diff options
author | android-build-team Robot <android-build-team-robot@google.com> | 2019-05-11 23:20:42 +0000 |
---|---|---|
committer | android-build-team Robot <android-build-team-robot@google.com> | 2019-05-11 23:20:42 +0000 |
commit | ae9e769d90104bb1a680601e15c9befdd26e21cf (patch) | |
tree | 3f75dbf1ca3dc3455647615cde513037d4d06e0e | |
parent | 6908555f3ced722d3f7badd753b3813e13526c8b (diff) | |
parent | 22776ab2e71269213c6206f19e4b5d04a3384164 (diff) | |
download | platform_external_libaom-android-10.0.0_r2.tar.gz platform_external_libaom-android-10.0.0_r2.tar.bz2 platform_external_libaom-android-10.0.0_r2.zip |
Snap for 5558509 from 22776ab2e71269213c6206f19e4b5d04a3384164 to qt-releaseandroid-vts-10.0_r5android-vts-10.0_r4android-vts-10.0_r3android-vts-10.0_r2android-vts-10.0_r1android-cts-10.0_r5android-cts-10.0_r4android-cts-10.0_r3android-cts-10.0_r2android-cts-10.0_r1android-10.0.0_r6android-10.0.0_r5android-10.0.0_r46android-10.0.0_r4android-10.0.0_r3android-10.0.0_r2android-10.0.0_r17android-10.0.0_r11android-10.0.0_r10android-10.0.0_r1android10-tests-releaseandroid10-security-releaseandroid10-s3-releaseandroid10-s2-releaseandroid10-s1-releaseandroid10-release
Change-Id: Ibdbdbc94e55e035ddd9afffe44c26fc7fde20a98
288 files changed, 37114 insertions, 16681 deletions
@@ -122,7 +122,6 @@ aom_av1_decoder_sources = [ aom_av1_encoder_asm_sse2 = [ "libaom/av1/encoder/x86/dct_sse2.asm", "libaom/av1/encoder/x86/error_sse2.asm", - "libaom/av1/encoder/x86/temporal_filter_apply_sse2.asm", ] aom_av1_encoder_asm_ssse3_x86_64 = [ @@ -132,8 +131,11 @@ aom_av1_encoder_asm_ssse3_x86_64 = [ aom_av1_encoder_intrin_avx2 = [ "libaom/av1/encoder/x86/av1_quantize_avx2.c", "libaom/av1/encoder/x86/av1_highbd_quantize_avx2.c", + "libaom/av1/encoder/x86/corner_match_avx2.c", "libaom/av1/encoder/x86/error_intrin_avx2.c", + "libaom/av1/encoder/x86/highbd_block_error_intrin_avx2.c", "libaom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c", + "libaom/av1/encoder/x86/highbd_fwd_txfm_avx2.c", "libaom/av1/encoder/x86/wedge_utils_avx2.c", "libaom/av1/encoder/x86/encodetxb_avx2.c", "libaom/av1/encoder/x86/rdopt_avx2.c", @@ -170,6 +172,8 @@ aom_av1_encoder_intrin_sse4_1 = [ "libaom/av1/encoder/x86/encodetxb_sse4.c", "libaom/av1/encoder/x86/highbd_fwd_txfm_sse4.c", "libaom/av1/encoder/x86/rdopt_sse4.c", + "libaom/av1/encoder/x86/temporal_filter_sse4.c", + "libaom/av1/encoder/x86/highbd_temporal_filter_sse4.c", "libaom/av1/encoder/x86/pickrst_sse4.c", ] @@ -194,20 +198,25 @@ aom_av1_encoder_sources = [ "libaom/av1/encoder/encodeframe.c", "libaom/av1/encoder/encodemb.c", "libaom/av1/encoder/encodemv.c", + "libaom/av1/encoder/encode_strategy.c", "libaom/av1/encoder/encoder.c", "libaom/av1/encoder/encodetxb.c", "libaom/av1/encoder/ethread.c", "libaom/av1/encoder/extend.c", "libaom/av1/encoder/firstpass.c", "libaom/av1/encoder/global_motion.c", + "libaom/av1/encoder/gop_structure.c", "libaom/av1/encoder/hash.c", "libaom/av1/encoder/hash_motion.c", "libaom/av1/encoder/hybrid_fwd_txfm.c", + "libaom/av1/encoder/level.c", "libaom/av1/encoder/lookahead.c", "libaom/av1/encoder/mbgraph.c", "libaom/av1/encoder/mcomp.c", "libaom/av1/encoder/ml.c", "libaom/av1/encoder/palette.c", + "libaom/av1/encoder/partition_strategy.c", + "libaom/av1/encoder/pass2_strategy.c", "libaom/av1/encoder/pickcdef.c", "libaom/av1/encoder/picklpf.c", "libaom/av1/encoder/pickrst.c", @@ -220,7 +229,9 @@ aom_av1_encoder_sources = [ "libaom/av1/encoder/speed_features.c", "libaom/av1/encoder/temporal_filter.c", "libaom/av1/encoder/tokenize.c", + "libaom/av1/encoder/tpl_model.c", "libaom/av1/encoder/wedge_utils.c", + "libaom/av1/encoder/var_based_part.c", "libaom/third_party/fastfeat/fast.c", "libaom/third_party/fastfeat/fast_9.c", "libaom/third_party/fastfeat/nonmax.c", diff --git a/config/arm/config/aom_config.asm b/config/arm/config/aom_config.asm index b8fcd42..50338c1 100644 --- a/config/arm/config/aom_config.asm +++ b/config/arm/config/aom_config.asm @@ -13,7 +13,8 @@ ARCH_MIPS equ 0 ARCH_PPC equ 0 ARCH_X86 equ 0 ARCH_X86_64 equ 0 -CONFIG_2PASS_PARTITION_SEARCH_LVL equ 1 +CONFIG_2PASS_PARTITION_SEARCH_LVL_END equ 3 +CONFIG_2PASS_PARTITION_SEARCH_LVL_START equ 1 CONFIG_ACCOUNTING equ 0 CONFIG_ANALYZER equ 0 CONFIG_AV1_DECODER equ 1 @@ -21,7 +22,8 @@ CONFIG_AV1_ENCODER equ 0 CONFIG_BIG_ENDIAN equ 0 CONFIG_BITSTREAM_DEBUG equ 0 CONFIG_COEFFICIENT_RANGE_CHECKING equ 0 -CONFIG_COLLECT_INTER_MODE_RD_STATS equ 1 +CONFIG_COLLECT_COMPONENT_TIMING equ 0 +CONFIG_COLLECT_PARTITION_STATS equ 0 CONFIG_COLLECT_RD_STATS equ 0 CONFIG_DEBUG equ 0 CONFIG_DENOISE equ 1 @@ -29,11 +31,8 @@ CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 equ 1 CONFIG_DIST_8X8 equ 0 CONFIG_ENTROPY_STATS equ 0 CONFIG_FILEOPTIONS equ 1 -CONFIG_FIX_GF_LENGTH equ 1 -CONFIG_FP_MB_STATS equ 0 CONFIG_GCC equ 1 CONFIG_GCOV equ 0 -CONFIG_GLOBAL_MOTION_SEARCH equ 1 CONFIG_GPROF equ 0 CONFIG_INSPECTION equ 0 CONFIG_INTERNAL_STATS equ 0 @@ -44,16 +43,15 @@ CONFIG_MAX_DECODE_PROFILE equ 0 CONFIG_MISMATCH_DEBUG equ 0 CONFIG_MULTITHREAD equ 1 CONFIG_NORMAL_TILE_MODE equ 1 -CONFIG_ONE_PASS_SVM equ 0 CONFIG_OS_SUPPORT equ 1 CONFIG_PIC equ 0 CONFIG_RD_DEBUG equ 0 -CONFIG_REDUCED_ENCODER_BORDER equ 0 CONFIG_RUNTIME_CPU_DETECT equ 0 CONFIG_SHARED equ 0 CONFIG_SHARP_SETTINGS equ 0 CONFIG_SIZE_LIMIT equ 1 CONFIG_SPATIAL_RESAMPLING equ 1 +CONFIG_SPEED_STATS equ 0 CONFIG_STATIC equ 1 CONFIG_WEBM_IO equ 1 DECODE_HEIGHT_LIMIT equ 16384 diff --git a/config/arm/config/aom_config.h b/config/arm/config/aom_config.h index 5418985..a3b86df 100644 --- a/config/arm/config/aom_config.h +++ b/config/arm/config/aom_config.h @@ -15,7 +15,8 @@ #define ARCH_PPC 0 #define ARCH_X86 0 #define ARCH_X86_64 0 -#define CONFIG_2PASS_PARTITION_SEARCH_LVL 1 +#define CONFIG_2PASS_PARTITION_SEARCH_LVL_END 3 +#define CONFIG_2PASS_PARTITION_SEARCH_LVL_START 1 #define CONFIG_ACCOUNTING 0 #define CONFIG_ANALYZER 0 #define CONFIG_AV1_DECODER 1 @@ -23,7 +24,8 @@ #define CONFIG_BIG_ENDIAN 0 #define CONFIG_BITSTREAM_DEBUG 0 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 -#define CONFIG_COLLECT_INTER_MODE_RD_STATS 1 +#define CONFIG_COLLECT_COMPONENT_TIMING 0 +#define CONFIG_COLLECT_PARTITION_STATS 0 #define CONFIG_COLLECT_RD_STATS 0 #define CONFIG_DEBUG 0 #define CONFIG_DENOISE 1 @@ -31,11 +33,8 @@ #define CONFIG_DIST_8X8 0 #define CONFIG_ENTROPY_STATS 0 #define CONFIG_FILEOPTIONS 1 -#define CONFIG_FIX_GF_LENGTH 1 -#define CONFIG_FP_MB_STATS 0 #define CONFIG_GCC 1 #define CONFIG_GCOV 0 -#define CONFIG_GLOBAL_MOTION_SEARCH 1 #define CONFIG_GPROF 0 #define CONFIG_INSPECTION 0 #define CONFIG_INTERNAL_STATS 0 @@ -46,16 +45,15 @@ #define CONFIG_MISMATCH_DEBUG 0 #define CONFIG_MULTITHREAD 1 #define CONFIG_NORMAL_TILE_MODE 1 -#define CONFIG_ONE_PASS_SVM 0 #define CONFIG_OS_SUPPORT 1 #define CONFIG_PIC 0 #define CONFIG_RD_DEBUG 0 -#define CONFIG_REDUCED_ENCODER_BORDER 0 #define CONFIG_RUNTIME_CPU_DETECT 0 #define CONFIG_SHARED 0 #define CONFIG_SHARP_SETTINGS 0 #define CONFIG_SIZE_LIMIT 1 #define CONFIG_SPATIAL_RESAMPLING 1 +#define CONFIG_SPEED_STATS 0 #define CONFIG_STATIC 1 #define CONFIG_WEBM_IO 1 #define DECODE_HEIGHT_LIMIT 16384 diff --git a/config/arm/config/aom_dsp_rtcd.h b/config/arm/config/aom_dsp_rtcd.h index e3150f7..0b1a28a 100644 --- a/config/arm/config/aom_dsp_rtcd.h +++ b/config/arm/config/aom_dsp_rtcd.h @@ -1400,10 +1400,6 @@ void aom_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define aom_v_predictor_8x8 aom_v_predictor_8x8_neon -void av1_round_shift_array_c(int32_t *arr, int size, int bit); -void av1_round_shift_array_neon(int32_t *arr, int size, int bit); -#define av1_round_shift_array av1_round_shift_array_neon - void aom_dsp_rtcd(void); #include "config/aom_config.h" diff --git a/config/arm/config/aom_scale_rtcd.h b/config/arm/config/aom_scale_rtcd.h index 7260bd3..067ddb4 100644 --- a/config/arm/config/aom_scale_rtcd.h +++ b/config/arm/config/aom_scale_rtcd.h @@ -77,6 +77,9 @@ void aom_yv12_partial_copy_v_c(const struct yv12_buffer_config *src_bc, int hsta void aom_yv12_partial_copy_y_c(const struct yv12_buffer_config *src_ybc, int hstart1, int hend1, int vstart1, int vend1, struct yv12_buffer_config *dst_ybc, int hstart2, int vstart2); #define aom_yv12_partial_copy_y aom_yv12_partial_copy_y_c +int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, int num_planes); +#define aom_yv12_realloc_with_new_border aom_yv12_realloc_with_new_border_c + void aom_scale_rtcd(void); #include "config/aom_config.h" diff --git a/config/arm/config/av1_rtcd.h b/config/arm/config/av1_rtcd.h index c58e511..6f42666 100644 --- a/config/arm/config/av1_rtcd.h +++ b/config/arm/config/av1_rtcd.h @@ -89,6 +89,22 @@ void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int d void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); #define av1_convolve_y_sr av1_convolve_y_sr_neon +void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); +void av1_dist_wtd_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); +#define av1_dist_wtd_convolve_2d av1_dist_wtd_convolve_2d_neon + +void av1_dist_wtd_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); +void av1_dist_wtd_convolve_2d_copy_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); +#define av1_dist_wtd_convolve_2d_copy av1_dist_wtd_convolve_2d_copy_neon + +void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); +void av1_dist_wtd_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); +#define av1_dist_wtd_convolve_x av1_dist_wtd_convolve_x_neon + +void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); +void av1_dist_wtd_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); +#define av1_dist_wtd_convolve_y av1_dist_wtd_convolve_y_neon + void av1_dr_prediction_z1_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy); #define av1_dr_prediction_z1 av1_dr_prediction_z1_c @@ -140,6 +156,18 @@ void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *d void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd); #define av1_highbd_convolve_y_sr av1_highbd_convolve_y_sr_c +void av1_highbd_dist_wtd_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd); +#define av1_highbd_dist_wtd_convolve_2d av1_highbd_dist_wtd_convolve_2d_c + +void av1_highbd_dist_wtd_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd); +#define av1_highbd_dist_wtd_convolve_2d_copy av1_highbd_dist_wtd_convolve_2d_copy_c + +void av1_highbd_dist_wtd_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd); +#define av1_highbd_dist_wtd_convolve_x av1_highbd_dist_wtd_convolve_x_c + +void av1_highbd_dist_wtd_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd); +#define av1_highbd_dist_wtd_convolve_y av1_highbd_dist_wtd_convolve_y_c + void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd); #define av1_highbd_dr_prediction_z1 av1_highbd_dr_prediction_z1_c @@ -152,27 +180,9 @@ void av1_highbd_dr_prediction_z3_c(uint16_t *dst, ptrdiff_t stride, int bw, int void av1_highbd_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); #define av1_highbd_inv_txfm_add av1_highbd_inv_txfm_add_c -void av1_highbd_inv_txfm_add_16x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); -#define av1_highbd_inv_txfm_add_16x16 av1_highbd_inv_txfm_add_16x16_c - -void av1_highbd_inv_txfm_add_16x32_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); -#define av1_highbd_inv_txfm_add_16x32 av1_highbd_inv_txfm_add_16x32_c - void av1_highbd_inv_txfm_add_16x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); #define av1_highbd_inv_txfm_add_16x4 av1_highbd_inv_txfm_add_16x4_c -void av1_highbd_inv_txfm_add_16x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); -#define av1_highbd_inv_txfm_add_16x8 av1_highbd_inv_txfm_add_16x8_c - -void av1_highbd_inv_txfm_add_32x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); -#define av1_highbd_inv_txfm_add_32x16 av1_highbd_inv_txfm_add_32x16_c - -void av1_highbd_inv_txfm_add_32x32_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); -#define av1_highbd_inv_txfm_add_32x32 av1_highbd_inv_txfm_add_32x32_c - -void av1_highbd_inv_txfm_add_32x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); -#define av1_highbd_inv_txfm_add_32x8 av1_highbd_inv_txfm_add_32x8_c - void av1_highbd_inv_txfm_add_4x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); #define av1_highbd_inv_txfm_add_4x16 av1_highbd_inv_txfm_add_4x16_c @@ -182,12 +192,6 @@ void av1_highbd_inv_txfm_add_4x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int void av1_highbd_inv_txfm_add_4x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); #define av1_highbd_inv_txfm_add_4x8 av1_highbd_inv_txfm_add_4x8_c -void av1_highbd_inv_txfm_add_8x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); -#define av1_highbd_inv_txfm_add_8x16 av1_highbd_inv_txfm_add_8x16_c - -void av1_highbd_inv_txfm_add_8x32_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); -#define av1_highbd_inv_txfm_add_8x32 av1_highbd_inv_txfm_add_8x32_c - void av1_highbd_inv_txfm_add_8x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); #define av1_highbd_inv_txfm_add_8x4 av1_highbd_inv_txfm_add_8x4_c @@ -200,18 +204,6 @@ void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int des void av1_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); #define av1_highbd_iwht4x4_1_add av1_highbd_iwht4x4_1_add_c -void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd); -#define av1_highbd_jnt_convolve_2d av1_highbd_jnt_convolve_2d_c - -void av1_highbd_jnt_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd); -#define av1_highbd_jnt_convolve_2d_copy av1_highbd_jnt_convolve_2d_copy_c - -void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd); -#define av1_highbd_jnt_convolve_x av1_highbd_jnt_convolve_x_c - -void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd); -#define av1_highbd_jnt_convolve_y av1_highbd_jnt_convolve_y_c - void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta); #define av1_highbd_warp_affine av1_highbd_warp_affine_c @@ -279,21 +271,9 @@ void av1_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, con void av1_inv_txfm_add_neon(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); #define av1_inv_txfm_add av1_inv_txfm_add_neon -void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); -void av1_jnt_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); -#define av1_jnt_convolve_2d av1_jnt_convolve_2d_neon - -void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); -void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); -#define av1_jnt_convolve_2d_copy av1_jnt_convolve_2d_copy_neon - -void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); -void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); -#define av1_jnt_convolve_x av1_jnt_convolve_x_neon - -void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); -void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); -#define av1_jnt_convolve_y av1_jnt_convolve_y_neon +void av1_round_shift_array_c(int32_t *arr, int size, int bit); +void av1_round_shift_array_neon(int32_t *arr, int size, int bit); +#define av1_round_shift_array av1_round_shift_array_neon int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height, int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride, diff --git a/config/arm64/config/aom_config.asm b/config/arm64/config/aom_config.asm index b8fcd42..50338c1 100644 --- a/config/arm64/config/aom_config.asm +++ b/config/arm64/config/aom_config.asm @@ -13,7 +13,8 @@ ARCH_MIPS equ 0 ARCH_PPC equ 0 ARCH_X86 equ 0 ARCH_X86_64 equ 0 -CONFIG_2PASS_PARTITION_SEARCH_LVL equ 1 +CONFIG_2PASS_PARTITION_SEARCH_LVL_END equ 3 +CONFIG_2PASS_PARTITION_SEARCH_LVL_START equ 1 CONFIG_ACCOUNTING equ 0 CONFIG_ANALYZER equ 0 CONFIG_AV1_DECODER equ 1 @@ -21,7 +22,8 @@ CONFIG_AV1_ENCODER equ 0 CONFIG_BIG_ENDIAN equ 0 CONFIG_BITSTREAM_DEBUG equ 0 CONFIG_COEFFICIENT_RANGE_CHECKING equ 0 -CONFIG_COLLECT_INTER_MODE_RD_STATS equ 1 +CONFIG_COLLECT_COMPONENT_TIMING equ 0 +CONFIG_COLLECT_PARTITION_STATS equ 0 CONFIG_COLLECT_RD_STATS equ 0 CONFIG_DEBUG equ 0 CONFIG_DENOISE equ 1 @@ -29,11 +31,8 @@ CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 equ 1 CONFIG_DIST_8X8 equ 0 CONFIG_ENTROPY_STATS equ 0 CONFIG_FILEOPTIONS equ 1 -CONFIG_FIX_GF_LENGTH equ 1 -CONFIG_FP_MB_STATS equ 0 CONFIG_GCC equ 1 CONFIG_GCOV equ 0 -CONFIG_GLOBAL_MOTION_SEARCH equ 1 CONFIG_GPROF equ 0 CONFIG_INSPECTION equ 0 CONFIG_INTERNAL_STATS equ 0 @@ -44,16 +43,15 @@ CONFIG_MAX_DECODE_PROFILE equ 0 CONFIG_MISMATCH_DEBUG equ 0 CONFIG_MULTITHREAD equ 1 CONFIG_NORMAL_TILE_MODE equ 1 -CONFIG_ONE_PASS_SVM equ 0 CONFIG_OS_SUPPORT equ 1 CONFIG_PIC equ 0 CONFIG_RD_DEBUG equ 0 -CONFIG_REDUCED_ENCODER_BORDER equ 0 CONFIG_RUNTIME_CPU_DETECT equ 0 CONFIG_SHARED equ 0 CONFIG_SHARP_SETTINGS equ 0 CONFIG_SIZE_LIMIT equ 1 CONFIG_SPATIAL_RESAMPLING equ 1 +CONFIG_SPEED_STATS equ 0 CONFIG_STATIC equ 1 CONFIG_WEBM_IO equ 1 DECODE_HEIGHT_LIMIT equ 16384 diff --git a/config/arm64/config/aom_config.h b/config/arm64/config/aom_config.h index 5418985..a3b86df 100644 --- a/config/arm64/config/aom_config.h +++ b/config/arm64/config/aom_config.h @@ -15,7 +15,8 @@ #define ARCH_PPC 0 #define ARCH_X86 0 #define ARCH_X86_64 0 -#define CONFIG_2PASS_PARTITION_SEARCH_LVL 1 +#define CONFIG_2PASS_PARTITION_SEARCH_LVL_END 3 +#define CONFIG_2PASS_PARTITION_SEARCH_LVL_START 1 #define CONFIG_ACCOUNTING 0 #define CONFIG_ANALYZER 0 #define CONFIG_AV1_DECODER 1 @@ -23,7 +24,8 @@ #define CONFIG_BIG_ENDIAN 0 #define CONFIG_BITSTREAM_DEBUG 0 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 -#define CONFIG_COLLECT_INTER_MODE_RD_STATS 1 +#define CONFIG_COLLECT_COMPONENT_TIMING 0 +#define CONFIG_COLLECT_PARTITION_STATS 0 #define CONFIG_COLLECT_RD_STATS 0 #define CONFIG_DEBUG 0 #define CONFIG_DENOISE 1 @@ -31,11 +33,8 @@ #define CONFIG_DIST_8X8 0 #define CONFIG_ENTROPY_STATS 0 #define CONFIG_FILEOPTIONS 1 -#define CONFIG_FIX_GF_LENGTH 1 -#define CONFIG_FP_MB_STATS 0 #define CONFIG_GCC 1 #define CONFIG_GCOV 0 -#define CONFIG_GLOBAL_MOTION_SEARCH 1 #define CONFIG_GPROF 0 #define CONFIG_INSPECTION 0 #define CONFIG_INTERNAL_STATS 0 @@ -46,16 +45,15 @@ #define CONFIG_MISMATCH_DEBUG 0 #define CONFIG_MULTITHREAD 1 #define CONFIG_NORMAL_TILE_MODE 1 -#define CONFIG_ONE_PASS_SVM 0 #define CONFIG_OS_SUPPORT 1 #define CONFIG_PIC 0 #define CONFIG_RD_DEBUG 0 -#define CONFIG_REDUCED_ENCODER_BORDER 0 #define CONFIG_RUNTIME_CPU_DETECT 0 #define CONFIG_SHARED 0 #define CONFIG_SHARP_SETTINGS 0 #define CONFIG_SIZE_LIMIT 1 #define CONFIG_SPATIAL_RESAMPLING 1 +#define CONFIG_SPEED_STATS 0 #define CONFIG_STATIC 1 #define CONFIG_WEBM_IO 1 #define DECODE_HEIGHT_LIMIT 16384 diff --git a/config/arm64/config/aom_dsp_rtcd.h b/config/arm64/config/aom_dsp_rtcd.h index e3150f7..0b1a28a 100644 --- a/config/arm64/config/aom_dsp_rtcd.h +++ b/config/arm64/config/aom_dsp_rtcd.h @@ -1400,10 +1400,6 @@ void aom_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define aom_v_predictor_8x8 aom_v_predictor_8x8_neon -void av1_round_shift_array_c(int32_t *arr, int size, int bit); -void av1_round_shift_array_neon(int32_t *arr, int size, int bit); -#define av1_round_shift_array av1_round_shift_array_neon - void aom_dsp_rtcd(void); #include "config/aom_config.h" diff --git a/config/arm64/config/aom_scale_rtcd.h b/config/arm64/config/aom_scale_rtcd.h index 7260bd3..067ddb4 100644 --- a/config/arm64/config/aom_scale_rtcd.h +++ b/config/arm64/config/aom_scale_rtcd.h @@ -77,6 +77,9 @@ void aom_yv12_partial_copy_v_c(const struct yv12_buffer_config *src_bc, int hsta void aom_yv12_partial_copy_y_c(const struct yv12_buffer_config *src_ybc, int hstart1, int hend1, int vstart1, int vend1, struct yv12_buffer_config *dst_ybc, int hstart2, int vstart2); #define aom_yv12_partial_copy_y aom_yv12_partial_copy_y_c +int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, int num_planes); +#define aom_yv12_realloc_with_new_border aom_yv12_realloc_with_new_border_c + void aom_scale_rtcd(void); #include "config/aom_config.h" diff --git a/config/arm64/config/av1_rtcd.h b/config/arm64/config/av1_rtcd.h index c58e511..6f42666 100644 --- a/config/arm64/config/av1_rtcd.h +++ b/config/arm64/config/av1_rtcd.h @@ -89,6 +89,22 @@ void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int d void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); #define av1_convolve_y_sr av1_convolve_y_sr_neon +void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); +void av1_dist_wtd_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); +#define av1_dist_wtd_convolve_2d av1_dist_wtd_convolve_2d_neon + +void av1_dist_wtd_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); +void av1_dist_wtd_convolve_2d_copy_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); +#define av1_dist_wtd_convolve_2d_copy av1_dist_wtd_convolve_2d_copy_neon + +void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); +void av1_dist_wtd_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); +#define av1_dist_wtd_convolve_x av1_dist_wtd_convolve_x_neon + +void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); +void av1_dist_wtd_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); +#define av1_dist_wtd_convolve_y av1_dist_wtd_convolve_y_neon + void av1_dr_prediction_z1_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy); #define av1_dr_prediction_z1 av1_dr_prediction_z1_c @@ -140,6 +156,18 @@ void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *d void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd); #define av1_highbd_convolve_y_sr av1_highbd_convolve_y_sr_c +void av1_highbd_dist_wtd_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd); +#define av1_highbd_dist_wtd_convolve_2d av1_highbd_dist_wtd_convolve_2d_c + +void av1_highbd_dist_wtd_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd); +#define av1_highbd_dist_wtd_convolve_2d_copy av1_highbd_dist_wtd_convolve_2d_copy_c + +void av1_highbd_dist_wtd_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd); +#define av1_highbd_dist_wtd_convolve_x av1_highbd_dist_wtd_convolve_x_c + +void av1_highbd_dist_wtd_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd); +#define av1_highbd_dist_wtd_convolve_y av1_highbd_dist_wtd_convolve_y_c + void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd); #define av1_highbd_dr_prediction_z1 av1_highbd_dr_prediction_z1_c @@ -152,27 +180,9 @@ void av1_highbd_dr_prediction_z3_c(uint16_t *dst, ptrdiff_t stride, int bw, int void av1_highbd_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); #define av1_highbd_inv_txfm_add av1_highbd_inv_txfm_add_c -void av1_highbd_inv_txfm_add_16x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); -#define av1_highbd_inv_txfm_add_16x16 av1_highbd_inv_txfm_add_16x16_c - -void av1_highbd_inv_txfm_add_16x32_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); -#define av1_highbd_inv_txfm_add_16x32 av1_highbd_inv_txfm_add_16x32_c - void av1_highbd_inv_txfm_add_16x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); #define av1_highbd_inv_txfm_add_16x4 av1_highbd_inv_txfm_add_16x4_c -void av1_highbd_inv_txfm_add_16x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); -#define av1_highbd_inv_txfm_add_16x8 av1_highbd_inv_txfm_add_16x8_c - -void av1_highbd_inv_txfm_add_32x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); -#define av1_highbd_inv_txfm_add_32x16 av1_highbd_inv_txfm_add_32x16_c - -void av1_highbd_inv_txfm_add_32x32_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); -#define av1_highbd_inv_txfm_add_32x32 av1_highbd_inv_txfm_add_32x32_c - -void av1_highbd_inv_txfm_add_32x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); -#define av1_highbd_inv_txfm_add_32x8 av1_highbd_inv_txfm_add_32x8_c - void av1_highbd_inv_txfm_add_4x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); #define av1_highbd_inv_txfm_add_4x16 av1_highbd_inv_txfm_add_4x16_c @@ -182,12 +192,6 @@ void av1_highbd_inv_txfm_add_4x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int void av1_highbd_inv_txfm_add_4x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); #define av1_highbd_inv_txfm_add_4x8 av1_highbd_inv_txfm_add_4x8_c -void av1_highbd_inv_txfm_add_8x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); -#define av1_highbd_inv_txfm_add_8x16 av1_highbd_inv_txfm_add_8x16_c - -void av1_highbd_inv_txfm_add_8x32_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); -#define av1_highbd_inv_txfm_add_8x32 av1_highbd_inv_txfm_add_8x32_c - void av1_highbd_inv_txfm_add_8x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); #define av1_highbd_inv_txfm_add_8x4 av1_highbd_inv_txfm_add_8x4_c @@ -200,18 +204,6 @@ void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int des void av1_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); #define av1_highbd_iwht4x4_1_add av1_highbd_iwht4x4_1_add_c -void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd); -#define av1_highbd_jnt_convolve_2d av1_highbd_jnt_convolve_2d_c - -void av1_highbd_jnt_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd); -#define av1_highbd_jnt_convolve_2d_copy av1_highbd_jnt_convolve_2d_copy_c - -void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd); -#define av1_highbd_jnt_convolve_x av1_highbd_jnt_convolve_x_c - -void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd); -#define av1_highbd_jnt_convolve_y av1_highbd_jnt_convolve_y_c - void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta); #define av1_highbd_warp_affine av1_highbd_warp_affine_c @@ -279,21 +271,9 @@ void av1_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, con void av1_inv_txfm_add_neon(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); #define av1_inv_txfm_add av1_inv_txfm_add_neon -void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); -void av1_jnt_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); -#define av1_jnt_convolve_2d av1_jnt_convolve_2d_neon - -void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); -void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); -#define av1_jnt_convolve_2d_copy av1_jnt_convolve_2d_copy_neon - -void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); -void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); -#define av1_jnt_convolve_x av1_jnt_convolve_x_neon - -void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); -void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); -#define av1_jnt_convolve_y av1_jnt_convolve_y_neon +void av1_round_shift_array_c(int32_t *arr, int size, int bit); +void av1_round_shift_array_neon(int32_t *arr, int size, int bit); +#define av1_round_shift_array av1_round_shift_array_neon int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height, int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride, diff --git a/config/x86/config/aom_config.asm b/config/x86/config/aom_config.asm index 4360c87..222e3bf 100644 --- a/config/x86/config/aom_config.asm +++ b/config/x86/config/aom_config.asm @@ -3,7 +3,7 @@ %define ARCH_PPC 0 %define ARCH_X86 1 %define ARCH_X86_64 0 -%define CONFIG_2PASS_PARTITION_SEARCH_LVL 1 +%define CONFIG_2PASS_PARTITION_SEARCH_LVL_START 1 %define CONFIG_ACCOUNTING 0 %define CONFIG_ANALYZER 0 %define CONFIG_AV1_DECODER 1 @@ -11,7 +11,8 @@ %define CONFIG_BIG_ENDIAN 0 %define CONFIG_BITSTREAM_DEBUG 0 %define CONFIG_COEFFICIENT_RANGE_CHECKING 0 -%define CONFIG_COLLECT_INTER_MODE_RD_STATS 1 +%define CONFIG_COLLECT_COMPONENT_TIMING 0 +%define CONFIG_COLLECT_PARTITION_STATS 0 %define CONFIG_COLLECT_RD_STATS 0 %define CONFIG_DEBUG 0 %define CONFIG_DENOISE 1 @@ -19,11 +20,8 @@ %define CONFIG_DIST_8X8 0 %define CONFIG_ENTROPY_STATS 0 %define CONFIG_FILEOPTIONS 1 -%define CONFIG_FIX_GF_LENGTH 1 -%define CONFIG_FP_MB_STATS 0 %define CONFIG_GCC 1 %define CONFIG_GCOV 0 -%define CONFIG_GLOBAL_MOTION_SEARCH 1 %define CONFIG_GPROF 0 %define CONFIG_INSPECTION 0 %define CONFIG_INTERNAL_STATS 0 @@ -34,16 +32,15 @@ %define CONFIG_MISMATCH_DEBUG 0 %define CONFIG_MULTITHREAD 1 %define CONFIG_NORMAL_TILE_MODE 1 -%define CONFIG_ONE_PASS_SVM 0 %define CONFIG_OS_SUPPORT 1 %define CONFIG_PIC 1 %define CONFIG_RD_DEBUG 0 -%define CONFIG_REDUCED_ENCODER_BORDER 0 %define CONFIG_RUNTIME_CPU_DETECT 0 %define CONFIG_SHARED 0 %define CONFIG_SHARP_SETTINGS 0 %define CONFIG_SIZE_LIMIT 1 %define CONFIG_SPATIAL_RESAMPLING 1 +%define CONFIG_SPEED_STATS 0 %define CONFIG_STATIC 1 %define CONFIG_WEBM_IO 1 %define DECODE_HEIGHT_LIMIT 16384 diff --git a/config/x86/config/aom_config.h b/config/x86/config/aom_config.h index e162899..db2edbd 100644 --- a/config/x86/config/aom_config.h +++ b/config/x86/config/aom_config.h @@ -15,7 +15,8 @@ #define ARCH_PPC 0 #define ARCH_X86 1 #define ARCH_X86_64 0 -#define CONFIG_2PASS_PARTITION_SEARCH_LVL 1 +#define CONFIG_2PASS_PARTITION_SEARCH_LVL_END 3 +#define CONFIG_2PASS_PARTITION_SEARCH_LVL_START 1 #define CONFIG_ACCOUNTING 0 #define CONFIG_ANALYZER 0 #define CONFIG_AV1_DECODER 1 @@ -23,7 +24,8 @@ #define CONFIG_BIG_ENDIAN 0 #define CONFIG_BITSTREAM_DEBUG 0 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 -#define CONFIG_COLLECT_INTER_MODE_RD_STATS 1 +#define CONFIG_COLLECT_COMPONENT_TIMING 0 +#define CONFIG_COLLECT_PARTITION_STATS 0 #define CONFIG_COLLECT_RD_STATS 0 #define CONFIG_DEBUG 0 #define CONFIG_DENOISE 1 @@ -31,11 +33,8 @@ #define CONFIG_DIST_8X8 0 #define CONFIG_ENTROPY_STATS 0 #define CONFIG_FILEOPTIONS 1 -#define CONFIG_FIX_GF_LENGTH 1 -#define CONFIG_FP_MB_STATS 0 #define CONFIG_GCC 1 #define CONFIG_GCOV 0 -#define CONFIG_GLOBAL_MOTION_SEARCH 1 #define CONFIG_GPROF 0 #define CONFIG_INSPECTION 0 #define CONFIG_INTERNAL_STATS 0 @@ -46,16 +45,15 @@ #define CONFIG_MISMATCH_DEBUG 0 #define CONFIG_MULTITHREAD 1 #define CONFIG_NORMAL_TILE_MODE 1 -#define CONFIG_ONE_PASS_SVM 0 #define CONFIG_OS_SUPPORT 1 #define CONFIG_PIC 1 #define CONFIG_RD_DEBUG 0 -#define CONFIG_REDUCED_ENCODER_BORDER 0 #define CONFIG_RUNTIME_CPU_DETECT 0 #define CONFIG_SHARED 0 #define CONFIG_SHARP_SETTINGS 0 #define CONFIG_SIZE_LIMIT 1 #define CONFIG_SPATIAL_RESAMPLING 1 +#define CONFIG_SPEED_STATS 0 #define CONFIG_STATIC 1 #define CONFIG_WEBM_IO 1 #define DECODE_HEIGHT_LIMIT 16384 diff --git a/config/x86/config/aom_dsp_rtcd.h b/config/x86/config/aom_dsp_rtcd.h index 8f11e0b..f84f313 100644 --- a/config/x86/config/aom_dsp_rtcd.h +++ b/config/x86/config/aom_dsp_rtcd.h @@ -1650,9 +1650,6 @@ void aom_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov void aom_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define aom_v_predictor_8x8 aom_v_predictor_8x8_sse2 -void av1_round_shift_array_c(int32_t *arr, int size, int bit); -#define av1_round_shift_array av1_round_shift_array_c - void aom_dsp_rtcd(void); #ifdef RTCD_C diff --git a/config/x86/config/aom_scale_rtcd.h b/config/x86/config/aom_scale_rtcd.h index b6e8149..65c184b 100644 --- a/config/x86/config/aom_scale_rtcd.h +++ b/config/x86/config/aom_scale_rtcd.h @@ -77,6 +77,9 @@ void aom_yv12_partial_copy_v_c(const struct yv12_buffer_config *src_bc, int hsta void aom_yv12_partial_copy_y_c(const struct yv12_buffer_config *src_ybc, int hstart1, int hend1, int vstart1, int vend1, struct yv12_buffer_config *dst_ybc, int hstart2, int vstart2); #define aom_yv12_partial_copy_y aom_yv12_partial_copy_y_c +int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, int num_planes); +#define aom_yv12_realloc_with_new_border aom_yv12_realloc_with_new_border_c + void aom_scale_rtcd(void); #ifdef RTCD_C diff --git a/config/x86/config/av1_rtcd.h b/config/x86/config/av1_rtcd.h index c5d7794..f788933 100644 --- a/config/x86/config/av1_rtcd.h +++ b/config/x86/config/av1_rtcd.h @@ -88,6 +88,23 @@ void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int d void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); #define av1_convolve_y_sr av1_convolve_y_sr_sse2 +void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); +void av1_dist_wtd_convolve_2d_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); +void av1_dist_wtd_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); +#define av1_dist_wtd_convolve_2d av1_dist_wtd_convolve_2d_ssse3 + +void av1_dist_wtd_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); +void av1_dist_wtd_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); +#define av1_dist_wtd_convolve_2d_copy av1_dist_wtd_convolve_2d_copy_sse2 + +void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); +void av1_dist_wtd_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); +#define av1_dist_wtd_convolve_x av1_dist_wtd_convolve_x_sse2 + +void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); +void av1_dist_wtd_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); +#define av1_dist_wtd_convolve_y av1_dist_wtd_convolve_y_sse2 + void av1_dr_prediction_z1_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy); #define av1_dr_prediction_z1 av1_dr_prediction_z1_c @@ -143,6 +160,18 @@ void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *d void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd); #define av1_highbd_convolve_y_sr av1_highbd_convolve_y_sr_ssse3 +void av1_highbd_dist_wtd_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd); +#define av1_highbd_dist_wtd_convolve_2d av1_highbd_dist_wtd_convolve_2d_c + +void av1_highbd_dist_wtd_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd); +#define av1_highbd_dist_wtd_convolve_2d_copy av1_highbd_dist_wtd_convolve_2d_copy_c + +void av1_highbd_dist_wtd_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd); +#define av1_highbd_dist_wtd_convolve_x av1_highbd_dist_wtd_convolve_x_c + +void av1_highbd_dist_wtd_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd); +#define av1_highbd_dist_wtd_convolve_y av1_highbd_dist_wtd_convolve_y_c + void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd); #define av1_highbd_dr_prediction_z1 av1_highbd_dr_prediction_z1_c @@ -155,27 +184,9 @@ void av1_highbd_dr_prediction_z3_c(uint16_t *dst, ptrdiff_t stride, int bw, int void av1_highbd_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); #define av1_highbd_inv_txfm_add av1_highbd_inv_txfm_add_c -void av1_highbd_inv_txfm_add_16x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); -#define av1_highbd_inv_txfm_add_16x16 av1_highbd_inv_txfm_add_16x16_c - -void av1_highbd_inv_txfm_add_16x32_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); -#define av1_highbd_inv_txfm_add_16x32 av1_highbd_inv_txfm_add_16x32_c - void av1_highbd_inv_txfm_add_16x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); #define av1_highbd_inv_txfm_add_16x4 av1_highbd_inv_txfm_add_16x4_c -void av1_highbd_inv_txfm_add_16x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); -#define av1_highbd_inv_txfm_add_16x8 av1_highbd_inv_txfm_add_16x8_c - -void av1_highbd_inv_txfm_add_32x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); -#define av1_highbd_inv_txfm_add_32x16 av1_highbd_inv_txfm_add_32x16_c - -void av1_highbd_inv_txfm_add_32x32_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); -#define av1_highbd_inv_txfm_add_32x32 av1_highbd_inv_txfm_add_32x32_c - -void av1_highbd_inv_txfm_add_32x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); -#define av1_highbd_inv_txfm_add_32x8 av1_highbd_inv_txfm_add_32x8_c - void av1_highbd_inv_txfm_add_4x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); #define av1_highbd_inv_txfm_add_4x16 av1_highbd_inv_txfm_add_4x16_c @@ -185,12 +196,6 @@ void av1_highbd_inv_txfm_add_4x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int void av1_highbd_inv_txfm_add_4x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); #define av1_highbd_inv_txfm_add_4x8 av1_highbd_inv_txfm_add_4x8_c -void av1_highbd_inv_txfm_add_8x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); -#define av1_highbd_inv_txfm_add_8x16 av1_highbd_inv_txfm_add_8x16_c - -void av1_highbd_inv_txfm_add_8x32_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); -#define av1_highbd_inv_txfm_add_8x32 av1_highbd_inv_txfm_add_8x32_c - void av1_highbd_inv_txfm_add_8x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); #define av1_highbd_inv_txfm_add_8x4 av1_highbd_inv_txfm_add_8x4_c @@ -203,18 +208,6 @@ void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int des void av1_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); #define av1_highbd_iwht4x4_1_add av1_highbd_iwht4x4_1_add_c -void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd); -#define av1_highbd_jnt_convolve_2d av1_highbd_jnt_convolve_2d_c - -void av1_highbd_jnt_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd); -#define av1_highbd_jnt_convolve_2d_copy av1_highbd_jnt_convolve_2d_copy_c - -void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd); -#define av1_highbd_jnt_convolve_x av1_highbd_jnt_convolve_x_c - -void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd); -#define av1_highbd_jnt_convolve_y av1_highbd_jnt_convolve_y_c - void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta); #define av1_highbd_warp_affine av1_highbd_warp_affine_c @@ -283,22 +276,8 @@ void av1_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, con void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); #define av1_inv_txfm_add av1_inv_txfm_add_ssse3 -void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); -void av1_jnt_convolve_2d_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); -void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); -#define av1_jnt_convolve_2d av1_jnt_convolve_2d_ssse3 - -void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); -void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); -#define av1_jnt_convolve_2d_copy av1_jnt_convolve_2d_copy_sse2 - -void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); -void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); -#define av1_jnt_convolve_x av1_jnt_convolve_x_sse2 - -void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); -void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); -#define av1_jnt_convolve_y av1_jnt_convolve_y_sse2 +void av1_round_shift_array_c(int32_t *arr, int size, int bit); +#define av1_round_shift_array av1_round_shift_array_c int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height, int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride, diff --git a/config/x86_64/config/aom_config.asm b/config/x86_64/config/aom_config.asm index 986dc75..43e7f74 100644 --- a/config/x86_64/config/aom_config.asm +++ b/config/x86_64/config/aom_config.asm @@ -3,7 +3,7 @@ %define ARCH_PPC 0 %define ARCH_X86 0 %define ARCH_X86_64 1 -%define CONFIG_2PASS_PARTITION_SEARCH_LVL 1 +%define CONFIG_2PASS_PARTITION_SEARCH_LVL_START 1 %define CONFIG_ACCOUNTING 0 %define CONFIG_ANALYZER 0 %define CONFIG_AV1_DECODER 1 @@ -11,7 +11,8 @@ %define CONFIG_BIG_ENDIAN 0 %define CONFIG_BITSTREAM_DEBUG 0 %define CONFIG_COEFFICIENT_RANGE_CHECKING 0 -%define CONFIG_COLLECT_INTER_MODE_RD_STATS 1 +%define CONFIG_COLLECT_COMPONENT_TIMING 0 +%define CONFIG_COLLECT_PARTITION_STATS 0 %define CONFIG_COLLECT_RD_STATS 0 %define CONFIG_DEBUG 0 %define CONFIG_DENOISE 1 @@ -19,11 +20,8 @@ %define CONFIG_DIST_8X8 0 %define CONFIG_ENTROPY_STATS 0 %define CONFIG_FILEOPTIONS 1 -%define CONFIG_FIX_GF_LENGTH 1 -%define CONFIG_FP_MB_STATS 0 %define CONFIG_GCC 1 %define CONFIG_GCOV 0 -%define CONFIG_GLOBAL_MOTION_SEARCH 1 %define CONFIG_GPROF 0 %define CONFIG_INSPECTION 0 %define CONFIG_INTERNAL_STATS 0 @@ -34,16 +32,15 @@ %define CONFIG_MISMATCH_DEBUG 0 %define CONFIG_MULTITHREAD 1 %define CONFIG_NORMAL_TILE_MODE 1 -%define CONFIG_ONE_PASS_SVM 0 %define CONFIG_OS_SUPPORT 1 %define CONFIG_PIC 0 %define CONFIG_RD_DEBUG 0 -%define CONFIG_REDUCED_ENCODER_BORDER 0 %define CONFIG_RUNTIME_CPU_DETECT 0 %define CONFIG_SHARED 0 %define CONFIG_SHARP_SETTINGS 0 %define CONFIG_SIZE_LIMIT 1 %define CONFIG_SPATIAL_RESAMPLING 1 +%define CONFIG_SPEED_STATS 0 %define CONFIG_STATIC 1 %define CONFIG_WEBM_IO 1 %define DECODE_HEIGHT_LIMIT 16384 diff --git a/config/x86_64/config/aom_config.h b/config/x86_64/config/aom_config.h index 0f32913..610e8ca 100644 --- a/config/x86_64/config/aom_config.h +++ b/config/x86_64/config/aom_config.h @@ -15,7 +15,8 @@ #define ARCH_PPC 0 #define ARCH_X86 0 #define ARCH_X86_64 1 -#define CONFIG_2PASS_PARTITION_SEARCH_LVL 1 +#define CONFIG_2PASS_PARTITION_SEARCH_LVL_END 3 +#define CONFIG_2PASS_PARTITION_SEARCH_LVL_START 1 #define CONFIG_ACCOUNTING 0 #define CONFIG_ANALYZER 0 #define CONFIG_AV1_DECODER 1 @@ -23,7 +24,8 @@ #define CONFIG_BIG_ENDIAN 0 #define CONFIG_BITSTREAM_DEBUG 0 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 -#define CONFIG_COLLECT_INTER_MODE_RD_STATS 1 +#define CONFIG_COLLECT_COMPONENT_TIMING 0 +#define CONFIG_COLLECT_PARTITION_STATS 0 #define CONFIG_COLLECT_RD_STATS 0 #define CONFIG_DEBUG 0 #define CONFIG_DENOISE 1 @@ -31,11 +33,8 @@ #define CONFIG_DIST_8X8 0 #define CONFIG_ENTROPY_STATS 0 #define CONFIG_FILEOPTIONS 1 -#define CONFIG_FIX_GF_LENGTH 1 -#define CONFIG_FP_MB_STATS 0 #define CONFIG_GCC 1 #define CONFIG_GCOV 0 -#define CONFIG_GLOBAL_MOTION_SEARCH 1 #define CONFIG_GPROF 0 #define CONFIG_INSPECTION 0 #define CONFIG_INTERNAL_STATS 0 @@ -46,16 +45,15 @@ #define CONFIG_MISMATCH_DEBUG 0 #define CONFIG_MULTITHREAD 1 #define CONFIG_NORMAL_TILE_MODE 1 -#define CONFIG_ONE_PASS_SVM 0 #define CONFIG_OS_SUPPORT 1 #define CONFIG_PIC 0 #define CONFIG_RD_DEBUG 0 -#define CONFIG_REDUCED_ENCODER_BORDER 0 #define CONFIG_RUNTIME_CPU_DETECT 0 #define CONFIG_SHARED 0 #define CONFIG_SHARP_SETTINGS 0 #define CONFIG_SIZE_LIMIT 1 #define CONFIG_SPATIAL_RESAMPLING 1 +#define CONFIG_SPEED_STATS 0 #define CONFIG_STATIC 1 #define CONFIG_WEBM_IO 1 #define DECODE_HEIGHT_LIMIT 16384 diff --git a/config/x86_64/config/aom_dsp_rtcd.h b/config/x86_64/config/aom_dsp_rtcd.h index 8f11e0b..f84f313 100644 --- a/config/x86_64/config/aom_dsp_rtcd.h +++ b/config/x86_64/config/aom_dsp_rtcd.h @@ -1650,9 +1650,6 @@ void aom_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov void aom_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define aom_v_predictor_8x8 aom_v_predictor_8x8_sse2 -void av1_round_shift_array_c(int32_t *arr, int size, int bit); -#define av1_round_shift_array av1_round_shift_array_c - void aom_dsp_rtcd(void); #ifdef RTCD_C diff --git a/config/x86_64/config/aom_scale_rtcd.h b/config/x86_64/config/aom_scale_rtcd.h index b6e8149..65c184b 100644 --- a/config/x86_64/config/aom_scale_rtcd.h +++ b/config/x86_64/config/aom_scale_rtcd.h @@ -77,6 +77,9 @@ void aom_yv12_partial_copy_v_c(const struct yv12_buffer_config *src_bc, int hsta void aom_yv12_partial_copy_y_c(const struct yv12_buffer_config *src_ybc, int hstart1, int hend1, int vstart1, int vend1, struct yv12_buffer_config *dst_ybc, int hstart2, int vstart2); #define aom_yv12_partial_copy_y aom_yv12_partial_copy_y_c +int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, int num_planes); +#define aom_yv12_realloc_with_new_border aom_yv12_realloc_with_new_border_c + void aom_scale_rtcd(void); #ifdef RTCD_C diff --git a/config/x86_64/config/av1_rtcd.h b/config/x86_64/config/av1_rtcd.h index 043595d..84673ba 100644 --- a/config/x86_64/config/av1_rtcd.h +++ b/config/x86_64/config/av1_rtcd.h @@ -88,6 +88,23 @@ void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int d void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); #define av1_convolve_y_sr av1_convolve_y_sr_sse2 +void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); +void av1_dist_wtd_convolve_2d_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); +void av1_dist_wtd_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); +#define av1_dist_wtd_convolve_2d av1_dist_wtd_convolve_2d_ssse3 + +void av1_dist_wtd_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); +void av1_dist_wtd_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); +#define av1_dist_wtd_convolve_2d_copy av1_dist_wtd_convolve_2d_copy_sse2 + +void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); +void av1_dist_wtd_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); +#define av1_dist_wtd_convolve_x av1_dist_wtd_convolve_x_sse2 + +void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); +void av1_dist_wtd_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); +#define av1_dist_wtd_convolve_y av1_dist_wtd_convolve_y_sse2 + void av1_dr_prediction_z1_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy); #define av1_dr_prediction_z1 av1_dr_prediction_z1_c @@ -146,6 +163,18 @@ void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *d void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd); #define av1_highbd_convolve_y_sr av1_highbd_convolve_y_sr_ssse3 +void av1_highbd_dist_wtd_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd); +#define av1_highbd_dist_wtd_convolve_2d av1_highbd_dist_wtd_convolve_2d_c + +void av1_highbd_dist_wtd_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd); +#define av1_highbd_dist_wtd_convolve_2d_copy av1_highbd_dist_wtd_convolve_2d_copy_c + +void av1_highbd_dist_wtd_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd); +#define av1_highbd_dist_wtd_convolve_x av1_highbd_dist_wtd_convolve_x_c + +void av1_highbd_dist_wtd_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd); +#define av1_highbd_dist_wtd_convolve_y av1_highbd_dist_wtd_convolve_y_c + void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd); #define av1_highbd_dr_prediction_z1 av1_highbd_dr_prediction_z1_c @@ -158,27 +187,9 @@ void av1_highbd_dr_prediction_z3_c(uint16_t *dst, ptrdiff_t stride, int bw, int void av1_highbd_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); #define av1_highbd_inv_txfm_add av1_highbd_inv_txfm_add_c -void av1_highbd_inv_txfm_add_16x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); -#define av1_highbd_inv_txfm_add_16x16 av1_highbd_inv_txfm_add_16x16_c - -void av1_highbd_inv_txfm_add_16x32_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); -#define av1_highbd_inv_txfm_add_16x32 av1_highbd_inv_txfm_add_16x32_c - void av1_highbd_inv_txfm_add_16x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); #define av1_highbd_inv_txfm_add_16x4 av1_highbd_inv_txfm_add_16x4_c -void av1_highbd_inv_txfm_add_16x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); -#define av1_highbd_inv_txfm_add_16x8 av1_highbd_inv_txfm_add_16x8_c - -void av1_highbd_inv_txfm_add_32x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); -#define av1_highbd_inv_txfm_add_32x16 av1_highbd_inv_txfm_add_32x16_c - -void av1_highbd_inv_txfm_add_32x32_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); -#define av1_highbd_inv_txfm_add_32x32 av1_highbd_inv_txfm_add_32x32_c - -void av1_highbd_inv_txfm_add_32x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); -#define av1_highbd_inv_txfm_add_32x8 av1_highbd_inv_txfm_add_32x8_c - void av1_highbd_inv_txfm_add_4x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); #define av1_highbd_inv_txfm_add_4x16 av1_highbd_inv_txfm_add_4x16_c @@ -188,12 +199,6 @@ void av1_highbd_inv_txfm_add_4x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int void av1_highbd_inv_txfm_add_4x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); #define av1_highbd_inv_txfm_add_4x8 av1_highbd_inv_txfm_add_4x8_c -void av1_highbd_inv_txfm_add_8x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); -#define av1_highbd_inv_txfm_add_8x16 av1_highbd_inv_txfm_add_8x16_c - -void av1_highbd_inv_txfm_add_8x32_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); -#define av1_highbd_inv_txfm_add_8x32 av1_highbd_inv_txfm_add_8x32_c - void av1_highbd_inv_txfm_add_8x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); #define av1_highbd_inv_txfm_add_8x4 av1_highbd_inv_txfm_add_8x4_c @@ -206,18 +211,6 @@ void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int des void av1_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); #define av1_highbd_iwht4x4_1_add av1_highbd_iwht4x4_1_add_c -void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd); -#define av1_highbd_jnt_convolve_2d av1_highbd_jnt_convolve_2d_c - -void av1_highbd_jnt_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd); -#define av1_highbd_jnt_convolve_2d_copy av1_highbd_jnt_convolve_2d_copy_c - -void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd); -#define av1_highbd_jnt_convolve_x av1_highbd_jnt_convolve_x_c - -void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd); -#define av1_highbd_jnt_convolve_y av1_highbd_jnt_convolve_y_c - void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta); #define av1_highbd_warp_affine av1_highbd_warp_affine_c @@ -286,22 +279,8 @@ void av1_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, con void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param); #define av1_inv_txfm_add av1_inv_txfm_add_ssse3 -void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); -void av1_jnt_convolve_2d_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); -void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); -#define av1_jnt_convolve_2d av1_jnt_convolve_2d_ssse3 - -void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); -void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); -#define av1_jnt_convolve_2d_copy av1_jnt_convolve_2d_copy_sse2 - -void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); -void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); -#define av1_jnt_convolve_x av1_jnt_convolve_x_sse2 - -void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); -void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params); -#define av1_jnt_convolve_y av1_jnt_convolve_y_sse2 +void av1_round_shift_array_c(int32_t *arr, int size, int bit); +#define av1_round_shift_array av1_round_shift_array_c int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height, int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride, diff --git a/libaom/CMakeLists.txt b/libaom/CMakeLists.txt index f409892..2c35a0f 100644 --- a/libaom/CMakeLists.txt +++ b/libaom/CMakeLists.txt @@ -293,8 +293,11 @@ if(CONFIG_AV1_DECODER AND ENABLE_EXAMPLES) if(EMSCRIPTEN) add_preproc_definition(_POSIX_SOURCE) - append_link_flag_to_target("inspect" "-s TOTAL_MEMORY=402653184") + append_link_flag_to_target("inspect" "--emrun") + append_link_flag_to_target("inspect" "-s USE_PTHREADS=0") + append_link_flag_to_target("inspect" "-s WASM=1") append_link_flag_to_target("inspect" "-s MODULARIZE=1") + append_link_flag_to_target("inspect" "-s ALLOW_MEMORY_GROWTH=1") append_link_flag_to_target( "inspect" "-s \'EXTRA_EXPORTED_RUNTIME_METHODS=[\"UTF8ToString\"]\'") append_link_flag_to_target("inspect" diff --git a/libaom/PATENTS b/libaom/PATENTS index be491f5..493f616 100644 --- a/libaom/PATENTS +++ b/libaom/PATENTS @@ -57,10 +57,10 @@ Alliance for Open Media Patent License 1.0 2. Definitions. -2.1. Affiliate. “Affiliate” means an entity that directly or indirectly +2.1. Affiliate. "Affiliate" means an entity that directly or indirectly Controls, is Controlled by, or is under common Control of that party. -2.2. Control. “Control” means direct or indirect control of more than 50% of +2.2. Control. "Control" means direct or indirect control of more than 50% of the voting power to elect directors of that corporation, or for any other entity, the power to direct management of such entity. @@ -70,7 +70,7 @@ Alliance for Open Media Patent License 1.0 2.4. Encoder. "Encoder" means any encoder that produces a bitstream that can be decoded by a Decoder only to the extent it produces such a bitstream. -2.5. Final Deliverable. “Final Deliverable” means the final version of a +2.5. Final Deliverable. "Final Deliverable" means the final version of a deliverable approved by the Alliance for Open Media as a Final Deliverable. @@ -79,9 +79,9 @@ Alliance for Open Media Patent License 1.0 Implementation also includes components of an Implementation only to the extent they are used as part of an Implementation. -2.7. License. “License” means this license. +2.7. License. "License" means this license. -2.8. Licensee. “Licensee” means any person or entity who exercises patent +2.8. Licensee. "Licensee" means any person or entity who exercises patent rights granted under this License. 2.9. Licensor. "Licensor" means (i) any Licensee that makes, sells, offers @@ -98,11 +98,11 @@ Alliance for Open Media Patent License 1.0 as if the Specification was a W3C Recommendation; or (ii) are infringed by the Reference Implementation. -2.11. Reference Implementation. “Reference Implementation” means an Encoder +2.11. Reference Implementation. "Reference Implementation" means an Encoder and/or Decoder released by the Alliance for Open Media as a Final Deliverable. -2.12. Specification. “Specification” means the specification designated by +2.12. Specification. "Specification" means the specification designated by the Alliance for Open Media as a Final Deliverable for which this License was issued. diff --git a/libaom/aom/aom_encoder.h b/libaom/aom/aom_encoder.h index 777236f..f8a7cec 100644 --- a/libaom/aom/aom_encoder.h +++ b/libaom/aom/aom_encoder.h @@ -406,8 +406,7 @@ typedef struct aom_codec_enc_cfg { * upscaling after the encode/decode process. Taking control of upscaling and * using restoration filters should allow it to outperform normal resizing. * - * Mode 0 is SUPERRES_NONE, mode 1 is SUPERRES_FIXED, mode 2 is - * SUPERRES_RANDOM and mode 3 is SUPERRES_QTHRESH. + * Valid values are 0 to 4 as defined in enum SUPERRES_MODE. */ unsigned int rc_superres_mode; @@ -862,6 +861,11 @@ aom_codec_err_t aom_codec_enc_config_set(aom_codec_ctx_t *ctx, */ aom_fixed_buf_t *aom_codec_get_global_headers(aom_codec_ctx_t *ctx); +/*!\brief usage parameter analogous to AV1 GOOD QUALITY mode. */ +#define AOM_USAGE_GOOD_QUALITY (0) +/*!\brief usage parameter analogous to AV1 REALTIME mode. */ +#define AOM_USAGE_REALTIME (1) + /*!\brief Encode a frame * * Encodes a video frame at the given "presentation time." The presentation diff --git a/libaom/aom/aom_frame_buffer.h b/libaom/aom/aom_frame_buffer.h index fba4322..a715645 100644 --- a/libaom/aom/aom_frame_buffer.h +++ b/libaom/aom/aom_frame_buffer.h @@ -53,9 +53,9 @@ typedef struct aom_codec_frame_buffer { * data. The callback is triggered when the decoder needs a frame buffer to * decode a compressed image into. This function may be called more than once * for every call to aom_codec_decode. The application may set fb->priv to - * some data which will be passed back in the ximage and the release function - * call. |fb| is guaranteed to not be NULL. On success the callback must - * return 0. Any failure the callback must return a value less than 0. + * some data which will be passed back in the aom_image_t and the release + * function call. |fb| is guaranteed to not be NULL. On success the callback + * must return 0. Any failure the callback must return a value less than 0. * * \param[in] priv Callback's private data * \param[in] new_size Size in bytes needed by the buffer diff --git a/libaom/aom/aomcx.h b/libaom/aom/aomcx.h index 9aa77bb..da7498f 100644 --- a/libaom/aom/aomcx.h +++ b/libaom/aom/aomcx.h @@ -512,16 +512,25 @@ enum aome_enc_control_id { */ AV1E_SET_RENDER_SIZE, - /*!\brief Codec control function to set target level. - * - * 255: off (default); 0: only keep level stats; 10: target for level 1.0; - * 11: target for level 1.1; ... 62: target for level 6.2 - */ - AV1E_SET_TARGET_LEVEL, - - /*!\brief Codec control function to get bitstream level. - */ - AV1E_GET_LEVEL, + /*!\brief Control to set target sequence level index for a certain operating + * point(OP). + * Possible values are in the form of "ABxy"(pad leading zeros if less than + * 4 digits). + * AB: OP index. + * xy: Target level index for the OP. Can be values 0~23(corresponding to + * level 2.0 ~ 7.3) or 31(maximum level parameter, no level-based + * constraints). + * E.g. "0" means target level index 0 for the 0th OP; + * "111" means target level index 11 for the 1st OP; + * "1021" means target level index 21 for the 10th OP. + * If the target level is not specified for an OP, the maximum level parameter + * of 31 is used as default. + */ + AV1E_SET_TARGET_SEQ_LEVEL_IDX, + + /*!\brief Codec control function to get sequence level index. + */ + AV1E_GET_SEQ_LEVEL_IDX, /*!\brief Codec control function to set intended superblock size. * @@ -561,12 +570,23 @@ enum aome_enc_control_id { */ AV1E_SET_ENABLE_RESTORATION, + /*!\brief Codec control function to predict with OBMC mode. + * + * 0 = do not allow OBMC mode + * 1 = allow OBMC mode + * + * By default, the encoder allows OBMC prediction mode. + * + */ + AV1E_SET_ENABLE_OBMC, + /*!\brief Codec control function to encode without trellis quantization. * * 0 = apply trellis quantization * 1 = do not apply trellis quantization + * 2 = disable trellis quantization partially * - * By default, the encoder applies trellis optimization on quantized + * By default, the encoder applies optimization on quantized * coefficients. * */ @@ -700,13 +720,59 @@ enum aome_enc_control_id { */ AV1E_SET_ANS_WINDOW_SIZE_LOG2, - /*!\brief Codec control function to turn on / off dual filter - * enabling/disabling. + /*!\brief Codec control function to enable/disable rectangular partitions. + * + * This will enable or disable usage of rectangular partitions. The default + * value is 1. + * + */ + AV1E_SET_ENABLE_RECT_PARTITIONS, + + /*!\brief Codec control function to enable/disable AB partitions. + * + * This will enable or disable usage of AB partitions. The default + * value is 1. + * + */ + AV1E_SET_ENABLE_AB_PARTITIONS, + + /*!\brief Codec control function to enable/disable 1:4 and 4:1 partitions. * - * This will enable or disable dual filter. The default value is 1 + * This will enable or disable usage of 1:4 and 4:1 partitions. The default + * value is 1. * */ - AV1E_SET_ENABLE_DF, + AV1E_SET_ENABLE_1TO4_PARTITIONS, + + /*!\brief Codec control function to set min partition size. + * + * This will set min partition size. The default value is 4 for 4x4. + * valid values are [4, 8, 16, 32, 64, 128] + * min_partition_size is applied to both width and height of the partition. + * i.e, both width and height of a partition can not be smaller than + * the min_partition_size, except the partition at the picture boundary. + * + */ + AV1E_SET_MIN_PARTITION_SIZE, + + /*!\brief Codec control function to set max partition size. + * + * This will set max partition size. The default value is 128 for 128x128. + * valid values are [4, 8, 16, 32, 64, 128] + * max_partition_size is applied to both width and height of the partition. + * i.e, both width and height of a partition can not be larger than + * the max_partition_size. + */ + AV1E_SET_MAX_PARTITION_SIZE, + + /*!\brief Codec control function to turn on / off intra edge filter + * at sequence level. + * + * This will enable or disable usage of intra-edge filtering. The default + * value is 1. + * + */ + AV1E_SET_ENABLE_INTRA_EDGE_FILTER, /*!\brief Codec control function to turn on / off frame order hint for a * few tools: @@ -720,14 +786,42 @@ enum aome_enc_control_id { */ AV1E_SET_ENABLE_ORDER_HINT, - /*!\brief Codec control function to turn on / off joint compound mode + /*!\brief Codec control function to turn on / off 64-length transforms. + * + * This will enable or disable usage of length 64 transforms in any + * direction. The default value is 1. + * + */ + AV1E_SET_ENABLE_TX64, + + /*!\brief Codec control function to turn on / off flip and identity + * transforms. + * + * This will enable or disable usage of flip and identity transform + * types in any direction. The default value is 1. Including: + * FLIPADST_DCT, DCT_FLIPADST, FLIPADST_FLIPADST, ADST_FLIPADST, + * FLIPADST_ADST, IDTX, V_DCT, H_DCT, V_ADST, H_ADST, V_FLIPADST, + * H_FLIPADST + */ + AV1E_SET_ENABLE_FLIP_IDTX, + + /*!\brief Codec control function to set transform block size search method. + * + * This will set the transform block size search method. + * 0: use Full RD search, 1: use Fast RD search, 2: always use largest + * allowed transform block size based on partition size. + */ + AV1E_SET_TX_SIZE_SEARCH_METHOD, + + /*!\brief Codec control function to turn on / off dist-wtd compound mode * at sequence level. * - * This will enable or disable joint compound mode. The default value is 1. - * If AV1E_SET_ENABLE_ORDER_HINT is 0, then this flag is forced to 0. + * This will enable or disable distance-weighted compound mode. The default + * value is 1. If AV1E_SET_ENABLE_ORDER_HINT is 0, then this flag is forced + * to 0. * */ - AV1E_SET_ENABLE_JNT_COMP, + AV1E_SET_ENABLE_DIST_WTD_COMP, /*!\brief Codec control function to turn on / off ref frame mvs (mfmv) usage * at sequence level. @@ -747,6 +841,86 @@ enum aome_enc_control_id { */ AV1E_SET_ALLOW_REF_FRAME_MVS, + /*!\brief Codec control function to turn on / off dual filter usage + * for a sequence. + * + * This will enable or disable use of dual interpolation filter. + * The default value is 1. + * + */ + AV1E_SET_ENABLE_DUAL_FILTER, + + /*!\brief Codec control function to turn on / off masked compound usage + * for a sequence. + * + * This will enable or disable usage of wedge and diff-wtd compound + * modes. The default value is 1. + * + */ + AV1E_SET_ENABLE_MASKED_COMP, + + /*!\brief Codec control function to turn on / off one sided compound usage + * for a sequence. + * + * This will enable or disable usage of one sided compound + * modes. The default value is 1. + * + */ + AV1E_SET_ENABLE_ONESIDED_COMP, + + /*!\brief Codec control function to turn on / off interintra compound + * for a sequence. + * + * This will enable or disable usage of inter-intra compound modes. + * The default value is 1. + * + */ + AV1E_SET_ENABLE_INTERINTRA_COMP, + + /*!\brief Codec control function to turn on / off smooth inter-intra + * mode for a sequence. + * + * This will enable or disable usage of smooth inter-intra mode. + * The default value is 1. + * + */ + AV1E_SET_ENABLE_SMOOTH_INTERINTRA, + + /*!\brief Codec control function to turn on / off difference weighted + * compound. + * + * This will enable or disable usage of difference weighted compound. + * The default value is 1. + * + */ + AV1E_SET_ENABLE_DIFF_WTD_COMP, + + /*!\brief Codec control function to turn on / off interinter wedge + * compound. + * + * This will enable or disable usage of interinter wedge compound. + * The default value is 1. + * + */ + AV1E_SET_ENABLE_INTERINTER_WEDGE, + + /*!\brief Codec control function to turn on / off interintra wedge + * compound. + * + * This will enable or disable usage of interintra wedge compound. + * The default value is 1. + * + */ + AV1E_SET_ENABLE_INTERINTRA_WEDGE, + + /*!\brief Codec control function to turn on / off global motion usage + * for a sequence. + * + * This will enable or disable usage of global motion. The default value is 1. + * + */ + AV1E_SET_ENABLE_GLOBAL_MOTION, + /*!\brief Codec control function to turn on / off warped motion usage * at sequence level. * @@ -764,6 +938,39 @@ enum aome_enc_control_id { */ AV1E_SET_ALLOW_WARPED_MOTION, + /*!\brief Codec control function to turn on / off filter intra usage at + * sequence level. + * + * This will enable or disable usage of filter intra. The default value is 1. + * If AV1E_SET_ENABLE_FILTER_INTRA is 0, then this flag is forced to 0. + * + */ + AV1E_SET_ENABLE_FILTER_INTRA, + + /*!\brief Codec control function to turn on / off smooth intra modes usage. + * + * This will enable or disable usage of smooth, smooth_h and smooth_v intra + * modes. The default value is 1. + * + */ + AV1E_SET_ENABLE_SMOOTH_INTRA, + + /*!\brief Codec control function to turn on / off Paeth intra mode usage. + * + * This will enable or disable usage of Paeth intra mode. The default value + * is 1. + * + */ + AV1E_SET_ENABLE_PAETH_INTRA, + + /*!\brief Codec control function to turn on / off CFL uv intra mode usage. + * + * This will enable or disable usage of chroma-from-luma intra mode. The + * default value is 1. + * + */ + AV1E_SET_ENABLE_CFL_INTRA, + /*!\brief Codec control function to turn on / off frame superresolution. * * This will enable or disable frame superresolution. The default value is 1 @@ -771,6 +978,15 @@ enum aome_enc_control_id { */ AV1E_SET_ENABLE_SUPERRES, + /*!\brief Codec control function to turn on/off palette mode */ + AV1E_SET_ENABLE_PALETTE, + + /*!\brief Codec control function to turn on/off intra block copy mode */ + AV1E_SET_ENABLE_INTRABC, + + /*!\brief Codec control function to turn on/off intra angle delta */ + AV1E_SET_ENABLE_ANGLE_DELTA, + /*!\brief Codec control function to set the delta q mode * * AV1 has a segment based feature that allows encoder to adaptively change @@ -828,6 +1044,54 @@ enum aome_enc_control_id { /*!\brief Sets the chroma subsampling y value */ AV1E_SET_CHROMA_SUBSAMPLING_Y, + + /*!\brief Control to use a reduced tx type set */ + AV1E_SET_REDUCED_TX_TYPE_SET, + + /*!\brief Control to use dct only for intra modes */ + AV1E_SET_INTRA_DCT_ONLY, + + /*!\brief Control to use dct only for inter modes */ + AV1E_SET_INTER_DCT_ONLY, + + /*!\brief Control to use default tx type only for intra modes */ + AV1E_SET_INTRA_DEFAULT_TX_ONLY, + + /*!\brief Control to use adaptive quantize_b */ + AV1E_SET_QUANT_B_ADAPT, + + /*!\brief Control to select maximum height for the GF group pyramid structure + * (valid values: 0 - 4) */ + AV1E_SET_GF_MAX_PYRAMID_HEIGHT, + + /*!\brief Control to select maximum reference frames allowed per frame + * (valid values: 3 - 7) */ + AV1E_SET_MAX_REFERENCE_FRAMES, + + /*!\brief Control to use reduced set of single and compound references. */ + AV1E_SET_REDUCED_REFERENCE_SET, + + /*!\brief Control to set frequency of the cost updates for coefficients + * Possible values are: + * 0: Update at SB level (default) + * 1: Update at SB row level in tile + * 2: Update at tile level + */ + AV1E_SET_COEFF_COST_UPD_FREQ, + + /*!\brief Control to set frequency of the cost updates for mode + * Possible values are: + * 0: Update at SB level (default) + * 1: Update at SB row level in tile + * 2: Update at tile level + */ + AV1E_SET_MODE_COST_UPD_FREQ, + + /*!\brief Control to set bit mask that specifies which tier each of the 32 + * possible operating points conforms to. + * Bit value 0: Main Tier; 1: High Tier. + */ + AV1E_SET_TIER_MASK, }; /*!\brief aom 1-D scaling mode @@ -934,13 +1198,11 @@ AOM_CTRL_USE_TYPE(AOME_SET_ACTIVEMAP, aom_active_map_t *) AOM_CTRL_USE_TYPE(AOME_SET_SCALEMODE, aom_scaling_mode_t *) #define AOM_CTRL_AOME_SET_SCALEMODE -AOM_CTRL_USE_TYPE(AOME_SET_SPATIAL_LAYER_ID, int) +AOM_CTRL_USE_TYPE(AOME_SET_SPATIAL_LAYER_ID, unsigned int) #define AOM_CTRL_AOME_SET_SPATIAL_LAYER_ID AOM_CTRL_USE_TYPE(AOME_SET_CPUUSED, int) #define AOM_CTRL_AOME_SET_CPUUSED -AOM_CTRL_USE_TYPE(AOME_SET_DEVSF, int) -#define AOM_CTRL_AOME_SET_DEVSF AOM_CTRL_USE_TYPE(AOME_SET_ENABLEAUTOALTREF, unsigned int) #define AOM_CTRL_AOME_SET_ENABLEAUTOALTREF @@ -961,12 +1223,12 @@ AOM_CTRL_USE_TYPE(AOME_SET_TUNING, int) /* aom_tune_metric */ AOM_CTRL_USE_TYPE(AOME_SET_CQ_LEVEL, unsigned int) #define AOM_CTRL_AOME_SET_CQ_LEVEL -AOM_CTRL_USE_TYPE(AV1E_SET_ROW_MT, int) +AOM_CTRL_USE_TYPE(AV1E_SET_ROW_MT, unsigned int) #define AOM_CTRL_AV1E_SET_ROW_MT -AOM_CTRL_USE_TYPE(AV1E_SET_TILE_COLUMNS, int) +AOM_CTRL_USE_TYPE(AV1E_SET_TILE_COLUMNS, unsigned int) #define AOM_CTRL_AV1E_SET_TILE_COLUMNS -AOM_CTRL_USE_TYPE(AV1E_SET_TILE_ROWS, int) +AOM_CTRL_USE_TYPE(AV1E_SET_TILE_ROWS, unsigned int) #define AOM_CTRL_AV1E_SET_TILE_ROWS AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_TPL_MODEL, unsigned int) @@ -997,6 +1259,9 @@ AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_CDEF, unsigned int) AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_RESTORATION, unsigned int) #define AOM_CTRL_AV1E_SET_ENABLE_RESTORATION +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_OBMC, unsigned int) +#define AOM_CTRL_AV1E_SET_ENABLE_OBMC + AOM_CTRL_USE_TYPE(AV1E_SET_DISABLE_TRELLIS_QUANT, unsigned int) #define AOM_CTRL_AV1E_SET_DISABLE_TRELLIS_QUANT @@ -1029,37 +1294,109 @@ AOM_CTRL_USE_TYPE(AV1E_SET_MTU, unsigned int) AOM_CTRL_USE_TYPE(AV1E_SET_TIMING_INFO_TYPE, int) /* aom_timing_info_type_t */ #define AOM_CTRL_AV1E_SET_TIMING_INFO_TYPE -AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_DF, unsigned int) -#define AOM_CTRL_AV1E_SET_ENABLE_DF +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_RECT_PARTITIONS, int) +#define AOM_CTRL_AV1E_SET_ENABLE_RECT_PARTITIONS -AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_ORDER_HINT, unsigned int) +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_AB_PARTITIONS, int) +#define AOM_CTRL_AV1E_SET_ENABLE_AB_PARTITIONS + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_1TO4_PARTITIONS, int) +#define AOM_CTRL_AV1E_SET_ENABLE_1TO4_PARTITIONS + +AOM_CTRL_USE_TYPE(AV1E_SET_MIN_PARTITION_SIZE, int) +#define AOM_CTRL_AV1E_SET_MIN_PARTITION_SIZE + +AOM_CTRL_USE_TYPE(AV1E_SET_MAX_PARTITION_SIZE, int) +#define AOM_CTRL_AV1E_SET_MAX_PARTITION_SIZE + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_INTRA_EDGE_FILTER, int) +#define AOM_CTRL_AV1E_SET_ENABLE_INTRA_EDGE_FILTER + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_ORDER_HINT, int) #define AOM_CTRL_AV1E_SET_ENABLE_ORDER_HINT -AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_JNT_COMP, unsigned int) -#define AOM_CTRL_AV1E_SET_ENABLE_JNT_COMP +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_TX64, int) +#define AOM_CTRL_AV1E_SET_ENABLE_TX64 -AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_REF_FRAME_MVS, unsigned int) +AOM_CTRL_USE_TYPE(AV1E_SET_TX_SIZE_SEARCH_METHOD, int) +#define AOM_CTRL_AV1E_SET_TXSIZE_SEARCH_METHOD + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_FLIP_IDTX, int) +#define AOM_CTRL_AV1E_SET_ENABLE_FLIP_IDTX + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_DIST_WTD_COMP, int) +#define AOM_CTRL_AV1E_SET_ENABLE_DIST_WTD_COMP + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_REF_FRAME_MVS, int) #define AOM_CTRL_AV1E_SET_ENABLE_REF_FRAME_MVS -AOM_CTRL_USE_TYPE(AV1E_SET_ALLOW_REF_FRAME_MVS, unsigned int) +AOM_CTRL_USE_TYPE(AV1E_SET_ALLOW_REF_FRAME_MVS, int) #define AOM_CTRL_AV1E_SET_ALLOW_REF_FRAME_MVS -AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_WARPED_MOTION, unsigned int) +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_DUAL_FILTER, int) +#define AOM_CTRL_AV1E_SET_ENABLE_DUAL_FILTER + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_MASKED_COMP, int) +#define AOM_CTRL_AV1E_SET_ENABLE_MASKED_COMP + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_ONESIDED_COMP, int) +#define AOM_CTRL_AV1E_SET_ENABLE_ONESIDED_COMP + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_INTERINTRA_COMP, int) +#define AOM_CTRL_AV1E_SET_ENABLE_INTERINTRA_COMP + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_SMOOTH_INTERINTRA, int) +#define AOM_CTRL_AV1E_SET_ENABLE_SMOOTH_INTERINTRA + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_DIFF_WTD_COMP, int) +#define AOM_CTRL_AV1E_SET_ENABLE_DIFF_WTD_COMP + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_INTERINTER_WEDGE, int) +#define AOM_CTRL_AV1E_SET_ENABLE_INTERINTER_WEDGE + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_INTERINTRA_WEDGE, int) +#define AOM_CTRL_AV1E_SET_ENABLE_INTERINTRA_WEDGE + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_GLOBAL_MOTION, int) +#define AOM_CTRL_AV1E_SET_ENABLE_GLOBAL_MOTION + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_WARPED_MOTION, int) #define AOM_CTRL_AV1E_SET_ENABLE_WARPED_MOTION -AOM_CTRL_USE_TYPE(AV1E_SET_ALLOW_WARPED_MOTION, unsigned int) +AOM_CTRL_USE_TYPE(AV1E_SET_ALLOW_WARPED_MOTION, int) #define AOM_CTRL_AV1E_SET_ALLOW_WARPED_MOTION -AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_SUPERRES, unsigned int) +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_FILTER_INTRA, int) +#define AOM_CTRL_AV1E_SET_ENABLE_FILTER_INTRA + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_SMOOTH_INTRA, int) +#define AOM_CTRL_AV1E_SET_ENABLE_SMOOTH_INTRA + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_PAETH_INTRA, int) +#define AOM_CTRL_AV1E_SET_ENABLE_PAETH_INTRA + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_CFL_INTRA, int) +#define AOM_CTRL_AV1E_SET_ENABLE_CFL_INTRA + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_SUPERRES, int) #define AOM_CTRL_AV1E_SET_ENABLE_SUPERRES +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_PALETTE, int) +#define AOM_CTRL_AV1E_SET_ENABLE_PALETTE + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_INTRABC, int) +#define AOM_CTRL_AV1E_SET_ENABLE_INTRABC + +AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_ANGLE_DELTA, int) +#define AOM_CTRL_AV1E_SET_ENABLE_ANGLE_DELTA + AOM_CTRL_USE_TYPE(AV1E_SET_FRAME_PARALLEL_DECODING, unsigned int) #define AOM_CTRL_AV1E_SET_FRAME_PARALLEL_DECODING -AOM_CTRL_USE_TYPE(AV1E_SET_ERROR_RESILIENT_MODE, unsigned int) +AOM_CTRL_USE_TYPE(AV1E_SET_ERROR_RESILIENT_MODE, int) #define AOM_CTRL_AV1E_SET_ERROR_RESILIENT_MODE -AOM_CTRL_USE_TYPE(AV1E_SET_S_FRAME_MODE, unsigned int) +AOM_CTRL_USE_TYPE(AV1E_SET_S_FRAME_MODE, int) #define AOM_CTRL_AV1E_SET_S_FRAME_MODE AOM_CTRL_USE_TYPE(AV1E_SET_AQ_MODE, unsigned int) @@ -1107,14 +1444,8 @@ AOM_CTRL_USE_TYPE(AV1E_SET_RENDER_SIZE, int *) AOM_CTRL_USE_TYPE(AV1E_SET_SUPERBLOCK_SIZE, unsigned int) #define AOM_CTRL_AV1E_SET_SUPERBLOCK_SIZE -AOM_CTRL_USE_TYPE(AV1E_SET_TARGET_LEVEL, unsigned int) -#define AOM_CTRL_AV1E_SET_TARGET_LEVEL - -AOM_CTRL_USE_TYPE(AV1E_GET_LEVEL, int *) -#define AOM_CTRL_AV1E_GET_LEVEL - -AOM_CTRL_USE_TYPE(AV1E_SET_ANS_WINDOW_SIZE_LOG2, unsigned int) -#define AOM_CTRL_AV1E_SET_ANS_WINDOW_SIZE_LOG2 +AOM_CTRL_USE_TYPE(AV1E_GET_SEQ_LEVEL_IDX, int *) +#define AOM_CTRL_AV1E_GET_SEQ_LEVEL_IDX AOM_CTRL_USE_TYPE(AV1E_SET_SINGLE_TILE_DECODING, unsigned int) #define AOM_CTRL_AV1E_SET_SINGLE_TILE_DECODING @@ -1122,13 +1453,13 @@ AOM_CTRL_USE_TYPE(AV1E_SET_SINGLE_TILE_DECODING, unsigned int) AOM_CTRL_USE_TYPE(AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST, unsigned int) #define AOM_CTRL_AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST -AOM_CTRL_USE_TYPE(AV1E_SET_FILM_GRAIN_TEST_VECTOR, unsigned int) +AOM_CTRL_USE_TYPE(AV1E_SET_FILM_GRAIN_TEST_VECTOR, int) #define AOM_CTRL_AV1E_SET_FILM_GRAIN_TEST_VECTOR AOM_CTRL_USE_TYPE(AV1E_SET_FILM_GRAIN_TABLE, const char *) #define AOM_CTRL_AV1E_SET_FILM_GRAIN_TABLE -AOM_CTRL_USE_TYPE(AV1E_SET_CDF_UPDATE_MODE, int) +AOM_CTRL_USE_TYPE(AV1E_SET_CDF_UPDATE_MODE, unsigned int) #define AOM_CTRL_AV1E_SET_CDF_UPDATE_MODE #ifdef CONFIG_DENOISE @@ -1145,6 +1476,42 @@ AOM_CTRL_USE_TYPE(AV1E_SET_CHROMA_SUBSAMPLING_X, unsigned int) AOM_CTRL_USE_TYPE(AV1E_SET_CHROMA_SUBSAMPLING_Y, unsigned int) #define AOM_CTRL_AV1E_SET_CHROMA_SUBSAMPLING_Y +AOM_CTRL_USE_TYPE(AV1E_SET_REDUCED_TX_TYPE_SET, int) +#define AOM_CTRL_AV1E_SET_REDUCED_TX_TYPE_SET + +AOM_CTRL_USE_TYPE(AV1E_SET_INTRA_DCT_ONLY, int) +#define AOM_CTRL_AV1E_SET_INTRA_DCT_ONLY + +AOM_CTRL_USE_TYPE(AV1E_SET_INTER_DCT_ONLY, int) +#define AOM_CTRL_AV1E_SET_INTER_DCT_ONLY + +AOM_CTRL_USE_TYPE(AV1E_SET_INTRA_DEFAULT_TX_ONLY, int) +#define AOM_CTRL_AV1E_SET_INTRA_DEFAULT_TX_ONLY + +AOM_CTRL_USE_TYPE(AV1E_SET_QUANT_B_ADAPT, int) +#define AOM_CTRL_AV1E_SET_QUANT_B_ADAPT + +AOM_CTRL_USE_TYPE(AV1E_SET_GF_MAX_PYRAMID_HEIGHT, unsigned int) +#define AOM_CTRL_AV1E_SET_GF_MAX_PYRAMID_HEIGHT + +AOM_CTRL_USE_TYPE(AV1E_SET_MAX_REFERENCE_FRAMES, int) +#define AOM_CTRL_AV1E_SET_MAX_REFERENCE_FRAMES + +AOM_CTRL_USE_TYPE(AV1E_SET_REDUCED_REFERENCE_SET, int) +#define AOM_CTRL_AV1E_SET_REDUCED_REFERENCE_SET + +AOM_CTRL_USE_TYPE(AV1E_SET_COEFF_COST_UPD_FREQ, unsigned int) +#define AOM_CTRL_AV1E_SET_COEFF_COST_UPD_FREQ + +AOM_CTRL_USE_TYPE(AV1E_SET_MODE_COST_UPD_FREQ, unsigned int) +#define AOM_CTRL_AV1E_SET_MODE_COST_UPD_FREQ + +AOM_CTRL_USE_TYPE(AV1E_SET_TARGET_SEQ_LEVEL_IDX, int) +#define AOM_CTRL_AV1E_SET_TARGET_SEQ_LEVEL_IDX + +AOM_CTRL_USE_TYPE(AV1E_SET_TIER_MASK, unsigned int) +#define AOM_CTRL_AV1E_SET_TIER_MASK + /*!\endcond */ /*! @} - end defgroup aom_encoder */ #ifdef __cplusplus diff --git a/libaom/aom_dsp/add_noise.c b/libaom/aom_dsp/add_noise.c index bfb3e7e..43587ca 100644 --- a/libaom/aom_dsp/add_noise.c +++ b/libaom/aom_dsp/add_noise.c @@ -40,7 +40,7 @@ void aom_plane_add_noise_c(uint8_t *start, char *noise, char blackclamp[16], } static double gaussian(double sigma, double mu, double x) { - return 1 / (sigma * sqrt(2.0 * 3.14159265)) * + return 1 / (sigma * sqrt(2.0 * PI)) * (exp(-(x - mu) * (x - mu) / (2 * sigma * sigma))); } diff --git a/libaom/aom_dsp/aom_dsp.cmake b/libaom/aom_dsp/aom_dsp.cmake index a8490c4..abf6a60 100644 --- a/libaom/aom_dsp/aom_dsp.cmake +++ b/libaom/aom_dsp/aom_dsp.cmake @@ -194,6 +194,7 @@ if(CONFIG_AV1_ENCODER) "${AOM_ROOT}/aom_dsp/x86/highbd_subtract_sse2.c" "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse2.c" "${AOM_ROOT}/aom_dsp/x86/quantize_sse2.c" + "${AOM_ROOT}/aom_dsp/x86/adaptive_quantize_sse2.c" "${AOM_ROOT}/aom_dsp/x86/quantize_x86.h" "${AOM_ROOT}/aom_dsp/x86/sum_squares_sse2.c" "${AOM_ROOT}/aom_dsp/x86/variance_sse2.c") @@ -226,6 +227,7 @@ if(CONFIG_AV1_ENCODER) "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.c" "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.h" "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.c" + "${AOM_ROOT}/aom_dsp/x86/quantize_ssse3.c" "${AOM_ROOT}/aom_dsp/x86/variance_impl_ssse3.c" "${AOM_ROOT}/aom_dsp/x86/jnt_variance_ssse3.c" "${AOM_ROOT}/aom_dsp/x86/jnt_sad_ssse3.c") @@ -361,6 +363,8 @@ function(setup_aom_dsp_targets) endif() endif() + target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_dsp>) + # Pass the new lib targets up to the parent scope instance of # $AOM_LIB_TARGETS. set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE) diff --git a/libaom/aom_dsp/aom_dsp_rtcd_defs.pl b/libaom/aom_dsp/aom_dsp_rtcd_defs.pl index 59d0620..f56a117 100755 --- a/libaom/aom_dsp/aom_dsp_rtcd_defs.pl +++ b/libaom/aom_dsp/aom_dsp_rtcd_defs.pl @@ -466,10 +466,6 @@ specialize qw/aom_highbd_lpf_horizontal_4 sse2/; add_proto qw/void aom_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; specialize qw/aom_highbd_lpf_horizontal_4_dual sse2 avx2/; -# Helper functions. -add_proto qw/void av1_round_shift_array/, "int32_t *arr, int size, int bit"; -specialize "av1_round_shift_array", qw/sse4_1 neon/; - # # Encoder functions. # @@ -522,10 +518,17 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { add_proto qw/void aom_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/aom_quantize_b sse2/, "$ssse3_x86_64", "$avx_x86_64"; + add_proto qw/void aom_quantize_b_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/aom_quantize_b_adaptive sse2/; + add_proto qw/void aom_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/aom_quantize_b_32x32/, "$ssse3_x86_64", "$avx_x86_64"; + add_proto qw/void aom_quantize_b_32x32_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/aom_quantize_b_32x32_adaptive sse2/; + add_proto qw/void aom_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/aom_quantize_b_64x64 ssse3/; } # CONFIG_AV1_ENCODER if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { @@ -536,7 +539,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { specialize qw/aom_highbd_quantize_b_32x32 sse2/; add_proto qw/void aom_highbd_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - + specialize qw/aom_highbd_quantize_b_64x64 sse2/; } # CONFIG_AV1_ENCODER # @@ -596,7 +599,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { ($w, $h) = @$_; add_proto qw/unsigned int/, "aom_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; add_proto qw/unsigned int/, "aom_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; - add_proto qw/unsigned int/, "aom_jnt_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param"; + add_proto qw/unsigned int/, "aom_dist_wtd_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param"; } specialize qw/aom_sad128x128 avx2 sse2/; @@ -647,29 +650,29 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { specialize qw/aom_sad16x64_avg sse2/; specialize qw/aom_sad64x16_avg sse2/; - specialize qw/aom_jnt_sad128x128_avg ssse3/; - specialize qw/aom_jnt_sad128x64_avg ssse3/; - specialize qw/aom_jnt_sad64x128_avg ssse3/; - specialize qw/aom_jnt_sad64x64_avg ssse3/; - specialize qw/aom_jnt_sad64x32_avg ssse3/; - specialize qw/aom_jnt_sad32x64_avg ssse3/; - specialize qw/aom_jnt_sad32x32_avg ssse3/; - specialize qw/aom_jnt_sad32x16_avg ssse3/; - specialize qw/aom_jnt_sad16x32_avg ssse3/; - specialize qw/aom_jnt_sad16x16_avg ssse3/; - specialize qw/aom_jnt_sad16x8_avg ssse3/; - specialize qw/aom_jnt_sad8x16_avg ssse3/; - specialize qw/aom_jnt_sad8x8_avg ssse3/; - specialize qw/aom_jnt_sad8x4_avg ssse3/; - specialize qw/aom_jnt_sad4x8_avg ssse3/; - specialize qw/aom_jnt_sad4x4_avg ssse3/; - - specialize qw/aom_jnt_sad4x16_avg ssse3/; - specialize qw/aom_jnt_sad16x4_avg ssse3/; - specialize qw/aom_jnt_sad8x32_avg ssse3/; - specialize qw/aom_jnt_sad32x8_avg ssse3/; - specialize qw/aom_jnt_sad16x64_avg ssse3/; - specialize qw/aom_jnt_sad64x16_avg ssse3/; + specialize qw/aom_dist_wtd_sad128x128_avg ssse3/; + specialize qw/aom_dist_wtd_sad128x64_avg ssse3/; + specialize qw/aom_dist_wtd_sad64x128_avg ssse3/; + specialize qw/aom_dist_wtd_sad64x64_avg ssse3/; + specialize qw/aom_dist_wtd_sad64x32_avg ssse3/; + specialize qw/aom_dist_wtd_sad32x64_avg ssse3/; + specialize qw/aom_dist_wtd_sad32x32_avg ssse3/; + specialize qw/aom_dist_wtd_sad32x16_avg ssse3/; + specialize qw/aom_dist_wtd_sad16x32_avg ssse3/; + specialize qw/aom_dist_wtd_sad16x16_avg ssse3/; + specialize qw/aom_dist_wtd_sad16x8_avg ssse3/; + specialize qw/aom_dist_wtd_sad8x16_avg ssse3/; + specialize qw/aom_dist_wtd_sad8x8_avg ssse3/; + specialize qw/aom_dist_wtd_sad8x4_avg ssse3/; + specialize qw/aom_dist_wtd_sad4x8_avg ssse3/; + specialize qw/aom_dist_wtd_sad4x4_avg ssse3/; + + specialize qw/aom_dist_wtd_sad4x16_avg ssse3/; + specialize qw/aom_dist_wtd_sad16x4_avg ssse3/; + specialize qw/aom_dist_wtd_sad8x32_avg ssse3/; + specialize qw/aom_dist_wtd_sad32x8_avg ssse3/; + specialize qw/aom_dist_wtd_sad16x64_avg ssse3/; + specialize qw/aom_dist_wtd_sad64x16_avg ssse3/; add_proto qw/unsigned int/, "aom_sad4xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height"; add_proto qw/unsigned int/, "aom_sad8xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height"; @@ -694,7 +697,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { specialize "aom_highbd_sad${w}x${h}", qw/sse2/; specialize "aom_highbd_sad${w}x${h}_avg", qw/sse2/; } - add_proto qw/unsigned int/, "aom_highbd_jnt_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const JNT_COMP_PARAMS* jcp_param"; + add_proto qw/unsigned int/, "aom_highbd_dist_wtd_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param"; } specialize qw/aom_highbd_sad128x128 avx2/; specialize qw/aom_highbd_sad128x64 avx2/; @@ -839,6 +842,30 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { specialize qw/aom_highbd_sad64x16x4d sse2/; # + # Avg + # + add_proto qw/unsigned int aom_avg_8x8/, "const uint8_t *, int p"; + specialize qw/aom_avg_8x8 sse2/; + + add_proto qw/unsigned int aom_avg_4x4/, "const uint8_t *, int p"; + specialize qw/aom_avg_4x4 sse2/; + + add_proto qw/void aom_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max"; + specialize qw/aom_minmax_8x8 sse2/; + + add_proto qw/void aom_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height"; + # TODO(kyslov@) bring back SSE2 by extending it to 128 block size + #specialize qw/aom_int_pro_row sse2/; + + add_proto qw/int16_t aom_int_pro_col/, "const uint8_t *ref, const int width"; + # TODO(kyslov@) bring back SSE2 by extending it to 128 block size + #specialize qw/aom_int_pro_col sse2/; + + add_proto qw/int aom_vector_var/, "const int16_t *ref, const int16_t *src, const int bwl"; + # TODO(kyslov@) bring back SSE2 by extending it to 128 block size + #specialize qw/aom_vector_var sse2/; + + # # hamadard transform and satd for implmenting temporal dependency model # add_proto qw/void aom_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff"; @@ -919,11 +946,11 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { int ref_stride, int subpel_search"; specialize qw/aom_comp_avg_upsampled_pred sse2/; - add_proto qw/void aom_jnt_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + add_proto qw/void aom_dist_wtd_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, - int ref_stride, const JNT_COMP_PARAMS *jcp_param, int subpel_search"; - specialize qw/aom_jnt_comp_avg_upsampled_pred ssse3/; + int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search"; + specialize qw/aom_dist_wtd_comp_avg_upsampled_pred ssse3/; add_proto qw/void aom_comp_mask_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, @@ -942,11 +969,11 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search"; specialize qw/aom_highbd_comp_avg_upsampled_pred sse2/; - add_proto qw/void aom_highbd_jnt_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, + add_proto qw/void aom_highbd_dist_wtd_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, - int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param, int subpel_search"; - specialize qw/aom_highbd_jnt_comp_avg_upsampled_pred sse2/; + int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search"; + specialize qw/aom_highbd_dist_wtd_comp_avg_upsampled_pred sse2/; # @@ -972,7 +999,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { add_proto qw/unsigned int/, "aom_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; add_proto qw/uint32_t/, "aom_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; add_proto qw/uint32_t/, "aom_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; - add_proto qw/uint32_t/, "aom_jnt_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param"; + add_proto qw/uint32_t/, "aom_dist_wtd_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param"; } specialize qw/aom_variance128x128 sse2 avx2 /; specialize qw/aom_variance128x64 sse2 avx2 /; @@ -1044,30 +1071,30 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { specialize qw/aom_sub_pixel_avg_variance16x64 sse2 ssse3/; specialize qw/aom_sub_pixel_avg_variance64x16 sse2 ssse3/; - specialize qw/aom_jnt_sub_pixel_avg_variance64x64 ssse3/; - specialize qw/aom_jnt_sub_pixel_avg_variance64x32 ssse3/; - specialize qw/aom_jnt_sub_pixel_avg_variance32x64 ssse3/; - specialize qw/aom_jnt_sub_pixel_avg_variance32x32 ssse3/; - specialize qw/aom_jnt_sub_pixel_avg_variance32x16 ssse3/; - specialize qw/aom_jnt_sub_pixel_avg_variance16x32 ssse3/; - specialize qw/aom_jnt_sub_pixel_avg_variance16x16 ssse3/; - specialize qw/aom_jnt_sub_pixel_avg_variance16x8 ssse3/; - specialize qw/aom_jnt_sub_pixel_avg_variance8x16 ssse3/; - specialize qw/aom_jnt_sub_pixel_avg_variance8x8 ssse3/; - specialize qw/aom_jnt_sub_pixel_avg_variance8x4 ssse3/; - specialize qw/aom_jnt_sub_pixel_avg_variance4x8 ssse3/; - specialize qw/aom_jnt_sub_pixel_avg_variance4x4 ssse3/; - - specialize qw/aom_jnt_sub_pixel_avg_variance4x16 ssse3/; - specialize qw/aom_jnt_sub_pixel_avg_variance16x4 ssse3/; - specialize qw/aom_jnt_sub_pixel_avg_variance8x32 ssse3/; - specialize qw/aom_jnt_sub_pixel_avg_variance32x8 ssse3/; - specialize qw/aom_jnt_sub_pixel_avg_variance16x64 ssse3/; - specialize qw/aom_jnt_sub_pixel_avg_variance64x16 ssse3/; - - specialize qw/aom_jnt_sub_pixel_avg_variance128x128 ssse3/; - specialize qw/aom_jnt_sub_pixel_avg_variance128x64 ssse3/; - specialize qw/aom_jnt_sub_pixel_avg_variance64x128 ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x64 ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x32 ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x64 ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x32 ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x16 ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x32 ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x16 ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x8 ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x16 ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x8 ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x4 ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x8 ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x4 ssse3/; + + specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x16 ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x4 ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x32 ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x8 ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x64 ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x16 ssse3/; + + specialize qw/aom_dist_wtd_sub_pixel_avg_variance128x128 ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance128x64 ssse3/; + specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x128 ssse3/; foreach $bd (8, 10, 12) { @@ -1099,7 +1126,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "sse4_1"; } - add_proto qw/uint32_t/, "aom_highbd_${bd}_jnt_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const JNT_COMP_PARAMS* jcp_param"; + add_proto qw/uint32_t/, "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param"; } } @@ -1188,8 +1215,8 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { # add_proto qw/void aom_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride"; - add_proto qw/void aom_jnt_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const JNT_COMP_PARAMS *jcp_param"; - specialize qw/aom_jnt_comp_avg_pred ssse3/; + add_proto qw/void aom_dist_wtd_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param"; + specialize qw/aom_dist_wtd_comp_avg_pred ssse3/; add_proto qw/unsigned int aom_highbd_12_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/aom_highbd_12_variance128x128 sse2/; @@ -1355,12 +1382,21 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { add_proto qw/void aom_highbd_comp_avg_pred/, "uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride"; - add_proto qw/void aom_highbd_jnt_comp_avg_pred/, "uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const JNT_COMP_PARAMS *jcp_param"; - specialize qw/aom_highbd_jnt_comp_avg_pred sse2/; + add_proto qw/void aom_highbd_dist_wtd_comp_avg_pred/, "uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param"; + specialize qw/aom_highbd_dist_wtd_comp_avg_pred sse2/; # # Subpixel Variance # + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_12_sub_pixel_variance128x128 sse2/; + + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_12_sub_pixel_variance128x64 sse2/; + + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_12_sub_pixel_variance64x128 sse2/; + add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/aom_highbd_12_sub_pixel_variance64x64 sse2/; @@ -1397,6 +1433,15 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_10_sub_pixel_variance128x128 sse2/; + + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_10_sub_pixel_variance128x64 sse2/; + + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_10_sub_pixel_variance64x128 sse2/; + add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/aom_highbd_10_sub_pixel_variance64x64 sse2/; @@ -1433,6 +1478,15 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_8_sub_pixel_variance128x128 sse2/; + + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_8_sub_pixel_variance128x64 sse2/; + + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; + specialize qw/aom_highbd_8_sub_pixel_variance64x128 sse2/; + add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/aom_highbd_8_sub_pixel_variance64x64 sse2/; diff --git a/libaom/aom_dsp/avg.c b/libaom/aom_dsp/avg.c index 4d78c9c..43d2760 100644 --- a/libaom/aom_dsp/avg.c +++ b/libaom/aom_dsp/avg.c @@ -14,6 +14,40 @@ #include "config/aom_dsp_rtcd.h" #include "aom_ports/mem.h" +void aom_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, + int *min, int *max) { + int i, j; + *min = 255; + *max = 0; + for (i = 0; i < 8; ++i, s += p, d += dp) { + for (j = 0; j < 8; ++j) { + int diff = abs(s[j] - d[j]); + *min = diff < *min ? diff : *min; + *max = diff > *max ? diff : *max; + } + } +} + +unsigned int aom_avg_4x4_c(const uint8_t *s, int p) { + int i, j; + int sum = 0; + for (i = 0; i < 4; ++i, s += p) + for (j = 0; j < 4; sum += s[j], ++j) { + } + + return (sum + 8) >> 4; +} + +unsigned int aom_avg_8x8_c(const uint8_t *s, int p) { + int i, j; + int sum = 0; + for (i = 0; i < 8; ++i, s += p) + for (j = 0; j < 8; sum += s[j], ++j) { + } + + return (sum + 32) >> 6; +} + // src_diff: first pass, 9 bit, dynamic range [-255, 255] // second pass, 12 bit, dynamic range [-2040, 2040] static void hadamard_col8(const int16_t *src_diff, ptrdiff_t src_stride, @@ -146,3 +180,48 @@ int aom_satd_c(const tran_low_t *coeff, int length) { // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024] return satd; } + +// Integer projection onto row vectors. +// height: value range {16, 32, 64, 128}. +void aom_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, + const int ref_stride, const int height) { + int idx; + const int norm_factor = height >> 1; + for (idx = 0; idx < 16; ++idx) { + int i; + hbuf[idx] = 0; + // hbuf[idx]: 14 bit, dynamic range [0, 32640]. + for (i = 0; i < height; ++i) hbuf[idx] += ref[i * ref_stride]; + // hbuf[idx]: 9 bit, dynamic range [0, 1020]. + hbuf[idx] /= norm_factor; + ++ref; + } +} + +// width: value range {16, 32, 64, 128}. +int16_t aom_int_pro_col_c(const uint8_t *ref, const int width) { + int idx; + int16_t sum = 0; + // sum: 14 bit, dynamic range [0, 32640] + for (idx = 0; idx < width; ++idx) sum += ref[idx]; + return sum; +} + +// ref: [0 - 510] +// src: [0 - 510] +// bwl: {2, 3, 4, 5} +int aom_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl) { + int i; + int width = 4 << bwl; + int sse = 0, mean = 0, var; + + for (i = 0; i < width; ++i) { + int diff = ref[i] - src[i]; // diff: dynamic range [-510, 510], 10 bits. + mean += diff; // mean: dynamic range 16 bits. + sse += diff * diff; // sse: dynamic range 26 bits. + } + + // (mean * mean): dynamic range 31 bits. + var = sse - ((mean * mean) >> (bwl + 2)); + return var; +} diff --git a/libaom/aom_dsp/bitreader_buffer.c b/libaom/aom_dsp/bitreader_buffer.c index 984b217..d79feea 100644 --- a/libaom/aom_dsp/bitreader_buffer.c +++ b/libaom/aom_dsp/bitreader_buffer.c @@ -60,9 +60,9 @@ int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits) { uint32_t aom_rb_read_uvlc(struct aom_read_bit_buffer *rb) { int leading_zeros = 0; - while (!aom_rb_read_bit(rb)) ++leading_zeros; + while (leading_zeros < 32 && !aom_rb_read_bit(rb)) ++leading_zeros; // Maximum 32 bits. - if (leading_zeros >= 32) return UINT32_MAX; + if (leading_zeros == 32) return UINT32_MAX; const uint32_t base = (1u << leading_zeros) - 1; const uint32_t value = aom_rb_read_literal(rb, leading_zeros); return base + value; diff --git a/libaom/aom_dsp/grain_synthesis.c b/libaom/aom_dsp/grain_synthesis.c index b96e1c3..4b94dbc 100644 --- a/libaom/aom_dsp/grain_synthesis.c +++ b/libaom/aom_dsp/grain_synthesis.c @@ -232,7 +232,6 @@ static int scaling_lut_y[256]; static int scaling_lut_cb[256]; static int scaling_lut_cr[256]; -static int grain_center; static int grain_min; static int grain_max; @@ -1077,7 +1076,7 @@ int av1_add_film_grain_run(const aom_film_grain_t *params, uint8_t *luma, int overlap = params->overlap_flag; int bit_depth = params->bit_depth; - grain_center = 128 << (bit_depth - 8); + const int grain_center = 128 << (bit_depth - 8); grain_min = 0 - grain_center; grain_max = (256 << (bit_depth - 8)) - 1 - grain_center; diff --git a/libaom/aom_dsp/grain_synthesis.h b/libaom/aom_dsp/grain_synthesis.h index 7aee6f6..9155b39 100644 --- a/libaom/aom_dsp/grain_synthesis.h +++ b/libaom/aom_dsp/grain_synthesis.h @@ -20,6 +20,8 @@ extern "C" { #endif +#include <string.h> + #include "aom_dsp/aom_dsp_common.h" #include "aom/aom_image.h" @@ -28,6 +30,9 @@ extern "C" { * This structure contains input parameters for film grain synthesis */ typedef struct { + // This structure is compared element-by-element in the function + // av1_check_grain_params_equiv: this function must be updated if any changes + // are made to this structure. int apply_grain; int update_parameters; @@ -79,8 +84,73 @@ typedef struct { int grain_scale_shift; uint16_t random_seed; + // This structure is compared element-by-element in the function + // av1_check_grain_params_equiv: this function must be updated if any changes + // are made to this structure. } aom_film_grain_t; +/*!\brief Check if two film grain parameters structs are equivalent + * + * Check if two film grain parameters are equal, except for the + * update_parameters and random_seed elements which are ignored. + * + * \param[in] pa The first set of parameters to compare + * \param[in] pb The second set of parameters to compare + * \return Returns 1 if the params are equivalent, 0 otherwise + */ +static INLINE int av1_check_grain_params_equiv( + const aom_film_grain_t *const pa, const aom_film_grain_t *const pb) { + if (pa->apply_grain != pb->apply_grain) return 0; + // Don't compare update_parameters + + if (pa->num_y_points != pb->num_y_points) return 0; + if (memcmp(pa->scaling_points_y, pb->scaling_points_y, + pa->num_y_points * 2 * sizeof(*pa->scaling_points_y)) != 0) + return 0; + + if (pa->num_cb_points != pb->num_cb_points) return 0; + if (memcmp(pa->scaling_points_cb, pb->scaling_points_cb, + pa->num_cb_points * 2 * sizeof(*pa->scaling_points_cb)) != 0) + return 0; + + if (pa->num_cr_points != pb->num_cr_points) return 0; + if (memcmp(pa->scaling_points_cr, pb->scaling_points_cr, + pa->num_cr_points * 2 * sizeof(*pa->scaling_points_cr)) != 0) + return 0; + + if (pa->scaling_shift != pb->scaling_shift) return 0; + if (pa->ar_coeff_lag != pb->ar_coeff_lag) return 0; + + const int num_pos = 2 * pa->ar_coeff_lag * (pa->ar_coeff_lag + 1); + if (memcmp(pa->ar_coeffs_y, pb->ar_coeffs_y, + num_pos * sizeof(*pa->ar_coeffs_y)) != 0) + return 0; + if (memcmp(pa->ar_coeffs_cb, pb->ar_coeffs_cb, + num_pos * sizeof(*pa->ar_coeffs_cb)) != 0) + return 0; + if (memcmp(pa->ar_coeffs_cr, pb->ar_coeffs_cr, + num_pos * sizeof(*pa->ar_coeffs_cr)) != 0) + return 0; + + if (pa->ar_coeff_shift != pb->ar_coeff_shift) return 0; + + if (pa->cb_mult != pb->cb_mult) return 0; + if (pa->cb_luma_mult != pb->cb_luma_mult) return 0; + if (pa->cb_offset != pb->cb_offset) return 0; + + if (pa->cr_mult != pb->cr_mult) return 0; + if (pa->cr_luma_mult != pb->cr_luma_mult) return 0; + if (pa->cr_offset != pb->cr_offset) return 0; + + if (pa->overlap_flag != pb->overlap_flag) return 0; + if (pa->clip_to_restricted_range != pb->clip_to_restricted_range) return 0; + if (pa->bit_depth != pb->bit_depth) return 0; + if (pa->chroma_scaling_from_luma != pb->chroma_scaling_from_luma) return 0; + if (pa->grain_scale_shift != pb->grain_scale_shift) return 0; + + return 1; +} + /*!\brief Add film grain * * Add film grain to an image diff --git a/libaom/aom_dsp/noise_model.h b/libaom/aom_dsp/noise_model.h index 049d5be..5e7de9b 100644 --- a/libaom/aom_dsp/noise_model.h +++ b/libaom/aom_dsp/noise_model.h @@ -158,10 +158,10 @@ int aom_flat_block_finder_run(const aom_flat_block_finder_t *block_finder, int stride, uint8_t *flat_blocks); // The noise shape indicates the allowed coefficients in the AR model. -typedef enum { +enum { AOM_NOISE_SHAPE_DIAMOND = 0, AOM_NOISE_SHAPE_SQUARE = 1 -} aom_noise_shape; +} UENUM1BYTE(aom_noise_shape); // The parameters of the noise model include the shape type, lag, the // bit depth of the input images provided, and whether the input images @@ -202,13 +202,13 @@ typedef struct { } aom_noise_model_t; /*!\brief Result of a noise model update. */ -typedef enum { +enum { AOM_NOISE_STATUS_OK = 0, AOM_NOISE_STATUS_INVALID_ARGUMENT, AOM_NOISE_STATUS_INSUFFICIENT_FLAT_BLOCKS, AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE, AOM_NOISE_STATUS_INTERNAL_ERROR, -} aom_noise_status_t; +} UENUM1BYTE(aom_noise_status_t); /*!\brief Initializes a noise model with the given parameters. * diff --git a/libaom/aom_dsp/prob.h b/libaom/aom_dsp/prob.h index d003a98..20ffdea 100644 --- a/libaom/aom_dsp/prob.h +++ b/libaom/aom_dsp/prob.h @@ -641,7 +641,7 @@ static INLINE uint8_t get_prob(unsigned int num, unsigned int den) { } } -static INLINE void update_cdf(aom_cdf_prob *cdf, int val, int nsymbs) { +static INLINE void update_cdf(aom_cdf_prob *cdf, int8_t val, int nsymbs) { int rate; int i, tmp; diff --git a/libaom/aom_dsp/quantize.c b/libaom/aom_dsp/quantize.c index 62dbd86..ced34b4 100644 --- a/libaom/aom_dsp/quantize.c +++ b/libaom/aom_dsp/quantize.c @@ -11,6 +11,98 @@ #include "aom_dsp/quantize.h" #include "aom_mem/aom_mem.h" +#include "av1/encoder/av1_quantize.h" + +void quantize_b_adaptive_helper_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr, const int log_scale) { + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), + ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; + const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; + int i, non_zero_count = (int)n_coeffs, eob = -1; + (void)iscan; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + int prescan_add[2]; + for (i = 0; i < 2; ++i) + prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); + + // Pre-scan pass + for (i = (int)n_coeffs - 1; i >= 0; i--) { + const int rc = scan[i]; + const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); + const int coeff = coeff_ptr[rc] * wt; + const int prescan_add_val = prescan_add[rc != 0]; + if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val) && + coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add_val)) + non_zero_count--; + else + break; + } + + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. +#if SKIP_EOB_FACTOR_ADJUST + int first = -1; +#endif // SKIP_EOB_FACTOR_ADJUST + for (i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int tmp32; + + const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); + if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) { + int64_t tmp = + clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale), + INT16_MIN, INT16_MAX); + tmp *= wt; + tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * + quant_shift_ptr[rc != 0]) >> + (16 - log_scale + AOM_QM_BITS)); // quantization + qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; + const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS); + const int dequant = + (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> + AOM_QM_BITS; + const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale; + dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); + + if (tmp32) { + eob = i; +#if SKIP_EOB_FACTOR_ADJUST + if (first == -1) first = i; +#endif // SKIP_EOB_FACTOR_ADJUST + } + } + } +#if SKIP_EOB_FACTOR_ADJUST + if (eob >= 0 && first == eob) { + const int rc = scan[eob]; + if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { + const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); + const int coeff = coeff_ptr[rc] * wt; + const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; + const int prescan_add_val = + ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); + if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val) && + coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add_val)) { + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + eob = -1; + } + } + } +#endif // SKIP_EOB_FACTOR_ADJUST + *eob_ptr = eob + 1; +} void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, @@ -74,6 +166,94 @@ void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, *eob_ptr = eob + 1; } +void highbd_quantize_b_adaptive_helper_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr, const int log_scale) { + int i, eob = -1; + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), + ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; + const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; + int dequant; + int idx_arr[4096]; + (void)iscan; + int idx = 0; + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + int prescan_add[2]; + for (i = 0; i < 2; ++i) + prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); + + // Pre-scan pass + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); + const int coeff = coeff_ptr[rc] * wt; + + // If the coefficient is out of the base ZBIN range, keep it for + // quantization. + const int prescan_add_val = prescan_add[rc != 0]; + if (coeff >= (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val) && + coeff <= (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add_val)) + idx_arr[idx++] = i; + } + + // Quantization pass: only process the coefficients selected in + // pre-scan pass. Note: idx can be zero. +#if SKIP_EOB_FACTOR_ADJUST + int first = -1; +#endif // SKIP_EOB_FACTOR_ADJUST + for (i = 0; i < idx; i++) { + const int rc = scan[idx_arr[i]]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); + const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp1 = + abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale); + const int64_t tmpw = tmp1 * wt; + const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw; + const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> + (16 - log_scale + AOM_QM_BITS)); + qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign); + dequant = + (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS; + const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale; + dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign); + if (abs_qcoeff) { + eob = idx_arr[i]; +#if SKIP_EOB_FACTOR_ADJUST + if (first == -1) first = eob; +#endif // SKIP_EOB_FACTOR_ADJUST + } + } +#if SKIP_EOB_FACTOR_ADJUST + if (eob >= 0 && first == eob) { + const int rc = scan[eob]; + if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { + const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS); + const int coeff = coeff_ptr[rc] * wt; + const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; + const int prescan_add_val = + ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); + if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val) && + coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add_val)) { + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + eob = -1; + } + } + } +#endif // SKIP_EOB_FACTOR_ADJUST + *eob_ptr = eob + 1; +} + void highbd_quantize_b_helper_c( const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, @@ -133,6 +313,80 @@ void highbd_quantize_b_helper_c( /* These functions should only be called when quantisation matrices are not used. */ +void aom_quantize_b_adaptive_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, + dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, + NULL, NULL, 0); +} + +void aom_quantize_b_32x32_adaptive_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, + dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, + NULL, NULL, 1); +} + +void aom_quantize_b_64x64_adaptive_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, + dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, + NULL, NULL, 2); +} + +void aom_highbd_quantize_b_adaptive_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + highbd_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, + dqcoeff_ptr, dequant_ptr, eob_ptr, scan, + iscan, NULL, NULL, 0); +} + +void aom_highbd_quantize_b_32x32_adaptive_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + highbd_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, + dqcoeff_ptr, dequant_ptr, eob_ptr, scan, + iscan, NULL, NULL, 1); +} + +void aom_highbd_quantize_b_64x64_adaptive_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + highbd_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, + quant_ptr, quant_shift_ptr, qcoeff_ptr, + dqcoeff_ptr, dequant_ptr, eob_ptr, scan, + iscan, NULL, NULL, 2); +} + void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, diff --git a/libaom/aom_dsp/quantize.h b/libaom/aom_dsp/quantize.h index c55ab23..43c30ee 100644 --- a/libaom/aom_dsp/quantize.h +++ b/libaom/aom_dsp/quantize.h @@ -20,6 +20,66 @@ extern "C" { #endif +void quantize_b_adaptive_helper_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr, const int log_scale); + +void aom_quantize_b_adaptive_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan); + +void aom_quantize_b_32x32_adaptive_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan); + +void aom_quantize_b_64x64_adaptive_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan); + +void highbd_quantize_b_adaptive_helper_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, + const qm_val_t *iqm_ptr, const int log_scale); + +void aom_highbd_quantize_b_adaptive_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan); + +void aom_highbd_quantize_b_32x32_adaptive_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan); + +void aom_highbd_quantize_b_64x64_adaptive_c( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan); + void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, diff --git a/libaom/aom_dsp/sad.c b/libaom/aom_dsp/sad.c index 252e0e1..9169e78 100644 --- a/libaom/aom_dsp/sad.c +++ b/libaom/aom_dsp/sad.c @@ -54,12 +54,12 @@ static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b, aom_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \ return sad(src, src_stride, comp_pred, m, m, n); \ } \ - unsigned int aom_jnt_sad##m##x##n##_avg_c( \ + unsigned int aom_dist_wtd_sad##m##x##n##_avg_c( \ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ - const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ + const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \ uint8_t comp_pred[m * n]; \ - aom_jnt_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride, \ - jcp_param); \ + aom_dist_wtd_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, \ + ref_stride, jcp_param); \ return sad(src, src_stride, comp_pred, m, m, n); \ } @@ -208,12 +208,13 @@ static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride, ref, ref_stride); \ return highbd_sadb(src, src_stride, comp_pred, m, m, n); \ } \ - unsigned int aom_highbd_jnt_sad##m##x##n##_avg_c( \ + unsigned int aom_highbd_dist_wtd_sad##m##x##n##_avg_c( \ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ - const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ + const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \ uint16_t comp_pred[m * n]; \ - aom_highbd_jnt_comp_avg_pred(CONVERT_TO_BYTEPTR(comp_pred), second_pred, \ - m, n, ref, ref_stride, jcp_param); \ + aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(comp_pred), \ + second_pred, m, n, ref, ref_stride, \ + jcp_param); \ return highbd_sadb(src, src_stride, comp_pred, m, m, n); \ } diff --git a/libaom/aom_dsp/variance.c b/libaom/aom_dsp/variance.c index 0f4990e..18a33c5 100644 --- a/libaom/aom_dsp/variance.c +++ b/libaom/aom_dsp/variance.c @@ -164,40 +164,40 @@ void aom_var_filter_block2d_bil_second_pass_c(const uint16_t *a, uint8_t *b, return aom_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \ } -#define SUBPIX_AVG_VAR(W, H) \ - uint32_t aom_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *a, int a_stride, int xoffset, int yoffset, \ - const uint8_t *b, int b_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint8_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ - \ - aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \ - bilinear_filters_2t[xoffset]); \ - aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ - bilinear_filters_2t[yoffset]); \ - \ - aom_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \ - \ - return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse); \ - } \ - uint32_t aom_jnt_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *a, int a_stride, int xoffset, int yoffset, \ - const uint8_t *b, int b_stride, uint32_t *sse, \ - const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint8_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ - \ - aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \ - bilinear_filters_2t[xoffset]); \ - aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ - bilinear_filters_2t[yoffset]); \ - \ - aom_jnt_comp_avg_pred(temp3, second_pred, W, H, temp2, W, jcp_param); \ - \ - return aom_variance##W##x##H(temp3, W, b, b_stride, sse); \ +#define SUBPIX_AVG_VAR(W, H) \ + uint32_t aom_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *a, int a_stride, int xoffset, int yoffset, \ + const uint8_t *b, int b_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ + \ + aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \ + bilinear_filters_2t[xoffset]); \ + aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ + bilinear_filters_2t[yoffset]); \ + \ + aom_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \ + \ + return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse); \ + } \ + uint32_t aom_dist_wtd_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *a, int a_stride, int xoffset, int yoffset, \ + const uint8_t *b, int b_stride, uint32_t *sse, \ + const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ + \ + aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \ + bilinear_filters_2t[xoffset]); \ + aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ + bilinear_filters_2t[yoffset]); \ + \ + aom_dist_wtd_comp_avg_pred(temp3, second_pred, W, H, temp2, W, jcp_param); \ + \ + return aom_variance##W##x##H(temp3, W, b, b_stride, sse); \ } /* Identical to the variance call except it takes an additional parameter, sum, @@ -291,7 +291,7 @@ void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm, const int ref_num = 0; const int is_intrabc = is_intrabc_block(mi); const struct scale_factors *const sf = - is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf; + is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num]; const int is_scaled = av1_is_scaled(sf); if (is_scaled) { @@ -424,9 +424,10 @@ void aom_comp_avg_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm, } } -void aom_jnt_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, - int height, const uint8_t *ref, int ref_stride, - const JNT_COMP_PARAMS *jcp_param) { +void aom_dist_wtd_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, + int width, int height, const uint8_t *ref, + int ref_stride, + const DIST_WTD_COMP_PARAMS *jcp_param) { int i, j; const int fwd_offset = jcp_param->fwd_offset; const int bck_offset = jcp_param->bck_offset; @@ -443,11 +444,11 @@ void aom_jnt_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, } } -void aom_jnt_comp_avg_upsampled_pred_c( +void aom_dist_wtd_comp_avg_upsampled_pred_c( MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col, const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, - int ref_stride, const JNT_COMP_PARAMS *jcp_param, int subpel_search) { + int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) { int i, j; const int fwd_offset = jcp_param->fwd_offset; const int bck_offset = jcp_param->bck_offset; @@ -688,125 +689,128 @@ void aom_highbd_var_filter_block2d_bil_second_pass( dst, dst_stride, sse); \ } -#define HIGHBD_SUBPIX_AVG_VAR(W, H) \ - uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ - \ - aom_highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ - aom_highbd_var_filter_block2d_bil_second_pass( \ - fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ - \ - aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W); \ - \ - return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ - dst, dst_stride, sse); \ - } \ - \ - uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ - \ - aom_highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ - aom_highbd_var_filter_block2d_bil_second_pass( \ - fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ - \ - aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W); \ - \ - return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ - dst, dst_stride, sse); \ - } \ - \ - uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ - \ - aom_highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ - aom_highbd_var_filter_block2d_bil_second_pass( \ - fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ - \ - aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W); \ - \ - return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ - dst, dst_stride, sse); \ - } \ - \ - uint32_t aom_highbd_8_jnt_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, uint32_t *sse, \ - const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ - \ - aom_highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ - aom_highbd_var_filter_block2d_bil_second_pass( \ - fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ - \ - aom_highbd_jnt_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W, jcp_param); \ - \ - return aom_highbd_8_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \ - dst_stride, sse); \ - } \ - \ - uint32_t aom_highbd_10_jnt_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, uint32_t *sse, \ - const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ - \ - aom_highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ - aom_highbd_var_filter_block2d_bil_second_pass( \ - fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ - \ - aom_highbd_jnt_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W, jcp_param); \ - \ - return aom_highbd_10_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \ - dst_stride, sse); \ - } \ - \ - uint32_t aom_highbd_12_jnt_sub_pixel_avg_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *dst, int dst_stride, uint32_t *sse, \ - const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ - \ - aom_highbd_var_filter_block2d_bil_first_pass( \ - src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ - aom_highbd_var_filter_block2d_bil_second_pass( \ - fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ - \ - aom_highbd_jnt_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W, jcp_param); \ - \ - return aom_highbd_12_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \ - dst_stride, sse); \ +#define HIGHBD_SUBPIX_AVG_VAR(W, H) \ + uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W); \ + \ + return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ + dst, dst_stride, sse); \ + } \ + \ + uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W); \ + \ + return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ + dst, dst_stride, sse); \ + } \ + \ + uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W); \ + \ + return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \ + dst, dst_stride, sse); \ + } \ + \ + uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse, \ + const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \ + W, H, CONVERT_TO_BYTEPTR(temp2), W, \ + jcp_param); \ + \ + return aom_highbd_8_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \ + dst_stride, sse); \ + } \ + \ + uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse, \ + const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \ + W, H, CONVERT_TO_BYTEPTR(temp2), W, \ + jcp_param); \ + \ + return aom_highbd_10_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \ + dst_stride, sse); \ + } \ + \ + uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *dst, int dst_stride, uint32_t *sse, \ + const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint16_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ + \ + aom_highbd_var_filter_block2d_bil_first_pass( \ + src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_highbd_var_filter_block2d_bil_second_pass( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \ + W, H, CONVERT_TO_BYTEPTR(temp2), W, \ + jcp_param); \ + \ + return aom_highbd_12_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \ + dst_stride, sse); \ } /* All three forms of the variance are available in the same sizes. */ @@ -880,7 +884,7 @@ void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd, const int ref_num = 0; const int is_intrabc = is_intrabc_block(mi); const struct scale_factors *const sf = - is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf; + is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num]; const int is_scaled = av1_is_scaled(sf); if (is_scaled) { @@ -1018,10 +1022,10 @@ void aom_highbd_comp_avg_upsampled_pred_c( } } -void aom_highbd_jnt_comp_avg_pred_c(uint8_t *comp_pred8, const uint8_t *pred8, - int width, int height, const uint8_t *ref8, - int ref_stride, - const JNT_COMP_PARAMS *jcp_param) { +void aom_highbd_dist_wtd_comp_avg_pred_c( + uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, + const uint8_t *ref8, int ref_stride, + const DIST_WTD_COMP_PARAMS *jcp_param) { int i, j; const int fwd_offset = jcp_param->fwd_offset; const int bck_offset = jcp_param->bck_offset; @@ -1041,11 +1045,11 @@ void aom_highbd_jnt_comp_avg_pred_c(uint8_t *comp_pred8, const uint8_t *pred8, } } -void aom_highbd_jnt_comp_avg_upsampled_pred_c( +void aom_highbd_dist_wtd_comp_avg_upsampled_pred_c( MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, - int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param, + int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) { int i, j; const int fwd_offset = jcp_param->fwd_offset; diff --git a/libaom/aom_dsp/variance.h b/libaom/aom_dsp/variance.h index 362da29..4550c17 100644 --- a/libaom/aom_dsp/variance.h +++ b/libaom/aom_dsp/variance.h @@ -50,15 +50,14 @@ typedef unsigned int (*aom_subp_avg_variance_fn_t)( const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b, int b_stride, unsigned int *sse, const uint8_t *second_pred); -typedef unsigned int (*aom_jnt_sad_avg_fn_t)(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - const uint8_t *second_pred, - const JNT_COMP_PARAMS *jcp_param); +typedef unsigned int (*aom_dist_wtd_sad_avg_fn_t)( + const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, + const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param); -typedef unsigned int (*aom_jnt_subp_avg_variance_fn_t)( +typedef unsigned int (*aom_dist_wtd_subp_avg_variance_fn_t)( const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b, int b_stride, unsigned int *sse, const uint8_t *second_pred, - const JNT_COMP_PARAMS *jcp_param); + const DIST_WTD_COMP_PARAMS *jcp_param); typedef unsigned int (*aom_masked_sad_fn_t)(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, @@ -101,8 +100,8 @@ typedef struct aom_variance_vtable { aom_obmc_sad_fn_t osdf; aom_obmc_variance_fn_t ovf; aom_obmc_subpixvariance_fn_t osvf; - aom_jnt_sad_avg_fn_t jsdaf; - aom_jnt_subp_avg_variance_fn_t jsvaf; + aom_dist_wtd_sad_avg_fn_t jsdaf; + aom_dist_wtd_subp_avg_variance_fn_t jsvaf; } aom_variance_fn_ptr_t; void aom_highbd_var_filter_block2d_bil_first_pass( diff --git a/libaom/aom_dsp/x86/adaptive_quantize_sse2.c b/libaom/aom_dsp/x86/adaptive_quantize_sse2.c new file mode 100644 index 0000000..3822c27 --- /dev/null +++ b/libaom/aom_dsp/x86/adaptive_quantize_sse2.c @@ -0,0 +1,421 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <emmintrin.h> +#include "config/aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "av1/encoder/av1_quantize.h" +#include "aom_dsp/x86/quantize_x86.h" + +void aom_quantize_b_adaptive_sse2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + const __m128i zero = _mm_setzero_si128(); + int index = 16; + int non_zero_count = (int)n_coeffs; + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1; + __m128i eob = zero, eob0, prescan0, prescan1, all_zero; + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 0), + ROUND_POWER_OF_TWO(zbin_ptr[1], 0) }; + + int prescan_add[2]; + for (int i = 0; i < 2; ++i) + prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); + + // max buffer is of size 256 as this functions calls with + // maximum n_coeffs as 256 + int16_t prescan[256]; + memset(prescan, -1, n_coeffs * sizeof(int16_t)); + + // TODO(Aniket): Experiment the following loop with intrinsic + for (int i = (int)n_coeffs - 1; i >= 0; i--) { + const int rc = scan[i]; + const qm_val_t wt = 1 << AOM_QM_BITS; + const int coeff = coeff_ptr[rc] * wt; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int prescan_add_val = prescan_add[rc != 0]; + if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) { + prescan[rc] = 0; + non_zero_count--; + } else { + break; + } + } +#if SKIP_EOB_FACTOR_ADJUST + int first = -1; +#endif + // Setup global values. + load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant, + dequant_ptr, &dequant, quant_shift_ptr, &shift); + + // Do DC and first 15 AC. + coeff0 = load_coefficients(coeff_ptr); + coeff1 = load_coefficients(coeff_ptr + 8); + + // Poor man's abs(). + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); + + prescan0 = _mm_loadu_si128((const __m128i *)prescan); + prescan1 = _mm_loadu_si128((const __m128i *)(prescan + 8)); + + cmp_mask0 = _mm_and_si128(prescan0, _mm_cmpgt_epi16(qcoeff0, zbin)); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC + cmp_mask1 = _mm_and_si128(prescan1, _mm_cmpgt_epi16(qcoeff1, zbin)); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero); + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + dequant = _mm_unpackhi_epi64(dequant, dequant); + } else { + calculate_qcoeff(&qcoeff0, round, quant, shift); + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + + calculate_qcoeff(&qcoeff1, round, quant, shift); + + // Reinsert signs + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); + + // Mask out zbin threshold coeffs + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_coefficients(qcoeff0, qcoeff_ptr); + store_coefficients(qcoeff1, qcoeff_ptr + 8); + + coeff0 = calculate_dqcoeff(qcoeff0, dequant); + dequant = _mm_unpackhi_epi64(dequant, dequant); + coeff1 = calculate_dqcoeff(qcoeff1, dequant); + + store_coefficients(coeff0, dqcoeff_ptr); + store_coefficients(coeff1, dqcoeff_ptr + 8); + + eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + } + + // AC only loop. + // TODO(Aniket): Reduce the processing of coeff quatization + // based on eob logic + while (index < n_coeffs) { + coeff0 = load_coefficients(coeff_ptr + index); + coeff1 = load_coefficients(coeff_ptr + index + 8); + + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); + + prescan0 = _mm_loadu_si128((const __m128i *)(prescan + index)); + prescan1 = _mm_loadu_si128((const __m128i *)(prescan + index + 8)); + + cmp_mask0 = _mm_and_si128(prescan0, _mm_cmpgt_epi16(qcoeff0, zbin)); + cmp_mask1 = _mm_and_si128(prescan1, _mm_cmpgt_epi16(qcoeff1, zbin)); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero); + index += 16; + continue; + } + calculate_qcoeff(&qcoeff0, round, quant, shift); + calculate_qcoeff(&qcoeff1, round, quant, shift); + + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_coefficients(qcoeff0, qcoeff_ptr + index); + store_coefficients(qcoeff1, qcoeff_ptr + index + 8); + + coeff0 = calculate_dqcoeff(qcoeff0, dequant); + coeff1 = calculate_dqcoeff(qcoeff1, dequant); + + store_coefficients(coeff0, dqcoeff_ptr + index); + store_coefficients(coeff1, dqcoeff_ptr + index + 8); + + eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, index, + zero); + eob = _mm_max_epi16(eob, eob0); + index += 16; + } + + *eob_ptr = accumulate_eob(eob); + +#if SKIP_EOB_FACTOR_ADJUST + // TODO(Aniket): Experiment the following loop with intrinsic by combining + // with the quantization loop above + for (int i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int qcoeff = qcoeff_ptr[rc]; + if (qcoeff) { + first = i; + break; + } + } + if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) { + const int rc = scan[(*eob_ptr - 1)]; + if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { + const qm_val_t wt = (1 << AOM_QM_BITS); + const int coeff = coeff_ptr[rc] * wt; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; + const int prescan_add_val = + ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); + if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) { + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + *eob_ptr = 0; + } + } + } +#endif +} + +void aom_quantize_b_32x32_adaptive_sse2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int index = 16; + int non_zero_count = (int)n_coeffs; + const int log_scale = 1; + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + const __m128i log_scale_vec = _mm_set1_epi16(log_scale); + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1, coeff0_sign, coeff1_sign; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1; + __m128i eob = zero, eob0, prescan0, prescan1, all_zero; + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale), + ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) }; + + int prescan_add[2]; + for (int i = 0; i < 2; ++i) + prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7); + + // max buffer is of size 1024 as this functions calls with + // maximum n_coeffs as 1024 + int16_t prescan[1024]; + memset(prescan, -1, n_coeffs * sizeof(int16_t)); + + // TODO(Aniket): Experiment the following loop with intrinsic + for (int i = (int)n_coeffs - 1; i >= 0; i--) { + const int rc = scan[i]; + const qm_val_t wt = 1 << AOM_QM_BITS; + const int coeff = coeff_ptr[rc] * wt; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int prescan_add_val = prescan_add[rc != 0]; + if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) { + prescan[rc] = 0; + non_zero_count--; + } else { + break; + } + } +#if SKIP_EOB_FACTOR_ADJUST + int first = -1; +#endif + // Setup global values. + zbin = _mm_load_si128((const __m128i *)zbin_ptr); + round = _mm_load_si128((const __m128i *)round_ptr); + quant = _mm_load_si128((const __m128i *)quant_ptr); + dequant = _mm_load_si128((const __m128i *)dequant_ptr); + shift = _mm_load_si128((const __m128i *)quant_shift_ptr); + + // Shift with rounding. + zbin = _mm_add_epi16(zbin, log_scale_vec); + round = _mm_add_epi16(round, log_scale_vec); + zbin = _mm_srli_epi16(zbin, log_scale); + round = _mm_srli_epi16(round, log_scale); + zbin = _mm_sub_epi16(zbin, one); + // Do DC and first 15 AC. + coeff0 = load_coefficients(coeff_ptr); + coeff1 = load_coefficients(coeff_ptr + 8); + + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); + + prescan0 = _mm_loadu_si128((const __m128i *)prescan); + prescan1 = _mm_loadu_si128((const __m128i *)(prescan + 8)); + + cmp_mask0 = _mm_and_si128(prescan0, _mm_cmpgt_epi16(qcoeff0, zbin)); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC + cmp_mask1 = _mm_and_si128(prescan1, _mm_cmpgt_epi16(qcoeff1, zbin)); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero); + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + dequant = _mm_unpackhi_epi64(dequant, dequant); + } else { + calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale); + + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + + calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale); + + // Reinsert signs + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); + + // Mask out zbin threshold coeffs + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_coefficients(qcoeff0, qcoeff_ptr); + store_coefficients(qcoeff1, qcoeff_ptr + 8); + + calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, dqcoeff_ptr, + &log_scale); + dequant = _mm_unpackhi_epi64(dequant, dequant); + calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero, + dqcoeff_ptr + 8, &log_scale); + + eob = + scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + } + + // AC only loop. + // TODO(Aniket): Reduce the processing of coeff quatization + // based on eob logic + while (index < n_coeffs) { + coeff0 = load_coefficients(coeff_ptr + index); + coeff1 = load_coefficients(coeff_ptr + index + 8); + + coeff0_sign = _mm_srai_epi16(coeff0, 15); + coeff1_sign = _mm_srai_epi16(coeff1, 15); + qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign); + + prescan0 = _mm_loadu_si128((const __m128i *)(prescan + index)); + prescan1 = _mm_loadu_si128((const __m128i *)(prescan + index + 8)); + + cmp_mask0 = _mm_and_si128(prescan0, _mm_cmpgt_epi16(qcoeff0, zbin)); + cmp_mask1 = _mm_and_si128(prescan1, _mm_cmpgt_epi16(qcoeff1, zbin)); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero); + index += 16; + continue; + } + calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale); + calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale); + + qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign); + qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_coefficients(qcoeff0, qcoeff_ptr + index); + store_coefficients(qcoeff1, qcoeff_ptr + index + 8); + + calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, + dqcoeff_ptr + index, &log_scale); + calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero, + dqcoeff_ptr + index + 8, &log_scale); + + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index, + zero); + eob = _mm_max_epi16(eob, eob0); + index += 16; + } + + *eob_ptr = accumulate_eob(eob); + +#if SKIP_EOB_FACTOR_ADJUST + // TODO(Aniket): Experiment the following loop with intrinsic by combining + // with the quantization loop above + for (int i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int qcoeff = qcoeff_ptr[rc]; + if (qcoeff) { + first = i; + break; + } + } + if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) { + const int rc = scan[(*eob_ptr - 1)]; + if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) { + const qm_val_t wt = (1 << AOM_QM_BITS); + const int coeff = coeff_ptr[rc] * wt; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST; + const int prescan_add_val = + ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7); + if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) { + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + *eob_ptr = 0; + } + } + } +#endif +} diff --git a/libaom/aom_dsp/x86/avg_intrin_sse2.c b/libaom/aom_dsp/x86/avg_intrin_sse2.c index 969e4e1..0c20261 100644 --- a/libaom/aom_dsp/x86/avg_intrin_sse2.c +++ b/libaom/aom_dsp/x86/avg_intrin_sse2.c @@ -16,6 +16,129 @@ #include "aom_dsp/x86/bitdepth_conversion_sse2.h" #include "aom_ports/mem.h" +void aom_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, + int *min, int *max) { + __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff; + u0 = _mm_setzero_si128(); + // Row 0 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff0 = _mm_max_epi16(diff, negdiff); + // Row 1 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(absdiff0, absdiff); + minabsdiff = _mm_min_epi16(absdiff0, absdiff); + // Row 2 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); + minabsdiff = _mm_min_epi16(minabsdiff, absdiff); + // Row 3 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); + minabsdiff = _mm_min_epi16(minabsdiff, absdiff); + // Row 4 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); + minabsdiff = _mm_min_epi16(minabsdiff, absdiff); + // Row 5 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); + minabsdiff = _mm_min_epi16(minabsdiff, absdiff); + // Row 6 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); + minabsdiff = _mm_min_epi16(minabsdiff, absdiff); + // Row 7 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); + minabsdiff = _mm_min_epi16(minabsdiff, absdiff); + + maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8)); + maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32)); + maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16)); + *max = _mm_extract_epi16(maxabsdiff, 0); + + minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8)); + minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32)); + minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16)); + *min = _mm_extract_epi16(minabsdiff, 0); +} + +unsigned int aom_avg_8x8_sse2(const uint8_t *s, int p) { + __m128i s0, s1, u0; + unsigned int avg = 0; + u0 = _mm_setzero_si128(); + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0); + s0 = _mm_adds_epu16(s0, s1); + + s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8)); + s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 32)); + s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16)); + avg = _mm_extract_epi16(s0, 0); + return (avg + 32) >> 6; +} + +unsigned int aom_avg_4x4_sse2(const uint8_t *s, int p) { + __m128i s0, s1, u0; + unsigned int avg = 0; + u0 = _mm_setzero_si128(); + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0); + s0 = _mm_adds_epu16(s0, s1); + s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0); + s0 = _mm_adds_epu16(s0, s1); + + s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4)); + s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16)); + avg = _mm_extract_epi16(s0, 0); + return (avg + 8) >> 4; +} + static void hadamard_col8_sse2(__m128i *in, int iter) { __m128i a0 = in[0]; __m128i a1 = in[1]; diff --git a/libaom/aom_dsp/x86/convolve_avx2.h b/libaom/aom_dsp/x86/convolve_avx2.h index 3cc0e23..4a1068e 100644 --- a/libaom/aom_dsp/x86/convolve_avx2.h +++ b/libaom/aom_dsp/x86/convolve_avx2.h @@ -34,31 +34,214 @@ DECLARE_ALIGNED(32, static const uint8_t, filt4_d4_global_avx2[]) = { 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, }; -DECLARE_ALIGNED(32, static const uint8_t, filt_center_global_avx2[32]) = { - 3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 10, 255, - 3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 10, 255 -}; - -DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = { - 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, - 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 -}; - -DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = { - 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, - 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 -}; - -DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = { - 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, - 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 -}; +#define CONVOLVE_SR_HORIZONTAL_FILTER_8TAP \ + for (i = 0; i < (im_h - 2); i += 2) { \ + __m256i data = _mm256_castsi128_si256( \ + _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \ + data = _mm256_inserti128_si256( \ + data, \ + _mm_loadu_si128( \ + (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]), \ + 1); \ + \ + __m256i res = convolve_lowbd_x(data, coeffs_h, filt); \ + res = \ + _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \ + _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); \ + } \ + \ + __m256i data_1 = _mm256_castsi128_si256( \ + _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \ + \ + __m256i res = convolve_lowbd_x(data_1, coeffs_h, filt); \ + \ + res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \ + \ + _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); + +#define CONVOLVE_SR_VERTICAL_FILTER_8TAP \ + __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \ + __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \ + __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \ + __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \ + __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); \ + __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); \ + \ + __m256i s[8]; \ + s[0] = _mm256_unpacklo_epi16(src_0, src_1); \ + s[1] = _mm256_unpacklo_epi16(src_2, src_3); \ + s[2] = _mm256_unpacklo_epi16(src_4, src_5); \ + \ + s[4] = _mm256_unpackhi_epi16(src_0, src_1); \ + s[5] = _mm256_unpackhi_epi16(src_2, src_3); \ + s[6] = _mm256_unpackhi_epi16(src_4, src_5); \ + \ + for (i = 0; i < h; i += 2) { \ + const int16_t *data = &im_block[i * im_stride]; \ + \ + const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); \ + const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); \ + \ + s[3] = _mm256_unpacklo_epi16(s6, s7); \ + s[7] = _mm256_unpackhi_epi16(s6, s7); \ + \ + __m256i res_a = convolve(s, coeffs_v); \ + __m256i res_b = convolve(s + 4, coeffs_v); \ + \ + res_a = \ + _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v); \ + res_b = \ + _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v); \ + \ + const __m256i res_a_round = _mm256_sra_epi32( \ + _mm256_add_epi32(res_a, round_const_v), round_shift_v); \ + const __m256i res_b_round = _mm256_sra_epi32( \ + _mm256_add_epi32(res_b, round_const_v), round_shift_v); \ + \ + const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); \ + const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit); \ + \ + const __m128i res_0 = _mm256_castsi256_si128(res_8b); \ + const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); \ + \ + __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; \ + __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride]; \ + if (w - j > 4) { \ + _mm_storel_epi64(p_0, res_0); \ + _mm_storel_epi64(p_1, res_1); \ + } else if (w == 4) { \ + xx_storel_32(p_0, res_0); \ + xx_storel_32(p_1, res_1); \ + } else { \ + *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0); \ + *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1); \ + } \ + \ + s[0] = s[1]; \ + s[1] = s[2]; \ + s[2] = s[3]; \ + \ + s[4] = s[5]; \ + s[5] = s[6]; \ + s[6] = s[7]; \ + } -DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = { - 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, - 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 -}; +#define DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP \ + for (i = 0; i < im_h; i += 2) { \ + __m256i data = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h)); \ + if (i + 1 < im_h) \ + data = _mm256_inserti128_si256( \ + data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1); \ + src_h += (src_stride << 1); \ + __m256i res = convolve_lowbd_x(data, coeffs_x, filt); \ + \ + res = \ + _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \ + \ + _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); \ + } +#define DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP \ + __m256i s[8]; \ + __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \ + __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \ + __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \ + __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \ + __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); \ + __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); \ + \ + s[0] = _mm256_unpacklo_epi16(s0, s1); \ + s[1] = _mm256_unpacklo_epi16(s2, s3); \ + s[2] = _mm256_unpacklo_epi16(s4, s5); \ + \ + s[4] = _mm256_unpackhi_epi16(s0, s1); \ + s[5] = _mm256_unpackhi_epi16(s2, s3); \ + s[6] = _mm256_unpackhi_epi16(s4, s5); \ + \ + for (i = 0; i < h; i += 2) { \ + const int16_t *data = &im_block[i * im_stride]; \ + \ + const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); \ + const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); \ + \ + s[3] = _mm256_unpacklo_epi16(s6, s7); \ + s[7] = _mm256_unpackhi_epi16(s6, s7); \ + \ + const __m256i res_a = convolve(s, coeffs_y); \ + const __m256i res_a_round = _mm256_sra_epi32( \ + _mm256_add_epi32(res_a, round_const_v), round_shift_v); \ + \ + if (w - j > 4) { \ + const __m256i res_b = convolve(s + 4, coeffs_y); \ + const __m256i res_b_round = _mm256_sra_epi32( \ + _mm256_add_epi32(res_b, round_const_v), round_shift_v); \ + const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round); \ + const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); \ + \ + if (do_average) { \ + const __m256i data_ref_0 = load_line2_avx2( \ + &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]); \ + const __m256i comp_avg_res = \ + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); \ + \ + const __m256i round_result = convolve_rounding( \ + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); \ + \ + const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); \ + const __m128i res_0 = _mm256_castsi256_si128(res_8); \ + const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); \ + \ + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); \ + _mm_storel_epi64( \ + (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); \ + } else { \ + const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); \ + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); \ + \ + const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); \ + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), \ + res_1); \ + } \ + } else { \ + const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round); \ + const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); \ + \ + if (do_average) { \ + const __m256i data_ref_0 = load_line2_avx2( \ + &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]); \ + \ + const __m256i comp_avg_res = \ + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); \ + \ + const __m256i round_result = convolve_rounding( \ + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); \ + \ + const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); \ + const __m128i res_0 = _mm256_castsi256_si128(res_8); \ + const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); \ + \ + *(uint32_t *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); \ + *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) = \ + _mm_cvtsi128_si32(res_1); \ + \ + } else { \ + const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); \ + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); \ + \ + const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); \ + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), \ + res_1); \ + } \ + } \ + \ + s[0] = s[1]; \ + s[1] = s[2]; \ + s[2] = s[3]; \ + \ + s[4] = s[5]; \ + s[5] = s[6]; \ + s[6] = s[7]; \ + } static INLINE void prepare_coeffs_lowbd( const InterpFilterParams *const filter_params, const int subpel_q4, __m256i *const coeffs /* [4] */) { @@ -120,6 +303,17 @@ static INLINE __m256i convolve_lowbd(const __m256i *const s, return res; } +static INLINE __m256i convolve_lowbd_4tap(const __m256i *const s, + const __m256i *const coeffs) { + const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]); + const __m256i res_45 = _mm256_maddubs_epi16(s[1], coeffs[1]); + + // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + const __m256i res = _mm256_add_epi16(res_45, res_23); + + return res; +} + static INLINE __m256i convolve(const __m256i *const s, const __m256i *const coeffs) { const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]); @@ -155,6 +349,17 @@ static INLINE __m256i convolve_lowbd_x(const __m256i data, return convolve_lowbd(s, coeffs); } +static INLINE __m256i convolve_lowbd_x_4tap(const __m256i data, + const __m256i *const coeffs, + const __m256i *const filt) { + __m256i s[2]; + + s[0] = _mm256_shuffle_epi8(data, filt[0]); + s[1] = _mm256_shuffle_epi8(data, filt[1]); + + return convolve_lowbd_4tap(s, coeffs); +} + static INLINE void add_store_aligned_256(CONV_BUF_TYPE *const dst, const __m256i *const res, const int do_average) { @@ -172,9 +377,9 @@ static INLINE void add_store_aligned_256(CONV_BUF_TYPE *const dst, static INLINE __m256i comp_avg(const __m256i *const data_ref_0, const __m256i *const res_unsigned, const __m256i *const wt, - const int use_jnt_comp_avg) { + const int use_dist_wtd_comp_avg) { __m256i res; - if (use_jnt_comp_avg) { + if (use_dist_wtd_comp_avg) { const __m256i data_lo = _mm256_unpacklo_epi16(*data_ref_0, *res_unsigned); const __m256i data_hi = _mm256_unpackhi_epi16(*data_ref_0, *res_unsigned); @@ -206,9 +411,9 @@ static INLINE __m256i highbd_comp_avg(const __m256i *const data_ref_0, const __m256i *const res_unsigned, const __m256i *const wt0, const __m256i *const wt1, - const int use_jnt_comp_avg) { + const int use_dist_wtd_comp_avg) { __m256i res; - if (use_jnt_comp_avg) { + if (use_dist_wtd_comp_avg) { const __m256i wt0_res = _mm256_mullo_epi32(*data_ref_0, *wt0); const __m256i wt1_res = _mm256_mullo_epi32(*res_unsigned, *wt1); const __m256i wt_res = _mm256_add_epi32(wt0_res, wt1_res); diff --git a/libaom/aom_dsp/x86/convolve_sse2.h b/libaom/aom_dsp/x86/convolve_sse2.h index 445d04b..385c7c7 100644 --- a/libaom/aom_dsp/x86/convolve_sse2.h +++ b/libaom/aom_dsp/x86/convolve_sse2.h @@ -78,9 +78,9 @@ static INLINE __m128i convolve_hi_y(const __m128i *const s, static INLINE __m128i comp_avg(const __m128i *const data_ref_0, const __m128i *const res_unsigned, const __m128i *const wt, - const int use_jnt_comp_avg) { + const int use_dist_wtd_avg) { __m128i res; - if (use_jnt_comp_avg) { + if (use_dist_wtd_avg) { const __m128i data_lo = _mm_unpacklo_epi16(*data_ref_0, *res_unsigned); const __m128i data_hi = _mm_unpackhi_epi16(*data_ref_0, *res_unsigned); diff --git a/libaom/aom_dsp/x86/convolve_sse4_1.h b/libaom/aom_dsp/x86/convolve_sse4_1.h index 6b8388d..b1a3bb4 100644 --- a/libaom/aom_dsp/x86/convolve_sse4_1.h +++ b/libaom/aom_dsp/x86/convolve_sse4_1.h @@ -35,9 +35,9 @@ static INLINE __m128i highbd_comp_avg_sse4_1(const __m128i *const data_ref_0, const __m128i *const res_unsigned, const __m128i *const wt0, const __m128i *const wt1, - const int use_jnt_comp_avg) { + const int use_dist_wtd_avg) { __m128i res; - if (use_jnt_comp_avg) { + if (use_dist_wtd_avg) { const __m128i wt0_res = _mm_mullo_epi32(*data_ref_0, *wt0); const __m128i wt1_res = _mm_mullo_epi32(*res_unsigned, *wt1); diff --git a/libaom/aom_dsp/x86/fft_avx2.c b/libaom/aom_dsp/x86/fft_avx2.c index 54da022..4cccc5f 100644 --- a/libaom/aom_dsp/x86/fft_avx2.c +++ b/libaom/aom_dsp/x86/fft_avx2.c @@ -11,6 +11,7 @@ #include <immintrin.h> +#include "config/aom_dsp_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/fft_common.h" diff --git a/libaom/aom_dsp/x86/fft_sse2.c b/libaom/aom_dsp/x86/fft_sse2.c index 12bdc3e..6f20a3c 100644 --- a/libaom/aom_dsp/x86/fft_sse2.c +++ b/libaom/aom_dsp/x86/fft_sse2.c @@ -11,6 +11,7 @@ s * PATENTS file, you can obtain it at www.aomedia.org/license/patent. #include <xmmintrin.h> +#include "config/aom_dsp_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/fft_common.h" diff --git a/libaom/aom_dsp/x86/highbd_loopfilter_sse2.c b/libaom/aom_dsp/x86/highbd_loopfilter_sse2.c index 097e077..70b91c6 100644 --- a/libaom/aom_dsp/x86/highbd_loopfilter_sse2.c +++ b/libaom/aom_dsp/x86/highbd_loopfilter_sse2.c @@ -727,8 +727,8 @@ void aom_highbd_lpf_horizontal_14_dual_sse2( _limit1, _thresh1, bd); for (i = 0; i < 6; i++) { - _mm_store_si128((__m128i *)(s - (i + 1) * pitch), p[i]); - _mm_store_si128((__m128i *)(s + i * pitch), q[i]); + _mm_storeu_si128((__m128i *)(s - (i + 1) * pitch), p[i]); + _mm_storeu_si128((__m128i *)(s + i * pitch), q[i]); } } diff --git a/libaom/aom_dsp/x86/highbd_quantize_intrin_sse2.c b/libaom/aom_dsp/x86/highbd_quantize_intrin_sse2.c index 58e5f98..2f4ffd3 100644 --- a/libaom/aom_dsp/x86/highbd_quantize_intrin_sse2.c +++ b/libaom/aom_dsp/x86/highbd_quantize_intrin_sse2.c @@ -146,3 +146,61 @@ void aom_highbd_quantize_b_32x32_sse2( } *eob_ptr = eob + 1; } + +void aom_highbd_quantize_b_64x64_sse2( + const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + __m128i zbins[2]; + __m128i nzbins[2]; + int idx = 0; + int idx_arr[1024]; + int i, eob = -1; + const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 2); + const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 2); + (void)scan; + zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp); + zbins[1] = _mm_set1_epi32(zbin1_tmp); + + nzbins[0] = _mm_setzero_si128(); + nzbins[1] = _mm_setzero_si128(); + nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]); + nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]); + + memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + // Pre-scan pass + for (i = 0; i < n_coeffs / 4; i++) { + __m128i coeffs, cmp1, cmp2; + int test; + coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4)); + cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]); + cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]); + cmp1 = _mm_and_si128(cmp1, cmp2); + test = _mm_movemask_epi8(cmp1); + if (!(test & 0xf)) idx_arr[idx++] = i * 4; + if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1; + if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2; + if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3; + } + + // Quantization pass: only process the coefficients selected in + // pre-scan pass. Note: idx can be zero. + for (i = 0; i < idx; i++) { + const int rc = idx_arr[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2); + const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1; + const uint32_t abs_qcoeff = + (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 14); + qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 4; + if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob; + } + *eob_ptr = eob + 1; +} diff --git a/libaom/aom_dsp/x86/highbd_variance_sse2.c b/libaom/aom_dsp/x86/highbd_variance_sse2.c index 226576b..fc5678d 100644 --- a/libaom/aom_dsp/x86/highbd_variance_sse2.c +++ b/libaom/aom_dsp/x86/highbd_variance_sse2.c @@ -287,30 +287,38 @@ DECLS(sse2); uint32_t aom_highbd_8_sub_pixel_variance##w##x##h##_##opt( \ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ - uint32_t sse; \ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ - int se = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ - src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \ - NULL); \ - if (w > wf) { \ - unsigned int sse2; \ + int se = 0; \ + unsigned int sse = 0; \ + unsigned int sse2; \ + int row_rep = (w > 64) ? 2 : 1; \ + for (int wd_64 = 0; wd_64 < row_rep; wd_64++) { \ + src += wd_64 * 64; \ + dst += wd_64 * 64; \ int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \ - &sse2, NULL, NULL); \ + src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse2, \ + NULL, NULL); \ se += se2; \ sse += sse2; \ - if (w > wf * 2) { \ - se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \ - &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ + if (w > wf) { \ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \ + src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \ &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ + if (w > wf * 2) { \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \ + h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \ + h, &sse2, NULL, NULL); \ + se += se2; \ + sse += sse2; \ + } \ } \ } \ *sse_ptr = sse; \ @@ -322,33 +330,42 @@ DECLS(sse2); const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ int64_t var; \ uint32_t sse; \ + uint64_t long_sse = 0; \ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ - int se = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ - src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \ - NULL); \ - if (w > wf) { \ - uint32_t sse2; \ + int se = 0; \ + int row_rep = (w > 64) ? 2 : 1; \ + for (int wd_64 = 0; wd_64 < row_rep; wd_64++) { \ + src += wd_64 * 64; \ + dst += wd_64 * 64; \ int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \ - &sse2, NULL, NULL); \ + src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \ + NULL); \ se += se2; \ - sse += sse2; \ - if (w > wf * 2) { \ - se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \ - &sse2, NULL, NULL); \ - se += se2; \ - sse += sse2; \ + long_sse += sse; \ + if (w > wf) { \ + uint32_t sse2; \ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \ + src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \ &sse2, NULL, NULL); \ se += se2; \ - sse += sse2; \ + long_sse += sse2; \ + if (w > wf * 2) { \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \ + h, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \ + h, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + } \ } \ } \ se = ROUND_POWER_OF_TWO(se, 2); \ - sse = ROUND_POWER_OF_TWO(sse, 4); \ + sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 4); \ *sse_ptr = sse; \ var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ return (var >= 0) ? (uint32_t)var : 0; \ @@ -364,35 +381,38 @@ DECLS(sse2); uint64_t long_sse = 0; \ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + int row_rep = (w > 64) ? 2 : 1; \ for (start_row = 0; start_row < h; start_row += 16) { \ uint32_t sse2; \ int height = h - start_row < 16 ? h - start_row : 16; \ - int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + (start_row * src_stride), src_stride, x_offset, y_offset, \ - dst + (start_row * dst_stride), dst_stride, height, &sse2, NULL, \ - NULL); \ - se += se2; \ - long_sse += sse2; \ - if (w > wf) { \ - se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 16 + (start_row * src_stride), src_stride, x_offset, \ - y_offset, dst + 16 + (start_row * dst_stride), dst_stride, height, \ - &sse2, NULL, NULL); \ + uint16_t *src_tmp = src + (start_row * src_stride); \ + uint16_t *dst_tmp = dst + (start_row * dst_stride); \ + for (int wd_64 = 0; wd_64 < row_rep; wd_64++) { \ + src_tmp += wd_64 * 64; \ + dst_tmp += wd_64 * 64; \ + int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src_tmp, src_stride, x_offset, y_offset, dst_tmp, dst_stride, \ + height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ - if (w > wf * 2) { \ + if (w > wf) { \ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 32 + (start_row * src_stride), src_stride, x_offset, \ - y_offset, dst + 32 + (start_row * dst_stride), dst_stride, \ - height, &sse2, NULL, NULL); \ - se += se2; \ - long_sse += sse2; \ - se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ - src + 48 + (start_row * src_stride), src_stride, x_offset, \ - y_offset, dst + 48 + (start_row * dst_stride), dst_stride, \ - height, &sse2, NULL, NULL); \ + src_tmp + 16, src_stride, x_offset, y_offset, dst_tmp + 16, \ + dst_stride, height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ + if (w > wf * 2) { \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src_tmp + 32, src_stride, x_offset, y_offset, dst_tmp + 32, \ + dst_stride, height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ + src_tmp + 48, src_stride, x_offset, y_offset, dst_tmp + 48, \ + dst_stride, height, &sse2, NULL, NULL); \ + se += se2; \ + long_sse += sse2; \ + } \ } \ } \ } \ @@ -403,22 +423,25 @@ DECLS(sse2); return (var >= 0) ? (uint32_t)var : 0; \ } -#define FNS(opt) \ - FN(64, 64, 16, 6, 6, opt, (int64_t)); \ - FN(64, 32, 16, 6, 5, opt, (int64_t)); \ - FN(32, 64, 16, 5, 6, opt, (int64_t)); \ - FN(32, 32, 16, 5, 5, opt, (int64_t)); \ - FN(32, 16, 16, 5, 4, opt, (int64_t)); \ - FN(16, 32, 16, 4, 5, opt, (int64_t)); \ - FN(16, 16, 16, 4, 4, opt, (int64_t)); \ - FN(16, 8, 16, 4, 3, opt, (int64_t)); \ - FN(8, 16, 8, 3, 4, opt, (int64_t)); \ - FN(8, 8, 8, 3, 3, opt, (int64_t)); \ - FN(8, 4, 8, 3, 2, opt, (int64_t)); \ - FN(16, 4, 16, 4, 2, opt, (int64_t)); \ - FN(8, 32, 8, 3, 5, opt, (int64_t)); \ - FN(32, 8, 16, 5, 3, opt, (int64_t)); \ - FN(16, 64, 16, 4, 6, opt, (int64_t)); \ +#define FNS(opt) \ + FN(128, 128, 16, 7, 7, opt, (int64_t)); \ + FN(128, 64, 16, 7, 6, opt, (int64_t)); \ + FN(64, 128, 16, 6, 7, opt, (int64_t)); \ + FN(64, 64, 16, 6, 6, opt, (int64_t)); \ + FN(64, 32, 16, 6, 5, opt, (int64_t)); \ + FN(32, 64, 16, 5, 6, opt, (int64_t)); \ + FN(32, 32, 16, 5, 5, opt, (int64_t)); \ + FN(32, 16, 16, 5, 4, opt, (int64_t)); \ + FN(16, 32, 16, 4, 5, opt, (int64_t)); \ + FN(16, 16, 16, 4, 4, opt, (int64_t)); \ + FN(16, 8, 16, 4, 3, opt, (int64_t)); \ + FN(8, 16, 8, 3, 4, opt, (int64_t)); \ + FN(8, 8, 8, 3, 3, opt, (int64_t)); \ + FN(8, 4, 8, 3, 2, opt, (int64_t)); \ + FN(16, 4, 16, 4, 2, opt, (int64_t)); \ + FN(8, 32, 8, 3, 5, opt, (int64_t)); \ + FN(32, 8, 16, 5, 3, opt, (int64_t)); \ + FN(16, 64, 16, 4, 6, opt, (int64_t)); \ FN(64, 16, 16, 6, 4, opt, (int64_t)) FNS(sse2); @@ -603,7 +626,7 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd, const int ref_num = 0; const int is_intrabc = is_intrabc_block(mi); const struct scale_factors *const sf = - is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf; + is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num]; const int is_scaled = av1_is_scaled(sf); if (is_scaled) { @@ -765,11 +788,11 @@ void aom_highbd_comp_avg_upsampled_pred_sse2( } } -static INLINE void highbd_compute_jnt_comp_avg(__m128i *p0, __m128i *p1, - const __m128i *w0, - const __m128i *w1, - const __m128i *r, - void *const result) { +static INLINE void highbd_compute_dist_wtd_comp_avg(__m128i *p0, __m128i *p1, + const __m128i *w0, + const __m128i *w1, + const __m128i *r, + void *const result) { assert(DIST_PRECISION_BITS <= 4); __m128i mult0 = _mm_mullo_epi16(*p0, *w0); __m128i mult1 = _mm_mullo_epi16(*p1, *w1); @@ -780,11 +803,10 @@ static INLINE void highbd_compute_jnt_comp_avg(__m128i *p0, __m128i *p1, xx_storeu_128(result, shift); } -void aom_highbd_jnt_comp_avg_pred_sse2(uint8_t *comp_pred8, - const uint8_t *pred8, int width, - int height, const uint8_t *ref8, - int ref_stride, - const JNT_COMP_PARAMS *jcp_param) { +void aom_highbd_dist_wtd_comp_avg_pred_sse2( + uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, + const uint8_t *ref8, int ref_stride, + const DIST_WTD_COMP_PARAMS *jcp_param) { int i; const uint16_t wt0 = (uint16_t)jcp_param->fwd_offset; const uint16_t wt1 = (uint16_t)jcp_param->bck_offset; @@ -806,7 +828,7 @@ void aom_highbd_jnt_comp_avg_pred_sse2(uint8_t *comp_pred8, __m128i p0 = xx_loadu_128(ref); __m128i p1 = xx_loadu_128(pred); - highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred); + highbd_compute_dist_wtd_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred); comp_pred += 8; pred += 8; @@ -823,7 +845,7 @@ void aom_highbd_jnt_comp_avg_pred_sse2(uint8_t *comp_pred8, __m128i p0 = _mm_unpacklo_epi64(p0_0, p0_1); __m128i p1 = xx_loadu_128(pred); - highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred); + highbd_compute_dist_wtd_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred); comp_pred += 8; pred += 8; @@ -832,11 +854,11 @@ void aom_highbd_jnt_comp_avg_pred_sse2(uint8_t *comp_pred8, } } -void aom_highbd_jnt_comp_avg_upsampled_pred_sse2( +void aom_highbd_dist_wtd_comp_avg_upsampled_pred_sse2( MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, - int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param, + int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) { uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); int n; @@ -860,7 +882,7 @@ void aom_highbd_jnt_comp_avg_upsampled_pred_sse2( __m128i p0 = xx_loadu_128(comp_pred16); __m128i p1 = xx_loadu_128(pred); - highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred16); + highbd_compute_dist_wtd_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred16); comp_pred16 += 8; pred += 8; diff --git a/libaom/aom_dsp/x86/intrapred_asm_sse2.asm b/libaom/aom_dsp/x86/intrapred_asm_sse2.asm index 9aece27..0eb6323 100644 --- a/libaom/aom_dsp/x86/intrapred_asm_sse2.asm +++ b/libaom/aom_dsp/x86/intrapred_asm_sse2.asm @@ -27,23 +27,6 @@ pw2_32: times 8 dw 16 SECTION .text -; ------------------------------------------ -; input: x, y, z, result -; -; trick from pascal -; (x+2y+z+2)>>2 can be calculated as: -; result = avg(x,z) -; result -= xor(x,z) & 1 -; result = avg(result,y) -; ------------------------------------------ -%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4 - pavgb %4, %1, %3 - pxor %3, %1 - pand %3, [GLOBAL(pb_1)] - psubb %4, %3 - pavgb %4, %2 -%endmacro - INIT_XMM sse2 cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset GET_GOT goffsetq diff --git a/libaom/aom_dsp/x86/intrapred_avx2.c b/libaom/aom_dsp/x86/intrapred_avx2.c index 5f3e7bb..17f35a0 100644 --- a/libaom/aom_dsp/x86/intrapred_avx2.c +++ b/libaom/aom_dsp/x86/intrapred_avx2.c @@ -1481,9 +1481,10 @@ static void highbd_dr_prediction_z1_64xN_avx2(int N, uint16_t *dst, void av1_highbd_dr_prediction_z1_avx2(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, - int dx, int dy) { + int dx, int dy, int bd) { (void)left; (void)dy; + (void)bd; switch (bw) { case 4: @@ -1511,8 +1512,8 @@ void av1_highbd_dr_prediction_z1_avx2(uint16_t *dst, ptrdiff_t stride, int bw, return; } -static void transpose_TX_8X8(const uint16_t *src, ptrdiff_t pitchSrc, - uint16_t *dst, ptrdiff_t pitchDst) { +static void highbd_transpose_TX_8X8(const uint16_t *src, ptrdiff_t pitchSrc, + uint16_t *dst, ptrdiff_t pitchDst) { __m128i r0, r1, r2, r3, r4, r5, r6, r7, r0_Lo, r1_Lo, r2_Lo, r3_Lo, r4_Lo, r5_Lo, r6_Lo; r0 = _mm_load_si128( @@ -1579,12 +1580,921 @@ static void transpose_TX_8X8(const uint16_t *src, ptrdiff_t pitchSrc, _mm_storeu_si128((__m128i *)(dst + 7 * pitchDst), r3); } -static void transpose(const uint16_t *src, ptrdiff_t pitchSrc, uint16_t *dst, - ptrdiff_t pitchDst, int width, int height) { +static uint8_t HighbdLoadMaskx[8][16] = { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 }, + { 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 }, + { 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }, + { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 }, + { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5 }, + { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3 }, + { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 }, +}; + +static uint8_t HighbdEvenOddMaskx4[8][16] = { + { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, + 15 }, // 0=0,1, 1=2,3, 2=4,5, 3=6,7, 4=8,9, 5=10,11, 6=12,13, 7=14,15, + // >7=0,1 + { 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 4, 5, 8, 9, 12, 13 }, + { 0, 1, 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 6, 7, 10, 11 }, + { 0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 0, 1, 8, 9 }, + { 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 12, 13, 0, 1, 0, 1 }, + { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 0, 1 }, + { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 0, 1 }, + { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15 } +}; + +static uint16_t HighbdEvenOddMaskx8_2[8][16] = { + { 0, 2, 4, 6, 8, 10, 12, 14 }, { 2, 2, 4, 6, 8, 10, 12, 14 }, + { 4, 4, 4, 6, 8, 10, 12, 14 }, { 6, 6, 6, 6, 8, 10, 12, 14 }, + { 8, 8, 8, 8, 8, 10, 12, 14 }, { 10, 10, 10, 10, 10, 10, 12, 14 }, + { 12, 12, 12, 12, 12, 12, 12, 14 }, { 14, 14, 14, 14, 14, 14, 14, 14 }, +}; + +static uint16_t HighbdBaseMask[17][16] = { + { + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + }, + { 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, + 0, 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, + 0, 0, 0, 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, + 0, 0, 0, 0, 0, 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, + 0xffff, 0, 0, 0, 0, 0, 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, + 0xffff, 0xffff, 0, 0, 0, 0, 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, + 0xffff, 0xffff, 0xffff, 0, 0, 0, 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, + 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, + 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, + 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0 }, + { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, + 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff } +}; + +static void highbd_dr_prediction_z2_Nx4_avx2( + int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above, + const uint16_t *left, int upsample_above, int upsample_left, int dx, + int dy) { + const int min_base_x = -(1 << upsample_above); + const int min_base_y = -(1 << upsample_left); + const int frac_bits_x = 6 - upsample_above; + const int frac_bits_y = 6 - upsample_left; + + // a assert(dx > 0); + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be caluculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0_x, a1_x, a32, a16; + __m256i diff; + __m128i c3f, min_base_y128; + + a16 = _mm256_set1_epi32(16); + c3f = _mm_set1_epi32(0x3f); + min_base_y128 = _mm_set1_epi32(min_base_y); + + for (int r = 0; r < N; r++) { + __m256i b, res, shift; + __m128i resx, resy, resxy; + __m128i a0_x128, a1_x128; + int y = r + 1; + int base_x = (-y * dx) >> frac_bits_x; + int base_shift = 0; + if (base_x < (min_base_x - 1)) { + base_shift = (min_base_x - base_x - 1) >> upsample_above; + } + int base_min_diff = + (min_base_x - base_x + upsample_above) >> upsample_above; + if (base_min_diff > 4) { + base_min_diff = 4; + } else { + if (base_min_diff < 0) base_min_diff = 0; + } + + if (base_shift > 3) { + a0_x = _mm256_setzero_si256(); + a1_x = _mm256_setzero_si256(); + shift = _mm256_setzero_si256(); + } else { + a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); + if (upsample_above) { + a0_x128 = _mm_shuffle_epi8(a0_x128, + *(__m128i *)HighbdEvenOddMaskx4[base_shift]); + a1_x128 = _mm_srli_si128(a0_x128, 8); + + shift = _mm256_castsi128_si256(_mm_srli_epi32( + _mm_and_si128( + _mm_slli_epi32( + _mm_setr_epi32(-y * dx, (1 << 6) - y * dx, + (2 << 6) - y * dx, (3 << 6) - y * dx), + upsample_above), + c3f), + 1)); + } else { + a0_x128 = + _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); + a1_x128 = _mm_srli_si128(a0_x128, 2); + + shift = _mm256_castsi128_si256(_mm_srli_epi32( + _mm_and_si128(_mm_setr_epi32(-y * dx, (1 << 6) - y * dx, + (2 << 6) - y * dx, (3 << 6) - y * dx), + c3f), + 1)); + } + a0_x = _mm256_cvtepu16_epi32(a0_x128); + a1_x = _mm256_cvtepu16_epi32(a1_x128); + } + // y calc + __m128i a0_y, a1_y, shifty; + if (base_x < min_base_x) { + __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128; + DECLARE_ALIGNED(32, int, base_y_c[4]); + r6 = _mm_set1_epi32(r << 6); + dy128 = _mm_set1_epi32(dy); + c1234 = _mm_setr_epi32(1, 2, 3, 4); + y_c128 = _mm_sub_epi32(r6, _mm_mullo_epi32(c1234, dy128)); + base_y_c128 = _mm_srai_epi32(y_c128, frac_bits_y); + mask128 = _mm_cmpgt_epi32(min_base_y128, base_y_c128); + base_y_c128 = _mm_andnot_si128(mask128, base_y_c128); + _mm_store_si128((__m128i *)base_y_c, base_y_c128); + + a0_y = _mm_setr_epi32(left[base_y_c[0]], left[base_y_c[1]], + left[base_y_c[2]], left[base_y_c[3]]); + a1_y = _mm_setr_epi32(left[base_y_c[0] + 1], left[base_y_c[1] + 1], + left[base_y_c[2] + 1], left[base_y_c[3] + 1]); + + if (upsample_left) { + shifty = _mm_srli_epi32( + _mm_and_si128(_mm_slli_epi32(y_c128, upsample_left), c3f), 1); + } else { + shifty = _mm_srli_epi32(_mm_and_si128(y_c128, c3f), 1); + } + a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1); + a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1); + shift = _mm256_inserti128_si256(shift, shifty, 1); + } + + diff = _mm256_sub_epi32(a1_x, a0_x); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0_x, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi32(diff, shift); + res = _mm256_add_epi32(a32, b); + res = _mm256_srli_epi32(res, 5); + + resx = _mm256_castsi256_si128(res); + resx = _mm_packus_epi32(resx, resx); + + resy = _mm256_extracti128_si256(res, 1); + resy = _mm_packus_epi32(resy, resy); + + resxy = + _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]); + _mm_storel_epi64((__m128i *)(dst), resxy); + dst += stride; + } +} + +static void highbd_dr_prediction_32bit_z2_Nx8_avx2( + int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above, + const uint16_t *left, int upsample_above, int upsample_left, int dx, + int dy) { + const int min_base_x = -(1 << upsample_above); + const int min_base_y = -(1 << upsample_left); + const int frac_bits_x = 6 - upsample_above; + const int frac_bits_y = 6 - upsample_left; + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be caluculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c3f, min_base_y256; + __m256i diff; + __m128i a0_x128, a1_x128; + + a16 = _mm256_set1_epi32(16); + c3f = _mm256_set1_epi32(0x3f); + min_base_y256 = _mm256_set1_epi32(min_base_y); + + for (int r = 0; r < N; r++) { + __m256i b, res, shift; + __m128i resx, resy, resxy; + int y = r + 1; + int base_x = (-y * dx) >> frac_bits_x; + int base_shift = 0; + if (base_x < (min_base_x - 1)) { + base_shift = (min_base_x - base_x - 1) >> upsample_above; + } + int base_min_diff = + (min_base_x - base_x + upsample_above) >> upsample_above; + if (base_min_diff > 8) { + base_min_diff = 8; + } else { + if (base_min_diff < 0) base_min_diff = 0; + } + + if (base_shift > 7) { + resx = _mm_setzero_si128(); + } else { + if (upsample_above) { + a0_x128 = _mm_setr_epi16( + above[base_x + HighbdEvenOddMaskx8_2[base_shift][0]], + above[base_x + HighbdEvenOddMaskx8_2[base_shift][1]], + above[base_x + HighbdEvenOddMaskx8_2[base_shift][2]], + above[base_x + HighbdEvenOddMaskx8_2[base_shift][3]], + above[base_x + HighbdEvenOddMaskx8_2[base_shift][4]], + above[base_x + HighbdEvenOddMaskx8_2[base_shift][5]], + above[base_x + HighbdEvenOddMaskx8_2[base_shift][6]], + above[base_x + HighbdEvenOddMaskx8_2[base_shift][7]]); + a1_x128 = _mm_setr_epi16( + above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][0]], + above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][1]], + above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][2]], + above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][3]], + above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][4]], + above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][5]], + above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][6]], + above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][7]]); + shift = _mm256_srli_epi32( + _mm256_and_si256( + _mm256_slli_epi32( + _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx, + (2 << 6) - y * dx, (3 << 6) - y * dx, + (4 << 6) - y * dx, (5 << 6) - y * dx, + (6 << 6) - y * dx, (7 << 6) - y * dx), + upsample_above), + c3f), + 1); + } else { + a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); + a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift)); + a0_x128 = + _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); + a1_x128 = + _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); + + shift = _mm256_srli_epi32( + _mm256_and_si256( + _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx, + (3 << 6) - y * dx, (4 << 6) - y * dx, + (5 << 6) - y * dx, (6 << 6) - y * dx, + (7 << 6) - y * dx), + c3f), + 1); + } + + a0_x = _mm256_cvtepu16_epi32(a0_x128); + a1_x = _mm256_cvtepu16_epi32(a1_x128); + + diff = _mm256_sub_epi32(a1_x, a0_x); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0_x, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi32(diff, shift); + res = _mm256_add_epi32(a32, b); + res = _mm256_srli_epi32(res, 5); + + resx = _mm256_castsi256_si128(_mm256_packus_epi32( + res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)))); + } + // y calc + if (base_x < min_base_x) { + DECLARE_ALIGNED(32, int, base_y_c[8]); + __m256i r6, c256, dy256, y_c256, base_y_c256, mask256; + r6 = _mm256_set1_epi32(r << 6); + dy256 = _mm256_set1_epi32(dy); + c256 = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8); + y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256)); + base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y); + mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256); + base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256); + _mm256_store_si256((__m256i *)base_y_c, base_y_c256); + + a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16( + left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], + left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]], + left[base_y_c[6]], left[base_y_c[7]])); + a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16( + left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1], + left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1], + left[base_y_c[6] + 1], left[base_y_c[7] + 1])); + + if (upsample_left) { + shift = _mm256_srli_epi32( + _mm256_and_si256(_mm256_slli_epi32((y_c256), upsample_left), c3f), + 1); + } else { + shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1); + } + diff = _mm256_sub_epi32(a1_y, a0_y); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0_y, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi32(diff, shift); + res = _mm256_add_epi32(a32, b); + res = _mm256_srli_epi32(res, 5); + + resy = _mm256_castsi256_si128(_mm256_packus_epi32( + res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)))); + } else { + resy = resx; + } + resxy = + _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]); + _mm_storeu_si128((__m128i *)(dst), resxy); + dst += stride; + } +} + +static void highbd_dr_prediction_z2_Nx8_avx2( + int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above, + const uint16_t *left, int upsample_above, int upsample_left, int dx, + int dy) { + const int min_base_x = -(1 << upsample_above); + const int min_base_y = -(1 << upsample_left); + const int frac_bits_x = 6 - upsample_above; + const int frac_bits_y = 6 - upsample_left; + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be caluculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m128i c3f, min_base_y128; + __m256i a0_x, a1_x, diff, a32, a16; + __m128i a0_x128, a1_x128; + + a16 = _mm256_set1_epi16(16); + c3f = _mm_set1_epi16(0x3f); + min_base_y128 = _mm_set1_epi16(min_base_y); + + for (int r = 0; r < N; r++) { + __m256i b, res, shift; + __m128i resx, resy, resxy; + int y = r + 1; + int base_x = (-y * dx) >> frac_bits_x; + int base_shift = 0; + if (base_x < (min_base_x - 1)) { + base_shift = (min_base_x - base_x - 1) >> upsample_above; + } + int base_min_diff = + (min_base_x - base_x + upsample_above) >> upsample_above; + if (base_min_diff > 8) { + base_min_diff = 8; + } else { + if (base_min_diff < 0) base_min_diff = 0; + } + + if (base_shift > 7) { + a0_x = _mm256_setzero_si256(); + a1_x = _mm256_setzero_si256(); + shift = _mm256_setzero_si256(); + } else { + if (upsample_above) { + a0_x128 = _mm_setr_epi16( + above[base_x + HighbdEvenOddMaskx8_2[base_shift][0]], + above[base_x + HighbdEvenOddMaskx8_2[base_shift][1]], + above[base_x + HighbdEvenOddMaskx8_2[base_shift][2]], + above[base_x + HighbdEvenOddMaskx8_2[base_shift][3]], + above[base_x + HighbdEvenOddMaskx8_2[base_shift][4]], + above[base_x + HighbdEvenOddMaskx8_2[base_shift][5]], + above[base_x + HighbdEvenOddMaskx8_2[base_shift][6]], + above[base_x + HighbdEvenOddMaskx8_2[base_shift][7]]); + a1_x128 = _mm_setr_epi16( + above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][0]], + above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][1]], + above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][2]], + above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][3]], + above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][4]], + above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][5]], + above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][6]], + above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][7]]); + shift = _mm256_castsi128_si256(_mm_srli_epi16( + _mm_and_si128( + _mm_slli_epi16( + _mm_setr_epi16(-y * dx, (1 << 6) - y * dx, + (2 << 6) - y * dx, (3 << 6) - y * dx, + (4 << 6) - y * dx, (5 << 6) - y * dx, + (6 << 6) - y * dx, (7 << 6) - y * dx), + upsample_above), + c3f), + 1)); + } else { + a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); + a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift)); + a0_x128 = + _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); + a1_x128 = + _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); + + shift = _mm256_castsi128_si256(_mm_srli_epi16( + _mm_and_si128(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx, + (2 << 6) - y * dx, (3 << 6) - y * dx, + (4 << 6) - y * dx, (5 << 6) - y * dx, + (6 << 6) - y * dx, (7 << 6) - y * dx), + c3f), + 1)); + } + a0_x = _mm256_castsi128_si256(a0_x128); + a1_x = _mm256_castsi128_si256(a1_x128); + } + + // y calc + __m128i a0_y, a1_y, shifty; + if (base_x < min_base_x) { + DECLARE_ALIGNED(32, int16_t, base_y_c[16]); + __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128; + r6 = _mm_set1_epi16(r << 6); + dy128 = _mm_set1_epi16(dy); + c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); + y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128)); + base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y); + mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128); + base_y_c128 = _mm_andnot_si128(mask128, base_y_c128); + _mm_store_si128((__m128i *)base_y_c, base_y_c128); + + a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], + left[base_y_c[2]], left[base_y_c[3]], + left[base_y_c[4]], left[base_y_c[5]], + left[base_y_c[6]], left[base_y_c[7]]); + a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1], + left[base_y_c[2] + 1], left[base_y_c[3] + 1], + left[base_y_c[4] + 1], left[base_y_c[5] + 1], + left[base_y_c[6] + 1], left[base_y_c[7] + 1]); + + if (upsample_left) { + shifty = _mm_srli_epi16( + _mm_and_si128(_mm_slli_epi16((y_c128), upsample_left), c3f), 1); + } else { + shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1); + } + a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1); + a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1); + shift = _mm256_inserti128_si256(shift, shifty, 1); + } + + diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi16(diff, shift); + res = _mm256_add_epi16(a32, b); + res = _mm256_srli_epi16(res, 5); + + resx = _mm256_castsi256_si128(res); + resy = _mm256_extracti128_si256(res, 1); + + resxy = + _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]); + _mm_storeu_si128((__m128i *)(dst), resxy); + dst += stride; + } +} + +static void highbd_dr_prediction_32bit_z2_HxW_avx2( + int H, int W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above, + const uint16_t *left, int upsample_above, int upsample_left, int dx, + int dy) { + // here upsample_above and upsample_left are 0 by design of + // av1_use_intra_edge_upsample + const int min_base_x = -1; + const int min_base_y = -1; + (void)upsample_above; + (void)upsample_left; + const int frac_bits_x = 6; + const int frac_bits_y = 6; + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be caluculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0_x, a1_x, a0_y, a1_y, a32, a0_1_x, a1_1_x, a16; + __m256i diff, min_base_y256, c3f; + __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128; + + a16 = _mm256_set1_epi32(16); + min_base_y256 = _mm256_set1_epi16(min_base_y); + c3f = _mm256_set1_epi32(0x3f); + + for (int r = 0; r < H; r++) { + __m256i b, res, shift; + __m256i resx[2], resy[2]; + __m256i resxy; + for (int j = 0; j < W; j += 16) { + int y = r + 1; + int base_x = (-y * dx) >> frac_bits_x; + int base_shift = 0; + if ((base_x + j) < (min_base_x - 1)) { + base_shift = (min_base_x - (base_x + j) - 1); + } + int base_min_diff = (min_base_x - base_x - j); + if (base_min_diff > 16) { + base_min_diff = 16; + } else { + if (base_min_diff < 0) base_min_diff = 0; + } + + if (base_shift > 7) { + resx[0] = _mm256_setzero_si256(); + } else { + a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j)); + a1_x128 = + _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j)); + a0_x128 = + _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); + a1_x128 = + _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); + + a0_x = _mm256_cvtepu16_epi32(a0_x128); + a1_x = _mm256_cvtepu16_epi32(a1_x128); + + shift = _mm256_srli_epi32( + _mm256_and_si256( + _mm256_setr_epi32( + ((0 + j) << 6) - y * dx, ((1 + j) << 6) - y * dx, + ((2 + j) << 6) - y * dx, ((3 + j) << 6) - y * dx, + ((4 + j) << 6) - y * dx, ((5 + j) << 6) - y * dx, + ((6 + j) << 6) - y * dx, ((7 + j) << 6) - y * dx), + c3f), + 1); + + diff = _mm256_sub_epi32(a1_x, a0_x); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0_x, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi32(diff, shift); + res = _mm256_add_epi32(a32, b); + res = _mm256_srli_epi32(res, 5); + + resx[0] = _mm256_packus_epi32( + res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))); + } + int base_shift8 = 0; + if ((base_x + j + 8) < (min_base_x - 1)) { + base_shift8 = (min_base_x - (base_x + j + 8) - 1); + } + if (base_shift8 > 7) { + resx[1] = _mm256_setzero_si256(); + } else { + a0_1_x128 = + _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 8 + j)); + a1_1_x128 = + _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 9 + j)); + a0_1_x128 = _mm_shuffle_epi8(a0_1_x128, + *(__m128i *)HighbdLoadMaskx[base_shift8]); + a1_1_x128 = _mm_shuffle_epi8(a1_1_x128, + *(__m128i *)HighbdLoadMaskx[base_shift8]); + + a0_1_x = _mm256_cvtepu16_epi32(a0_1_x128); + a1_1_x = _mm256_cvtepu16_epi32(a1_1_x128); + + shift = _mm256_srli_epi32( + _mm256_and_si256( + _mm256_setr_epi32( + ((8 + j) << 6) - y * dx, ((9 + j) << 6) - y * dx, + ((10 + j) << 6) - y * dx, ((11 + j) << 6) - y * dx, + ((12 + j) << 6) - y * dx, ((13 + j) << 6) - y * dx, + ((14 + j) << 6) - y * dx, ((15 + j) << 6) - y * dx), + c3f), + 1); + + diff = _mm256_sub_epi32(a1_1_x, a0_1_x); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0_1_x, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + b = _mm256_mullo_epi32(diff, shift); + + resx[1] = _mm256_add_epi32(a32, b); + resx[1] = _mm256_srli_epi32(resx[1], 5); + resx[1] = _mm256_packus_epi32( + resx[1], + _mm256_castsi128_si256(_mm256_extracti128_si256(resx[1], 1))); + } + resx[0] = + _mm256_inserti128_si256(resx[0], _mm256_castsi256_si128(resx[1]), + 1); // 16 16bit values + + // y calc + if ((base_x < min_base_x)) { + DECLARE_ALIGNED(32, int, base_y_c[16]); + __m256i r6, c256, dy256, y_c256, y_c_1_256, base_y_c256, mask256; + r6 = _mm256_set1_epi32(r << 6); + dy256 = _mm256_set1_epi32(dy); + c256 = _mm256_setr_epi32(1 + j, 2 + j, 3 + j, 4 + j, 5 + j, 6 + j, + 7 + j, 8 + j); + y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256)); + base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y); + mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256); + base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256); + _mm256_store_si256((__m256i *)base_y_c, base_y_c256); + c256 = _mm256_setr_epi32(9 + j, 10 + j, 11 + j, 12 + j, 13 + j, 14 + j, + 15 + j, 16 + j); + y_c_1_256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256)); + base_y_c256 = _mm256_srai_epi32(y_c_1_256, frac_bits_y); + mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256); + base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256); + _mm256_store_si256((__m256i *)(base_y_c + 8), base_y_c256); + + a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16( + left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], + left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]], + left[base_y_c[6]], left[base_y_c[7]])); + a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16( + left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1], + left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1], + left[base_y_c[6] + 1], left[base_y_c[7] + 1])); + + shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1); + + diff = _mm256_sub_epi32(a1_y, a0_y); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0_y, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi32(diff, shift); + res = _mm256_add_epi32(a32, b); + res = _mm256_srli_epi32(res, 5); + + resy[0] = _mm256_packus_epi32( + res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))); + + a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16( + left[base_y_c[8]], left[base_y_c[9]], left[base_y_c[10]], + left[base_y_c[11]], left[base_y_c[12]], left[base_y_c[13]], + left[base_y_c[14]], left[base_y_c[15]])); + a1_y = _mm256_cvtepu16_epi32( + _mm_setr_epi16(left[base_y_c[8] + 1], left[base_y_c[9] + 1], + left[base_y_c[10] + 1], left[base_y_c[11] + 1], + left[base_y_c[12] + 1], left[base_y_c[13] + 1], + left[base_y_c[14] + 1], left[base_y_c[15] + 1])); + shift = _mm256_srli_epi32(_mm256_and_si256(y_c_1_256, c3f), 1); + + diff = _mm256_sub_epi32(a1_y, a0_y); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0_y, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi32(diff, shift); + res = _mm256_add_epi32(a32, b); + res = _mm256_srli_epi32(res, 5); + + resy[1] = _mm256_packus_epi32( + res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))); + + resy[0] = + _mm256_inserti128_si256(resy[0], _mm256_castsi256_si128(resy[1]), + 1); // 16 16bit values + } else { + resy[0] = resx[0]; + } + resxy = _mm256_blendv_epi8(resx[0], resy[0], + *(__m256i *)HighbdBaseMask[base_min_diff]); + _mm256_storeu_si256((__m256i *)(dst + j), resxy); + } // for j + dst += stride; + } +} + +static void highbd_dr_prediction_z2_HxW_avx2( + int H, int W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above, + const uint16_t *left, int upsample_above, int upsample_left, int dx, + int dy) { + // here upsample_above and upsample_left are 0 by design of + // av1_use_intra_edge_upsample + const int min_base_x = -1; + const int min_base_y = -1; + (void)upsample_above; + (void)upsample_left; + const int frac_bits_x = 6; + const int frac_bits_y = 6; + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be caluculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0_x, a1_x, a32, a16, c3f; + __m256i diff, min_base_y256; + + a16 = _mm256_set1_epi16(16); + min_base_y256 = _mm256_set1_epi16(min_base_y); + c3f = _mm256_set1_epi16(0x3f); + + for (int r = 0; r < H; r++) { + __m256i b, res, shift; + __m256i resx, resy; + __m256i resxy; + __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128, shiftx; + + for (int j = 0; j < W; j += 16) { + int y = r + 1; + int base_x = (-y * dx) >> frac_bits_x; + int base_shift = 0; + if ((base_x + j) < (min_base_x - 1)) { + base_shift = (min_base_x - (base_x + j) - 1); + } + int base_min_diff = (min_base_x - base_x - j); + if (base_min_diff > 16) { + base_min_diff = 16; + } else { + if (base_min_diff < 0) base_min_diff = 0; + } + + if (base_shift > 7) { + a0_x = _mm256_setzero_si256(); + a1_x = _mm256_setzero_si256(); + shift = _mm256_setzero_si256(); + } else { + a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j)); + a1_x128 = + _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j)); + a0_x128 = + _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); + a1_x128 = + _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); + + a0_x = _mm256_castsi128_si256(a0_x128); + a1_x = _mm256_castsi128_si256(a1_x128); + + shift = _mm256_castsi128_si256(_mm_srli_epi16( + _mm_and_si128(_mm_setr_epi16( + ((0 + j) << 6) - y * dx, ((1 + j) << 6) - y * dx, + ((2 + j) << 6) - y * dx, ((3 + j) << 6) - y * dx, + ((4 + j) << 6) - y * dx, ((5 + j) << 6) - y * dx, + ((6 + j) << 6) - y * dx, ((7 + j) << 6) - y * dx), + _mm256_castsi256_si128(c3f)), + 1)); + } + + base_shift = 0; + if ((base_x + j + 8) < (min_base_x - 1)) { + base_shift = (min_base_x - (base_x + j + 8) - 1); + } + if (base_shift <= 7) { + a0_1_x128 = + _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 8 + j)); + a1_1_x128 = + _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 9 + j)); + a0_1_x128 = _mm_shuffle_epi8(a0_1_x128, + *(__m128i *)HighbdLoadMaskx[base_shift]); + a1_1_x128 = _mm_shuffle_epi8(a1_1_x128, + *(__m128i *)HighbdLoadMaskx[base_shift]); + + shiftx = _mm_srli_epi16( + _mm_and_si128( + _mm_setr_epi16( + ((8 + j) << 6) - y * dx, ((9 + j) << 6) - y * dx, + ((10 + j) << 6) - y * dx, ((11 + j) << 6) - y * dx, + ((12 + j) << 6) - y * dx, ((13 + j) << 6) - y * dx, + ((14 + j) << 6) - y * dx, ((15 + j) << 6) - y * dx), + _mm256_castsi256_si128(c3f)), + 1); + + a0_x = _mm256_inserti128_si256(a0_x, a0_1_x128, 1); + a1_x = _mm256_inserti128_si256(a1_x, a1_1_x128, 1); + shift = _mm256_inserti128_si256(shift, shiftx, 1); + } + + diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi16(diff, shift); + res = _mm256_add_epi16(a32, b); + resx = _mm256_srli_epi16(res, 5); // 16 16-bit values + + // y calc + __m256i a0_y, a1_y, shifty; + if ((base_x < min_base_x)) { + DECLARE_ALIGNED(32, int16_t, base_y_c[16]); + __m256i r6, c256, dy256, y_c256, base_y_c256, mask256, mul16; + r6 = _mm256_set1_epi16(r << 6); + dy256 = _mm256_set1_epi16(dy); + c256 = _mm256_setr_epi16(1 + j, 2 + j, 3 + j, 4 + j, 5 + j, 6 + j, + 7 + j, 8 + j, 9 + j, 10 + j, 11 + j, 12 + j, + 13 + j, 14 + j, 15 + j, 16 + j); + mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256), + _mm256_srli_epi16(min_base_y256, 1)); + y_c256 = _mm256_sub_epi16(r6, mul16); + base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y); + mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256); + base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256); + _mm256_store_si256((__m256i *)base_y_c, base_y_c256); + + a0_y = _mm256_setr_epi16( + left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], + left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]], + left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]], + left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]], + left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]], + left[base_y_c[15]]); + a1_y = _mm256_setr_epi16( + left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1], + left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1], + left[base_y_c[6] + 1], left[base_y_c[7] + 1], left[base_y_c[8] + 1], + left[base_y_c[9] + 1], left[base_y_c[10] + 1], + left[base_y_c[11] + 1], left[base_y_c[12] + 1], + left[base_y_c[13] + 1], left[base_y_c[14] + 1], + left[base_y_c[15] + 1]); + + shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1); + + diff = _mm256_sub_epi16(a1_y, a0_y); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0_y, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi16(diff, shifty); + res = _mm256_add_epi16(a32, b); + resy = _mm256_srli_epi16(res, 5); + } else { + resy = _mm256_setzero_si256(); + } + + resxy = _mm256_blendv_epi8(resx, resy, + *(__m256i *)HighbdBaseMask[base_min_diff]); + _mm256_storeu_si256((__m256i *)(dst + j), resxy); + } // for j + dst += stride; + } +} + +// Directional prediction, zone 2: 90 < angle < 180 +void av1_highbd_dr_prediction_z2_avx2(uint16_t *dst, ptrdiff_t stride, int bw, + int bh, const uint16_t *above, + const uint16_t *left, int upsample_above, + int upsample_left, int dx, int dy, + int bd) { + (void)bd; + assert(dx > 0); + assert(dy > 0); + switch (bw) { + case 4: + highbd_dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left, + upsample_above, upsample_left, dx, dy); + break; + case 8: + if (bd < 12) { + highbd_dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left, + upsample_above, upsample_left, dx, dy); + } else { + highbd_dr_prediction_32bit_z2_Nx8_avx2(bh, dst, stride, above, left, + upsample_above, upsample_left, + dx, dy); + } + break; + default: + if (bd < 12) { + highbd_dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left, + upsample_above, upsample_left, dx, dy); + } else { + highbd_dr_prediction_32bit_z2_HxW_avx2(bh, bw, dst, stride, above, left, + upsample_above, upsample_left, + dx, dy); + } + break; + } +} + +static void highbd_transpose(const uint16_t *src, ptrdiff_t pitchSrc, + uint16_t *dst, ptrdiff_t pitchDst, int width, + int height) { for (int j = 0; j < height; j += 8) for (int i = 0; i < width; i += 8) - transpose_TX_8X8(src + i * pitchSrc + j, pitchSrc, dst + j * pitchDst + i, - pitchDst); + highbd_transpose_TX_8X8(src + i * pitchSrc + j, pitchSrc, + dst + j * pitchDst + i, pitchDst); } static void highbd_dr_prediction_z3_4x4_avx2(uint16_t *dst, ptrdiff_t stride, @@ -1649,7 +2559,7 @@ static void highbd_dr_prediction_z3_8x4_avx2(uint16_t *dst, ptrdiff_t stride, static void highbd_dr_prediction_z3_8x16_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *left, int upsample_left, int dy) { - __m256i dstvec[8], d[16]; + __m256i dstvec[8], d[8]; highbd_dr_prediction_z1_16xN_internal_avx2(8, dstvec, left, upsample_left, dy); @@ -1818,9 +2728,9 @@ static void highbd_dr_prediction_z3_32x32_avx2(uint16_t *dst, ptrdiff_t stride, static void highbd_dr_prediction_z3_64x64_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *left, int upsample_left, int dy) { - uint16_t dstT[64 * 64]; + DECLARE_ALIGNED(16, uint16_t, dstT[64 * 64]); highbd_dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy); - transpose(dstT, 64, dst, stride, 64, 64); + highbd_transpose(dstT, 64, dst, stride, 64, 64); } static void highbd_dr_prediction_z3_16x32_avx2(uint16_t *dst, ptrdiff_t stride, @@ -1872,24 +2782,24 @@ static void highbd_dr_prediction_z3_32x64_avx2(uint16_t *dst, ptrdiff_t stride, int upsample_left, int dy) { uint16_t dstT[64 * 32]; highbd_dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy); - transpose(dstT, 64, dst, stride, 32, 64); + highbd_transpose(dstT, 64, dst, stride, 32, 64); } static void highbd_dr_prediction_z3_64x32_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *left, int upsample_left, int dy) { - uint16_t dstT[32 * 64]; + DECLARE_ALIGNED(16, uint16_t, dstT[32 * 64]); highbd_dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy); - transpose(dstT, 32, dst, stride, 64, 32); + highbd_transpose(dstT, 32, dst, stride, 64, 32); return; } static void highbd_dr_prediction_z3_16x64_avx2(uint16_t *dst, ptrdiff_t stride, const uint16_t *left, int upsample_left, int dy) { - uint16_t dstT[64 * 16]; + DECLARE_ALIGNED(16, uint16_t, dstT[64 * 16]); highbd_dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy); - transpose(dstT, 64, dst, stride, 16, 64); + highbd_transpose(dstT, 64, dst, stride, 16, 64); } static void highbd_dr_prediction_z3_64x16_avx2(uint16_t *dst, ptrdiff_t stride, @@ -1910,9 +2820,10 @@ static void highbd_dr_prediction_z3_64x16_avx2(uint16_t *dst, ptrdiff_t stride, void av1_highbd_dr_prediction_z3_avx2(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_left, - int dx, int dy) { + int dx, int dy, int bd) { (void)above; (void)dx; + (void)bd; assert(dx == 1); assert(dy > 0); if (bw == bh) { @@ -2013,3 +2924,1716 @@ void av1_highbd_dr_prediction_z3_avx2(uint16_t *dst, ptrdiff_t stride, int bw, } return; } + +// Low bit depth functions +static uint8_t BaseMask[33][32] = { + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, +}; + +static AOM_FORCE_INLINE void dr_prediction_z1_4xN_internal_avx2( + int N, __m128i *dst, const uint8_t *above, int upsample_above, int dx) { + const int frac_bits = 6 - upsample_above; + const int max_base_x = ((N + 4) - 1) << upsample_above; + int x; + // a assert(dx > 0); + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be caluculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0, a1, a32, a16; + __m256i diff, c3f; + __m128i a_mbase_x; + + a16 = _mm256_set1_epi16(16); + a_mbase_x = _mm_set1_epi8(above[max_base_x]); + c3f = _mm256_set1_epi16(0x3f); + + x = dx; + for (int r = 0; r < N; r++) { + __m256i b, res, shift; + __m128i res1, a0_128, a1_128; + + int base = x >> frac_bits; + int base_max_diff = (max_base_x - base) >> upsample_above; + if (base_max_diff <= 0) { + for (int i = r; i < N; ++i) { + dst[i] = a_mbase_x; // save 4 values + } + return; + } + if (base_max_diff > 4) base_max_diff = 4; + a0_128 = _mm_loadu_si128((__m128i *)(above + base)); + a1_128 = _mm_srli_si128(a0_128, 1); + + if (upsample_above) { + a0_128 = _mm_shuffle_epi8( + a0_128, + _mm_setr_epi8(0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15)); + a1_128 = _mm_srli_si128(a0_128, 4); + + shift = _mm256_srli_epi16( + _mm256_and_si256( + _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f), + 1); + } else { + shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1); + } + a0 = _mm256_cvtepu8_epi16(a0_128); + a1 = _mm256_cvtepu8_epi16(a1_128); + + diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi16(diff, shift); + res = _mm256_add_epi16(a32, b); + res = _mm256_srli_epi16(res, 5); + + res1 = _mm256_castsi256_si128(res); + res1 = _mm_packus_epi16(res1, res1); + + dst[r] = + _mm_blendv_epi8(a_mbase_x, res1, *(__m128i *)BaseMask[base_max_diff]); + x += dx; + } +} + +static void dr_prediction_z1_4xN_avx2(int N, uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, int upsample_above, + int dx) { + __m128i dstvec[16]; + + dr_prediction_z1_4xN_internal_avx2(N, dstvec, above, upsample_above, dx); + for (int i = 0; i < N; i++) { + *(uint32_t *)(dst + stride * i) = _mm_cvtsi128_si32(dstvec[i]); + } +} + +static AOM_FORCE_INLINE void dr_prediction_z1_8xN_internal_avx2( + int N, __m128i *dst, const uint8_t *above, int upsample_above, int dx) { + const int frac_bits = 6 - upsample_above; + const int max_base_x = ((8 + N) - 1) << upsample_above; + + int x; + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be caluculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0, a1, a0_1, a1_1, a32, a16, diff, c3f; + __m128i a_mbase_x; + + a16 = _mm256_set1_epi32(16); + a_mbase_x = _mm_set1_epi8(above[max_base_x]); + c3f = _mm256_set1_epi32(0x3f); + + x = dx; + for (int r = 0; r < N; r++) { + __m256i b, res, res1, shift; + __m128i res128; + + int base = x >> frac_bits; + int base_max_diff = (max_base_x - base) >> upsample_above; + if (base_max_diff <= 0) { + for (int i = r; i < N; ++i) { + dst[i] = a_mbase_x; // save 16 values, 8 to be used furter + } + return; + } + if (base_max_diff > 8) base_max_diff = 8; + + a0 = _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i *)(above + base))); + a1 = _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i *)(above + base + 1))); + + if (upsample_above) { + a0 = _mm256_permutevar8x32_epi32( + a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0)); + a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1)); + + a0_1 = + _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i *)(above + base + 8))); + a0_1 = _mm256_permutevar8x32_epi32( + a0_1, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0)); + a1_1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0_1, 1)); + + a0 = _mm256_inserti128_si256(a0, _mm256_castsi256_si128(a0_1), 1); + a1 = _mm256_inserti128_si256(a1, _mm256_castsi256_si128(a1_1), 1); + + shift = _mm256_srli_epi32( + _mm256_and_si256( + _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above), c3f), + 1); + } else { + shift = _mm256_srli_epi32(_mm256_and_si256(_mm256_set1_epi32(x), c3f), 1); + } + + diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi32(diff, shift); + res = _mm256_add_epi32(a32, b); + res = _mm256_srli_epi32(res, 5); + + res1 = _mm256_packus_epi32( + res, _mm256_castsi128_si256( + _mm256_extracti128_si256(res, 1))); // goto 16 bit + + res128 = _mm_packus_epi16(_mm256_castsi256_si128(res1), + _mm256_castsi256_si128(res1)); // goto 8 bit + + res128 = + _mm_blendv_epi8(a_mbase_x, res128, *(__m128i *)BaseMask[base_max_diff]); + dst[r] = res128; + x += dx; + } +} + +static void dr_prediction_z1_8xN_avx2(int N, uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, int upsample_above, + int dx) { + __m128i dstvec[32]; + + dr_prediction_z1_8xN_internal_avx2(N, dstvec, above, upsample_above, dx); + for (int i = 0; i < N; i++) { + _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]); + } +} + +static AOM_FORCE_INLINE void dr_prediction_z1_16xN_internal_avx2( + int N, __m128i *dstvec, const uint8_t *above, int upsample_above, int dx) { + int x; + // here upsample_above is 0 by design of av1_use_intra_edge_upsample + (void)upsample_above; + const int frac_bits = 6; + const int max_base_x = ((16 + N) - 1); + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be caluculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0, a0_1, a1, a1_1, diff, a32, a16, c3f; + __m128i a_mbase_x; + + a16 = _mm256_set1_epi32(16); + a_mbase_x = _mm_set1_epi8((uint8_t)above[max_base_x]); + c3f = _mm256_set1_epi32(0x3f); + + x = dx; + for (int r = 0; r < N; r++) { + __m256i b, res[2]; + __m128i res128[2]; + int base = x >> frac_bits; + int base_max_diff = (max_base_x - base); + if (base_max_diff <= 0) { + for (int i = r; i < N; ++i) { + dstvec[i] = a_mbase_x; // save 16 values + } + return; + } + __m256i shift = + _mm256_srli_epi32(_mm256_and_si256(_mm256_set1_epi32(x), c3f), 1); + + a0 = _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i *)(above + base))); + a1 = _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i *)(above + base + 1))); + + diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + b = _mm256_mullo_epi32(diff, shift); + + res[0] = _mm256_add_epi32(a32, b); + res[0] = _mm256_srli_epi32(res[0], 5); + res[0] = _mm256_packus_epi32( + res[0], _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1))); + res128[0] = _mm_packus_epi16(_mm256_castsi256_si128(res[0]), + _mm256_castsi256_si128(res[0])); // goto 8 bit + + if (base_max_diff > 8) { + if (base_max_diff > 16) base_max_diff = 16; + a0_1 = + _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i *)(above + base + 8))); + a1_1 = + _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i *)(above + base + 9))); + + diff = _mm256_sub_epi32(a1_1, a0_1); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0_1, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + b = _mm256_mullo_epi32(diff, shift); + + res[1] = _mm256_add_epi32(a32, b); + res[1] = _mm256_srli_epi32(res[1], 5); + res[1] = _mm256_packus_epi32( + res[1], _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1))); + res128[1] = + _mm_packus_epi16(_mm256_castsi256_si128(res[1]), + _mm256_castsi256_si128(res[1])); // goto 8 bit + + } else { + res128[1] = a_mbase_x; + } + res128[0] = _mm_unpacklo_epi64(res128[0], res128[1]); // 16 8bit values + + dstvec[r] = _mm_blendv_epi8(a_mbase_x, res128[0], + *(__m128i *)BaseMask[base_max_diff]); + x += dx; + } +} +static void dr_prediction_z1_16xN_avx2(int N, uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, int upsample_above, + int dx) { + __m128i dstvec[64]; + + dr_prediction_z1_16xN_internal_avx2(N, dstvec, above, upsample_above, dx); + for (int i = 0; i < N; i++) { + _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]); + } +} + +static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_avx2( + int N, __m256i *dstvec, const uint8_t *above, int upsample_above, int dx) { + int x; + // here upsample_above is 0 by design of av1_use_intra_edge_upsample + (void)upsample_above; + const int frac_bits = 6; + const int max_base_x = ((32 + N) - 1); + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be caluculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0, a0_1, a1, a1_1, a32, a16; + __m256i a_mbase_x, diff, c3f; + + a16 = _mm256_set1_epi32(16); + a_mbase_x = _mm256_set1_epi8(above[max_base_x]); + c3f = _mm256_set1_epi32(0x3f); + + x = dx; + for (int r = 0; r < N; r++) { + __m256i b, res[2], res16[2]; + + int base = x >> frac_bits; + int base_max_diff = (max_base_x - base); + if (base_max_diff <= 0) { + for (int i = r; i < N; ++i) { + dstvec[i] = a_mbase_x; // save 32 values + } + return; + } + if (base_max_diff > 32) base_max_diff = 32; + __m256i shift = + _mm256_srli_epi32(_mm256_and_si256(_mm256_set1_epi32(x), c3f), 1); + + for (int j = 0, jj = 0; j < 32; j += 16, jj++) { + int mdiff = base_max_diff - j; + if (mdiff <= 0) { + res16[jj] = a_mbase_x; + } else { + a0 = _mm256_cvtepu8_epi32( + _mm_loadu_si128((__m128i *)(above + base + j))); + a1 = _mm256_cvtepu8_epi32( + _mm_loadu_si128((__m128i *)(above + base + 1 + j))); + + diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + b = _mm256_mullo_epi32(diff, shift); + + res[0] = _mm256_add_epi32(a32, b); + res[0] = _mm256_srli_epi32(res[0], 5); + res[0] = _mm256_packus_epi32( + res[0], + _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1))); + + // goto 8 bit + res[0] = _mm256_packus_epi16(res[0], res[0]); + + if (mdiff > 8) { + a0_1 = _mm256_cvtepu8_epi32( + _mm_loadu_si128((__m128i *)(above + base + 8 + j))); + a1_1 = _mm256_cvtepu8_epi32( + _mm_loadu_si128((__m128i *)(above + base + 9 + j))); + + diff = _mm256_sub_epi32(a1_1, a0_1); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0_1, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + b = _mm256_mullo_epi32(diff, shift); + + res[1] = _mm256_add_epi32(a32, b); + res[1] = _mm256_srli_epi32(res[1], 5); + res[1] = _mm256_packus_epi32( + res[1], + _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1))); + res[1] = _mm256_packus_epi16(res[1], res[1]); + // goto 8 bit + } else { + res[1] = a_mbase_x; + } + res16[jj] = _mm256_unpacklo_epi64(res[0], res[1]); // 16 8bit values + } + } + res16[1] = + _mm256_inserti128_si256(res16[0], _mm256_castsi256_si128(res16[1]), + 1); // 32 8bit values + + dstvec[r] = _mm256_blendv_epi8( + a_mbase_x, res16[1], + *(__m256i *)BaseMask[base_max_diff]); // 32 8bit values + x += dx; + } +} + +static void dr_prediction_z1_32xN_avx2(int N, uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, int upsample_above, + int dx) { + __m256i dstvec[64]; + dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above, dx); + for (int i = 0; i < N; i++) { + _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]); + } +} + +static void dr_prediction_z1_64xN_avx2(int N, uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, int upsample_above, + int dx) { + int x; + // here upsample_above is 0 by design of av1_use_intra_edge_upsample + (void)upsample_above; + const int frac_bits = 6; + const int max_base_x = ((64 + N) - 1); + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be caluculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0, a0_1, a1, a1_1, a32, a16; + __m256i a_mbase_x, diff, c3f; + __m128i max_base_x128, base_inc128, mask128; + + a16 = _mm256_set1_epi32(16); + a_mbase_x = _mm256_set1_epi8(above[max_base_x]); + max_base_x128 = _mm_set1_epi8(max_base_x); + c3f = _mm256_set1_epi32(0x3f); + + x = dx; + for (int r = 0; r < N; r++, dst += stride) { + __m256i b, res[2]; + __m128i res1; + + int base = x >> frac_bits; + if (base >= max_base_x) { + for (int i = r; i < N; ++i) { + _mm256_storeu_si256((__m256i *)dst, a_mbase_x); // save 32 values + _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x); + dst += stride; + } + return; + } + + __m256i shift = + _mm256_srli_epi32(_mm256_and_si256(_mm256_set1_epi32(x), c3f), 1); + + __m128i a0_128, a0_1_128, a1_128, a1_1_128; + for (int j = 0; j < 64; j += 16) { + int mdif = max_base_x - (base + j); + if (mdif <= 0) { + _mm_storeu_si128((__m128i *)(dst + j), + _mm256_castsi256_si128(a_mbase_x)); + } else { + a0_128 = _mm_loadu_si128((__m128i *)(above + base + j)); + a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j)); + a0 = _mm256_cvtepu8_epi32(a0_128); + a1 = _mm256_cvtepu8_epi32(a1_128); + + diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + b = _mm256_mullo_epi32(diff, shift); + + res[0] = _mm256_add_epi32(a32, b); + res[0] = _mm256_srli_epi32(res[0], 5); + res[0] = _mm256_packus_epi32( + res[0], + _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1))); + // goto 8 bit + res[0] = _mm256_packus_epi16(res[0], res[0]); + + if (mdif > 8) { + a0_1_128 = _mm_loadu_si128((__m128i *)(above + base + 8 + j)); + a1_1_128 = _mm_loadu_si128((__m128i *)(above + base + 9 + j)); + a0_1 = _mm256_cvtepu8_epi32(a0_1_128); + a1_1 = _mm256_cvtepu8_epi32(a1_1_128); + + diff = _mm256_sub_epi32(a1_1, a0_1); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0_1, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + b = _mm256_mullo_epi32(diff, shift); + + res[1] = _mm256_add_epi32(a32, b); + res[1] = _mm256_srli_epi32(res[1], 5); + res[1] = _mm256_packus_epi32( + res[1], + _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1))); + res[1] = _mm256_packus_epi16(res[1], res[1]); + + } else { + res[1] = a_mbase_x; + } + res1 = _mm_unpacklo_epi64( + _mm256_castsi256_si128(res[0]), + _mm256_castsi256_si128(res[1])); // 16 8bit values + + base_inc128 = _mm_setr_epi8( + base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4, + base + j + 5, base + j + 6, base + j + 7, base + j + 8, + base + j + 9, base + j + 10, base + j + 11, base + j + 12, + base + j + 13, base + j + 14, base + j + 15); + + mask128 = _mm_cmpgt_epi8(_mm_subs_epu8(max_base_x128, base_inc128), + _mm_setzero_si128()); + res1 = + _mm_blendv_epi8(_mm256_castsi256_si128(a_mbase_x), res1, mask128); + _mm_storeu_si128((__m128i *)(dst + j), res1); + } + } + x += dx; + } +} + +// Directional prediction, zone 1: 0 < angle < 90 +void av1_dr_prediction_z1_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh, + const uint8_t *above, const uint8_t *left, + int upsample_above, int dx, int dy) { + (void)left; + (void)dy; + switch (bw) { + case 4: + dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above, dx); + break; + case 8: + dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above, dx); + break; + case 16: + dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above, dx); + break; + case 32: + dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above, dx); + break; + case 64: + dr_prediction_z1_64xN_avx2(bh, dst, stride, above, upsample_above, dx); + break; + default: break; + } + return; +} + +static uint8_t LoadMaskx[8][16] = { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }, + { 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 }, + { 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 }, + { 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 }, + { 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 }, + { 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8 }, +}; + +static uint8_t EvenOddMaskx4[8][16] = { + { 0, 2, 4, 6, 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 1, 3, 5, 7, 2, 4, 6, 8, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 2, 4, 6, 8, 3, 5, 7, 9, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 3, 5, 7, 9, 4, 6, 8, 10, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 4, 6, 8, 10, 5, 7, 9, 11, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 5, 7, 9, 11, 6, 8, 10, 12, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 6, 8, 10, 12, 7, 9, 11, 13, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 7, 9, 11, 13, 8, 10, 12, 14, 0 } +}; + +static uint8_t EvenOddMaskx[8][16] = { + { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 0, 0, 0, 0 }, + { 0, 1, 3, 5, 7, 9, 11, 13, 15, 2, 4, 6, 8, 0, 0, 0 }, + { 0, 0, 2, 4, 6, 8, 10, 12, 14, 3, 5, 7, 9, 0, 0, 0 }, + { 0, 0, 0, 3, 5, 7, 9, 11, 13, 15, 4, 6, 8, 10, 0 }, + { 0, 0, 0, 0, 4, 6, 8, 10, 12, 14, 5, 7, 9, 11, 0, 0 }, + { 0, 0, 0, 0, 0, 5, 7, 9, 11, 13, 15, 6, 8, 10, 12, 0 }, + { 0, 0, 0, 0, 0, 0, 6, 8, 10, 12, 14, 7, 9, 11, 13, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 7, 9, 11, 13, 15, 8, 10, 12, 14 } +}; + +static void dr_prediction_z2_Nx4_avx2(int N, uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left, + int upsample_above, int upsample_left, + int dx, int dy) { + const int min_base_x = -(1 << upsample_above); + const int min_base_y = -(1 << upsample_left); + const int frac_bits_x = 6 - upsample_above; + const int frac_bits_y = 6 - upsample_left; + + // a assert(dx > 0); + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be caluculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0_x, a1_x, a32, a16, diff; + __m128i c3f, min_base_y128; + + a16 = _mm256_set1_epi32(16); + c3f = _mm_set1_epi32(0x3f); + min_base_y128 = _mm_set1_epi32(min_base_y); + + for (int r = 0; r < N; r++) { + __m256i b, res, shift; + __m128i resx, resy, resxy; + __m128i a0_x128, a1_x128; + int y = r + 1; + int base_x = (-y * dx) >> frac_bits_x; + int base_shift = 0; + if (base_x < (min_base_x - 1)) { + base_shift = (min_base_x - base_x - 1) >> upsample_above; + } + int base_min_diff = + (min_base_x - base_x + upsample_above) >> upsample_above; + if (base_min_diff > 4) { + base_min_diff = 4; + } else { + if (base_min_diff < 0) base_min_diff = 0; + } + + if (base_shift > 3) { + a0_x = _mm256_setzero_si256(); + a1_x = _mm256_setzero_si256(); + shift = _mm256_setzero_si256(); + } else { + a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); + if (upsample_above) { + a0_x128 = + _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx4[base_shift]); + a1_x128 = _mm_srli_si128(a0_x128, 4); + + shift = _mm256_castsi128_si256(_mm_srli_epi32( + _mm_and_si128( + _mm_slli_epi32( + _mm_setr_epi32(-y * dx, (1 << 6) - y * dx, + (2 << 6) - y * dx, (3 << 6) - y * dx), + upsample_above), + c3f), + 1)); + } else { + a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]); + a1_x128 = _mm_srli_si128(a0_x128, 1); + + shift = _mm256_castsi128_si256(_mm_srli_epi32( + _mm_and_si128(_mm_setr_epi32(-y * dx, (1 << 6) - y * dx, + (2 << 6) - y * dx, (3 << 6) - y * dx), + c3f), + 1)); + } + a0_x = _mm256_cvtepu8_epi32(a0_x128); + a1_x = _mm256_cvtepu8_epi32(a1_x128); + } + // y calc + __m128i a0_y, a1_y, shifty; + if (base_x < min_base_x) { + DECLARE_ALIGNED(32, int, base_y_c[4]); + __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128; + r6 = _mm_set1_epi32(r << 6); + dy128 = _mm_set1_epi32(dy); + c1234 = _mm_setr_epi32(1, 2, 3, 4); + y_c128 = _mm_sub_epi32(r6, _mm_mullo_epi32(c1234, dy128)); + base_y_c128 = _mm_srai_epi32(y_c128, frac_bits_y); + mask128 = _mm_cmpgt_epi32(min_base_y128, base_y_c128); + base_y_c128 = _mm_andnot_si128(mask128, base_y_c128); + _mm_store_si128((__m128i *)base_y_c, base_y_c128); + + a0_y = _mm_setr_epi32(left[base_y_c[0]], left[base_y_c[1]], + left[base_y_c[2]], left[base_y_c[3]]); + a1_y = _mm_setr_epi32(left[base_y_c[0] + 1], left[base_y_c[1] + 1], + left[base_y_c[2] + 1], left[base_y_c[3] + 1]); + + if (upsample_left) { + shifty = _mm_srli_epi32( + _mm_and_si128(_mm_slli_epi32(y_c128, upsample_left), c3f), 1); + } else { + shifty = _mm_srli_epi32(_mm_and_si128(y_c128, c3f), 1); + } + a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1); + a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1); + shift = _mm256_inserti128_si256(shift, shifty, 1); + } + + diff = _mm256_sub_epi32(a1_x, a0_x); // a[x+1] - a[x] + a32 = _mm256_slli_epi32(a0_x, 5); // a[x] * 32 + a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi32(diff, shift); + res = _mm256_add_epi32(a32, b); + res = _mm256_srli_epi32(res, 5); + + resx = _mm256_castsi256_si128(res); + resx = _mm_packus_epi32(resx, resx); + resx = _mm_packus_epi16(resx, resx); + + resy = _mm256_extracti128_si256(res, 1); + resy = _mm_packus_epi32(resy, resy); + resy = _mm_packus_epi16(resy, resy); + + resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]); + *(uint32_t *)(dst) = _mm_cvtsi128_si32(resxy); + dst += stride; + } +} + +static void dr_prediction_z2_Nx8_avx2(int N, uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left, + int upsample_above, int upsample_left, + int dx, int dy) { + const int min_base_x = -(1 << upsample_above); + const int min_base_y = -(1 << upsample_left); + const int frac_bits_x = 6 - upsample_above; + const int frac_bits_y = 6 - upsample_left; + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be caluculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i diff, a32, a16; + __m256i a0_x, a1_x; + __m128i a0_x128, a1_x128, min_base_y128, c3f; + + a16 = _mm256_set1_epi16(16); + c3f = _mm_set1_epi16(0x3f); + min_base_y128 = _mm_set1_epi16(min_base_y); + + for (int r = 0; r < N; r++) { + __m256i b, res, shift; + __m128i resx, resy, resxy; + + int y = r + 1; + int base_x = (-y * dx) >> frac_bits_x; + int base_shift = 0; + if (base_x < (min_base_x - 1)) { + base_shift = (min_base_x - base_x - 1) >> upsample_above; + } + int base_min_diff = + (min_base_x - base_x + upsample_above) >> upsample_above; + if (base_min_diff > 8) { + base_min_diff = 8; + } else { + if (base_min_diff < 0) base_min_diff = 0; + } + + if (base_shift > 7) { + a0_x = _mm256_setzero_si256(); + a1_x = _mm256_setzero_si256(); + shift = _mm256_setzero_si256(); + } else { + a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); + a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift)); + if (upsample_above) { + a0_x128 = + _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]); + a1_x128 = + _mm_shuffle_epi8(a1_x128, *(__m128i *)EvenOddMaskx[base_shift]); + + shift = _mm256_castsi128_si256(_mm_srli_epi16( + _mm_and_si128( + _mm_slli_epi16( + _mm_setr_epi16(-y * dx, (1 << 6) - y * dx, + (2 << 6) - y * dx, (3 << 6) - y * dx, + (4 << 6) - y * dx, (5 << 6) - y * dx, + (6 << 6) - y * dx, (7 << 6) - y * dx), + upsample_above), + c3f), + 1)); + } else { + a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]); + a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]); + + shift = _mm256_castsi128_si256(_mm_srli_epi16( + _mm_and_si128(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx, + (2 << 6) - y * dx, (3 << 6) - y * dx, + (4 << 6) - y * dx, (5 << 6) - y * dx, + (6 << 6) - y * dx, (7 << 6) - y * dx), + c3f), + 1)); + } + a0_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a0_x128)); + a1_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a1_x128)); + } + + // y calc + __m128i a0_y, a1_y, shifty; + if (base_x < min_base_x) { + DECLARE_ALIGNED(32, int16_t, base_y_c[16]); + __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128; + r6 = _mm_set1_epi16(r << 6); + dy128 = _mm_set1_epi16(dy); + c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); + y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128)); + base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y); + mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128); + base_y_c128 = _mm_andnot_si128(mask128, base_y_c128); + _mm_store_si128((__m128i *)base_y_c, base_y_c128); + + a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], + left[base_y_c[2]], left[base_y_c[3]], + left[base_y_c[4]], left[base_y_c[5]], + left[base_y_c[6]], left[base_y_c[7]]); + a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1], + left[base_y_c[2] + 1], left[base_y_c[3] + 1], + left[base_y_c[4] + 1], left[base_y_c[5] + 1], + left[base_y_c[6] + 1], left[base_y_c[7] + 1]); + + if (upsample_left) { + shifty = _mm_srli_epi16( + _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1); + } else { + shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1); + } + + a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1); + a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1); + shift = _mm256_inserti128_si256(shift, shifty, 1); + } + + diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi16(diff, shift); + res = _mm256_add_epi16(a32, b); + res = _mm256_srli_epi16(res, 5); + + resx = _mm_packus_epi16(_mm256_castsi256_si128(res), + _mm256_castsi256_si128(res)); + resy = _mm256_extracti128_si256(res, 1); + resy = _mm_packus_epi16(resy, resy); + + resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]); + _mm_storel_epi64((__m128i *)(dst), resxy); + dst += stride; + } +} + +static void dr_prediction_z2_HxW_avx2(int H, int W, uint8_t *dst, + ptrdiff_t stride, const uint8_t *above, + const uint8_t *left, int upsample_above, + int upsample_left, int dx, int dy) { + // here upsample_above and upsample_left are 0 by design of + // av1_use_intra_edge_upsample + const int min_base_x = -1; + const int min_base_y = -1; + (void)upsample_above; + (void)upsample_left; + const int frac_bits_x = 6; + const int frac_bits_y = 6; + + // pre-filter above pixels + // store in temp buffers: + // above[x] * 32 + 16 + // above[x+1] - above[x] + // final pixels will be caluculated as: + // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 + __m256i a0_x, a1_x, a0_y, a1_y, a32, a16; + __m256i diff, min_base_y256, c3f, shifty; + __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128, a0_1_x, a1_1_x, shiftx; + + a16 = _mm256_set1_epi16(16); + min_base_y256 = _mm256_set1_epi16(min_base_y); + c3f = _mm256_set1_epi16(0x3f); + + for (int r = 0; r < H; r++) { + __m256i b, res, shift; + __m128i resx, resy; + __m128i resxy; + for (int j = 0; j < W; j += 16) { + int y = r + 1; + int base_x = (-y * dx) >> frac_bits_x; + + int base_shift = 0; + if ((base_x + j) < (min_base_x - 1)) { + base_shift = (min_base_x - (base_x + j) - 1); + } + int base_min_diff = (min_base_x - base_x - j); + if (base_min_diff > 16) { + base_min_diff = 16; + } else { + if (base_min_diff < 0) base_min_diff = 0; + } + if (base_shift > 7) { + a0_x = _mm256_setzero_si256(); + a1_x = _mm256_setzero_si256(); + shift = _mm256_setzero_si256(); + } else { + a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j)); + a1_x128 = + _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j)); + a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]); + a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]); + + a0_x = _mm256_cvtepu8_epi16(a0_x128); + a1_x = _mm256_cvtepu8_epi16(a1_x128); + + shift = _mm256_castsi128_si256(_mm_srli_epi16( + _mm_and_si128(_mm_setr_epi16( + ((0 + j) << 6) - y * dx, ((1 + j) << 6) - y * dx, + ((2 + j) << 6) - y * dx, ((3 + j) << 6) - y * dx, + ((4 + j) << 6) - y * dx, ((5 + j) << 6) - y * dx, + ((6 + j) << 6) - y * dx, ((7 + j) << 6) - y * dx), + _mm256_castsi256_si128(c3f)), + 1)); + } + + base_shift = 0; + if ((base_x + j + 8) < (min_base_x - 1)) { + base_shift = (min_base_x - (base_x + j + 8) - 1); + } + if (base_shift <= 7) { + a0_1_x128 = + _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 8 + j)); + a1_1_x128 = + _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 9 + j)); + a0_1_x128 = + _mm_shuffle_epi8(a0_1_x128, *(__m128i *)LoadMaskx[base_shift]); + a1_1_x128 = + _mm_shuffle_epi8(a1_1_x128, *(__m128i *)LoadMaskx[base_shift]); + + a0_1_x = _mm_cvtepu8_epi16(a0_1_x128); + a1_1_x = _mm_cvtepu8_epi16(a1_1_x128); + + shiftx = _mm_srli_epi16( + _mm_and_si128( + _mm_setr_epi16( + ((8 + j) << 6) - y * dx, ((9 + j) << 6) - y * dx, + ((10 + j) << 6) - y * dx, ((11 + j) << 6) - y * dx, + ((12 + j) << 6) - y * dx, ((13 + j) << 6) - y * dx, + ((14 + j) << 6) - y * dx, ((15 + j) << 6) - y * dx), + _mm256_castsi256_si128(c3f)), + 1); + + a0_x = _mm256_inserti128_si256(a0_x, a0_1_x, 1); + a1_x = _mm256_inserti128_si256(a1_x, a1_1_x, 1); + shift = _mm256_inserti128_si256(shift, shiftx, 1); + } + + diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi16(diff, shift); + res = _mm256_add_epi16(a32, b); + res = _mm256_srli_epi16(res, 5); // 16 16-bit values + resx = _mm256_castsi256_si128(_mm256_packus_epi16( + res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)))); + + // y calc + if ((base_x < min_base_x)) { + DECLARE_ALIGNED(32, int16_t, base_y_c[16]); + __m256i r6, c256, dy256, y_c256, base_y_c256, mask256, mul16; + r6 = _mm256_set1_epi16(r << 6); + dy256 = _mm256_set1_epi16(dy); + c256 = _mm256_setr_epi16(1 + j, 2 + j, 3 + j, 4 + j, 5 + j, 6 + j, + 7 + j, 8 + j, 9 + j, 10 + j, 11 + j, 12 + j, + 13 + j, 14 + j, 15 + j, 16 + j); + mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256), + _mm256_srli_epi16(min_base_y256, 1)); + y_c256 = _mm256_sub_epi16(r6, mul16); + + base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y); + mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256); + base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256); + _mm256_store_si256((__m256i *)base_y_c, base_y_c256); /**/ + + a0_y = _mm256_setr_epi16( + left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], + left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]], + left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]], + left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]], + left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]], + left[base_y_c[15]]); + a1_y = _mm256_setr_epi16( + left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1], + left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1], + left[base_y_c[6] + 1], left[base_y_c[7] + 1], left[base_y_c[8] + 1], + left[base_y_c[9] + 1], left[base_y_c[10] + 1], + left[base_y_c[11] + 1], left[base_y_c[12] + 1], + left[base_y_c[13] + 1], left[base_y_c[14] + 1], + left[base_y_c[15] + 1]); + + shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1); + + diff = _mm256_sub_epi16(a1_y, a0_y); // a[x+1] - a[x] + a32 = _mm256_slli_epi16(a0_y, 5); // a[x] * 32 + a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 + + b = _mm256_mullo_epi16(diff, shifty); + res = _mm256_add_epi16(a32, b); + res = _mm256_srli_epi16(res, 5); // 16 16-bit values + resy = _mm256_castsi256_si128(_mm256_packus_epi16( + res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)))); + + } else { + resy = _mm_setzero_si128(); + } + resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]); + _mm_storeu_si128((__m128i *)(dst + j), resxy); + } // for j + dst += stride; + } +} + +// Directional prediction, zone 2: 90 < angle < 180 +void av1_dr_prediction_z2_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh, + const uint8_t *above, const uint8_t *left, + int upsample_above, int upsample_left, int dx, + int dy) { + assert(dx > 0); + assert(dy > 0); + switch (bw) { + case 4: + dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left, upsample_above, + upsample_left, dx, dy); + break; + case 8: + dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left, upsample_above, + upsample_left, dx, dy); + break; + default: + dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left, + upsample_above, upsample_left, dx, dy); + break; + } + return; +} + +// z3 functions +static INLINE void transpose4x16_sse2(__m128i *x, __m128i *d) { + __m128i w0, w1, w2, w3, ww0, ww1, ww2, ww3; + w0 = _mm_unpacklo_epi8(x[0], x[1]); + w1 = _mm_unpacklo_epi8(x[2], x[3]); + w2 = _mm_unpackhi_epi8(x[0], x[1]); + w3 = _mm_unpackhi_epi8(x[2], x[3]); + + ww0 = _mm_unpacklo_epi16(w0, w1); + ww1 = _mm_unpacklo_epi16(w2, w3); + ww2 = _mm_unpackhi_epi16(w0, w1); + ww3 = _mm_unpackhi_epi16(w2, w3); + + w0 = _mm_unpacklo_epi32(ww0, ww1); + w2 = _mm_unpacklo_epi32(ww2, ww3); + w1 = _mm_unpackhi_epi32(ww0, ww1); + w3 = _mm_unpackhi_epi32(ww2, ww3); + + d[0] = _mm_unpacklo_epi64(w0, w2); + d[1] = _mm_unpackhi_epi64(w0, w2); + d[2] = _mm_unpacklo_epi64(w1, w3); + d[3] = _mm_unpackhi_epi64(w1, w3); + + d[4] = _mm_srli_si128(d[0], 8); + d[5] = _mm_srli_si128(d[1], 8); + d[6] = _mm_srli_si128(d[2], 8); + d[7] = _mm_srli_si128(d[3], 8); + + d[8] = _mm_srli_si128(d[0], 4); + d[9] = _mm_srli_si128(d[1], 4); + d[10] = _mm_srli_si128(d[2], 4); + d[11] = _mm_srli_si128(d[3], 4); + + d[12] = _mm_srli_si128(d[0], 12); + d[13] = _mm_srli_si128(d[1], 12); + d[14] = _mm_srli_si128(d[2], 12); + d[15] = _mm_srli_si128(d[3], 12); +} + +static INLINE void transpose16x32_avx2(__m256i *x, __m256i *d) { + __m256i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9; + __m256i w10, w11, w12, w13, w14, w15; + + w0 = _mm256_unpacklo_epi8(x[0], x[1]); + w1 = _mm256_unpacklo_epi8(x[2], x[3]); + w2 = _mm256_unpacklo_epi8(x[4], x[5]); + w3 = _mm256_unpacklo_epi8(x[6], x[7]); + + w8 = _mm256_unpacklo_epi8(x[8], x[9]); + w9 = _mm256_unpacklo_epi8(x[10], x[11]); + w10 = _mm256_unpacklo_epi8(x[12], x[13]); + w11 = _mm256_unpacklo_epi8(x[14], x[15]); + + w4 = _mm256_unpacklo_epi16(w0, w1); + w5 = _mm256_unpacklo_epi16(w2, w3); + w12 = _mm256_unpacklo_epi16(w8, w9); + w13 = _mm256_unpacklo_epi16(w10, w11); + + w6 = _mm256_unpacklo_epi32(w4, w5); + w7 = _mm256_unpackhi_epi32(w4, w5); + w14 = _mm256_unpacklo_epi32(w12, w13); + w15 = _mm256_unpackhi_epi32(w12, w13); + + // Store first 4-line result + d[0] = _mm256_unpacklo_epi64(w6, w14); + d[1] = _mm256_unpackhi_epi64(w6, w14); + d[2] = _mm256_unpacklo_epi64(w7, w15); + d[3] = _mm256_unpackhi_epi64(w7, w15); + + w4 = _mm256_unpackhi_epi16(w0, w1); + w5 = _mm256_unpackhi_epi16(w2, w3); + w12 = _mm256_unpackhi_epi16(w8, w9); + w13 = _mm256_unpackhi_epi16(w10, w11); + + w6 = _mm256_unpacklo_epi32(w4, w5); + w7 = _mm256_unpackhi_epi32(w4, w5); + w14 = _mm256_unpacklo_epi32(w12, w13); + w15 = _mm256_unpackhi_epi32(w12, w13); + + // Store second 4-line result + d[4] = _mm256_unpacklo_epi64(w6, w14); + d[5] = _mm256_unpackhi_epi64(w6, w14); + d[6] = _mm256_unpacklo_epi64(w7, w15); + d[7] = _mm256_unpackhi_epi64(w7, w15); + + // upper half + w0 = _mm256_unpackhi_epi8(x[0], x[1]); + w1 = _mm256_unpackhi_epi8(x[2], x[3]); + w2 = _mm256_unpackhi_epi8(x[4], x[5]); + w3 = _mm256_unpackhi_epi8(x[6], x[7]); + + w8 = _mm256_unpackhi_epi8(x[8], x[9]); + w9 = _mm256_unpackhi_epi8(x[10], x[11]); + w10 = _mm256_unpackhi_epi8(x[12], x[13]); + w11 = _mm256_unpackhi_epi8(x[14], x[15]); + + w4 = _mm256_unpacklo_epi16(w0, w1); + w5 = _mm256_unpacklo_epi16(w2, w3); + w12 = _mm256_unpacklo_epi16(w8, w9); + w13 = _mm256_unpacklo_epi16(w10, w11); + + w6 = _mm256_unpacklo_epi32(w4, w5); + w7 = _mm256_unpackhi_epi32(w4, w5); + w14 = _mm256_unpacklo_epi32(w12, w13); + w15 = _mm256_unpackhi_epi32(w12, w13); + + // Store first 4-line result + d[8] = _mm256_unpacklo_epi64(w6, w14); + d[9] = _mm256_unpackhi_epi64(w6, w14); + d[10] = _mm256_unpacklo_epi64(w7, w15); + d[11] = _mm256_unpackhi_epi64(w7, w15); + + w4 = _mm256_unpackhi_epi16(w0, w1); + w5 = _mm256_unpackhi_epi16(w2, w3); + w12 = _mm256_unpackhi_epi16(w8, w9); + w13 = _mm256_unpackhi_epi16(w10, w11); + + w6 = _mm256_unpacklo_epi32(w4, w5); + w7 = _mm256_unpackhi_epi32(w4, w5); + w14 = _mm256_unpacklo_epi32(w12, w13); + w15 = _mm256_unpackhi_epi32(w12, w13); + + // Store second 4-line result + d[12] = _mm256_unpacklo_epi64(w6, w14); + d[13] = _mm256_unpackhi_epi64(w6, w14); + d[14] = _mm256_unpacklo_epi64(w7, w15); + d[15] = _mm256_unpackhi_epi64(w7, w15); +} + +static INLINE void transpose16x16_sse2(__m128i *x, __m128i *d) { + __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9; + __m128i w10, w11, w12, w13, w14, w15; + + w0 = _mm_unpacklo_epi8(x[0], x[1]); + w1 = _mm_unpacklo_epi8(x[2], x[3]); + w2 = _mm_unpacklo_epi8(x[4], x[5]); + w3 = _mm_unpacklo_epi8(x[6], x[7]); + + w8 = _mm_unpacklo_epi8(x[8], x[9]); + w9 = _mm_unpacklo_epi8(x[10], x[11]); + w10 = _mm_unpacklo_epi8(x[12], x[13]); + w11 = _mm_unpacklo_epi8(x[14], x[15]); + + w4 = _mm_unpacklo_epi16(w0, w1); + w5 = _mm_unpacklo_epi16(w2, w3); + w12 = _mm_unpacklo_epi16(w8, w9); + w13 = _mm_unpacklo_epi16(w10, w11); + + w6 = _mm_unpacklo_epi32(w4, w5); + w7 = _mm_unpackhi_epi32(w4, w5); + w14 = _mm_unpacklo_epi32(w12, w13); + w15 = _mm_unpackhi_epi32(w12, w13); + + // Store first 4-line result + d[0] = _mm_unpacklo_epi64(w6, w14); + d[1] = _mm_unpackhi_epi64(w6, w14); + d[2] = _mm_unpacklo_epi64(w7, w15); + d[3] = _mm_unpackhi_epi64(w7, w15); + + w4 = _mm_unpackhi_epi16(w0, w1); + w5 = _mm_unpackhi_epi16(w2, w3); + w12 = _mm_unpackhi_epi16(w8, w9); + w13 = _mm_unpackhi_epi16(w10, w11); + + w6 = _mm_unpacklo_epi32(w4, w5); + w7 = _mm_unpackhi_epi32(w4, w5); + w14 = _mm_unpacklo_epi32(w12, w13); + w15 = _mm_unpackhi_epi32(w12, w13); + + // Store second 4-line result + d[4] = _mm_unpacklo_epi64(w6, w14); + d[5] = _mm_unpackhi_epi64(w6, w14); + d[6] = _mm_unpacklo_epi64(w7, w15); + d[7] = _mm_unpackhi_epi64(w7, w15); + + // upper half + w0 = _mm_unpackhi_epi8(x[0], x[1]); + w1 = _mm_unpackhi_epi8(x[2], x[3]); + w2 = _mm_unpackhi_epi8(x[4], x[5]); + w3 = _mm_unpackhi_epi8(x[6], x[7]); + + w8 = _mm_unpackhi_epi8(x[8], x[9]); + w9 = _mm_unpackhi_epi8(x[10], x[11]); + w10 = _mm_unpackhi_epi8(x[12], x[13]); + w11 = _mm_unpackhi_epi8(x[14], x[15]); + + w4 = _mm_unpacklo_epi16(w0, w1); + w5 = _mm_unpacklo_epi16(w2, w3); + w12 = _mm_unpacklo_epi16(w8, w9); + w13 = _mm_unpacklo_epi16(w10, w11); + + w6 = _mm_unpacklo_epi32(w4, w5); + w7 = _mm_unpackhi_epi32(w4, w5); + w14 = _mm_unpacklo_epi32(w12, w13); + w15 = _mm_unpackhi_epi32(w12, w13); + + // Store first 4-line result + d[8] = _mm_unpacklo_epi64(w6, w14); + d[9] = _mm_unpackhi_epi64(w6, w14); + d[10] = _mm_unpacklo_epi64(w7, w15); + d[11] = _mm_unpackhi_epi64(w7, w15); + + w4 = _mm_unpackhi_epi16(w0, w1); + w5 = _mm_unpackhi_epi16(w2, w3); + w12 = _mm_unpackhi_epi16(w8, w9); + w13 = _mm_unpackhi_epi16(w10, w11); + + w6 = _mm_unpacklo_epi32(w4, w5); + w7 = _mm_unpackhi_epi32(w4, w5); + w14 = _mm_unpacklo_epi32(w12, w13); + w15 = _mm_unpackhi_epi32(w12, w13); + + // Store second 4-line result + d[12] = _mm_unpacklo_epi64(w6, w14); + d[13] = _mm_unpackhi_epi64(w6, w14); + d[14] = _mm_unpacklo_epi64(w7, w15); + d[15] = _mm_unpackhi_epi64(w7, w15); +} + +static void transpose_TX_8X8(const uint8_t *src, ptrdiff_t pitchSrc, + uint8_t *dst, ptrdiff_t pitchDst) { + __m128i r0, r1, r2, r3, r4, r5, r6, r7; + __m128i d0d1, d2d3, d4d5, d6d7; + r0 = _mm_loadl_epi64((__m128i *)(src + 0 * pitchSrc)); + r1 = _mm_loadl_epi64((__m128i *)(src + 1 * pitchSrc)); + r2 = _mm_loadl_epi64((__m128i *)(src + 2 * pitchSrc)); + r3 = _mm_loadl_epi64((__m128i *)(src + 3 * pitchSrc)); + r4 = _mm_loadl_epi64((__m128i *)(src + 4 * pitchSrc)); + r5 = _mm_loadl_epi64((__m128i *)(src + 5 * pitchSrc)); + r6 = _mm_loadl_epi64((__m128i *)(src + 6 * pitchSrc)); + r7 = _mm_loadl_epi64((__m128i *)(src + 7 * pitchSrc)); + + transpose8x8_sse2(&r0, &r1, &r2, &r3, &r4, &r5, &r6, &r7, &d0d1, &d2d3, &d4d5, + &d6d7); + + _mm_storel_epi64((__m128i *)(dst + 0 * pitchDst), d0d1); + _mm_storel_epi64((__m128i *)(dst + 1 * pitchDst), _mm_srli_si128(d0d1, 8)); + _mm_storel_epi64((__m128i *)(dst + 2 * pitchDst), d2d3); + _mm_storel_epi64((__m128i *)(dst + 3 * pitchDst), _mm_srli_si128(d2d3, 8)); + _mm_storel_epi64((__m128i *)(dst + 4 * pitchDst), d4d5); + _mm_storel_epi64((__m128i *)(dst + 5 * pitchDst), _mm_srli_si128(d4d5, 8)); + _mm_storel_epi64((__m128i *)(dst + 6 * pitchDst), d6d7); + _mm_storel_epi64((__m128i *)(dst + 7 * pitchDst), _mm_srli_si128(d6d7, 8)); +} + +static void transpose(const uint8_t *src, ptrdiff_t pitchSrc, uint8_t *dst, + ptrdiff_t pitchDst, int width, int height) { + for (int j = 0; j < height; j += 8) + for (int i = 0; i < width; i += 8) + transpose_TX_8X8(src + i * pitchSrc + j, pitchSrc, dst + j * pitchDst + i, + pitchDst); +} + +static void dr_prediction_z3_4x4_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[4], d[4]; + + dr_prediction_z1_4xN_internal_avx2(4, dstvec, left, upsample_left, dy); + transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], + &d[0], &d[1], &d[2], &d[3]); + + *(uint32_t *)(dst + stride * 0) = _mm_cvtsi128_si32(d[0]); + *(uint32_t *)(dst + stride * 1) = _mm_cvtsi128_si32(d[1]); + *(uint32_t *)(dst + stride * 2) = _mm_cvtsi128_si32(d[2]); + *(uint32_t *)(dst + stride * 3) = _mm_cvtsi128_si32(d[3]); + return; +} + +static void dr_prediction_z3_8x8_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[8], d[8]; + + dr_prediction_z1_8xN_internal_avx2(8, dstvec, left, upsample_left, dy); + transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], + &dstvec[5], &dstvec[6], &dstvec[7], &d[0], &d[1], &d[2], + &d[3]); + + _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]); + _mm_storel_epi64((__m128i *)(dst + 1 * stride), _mm_srli_si128(d[0], 8)); + _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[1]); + _mm_storel_epi64((__m128i *)(dst + 3 * stride), _mm_srli_si128(d[1], 8)); + _mm_storel_epi64((__m128i *)(dst + 4 * stride), d[2]); + _mm_storel_epi64((__m128i *)(dst + 5 * stride), _mm_srli_si128(d[2], 8)); + _mm_storel_epi64((__m128i *)(dst + 6 * stride), d[3]); + _mm_storel_epi64((__m128i *)(dst + 7 * stride), _mm_srli_si128(d[3], 8)); +} + +static void dr_prediction_z3_4x8_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[4], d[8]; + + dr_prediction_z1_8xN_internal_avx2(4, dstvec, left, upsample_left, dy); + transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &d[0], + &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]); + for (int i = 0; i < 8; i++) { + *(uint32_t *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]); + } +} + +static void dr_prediction_z3_8x4_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[8], d[4]; + + dr_prediction_z1_4xN_internal_avx2(8, dstvec, left, upsample_left, dy); + transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], + &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &d[0], + &d[1], &d[2], &d[3]); + _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]); + _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]); + _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]); + _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]); +} + +static void dr_prediction_z3_8x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[8], d[8]; + + dr_prediction_z1_16xN_internal_avx2(8, dstvec, left, upsample_left, dy); + transpose8x16_16x8_sse2(dstvec, dstvec + 1, dstvec + 2, dstvec + 3, + dstvec + 4, dstvec + 5, dstvec + 6, dstvec + 7, d, + d + 1, d + 2, d + 3, d + 4, d + 5, d + 6, d + 7); + for (int i = 0; i < 8; i++) { + _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]); + _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride), + _mm_srli_si128(d[i], 8)); + } +} + +static void dr_prediction_z3_16x8_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[16], d[16]; + + dr_prediction_z1_8xN_internal_avx2(16, dstvec, left, upsample_left, dy); + transpose16x8_8x16_sse2( + &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5], + &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11], + &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2], + &d[3], &d[4], &d[5], &d[6], &d[7]); + + for (int i = 0; i < 8; i++) { + _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); + } +} + +static void dr_prediction_z3_4x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[4], d[16]; + + dr_prediction_z1_16xN_internal_avx2(4, dstvec, left, upsample_left, dy); + transpose4x16_sse2(dstvec, d); + for (int i = 0; i < 16; i++) { + *(uint32_t *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]); + } +} + +static void dr_prediction_z3_16x4_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[16], d[8]; + + dr_prediction_z1_4xN_internal_avx2(16, dstvec, left, upsample_left, dy); + for (int i = 4; i < 8; i++) { + d[i] = _mm_setzero_si128(); + } + transpose16x8_8x16_sse2( + &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5], + &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11], + &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2], + &d[3], &d[4], &d[5], &d[6], &d[7]); + + for (int i = 0; i < 4; i++) { + _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); + } +} + +static void dr_prediction_z3_8x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m256i dstvec[16], d[16]; + + dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left, dy); + for (int i = 8; i < 16; i++) { + dstvec[i] = _mm256_setzero_si256(); + } + transpose16x32_avx2(dstvec, d); + + for (int i = 0; i < 16; i++) { + _mm_storel_epi64((__m128i *)(dst + i * stride), + _mm256_castsi256_si128(d[i])); + } + for (int i = 0; i < 16; i++) { + _mm_storel_epi64((__m128i *)(dst + (i + 16) * stride), + _mm256_extracti128_si256(d[i], 1)); + } +} + +static void dr_prediction_z3_32x8_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[32], d[16]; + + dr_prediction_z1_8xN_internal_avx2(32, dstvec, left, upsample_left, dy); + + transpose16x8_8x16_sse2( + &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5], + &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11], + &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2], + &d[3], &d[4], &d[5], &d[6], &d[7]); + transpose16x8_8x16_sse2( + &dstvec[0 + 16], &dstvec[1 + 16], &dstvec[2 + 16], &dstvec[3 + 16], + &dstvec[4 + 16], &dstvec[5 + 16], &dstvec[6 + 16], &dstvec[7 + 16], + &dstvec[8 + 16], &dstvec[9 + 16], &dstvec[10 + 16], &dstvec[11 + 16], + &dstvec[12 + 16], &dstvec[13 + 16], &dstvec[14 + 16], &dstvec[15 + 16], + &d[0 + 8], &d[1 + 8], &d[2 + 8], &d[3 + 8], &d[4 + 8], &d[5 + 8], + &d[6 + 8], &d[7 + 8]); + + for (int i = 0; i < 8; i++) { + _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); + _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 8]); + } +} + +static void dr_prediction_z3_16x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[16], d[16]; + + dr_prediction_z1_16xN_internal_avx2(16, dstvec, left, upsample_left, dy); + transpose16x16_sse2(dstvec, d); + + for (int i = 0; i < 16; i++) { + _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); + } +} + +static void dr_prediction_z3_32x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m256i dstvec[32], d[32]; + + dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left, dy); + transpose16x32_avx2(dstvec, d); + transpose16x32_avx2(dstvec + 16, d + 16); + for (int j = 0; j < 16; j++) { + _mm_storeu_si128((__m128i *)(dst + j * stride), + _mm256_castsi256_si128(d[j])); + _mm_storeu_si128((__m128i *)(dst + j * stride + 16), + _mm256_castsi256_si128(d[j + 16])); + } + for (int j = 0; j < 16; j++) { + _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride), + _mm256_extracti128_si256(d[j], 1)); + _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride + 16), + _mm256_extracti128_si256(d[j + 16], 1)); + } +} + +static void dr_prediction_z3_64x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + DECLARE_ALIGNED(16, uint8_t, dstT[64 * 64]); + dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy); + transpose(dstT, 64, dst, stride, 64, 64); +} + +static void dr_prediction_z3_16x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m256i dstvec[16], d[16]; + + dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left, dy); + transpose16x32_avx2(dstvec, d); + // store + for (int j = 0; j < 16; j++) { + _mm_storeu_si128((__m128i *)(dst + j * stride), + _mm256_castsi256_si128(d[j])); + _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride), + _mm256_extracti128_si256(d[j], 1)); + } +} + +static void dr_prediction_z3_32x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[32], d[16]; + + dr_prediction_z1_16xN_internal_avx2(32, dstvec, left, upsample_left, dy); + for (int i = 0; i < 32; i += 16) { + transpose16x16_sse2((dstvec + i), d); + for (int j = 0; j < 16; j++) { + _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]); + } + } +} + +static void dr_prediction_z3_32x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + uint8_t dstT[64 * 32]; + dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy); + transpose(dstT, 64, dst, stride, 32, 64); +} + +static void dr_prediction_z3_64x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + uint8_t dstT[32 * 64]; + dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy); + transpose(dstT, 32, dst, stride, 64, 32); + return; +} + +static void dr_prediction_z3_16x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + uint8_t dstT[64 * 16]; + dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy); + transpose(dstT, 64, dst, stride, 16, 64); +} + +static void dr_prediction_z3_64x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy) { + __m128i dstvec[64], d[16]; + + dr_prediction_z1_16xN_internal_avx2(64, dstvec, left, upsample_left, dy); + for (int i = 0; i < 64; i += 16) { + transpose16x16_sse2((dstvec + i), d); + for (int j = 0; j < 16; j++) { + _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]); + } + } +} + +void av1_dr_prediction_z3_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh, + const uint8_t *above, const uint8_t *left, + int upsample_left, int dx, int dy) { + (void)above; + (void)dx; + assert(dx == 1); + assert(dy > 0); + + if (bw == bh) { + switch (bw) { + case 4: + dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy); + break; + case 8: + dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy); + break; + case 16: + dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy); + break; + case 32: + dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy); + break; + case 64: + dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy); + break; + } + } else { + if (bw < bh) { + if (bw + bw == bh) { + switch (bw) { + case 4: + dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left, dy); + break; + case 8: + dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left, dy); + break; + case 16: + dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left, dy); + break; + case 32: + dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left, dy); + break; + } + } else { + switch (bw) { + case 4: + dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left, dy); + break; + case 8: + dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left, dy); + break; + case 16: + dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left, dy); + break; + } + } + } else { + if (bh + bh == bw) { + switch (bh) { + case 4: + dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left, dy); + break; + case 8: + dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left, dy); + break; + case 16: + dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left, dy); + break; + case 32: + dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left, dy); + break; + } + } else { + switch (bh) { + case 4: + dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left, dy); + break; + case 8: + dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left, dy); + break; + case 16: + dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left, dy); + break; + } + } + } + } +} diff --git a/libaom/aom_dsp/x86/jnt_sad_ssse3.c b/libaom/aom_dsp/x86/jnt_sad_ssse3.c index c3c8824..2e3e2be 100644 --- a/libaom/aom_dsp/x86/jnt_sad_ssse3.c +++ b/libaom/aom_dsp/x86/jnt_sad_ssse3.c @@ -192,47 +192,47 @@ unsigned int aom_sad128xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b, return res; } -#define jnt_sadMxN_sse2(m, n) \ - unsigned int aom_jnt_sad##m##x##n##_avg_ssse3( \ +#define dist_wtd_sadMxN_sse2(m, n) \ + unsigned int aom_dist_wtd_sad##m##x##n##_avg_ssse3( \ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ - const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ + const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \ uint8_t comp_pred[m * n]; \ - aom_jnt_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride, \ - jcp_param); \ + aom_dist_wtd_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride, \ + jcp_param); \ return aom_sad##m##xh_sse2(src, src_stride, comp_pred, m, m, n); \ } -#define jnt_sadMxN_avx2(m, n) \ - unsigned int aom_jnt_sad##m##x##n##_avg_avx2( \ +#define dist_wtd_sadMxN_avx2(m, n) \ + unsigned int aom_dist_wtd_sad##m##x##n##_avg_avx2( \ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ - const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ + const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \ uint8_t comp_pred[m * n]; \ - aom_jnt_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride, \ - jcp_param); \ + aom_dist_wtd_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride, \ + jcp_param); \ return aom_sad##m##xh_avx2(src, src_stride, comp_pred, m, m, n); \ } /* clang-format off */ -jnt_sadMxN_sse2(128, 128) -jnt_sadMxN_sse2(128, 64) -jnt_sadMxN_sse2(64, 128) -jnt_sadMxN_sse2(64, 64) -jnt_sadMxN_sse2(64, 32) -jnt_sadMxN_sse2(32, 64) -jnt_sadMxN_sse2(32, 32) -jnt_sadMxN_sse2(32, 16) -jnt_sadMxN_sse2(16, 32) -jnt_sadMxN_sse2(16, 16) -jnt_sadMxN_sse2(16, 8) -jnt_sadMxN_sse2(8, 16) -jnt_sadMxN_sse2(8, 8) -jnt_sadMxN_sse2(8, 4) -jnt_sadMxN_sse2(4, 8) -jnt_sadMxN_sse2(4, 4) -jnt_sadMxN_sse2(4, 16) -jnt_sadMxN_sse2(16, 4) -jnt_sadMxN_sse2(8, 32) -jnt_sadMxN_sse2(32, 8) -jnt_sadMxN_sse2(16, 64) -jnt_sadMxN_sse2(64, 16) +dist_wtd_sadMxN_sse2(128, 128) +dist_wtd_sadMxN_sse2(128, 64) +dist_wtd_sadMxN_sse2(64, 128) +dist_wtd_sadMxN_sse2(64, 64) +dist_wtd_sadMxN_sse2(64, 32) +dist_wtd_sadMxN_sse2(32, 64) +dist_wtd_sadMxN_sse2(32, 32) +dist_wtd_sadMxN_sse2(32, 16) +dist_wtd_sadMxN_sse2(16, 32) +dist_wtd_sadMxN_sse2(16, 16) +dist_wtd_sadMxN_sse2(16, 8) +dist_wtd_sadMxN_sse2(8, 16) +dist_wtd_sadMxN_sse2(8, 8) +dist_wtd_sadMxN_sse2(8, 4) +dist_wtd_sadMxN_sse2(4, 8) +dist_wtd_sadMxN_sse2(4, 4) +dist_wtd_sadMxN_sse2(4, 16) +dist_wtd_sadMxN_sse2(16, 4) +dist_wtd_sadMxN_sse2(8, 32) +dist_wtd_sadMxN_sse2(32, 8) +dist_wtd_sadMxN_sse2(16, 64) +dist_wtd_sadMxN_sse2(64, 16) /* clang-format on */ diff --git a/libaom/aom_dsp/x86/jnt_variance_ssse3.c b/libaom/aom_dsp/x86/jnt_variance_ssse3.c index f9a41a2..c8b02f5 100644 --- a/libaom/aom_dsp/x86/jnt_variance_ssse3.c +++ b/libaom/aom_dsp/x86/jnt_variance_ssse3.c @@ -29,7 +29,7 @@ void aom_var_filter_block2d_bil_second_pass_ssse3( unsigned int pixel_step, unsigned int output_height, unsigned int output_width, const uint8_t *filter); -static INLINE void compute_jnt_comp_avg(__m128i *p0, __m128i *p1, +static INLINE void compute_dist_wtd_avg(__m128i *p0, __m128i *p1, const __m128i *w, const __m128i *r, void *const result) { __m128i p_lo = _mm_unpacklo_epi8(*p0, *p1); @@ -45,10 +45,10 @@ static INLINE void compute_jnt_comp_avg(__m128i *p0, __m128i *p1, xx_storeu_128(result, _mm_packus_epi16(shift_lo, shift_hi)); } -void aom_jnt_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred, - int width, int height, const uint8_t *ref, - int ref_stride, - const JNT_COMP_PARAMS *jcp_param) { +void aom_dist_wtd_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred, + int width, int height, const uint8_t *ref, + int ref_stride, + const DIST_WTD_COMP_PARAMS *jcp_param) { int i; const uint8_t w0 = (uint8_t)jcp_param->fwd_offset; const uint8_t w1 = (uint8_t)jcp_param->bck_offset; @@ -67,7 +67,7 @@ void aom_jnt_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred, __m128i p0 = xx_loadu_128(ref); __m128i p1 = xx_loadu_128(pred); - compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred); + compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred); comp_pred += 16; pred += 16; @@ -85,7 +85,7 @@ void aom_jnt_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred, __m128i p0 = _mm_unpacklo_epi64(p0_0, p0_1); __m128i p1 = xx_loadu_128(pred); - compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred); + compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred); comp_pred += 16; pred += 16; @@ -107,7 +107,7 @@ void aom_jnt_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred, row3[0], row3[1], row3[2], row3[3]); __m128i p1 = xx_loadu_128(pred); - compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred); + compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred); comp_pred += 16; pred += 16; @@ -116,11 +116,11 @@ void aom_jnt_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred, } } -void aom_jnt_comp_avg_upsampled_pred_ssse3( +void aom_dist_wtd_comp_avg_upsampled_pred_ssse3( MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, - int ref_stride, const JNT_COMP_PARAMS *jcp_param, int subpel_search) { + int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) { int n; int i; aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height, @@ -141,52 +141,52 @@ void aom_jnt_comp_avg_upsampled_pred_ssse3( __m128i p0 = xx_loadu_128(comp_pred); __m128i p1 = xx_loadu_128(pred); - compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred); + compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred); comp_pred += 16; pred += 16; } } -#define JNT_SUBPIX_AVG_VAR(W, H) \ - uint32_t aom_jnt_sub_pixel_avg_variance##W##x##H##_ssse3( \ - const uint8_t *a, int a_stride, int xoffset, int yoffset, \ - const uint8_t *b, int b_stride, uint32_t *sse, \ - const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint8_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ - \ - aom_var_filter_block2d_bil_first_pass_ssse3( \ - a, fdata3, a_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ - aom_var_filter_block2d_bil_second_pass_ssse3( \ - fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ - \ - aom_jnt_comp_avg_pred_ssse3(temp3, second_pred, W, H, temp2, W, \ - jcp_param); \ - \ - return aom_variance##W##x##H(temp3, W, b, b_stride, sse); \ +#define DIST_WTD_SUBPIX_AVG_VAR(W, H) \ + uint32_t aom_dist_wtd_sub_pixel_avg_variance##W##x##H##_ssse3( \ + const uint8_t *a, int a_stride, int xoffset, int yoffset, \ + const uint8_t *b, int b_stride, uint32_t *sse, \ + const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ + \ + aom_var_filter_block2d_bil_first_pass_ssse3( \ + a, fdata3, a_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \ + aom_var_filter_block2d_bil_second_pass_ssse3( \ + fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \ + \ + aom_dist_wtd_comp_avg_pred_ssse3(temp3, second_pred, W, H, temp2, W, \ + jcp_param); \ + \ + return aom_variance##W##x##H(temp3, W, b, b_stride, sse); \ } -JNT_SUBPIX_AVG_VAR(128, 128) -JNT_SUBPIX_AVG_VAR(128, 64) -JNT_SUBPIX_AVG_VAR(64, 128) -JNT_SUBPIX_AVG_VAR(64, 64) -JNT_SUBPIX_AVG_VAR(64, 32) -JNT_SUBPIX_AVG_VAR(32, 64) -JNT_SUBPIX_AVG_VAR(32, 32) -JNT_SUBPIX_AVG_VAR(32, 16) -JNT_SUBPIX_AVG_VAR(16, 32) -JNT_SUBPIX_AVG_VAR(16, 16) -JNT_SUBPIX_AVG_VAR(16, 8) -JNT_SUBPIX_AVG_VAR(8, 16) -JNT_SUBPIX_AVG_VAR(8, 8) -JNT_SUBPIX_AVG_VAR(8, 4) -JNT_SUBPIX_AVG_VAR(4, 8) -JNT_SUBPIX_AVG_VAR(4, 4) -JNT_SUBPIX_AVG_VAR(4, 16) -JNT_SUBPIX_AVG_VAR(16, 4) -JNT_SUBPIX_AVG_VAR(8, 32) -JNT_SUBPIX_AVG_VAR(32, 8) -JNT_SUBPIX_AVG_VAR(16, 64) -JNT_SUBPIX_AVG_VAR(64, 16) +DIST_WTD_SUBPIX_AVG_VAR(128, 128) +DIST_WTD_SUBPIX_AVG_VAR(128, 64) +DIST_WTD_SUBPIX_AVG_VAR(64, 128) +DIST_WTD_SUBPIX_AVG_VAR(64, 64) +DIST_WTD_SUBPIX_AVG_VAR(64, 32) +DIST_WTD_SUBPIX_AVG_VAR(32, 64) +DIST_WTD_SUBPIX_AVG_VAR(32, 32) +DIST_WTD_SUBPIX_AVG_VAR(32, 16) +DIST_WTD_SUBPIX_AVG_VAR(16, 32) +DIST_WTD_SUBPIX_AVG_VAR(16, 16) +DIST_WTD_SUBPIX_AVG_VAR(16, 8) +DIST_WTD_SUBPIX_AVG_VAR(8, 16) +DIST_WTD_SUBPIX_AVG_VAR(8, 8) +DIST_WTD_SUBPIX_AVG_VAR(8, 4) +DIST_WTD_SUBPIX_AVG_VAR(4, 8) +DIST_WTD_SUBPIX_AVG_VAR(4, 4) +DIST_WTD_SUBPIX_AVG_VAR(4, 16) +DIST_WTD_SUBPIX_AVG_VAR(16, 4) +DIST_WTD_SUBPIX_AVG_VAR(8, 32) +DIST_WTD_SUBPIX_AVG_VAR(32, 8) +DIST_WTD_SUBPIX_AVG_VAR(16, 64) +DIST_WTD_SUBPIX_AVG_VAR(64, 16) diff --git a/libaom/aom_dsp/x86/loopfilter_sse2.c b/libaom/aom_dsp/x86/loopfilter_sse2.c index 26f249e..c021f50 100644 --- a/libaom/aom_dsp/x86/loopfilter_sse2.c +++ b/libaom/aom_dsp/x86/loopfilter_sse2.c @@ -16,237 +16,69 @@ #include "aom_dsp/x86/synonyms.h" #include "aom_ports/mem.h" #include "aom_ports/emmintrin_compat.h" +#include "aom_dsp/x86/lpf_common_sse2.h" static INLINE __m128i abs_diff(__m128i a, __m128i b) { return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a)); } -static INLINE void transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1, - __m128i *x2, __m128i *x3, - __m128i *d0, __m128i *d1, - __m128i *d2, __m128i *d3) { - // input - // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx - // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx - // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx - // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx - // output - // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx - // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx - // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx - // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx - - __m128i w0, w1; - - w0 = _mm_unpacklo_epi8( - *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 - w1 = _mm_unpacklo_epi8( - *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 - - *d0 = _mm_unpacklo_epi16( - w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 - - *d1 = _mm_srli_si128(*d0, - 4); // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx - *d2 = _mm_srli_si128(*d0, - 8); // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx - *d3 = _mm_srli_si128(*d0, - 12); // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx -} - -static INLINE void transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, __m128i *x2, - __m128i *x3, __m128i *d0, __m128i *d1, - __m128i *d2, __m128i *d3, __m128i *d4, - __m128i *d5, __m128i *d6, - __m128i *d7) { - // input - // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx - // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx - // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx - // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx - // output - // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx - // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx - // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx - // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx - // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx - // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx - // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx - // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx - - __m128i w0, w1, ww0, ww1; - +// this function treats its input as 2 parallel 8x4 matrices, transposes each of +// them to 4x8 independently while flipping the second matrix horizontally. +// Used for 14 taps pq pairs creation +static INLINE void transpose_pq_14_sse2(__m128i *x0, __m128i *x1, __m128i *x2, + __m128i *x3, __m128i *q0p0, + __m128i *q1p1, __m128i *q2p2, + __m128i *q3p3, __m128i *q4p4, + __m128i *q5p5, __m128i *q6p6, + __m128i *q7p7) { + __m128i w0, w1, ww0, ww1, w2, w3, ww2, ww3; w0 = _mm_unpacklo_epi8( *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 w1 = _mm_unpacklo_epi8( *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + w2 = _mm_unpackhi_epi8( + *x0, *x1); // 08 18 09 19 010 110 011 111 012 112 013 113 014 114 015 115 + w3 = _mm_unpackhi_epi8( + *x2, *x3); // 28 38 29 39 210 310 211 311 212 312 213 313 214 314 215 315 ww0 = _mm_unpacklo_epi16( - w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 ww1 = _mm_unpackhi_epi16( - w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 - - *d0 = ww0; // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx - *d1 = _mm_srli_si128(ww0, - 4); // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx - *d2 = _mm_srli_si128(ww0, - 8); // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx - *d3 = _mm_srli_si128(ww0, - 12); // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx - - *d4 = ww1; // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx - *d5 = _mm_srli_si128(ww1, - 4); // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx - *d6 = _mm_srli_si128(ww1, - 8); // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx - *d7 = _mm_srli_si128(ww1, - 12); // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx -} - -static INLINE void transpose8x8_low_sse2(__m128i *x0, __m128i *x1, __m128i *x2, - __m128i *x3, __m128i *x4, __m128i *x5, - __m128i *x6, __m128i *x7, __m128i *d0, - __m128i *d1, __m128i *d2, - __m128i *d3) { - // input - // x0 00 01 02 03 04 05 06 07 - // x1 10 11 12 13 14 15 16 17 - // x2 20 21 22 23 24 25 26 27 - // x3 30 31 32 33 34 35 36 37 - // x4 40 41 42 43 44 45 46 47 - // x5 50 51 52 53 54 55 56 57 - // x6 60 61 62 63 64 65 66 67 - // x7 70 71 72 73 74 75 76 77 - // output - // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx - // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx - // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx - // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx - - __m128i w0, w1, w2, w3, w4, w5; - - w0 = _mm_unpacklo_epi8( - *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 - - w1 = _mm_unpacklo_epi8( - *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 - - w2 = _mm_unpacklo_epi8( - *x4, *x5); // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 - - w3 = _mm_unpacklo_epi8( - *x6, *x7); // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 - - w4 = _mm_unpacklo_epi16( - w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 - w5 = _mm_unpacklo_epi16( - w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 - - *d0 = _mm_unpacklo_epi32( - w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 - *d1 = _mm_srli_si128(*d0, 8); - *d2 = _mm_unpackhi_epi32( - w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 - *d3 = _mm_srli_si128(*d2, 8); -} - -static INLINE void transpose8x8_sse2(__m128i *x0, __m128i *x1, __m128i *x2, - __m128i *x3, __m128i *x4, __m128i *x5, - __m128i *x6, __m128i *x7, __m128i *d0d1, - __m128i *d2d3, __m128i *d4d5, - __m128i *d6d7) { - __m128i w0, w1, w2, w3, w4, w5, w6, w7; - // x0 00 01 02 03 04 05 06 07 - // x1 10 11 12 13 14 15 16 17 - w0 = _mm_unpacklo_epi8( - *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 - - // x2 20 21 22 23 24 25 26 27 - // x3 30 31 32 33 34 35 36 37 - w1 = _mm_unpacklo_epi8( - *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 - - // x4 40 41 42 43 44 45 46 47 - // x5 50 51 52 53 54 55 56 57 - w2 = _mm_unpacklo_epi8( - *x4, *x5); // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 - - // x6 60 61 62 63 64 65 66 67 - // x7 70 71 72 73 74 75 76 77 - w3 = _mm_unpacklo_epi8( - *x6, *x7); // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 - - w4 = _mm_unpacklo_epi16( - w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 - w5 = _mm_unpacklo_epi16( - w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 - - *d0d1 = _mm_unpacklo_epi32( - w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 - *d2d3 = _mm_unpackhi_epi32( - w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 - - w6 = _mm_unpackhi_epi16( - w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 - w7 = _mm_unpackhi_epi16( - w2, w3); // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 - - *d4d5 = _mm_unpacklo_epi32( - w6, w7); // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 - *d6d7 = _mm_unpackhi_epi32( - w6, w7); // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 -} + w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + ww2 = _mm_unpacklo_epi16( + w2, w3); // 08 18 28 38 09 19 29 39 010 110 210 310 011 111 211 311 + ww3 = _mm_unpackhi_epi16( + w2, + w3); // 012 112 212 312 013 113 213 313 014 114 214 314 015 115 215 315 -static INLINE void transpose16x8_8x16_sse2( - __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, - __m128i *x5, __m128i *x6, __m128i *x7, __m128i *x8, __m128i *x9, - __m128i *x10, __m128i *x11, __m128i *x12, __m128i *x13, __m128i *x14, - __m128i *x15, __m128i *d0, __m128i *d1, __m128i *d2, __m128i *d3, - __m128i *d4, __m128i *d5, __m128i *d6, __m128i *d7) { - __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9; - __m128i w10, w11, w12, w13, w14, w15; - - w0 = _mm_unpacklo_epi8(*x0, *x1); - w1 = _mm_unpacklo_epi8(*x2, *x3); - w2 = _mm_unpacklo_epi8(*x4, *x5); - w3 = _mm_unpacklo_epi8(*x6, *x7); - - w8 = _mm_unpacklo_epi8(*x8, *x9); - w9 = _mm_unpacklo_epi8(*x10, *x11); - w10 = _mm_unpacklo_epi8(*x12, *x13); - w11 = _mm_unpacklo_epi8(*x14, *x15); - - w4 = _mm_unpacklo_epi16(w0, w1); - w5 = _mm_unpacklo_epi16(w2, w3); - w12 = _mm_unpacklo_epi16(w8, w9); - w13 = _mm_unpacklo_epi16(w10, w11); - - w6 = _mm_unpacklo_epi32(w4, w5); - w7 = _mm_unpackhi_epi32(w4, w5); - w14 = _mm_unpacklo_epi32(w12, w13); - w15 = _mm_unpackhi_epi32(w12, w13); - - // Store first 4-line result - *d0 = _mm_unpacklo_epi64(w6, w14); - *d1 = _mm_unpackhi_epi64(w6, w14); - *d2 = _mm_unpacklo_epi64(w7, w15); - *d3 = _mm_unpackhi_epi64(w7, w15); - - w4 = _mm_unpackhi_epi16(w0, w1); - w5 = _mm_unpackhi_epi16(w2, w3); - w12 = _mm_unpackhi_epi16(w8, w9); - w13 = _mm_unpackhi_epi16(w10, w11); - - w6 = _mm_unpacklo_epi32(w4, w5); - w7 = _mm_unpackhi_epi32(w4, w5); - w14 = _mm_unpacklo_epi32(w12, w13); - w15 = _mm_unpackhi_epi32(w12, w13); - - // Store second 4-line result - *d4 = _mm_unpacklo_epi64(w6, w14); - *d5 = _mm_unpackhi_epi64(w6, w14); - *d6 = _mm_unpacklo_epi64(w7, w15); - *d7 = _mm_unpackhi_epi64(w7, w15); + *q7p7 = _mm_unpacklo_epi32( + ww0, + _mm_srli_si128( + ww3, 12)); // 00 10 20 30 015 115 215 315 xx xx xx xx xx xx xx xx + *q6p6 = _mm_unpackhi_epi32( + _mm_slli_si128(ww0, 4), + ww3); // 01 11 21 31 014 114 214 314 xx xx xx xxxx xx xx xx + *q5p5 = _mm_unpackhi_epi32( + ww0, + _mm_slli_si128( + ww3, 4)); // 02 12 22 32 013 113 213 313 xx xx xx x xx xx xx xxx + *q4p4 = _mm_unpacklo_epi32( + _mm_srli_si128(ww0, 12), + ww3); // 03 13 23 33 012 112 212 312 xx xx xx xx xx xx xx xx + *q3p3 = _mm_unpacklo_epi32( + ww1, + _mm_srli_si128( + ww2, 12)); // 04 14 24 34 011 111 211 311 xx xx xx xx xx xx xx xx + *q2p2 = _mm_unpackhi_epi32( + _mm_slli_si128(ww1, 4), + ww2); // 05 15 25 35 010 110 210 310 xx xx xx xx xx xx xx xx + *q1p1 = _mm_unpackhi_epi32( + ww1, + _mm_slli_si128( + ww2, 4)); // 06 16 26 36 09 19 29 39 xx xx xx xx xx xx xx xx + *q0p0 = _mm_unpacklo_epi32( + _mm_srli_si128(ww1, 12), + ww2); // 07 17 27 37 08 18 28 38 xx xx xx xx xx xx xx xx } // this function treats its input as 2 parallel 8x4 matrices, transposes each of @@ -306,116 +138,6 @@ static INLINE void transpose_pq_14_inv_sse2(__m128i *x0, __m128i *x1, *pq3 = _mm_unpackhi_epi64(d2, d3); // pq } -static INLINE void transpose8x16_16x8_sse2( - __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, - __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0d1, __m128i *d2d3, - __m128i *d4d5, __m128i *d6d7, __m128i *d8d9, __m128i *d10d11, - __m128i *d12d13, __m128i *d14d15) { - __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9; - __m128i w10, w11, w12, w13, w14, w15; - - w0 = _mm_unpacklo_epi8(*x0, *x1); - w1 = _mm_unpacklo_epi8(*x2, *x3); - w2 = _mm_unpacklo_epi8(*x4, *x5); - w3 = _mm_unpacklo_epi8(*x6, *x7); - - w8 = _mm_unpackhi_epi8(*x0, *x1); - w9 = _mm_unpackhi_epi8(*x2, *x3); - w10 = _mm_unpackhi_epi8(*x4, *x5); - w11 = _mm_unpackhi_epi8(*x6, *x7); - - w4 = _mm_unpacklo_epi16(w0, w1); - w5 = _mm_unpacklo_epi16(w2, w3); - w12 = _mm_unpacklo_epi16(w8, w9); - w13 = _mm_unpacklo_epi16(w10, w11); - - w6 = _mm_unpacklo_epi32(w4, w5); - w7 = _mm_unpackhi_epi32(w4, w5); - w14 = _mm_unpacklo_epi32(w12, w13); - w15 = _mm_unpackhi_epi32(w12, w13); - - // Store first 4-line result - *d0d1 = _mm_unpacklo_epi64(w6, w14); - *d2d3 = _mm_unpackhi_epi64(w6, w14); - *d4d5 = _mm_unpacklo_epi64(w7, w15); - *d6d7 = _mm_unpackhi_epi64(w7, w15); - - w4 = _mm_unpackhi_epi16(w0, w1); - w5 = _mm_unpackhi_epi16(w2, w3); - w12 = _mm_unpackhi_epi16(w8, w9); - w13 = _mm_unpackhi_epi16(w10, w11); - - w6 = _mm_unpacklo_epi32(w4, w5); - w7 = _mm_unpackhi_epi32(w4, w5); - w14 = _mm_unpacklo_epi32(w12, w13); - w15 = _mm_unpackhi_epi32(w12, w13); - - // Store second 4-line result - *d8d9 = _mm_unpacklo_epi64(w6, w14); - *d10d11 = _mm_unpackhi_epi64(w6, w14); - *d12d13 = _mm_unpacklo_epi64(w7, w15); - *d14d15 = _mm_unpackhi_epi64(w7, w15); -} - -// this function treats its input as 2 parallel 8x4 matrices, transposes each of -// them to 4x8 independently while flipping the second matrix horizontaly. Used -// for 14 taps pq pairs creation -static INLINE void transpose_pq_14_sse2(__m128i *x0, __m128i *x1, __m128i *x2, - __m128i *x3, __m128i *q0p0, - __m128i *q1p1, __m128i *q2p2, - __m128i *q3p3, __m128i *q4p4, - __m128i *q5p5, __m128i *q6p6, - __m128i *q7p7) { - __m128i w0, w1, ww0, ww1, w2, w3, ww2, ww3; - w0 = _mm_unpacklo_epi8( - *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 - w1 = _mm_unpacklo_epi8( - *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 - w2 = _mm_unpackhi_epi8( - *x0, *x1); // 08 18 09 19 010 110 011 111 012 112 013 113 014 114 015 115 - w3 = _mm_unpackhi_epi8( - *x2, *x3); // 28 38 29 39 210 310 211 311 212 312 213 313 214 314 215 315 - - ww0 = _mm_unpacklo_epi16( - w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 - ww1 = _mm_unpackhi_epi16( - w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 - ww2 = _mm_unpacklo_epi16( - w2, w3); // 08 18 28 38 09 19 29 39 010 110 210 310 011 111 211 311 - ww3 = _mm_unpackhi_epi16( - w2, - w3); // 012 112 212 312 013 113 213 313 014 114 214 314 015 115 215 315 - - *q7p7 = _mm_unpacklo_epi32( - ww0, - _mm_srli_si128( - ww3, 12)); // 00 10 20 30 015 115 215 315 xx xx xx xx xx xx xx xx - *q6p6 = _mm_unpackhi_epi32( - _mm_slli_si128(ww0, 4), - ww3); // 01 11 21 31 014 114 214 314 xx xx xx xxxx xx xx xx - *q5p5 = _mm_unpackhi_epi32( - ww0, - _mm_slli_si128( - ww3, 4)); // 02 12 22 32 013 113 213 313 xx xx xx x xx xx xx xxx - *q4p4 = _mm_unpacklo_epi32( - _mm_srli_si128(ww0, 12), - ww3); // 03 13 23 33 012 112 212 312 xx xx xx xx xx xx xx xx - *q3p3 = _mm_unpacklo_epi32( - ww1, - _mm_srli_si128( - ww2, 12)); // 04 14 24 34 011 111 211 311 xx xx xx xx xx xx xx xx - *q2p2 = _mm_unpackhi_epi32( - _mm_slli_si128(ww1, 4), - ww2); // 05 15 25 35 010 110 210 310 xx xx xx xx xx xx xx xx - *q1p1 = _mm_unpackhi_epi32( - ww1, - _mm_slli_si128( - ww2, 4)); // 06 16 26 36 09 19 29 39 xx xx xx xx xx xx xx xx - *q0p0 = _mm_unpacklo_epi32( - _mm_srli_si128(ww1, 12), - ww2); // 07 17 27 37 08 18 28 38 xx xx xx xx xx xx xx xx -} - static AOM_FORCE_INLINE void filter4_sse2(__m128i *p1p0, __m128i *q1q0, __m128i *hev, __m128i *mask, __m128i *qs1qs0, __m128i *ps1ps0) { diff --git a/libaom/aom_dsp/x86/lpf_common_sse2.h b/libaom/aom_dsp/x86/lpf_common_sse2.h index 8970fe7..6ed2cbf 100644 --- a/libaom/aom_dsp/x86/lpf_common_sse2.h +++ b/libaom/aom_dsp/x86/lpf_common_sse2.h @@ -212,4 +212,284 @@ static INLINE void highbd_transpose8x16_sse2( d4 + 1, d5 + 1, d6 + 1, d7 + 1); } +// Low bit depth functions +static INLINE void transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1, + __m128i *x2, __m128i *x3, + __m128i *d0, __m128i *d1, + __m128i *d2, __m128i *d3) { + // input + // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx + // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx + // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx + // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx + // output + // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx + // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx + // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx + // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx + + __m128i w0, w1; + + w0 = _mm_unpacklo_epi8( + *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + w1 = _mm_unpacklo_epi8( + *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + + *d0 = _mm_unpacklo_epi16( + w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + + *d1 = _mm_srli_si128(*d0, + 4); // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx + *d2 = _mm_srli_si128(*d0, + 8); // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx + *d3 = _mm_srli_si128(*d0, + 12); // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx +} + +static INLINE void transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, __m128i *x2, + __m128i *x3, __m128i *d0, __m128i *d1, + __m128i *d2, __m128i *d3, __m128i *d4, + __m128i *d5, __m128i *d6, + __m128i *d7) { + // input + // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx + // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx + // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx + // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx + // output + // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx + // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx + // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx + // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx + // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx + // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx + // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx + // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx + + __m128i w0, w1, ww0, ww1; + + w0 = _mm_unpacklo_epi8( + *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + w1 = _mm_unpacklo_epi8( + *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + + ww0 = _mm_unpacklo_epi16( + w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + ww1 = _mm_unpackhi_epi16( + w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + + *d0 = ww0; // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx + *d1 = _mm_srli_si128(ww0, + 4); // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx + *d2 = _mm_srli_si128(ww0, + 8); // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx + *d3 = _mm_srli_si128(ww0, + 12); // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx + + *d4 = ww1; // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx + *d5 = _mm_srli_si128(ww1, + 4); // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx + *d6 = _mm_srli_si128(ww1, + 8); // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx + *d7 = _mm_srli_si128(ww1, + 12); // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx +} + +static INLINE void transpose8x8_low_sse2(__m128i *x0, __m128i *x1, __m128i *x2, + __m128i *x3, __m128i *x4, __m128i *x5, + __m128i *x6, __m128i *x7, __m128i *d0, + __m128i *d1, __m128i *d2, + __m128i *d3) { + // input + // x0 00 01 02 03 04 05 06 07 + // x1 10 11 12 13 14 15 16 17 + // x2 20 21 22 23 24 25 26 27 + // x3 30 31 32 33 34 35 36 37 + // x4 40 41 42 43 44 45 46 47 + // x5 50 51 52 53 54 55 56 57 + // x6 60 61 62 63 64 65 66 67 + // x7 70 71 72 73 74 75 76 77 + // output + // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx + // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx + // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx + // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx + + __m128i w0, w1, w2, w3, w4, w5; + + w0 = _mm_unpacklo_epi8( + *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + + w1 = _mm_unpacklo_epi8( + *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + + w2 = _mm_unpacklo_epi8( + *x4, *x5); // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + + w3 = _mm_unpacklo_epi8( + *x6, *x7); // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + + w4 = _mm_unpacklo_epi16( + w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + w5 = _mm_unpacklo_epi16( + w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + + *d0 = _mm_unpacklo_epi32( + w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + *d1 = _mm_srli_si128(*d0, 8); + *d2 = _mm_unpackhi_epi32( + w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + *d3 = _mm_srli_si128(*d2, 8); +} + +static INLINE void transpose8x8_sse2(__m128i *x0, __m128i *x1, __m128i *x2, + __m128i *x3, __m128i *x4, __m128i *x5, + __m128i *x6, __m128i *x7, __m128i *d0d1, + __m128i *d2d3, __m128i *d4d5, + __m128i *d6d7) { + __m128i w0, w1, w2, w3, w4, w5, w6, w7; + // x0 00 01 02 03 04 05 06 07 + // x1 10 11 12 13 14 15 16 17 + w0 = _mm_unpacklo_epi8( + *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + + // x2 20 21 22 23 24 25 26 27 + // x3 30 31 32 33 34 35 36 37 + w1 = _mm_unpacklo_epi8( + *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + + // x4 40 41 42 43 44 45 46 47 + // x5 50 51 52 53 54 55 56 57 + w2 = _mm_unpacklo_epi8( + *x4, *x5); // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + + // x6 60 61 62 63 64 65 66 67 + // x7 70 71 72 73 74 75 76 77 + w3 = _mm_unpacklo_epi8( + *x6, *x7); // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + + w4 = _mm_unpacklo_epi16( + w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + w5 = _mm_unpacklo_epi16( + w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + + *d0d1 = _mm_unpacklo_epi32( + w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + *d2d3 = _mm_unpackhi_epi32( + w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + + w6 = _mm_unpackhi_epi16( + w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + w7 = _mm_unpackhi_epi16( + w2, w3); // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 + + *d4d5 = _mm_unpacklo_epi32( + w6, w7); // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 + *d6d7 = _mm_unpackhi_epi32( + w6, w7); // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 +} + +static INLINE void transpose16x8_8x16_sse2( + __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, + __m128i *x5, __m128i *x6, __m128i *x7, __m128i *x8, __m128i *x9, + __m128i *x10, __m128i *x11, __m128i *x12, __m128i *x13, __m128i *x14, + __m128i *x15, __m128i *d0, __m128i *d1, __m128i *d2, __m128i *d3, + __m128i *d4, __m128i *d5, __m128i *d6, __m128i *d7) { + __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9; + __m128i w10, w11, w12, w13, w14, w15; + + w0 = _mm_unpacklo_epi8(*x0, *x1); + w1 = _mm_unpacklo_epi8(*x2, *x3); + w2 = _mm_unpacklo_epi8(*x4, *x5); + w3 = _mm_unpacklo_epi8(*x6, *x7); + + w8 = _mm_unpacklo_epi8(*x8, *x9); + w9 = _mm_unpacklo_epi8(*x10, *x11); + w10 = _mm_unpacklo_epi8(*x12, *x13); + w11 = _mm_unpacklo_epi8(*x14, *x15); + + w4 = _mm_unpacklo_epi16(w0, w1); + w5 = _mm_unpacklo_epi16(w2, w3); + w12 = _mm_unpacklo_epi16(w8, w9); + w13 = _mm_unpacklo_epi16(w10, w11); + + w6 = _mm_unpacklo_epi32(w4, w5); + w7 = _mm_unpackhi_epi32(w4, w5); + w14 = _mm_unpacklo_epi32(w12, w13); + w15 = _mm_unpackhi_epi32(w12, w13); + + // Store first 4-line result + *d0 = _mm_unpacklo_epi64(w6, w14); + *d1 = _mm_unpackhi_epi64(w6, w14); + *d2 = _mm_unpacklo_epi64(w7, w15); + *d3 = _mm_unpackhi_epi64(w7, w15); + + w4 = _mm_unpackhi_epi16(w0, w1); + w5 = _mm_unpackhi_epi16(w2, w3); + w12 = _mm_unpackhi_epi16(w8, w9); + w13 = _mm_unpackhi_epi16(w10, w11); + + w6 = _mm_unpacklo_epi32(w4, w5); + w7 = _mm_unpackhi_epi32(w4, w5); + w14 = _mm_unpacklo_epi32(w12, w13); + w15 = _mm_unpackhi_epi32(w12, w13); + + // Store second 4-line result + *d4 = _mm_unpacklo_epi64(w6, w14); + *d5 = _mm_unpackhi_epi64(w6, w14); + *d6 = _mm_unpacklo_epi64(w7, w15); + *d7 = _mm_unpackhi_epi64(w7, w15); +} + +static INLINE void transpose8x16_16x8_sse2( + __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, + __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0d1, __m128i *d2d3, + __m128i *d4d5, __m128i *d6d7, __m128i *d8d9, __m128i *d10d11, + __m128i *d12d13, __m128i *d14d15) { + __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9; + __m128i w10, w11, w12, w13, w14, w15; + + w0 = _mm_unpacklo_epi8(*x0, *x1); + w1 = _mm_unpacklo_epi8(*x2, *x3); + w2 = _mm_unpacklo_epi8(*x4, *x5); + w3 = _mm_unpacklo_epi8(*x6, *x7); + + w8 = _mm_unpackhi_epi8(*x0, *x1); + w9 = _mm_unpackhi_epi8(*x2, *x3); + w10 = _mm_unpackhi_epi8(*x4, *x5); + w11 = _mm_unpackhi_epi8(*x6, *x7); + + w4 = _mm_unpacklo_epi16(w0, w1); + w5 = _mm_unpacklo_epi16(w2, w3); + w12 = _mm_unpacklo_epi16(w8, w9); + w13 = _mm_unpacklo_epi16(w10, w11); + + w6 = _mm_unpacklo_epi32(w4, w5); + w7 = _mm_unpackhi_epi32(w4, w5); + w14 = _mm_unpacklo_epi32(w12, w13); + w15 = _mm_unpackhi_epi32(w12, w13); + + // Store first 4-line result + *d0d1 = _mm_unpacklo_epi64(w6, w14); + *d2d3 = _mm_unpackhi_epi64(w6, w14); + *d4d5 = _mm_unpacklo_epi64(w7, w15); + *d6d7 = _mm_unpackhi_epi64(w7, w15); + + w4 = _mm_unpackhi_epi16(w0, w1); + w5 = _mm_unpackhi_epi16(w2, w3); + w12 = _mm_unpackhi_epi16(w8, w9); + w13 = _mm_unpackhi_epi16(w10, w11); + + w6 = _mm_unpacklo_epi32(w4, w5); + w7 = _mm_unpackhi_epi32(w4, w5); + w14 = _mm_unpacklo_epi32(w12, w13); + w15 = _mm_unpackhi_epi32(w12, w13); + + // Store second 4-line result + *d8d9 = _mm_unpacklo_epi64(w6, w14); + *d10d11 = _mm_unpackhi_epi64(w6, w14); + *d12d13 = _mm_unpacklo_epi64(w7, w15); + *d14d15 = _mm_unpackhi_epi64(w7, w15); +} + #endif // AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_ diff --git a/libaom/aom_dsp/x86/quantize_sse2.c b/libaom/aom_dsp/x86/quantize_sse2.c index d3de6e2..ebef1fb 100644 --- a/libaom/aom_dsp/x86/quantize_sse2.c +++ b/libaom/aom_dsp/x86/quantize_sse2.c @@ -18,28 +18,6 @@ #include "aom/aom_integer.h" #include "aom_dsp/x86/quantize_x86.h" -static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) { - assert(sizeof(tran_low_t) == 4); - - return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1], - (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3], - (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5], - (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]); -} - -static INLINE void store_coefficients(__m128i coeff_vals, - tran_low_t *coeff_ptr) { - assert(sizeof(tran_low_t) == 4); - - __m128i one = _mm_set1_epi16(1); - __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one); - __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one); - __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi); - __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi); - _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1); - _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2); -} - void aom_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, diff --git a/libaom/aom_dsp/x86/quantize_ssse3.c b/libaom/aom_dsp/x86/quantize_ssse3.c new file mode 100644 index 0000000..25980a0 --- /dev/null +++ b/libaom/aom_dsp/x86/quantize_ssse3.c @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <tmmintrin.h> +#include <emmintrin.h> +#include <xmmintrin.h> + +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_integer.h" +#include "aom_dsp/x86/quantize_x86.h" + +static INLINE void calculate_qcoeff_64x64(__m128i *coeff, const __m128i round, + const __m128i quant, + const __m128i *shift) { + __m128i tmp, qcoeff, tmp1; + qcoeff = _mm_adds_epi16(*coeff, round); + tmp = _mm_mulhi_epi16(qcoeff, quant); + qcoeff = _mm_add_epi16(tmp, qcoeff); + tmp = _mm_mullo_epi16(qcoeff, *shift); + tmp = _mm_srli_epi16(tmp, 14); + tmp1 = _mm_mulhi_epi16(qcoeff, *shift); + tmp1 = _mm_slli_epi16(tmp1, 2); + *coeff = _mm_or_si128(tmp, tmp1); +} + +static INLINE void calculate_dqcoeff_and_store_64x64(const __m128i qcoeff, + const __m128i dequant, + const __m128i zero, + tran_low_t *dqcoeff) { + // Un-sign to bias rounding like C. + const __m128i coeff = _mm_abs_epi16(qcoeff); + + const __m128i sign_0 = _mm_unpacklo_epi16(zero, qcoeff); + const __m128i sign_1 = _mm_unpackhi_epi16(zero, qcoeff); + + const __m128i low = _mm_mullo_epi16(coeff, dequant); + const __m128i high = _mm_mulhi_epi16(coeff, dequant); + __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high); + __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high); + + // "Divide" by 4. + dqcoeff32_0 = _mm_srli_epi32(dqcoeff32_0, 2); + dqcoeff32_1 = _mm_srli_epi32(dqcoeff32_1, 2); + + dqcoeff32_0 = _mm_sign_epi32(dqcoeff32_0, sign_0); + dqcoeff32_1 = _mm_sign_epi32(dqcoeff32_1, sign_1); + + _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0); + _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1); +} + +void aom_quantize_b_64x64_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + const __m128i two = _mm_set1_epi16(2); + int index; + + __m128i zbin, round, quant, dequant, shift; + __m128i coeff0, coeff1, qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1, all_zero; + __m128i eob = zero, eob0; + + (void)scan; + (void)n_coeffs; + + // Setup global values. + zbin = _mm_load_si128((const __m128i *)zbin_ptr); + round = _mm_load_si128((const __m128i *)round_ptr); + quant = _mm_load_si128((const __m128i *)quant_ptr); + dequant = _mm_load_si128((const __m128i *)dequant_ptr); + shift = _mm_load_si128((const __m128i *)quant_shift_ptr); + + // Shift with rounding. + zbin = _mm_add_epi16(zbin, two); + round = _mm_add_epi16(round, two); + zbin = _mm_srli_epi16(zbin, 2); + round = _mm_srli_epi16(round, 2); + zbin = _mm_sub_epi16(zbin, one); + // Do DC and first 15 AC. + coeff0 = load_coefficients(coeff_ptr); + coeff1 = load_coefficients(coeff_ptr + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero); + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + dequant = _mm_unpackhi_epi64(dequant, dequant); + } else { + calculate_qcoeff_64x64(&qcoeff0, round, quant, &shift); + round = _mm_unpackhi_epi64(round, round); + quant = _mm_unpackhi_epi64(quant, quant); + shift = _mm_unpackhi_epi64(shift, shift); + calculate_qcoeff_64x64(&qcoeff1, round, quant, &shift); + + // Reinsert signs. + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + // Mask out zbin threshold coeffs. + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_coefficients(qcoeff0, qcoeff_ptr); + store_coefficients(qcoeff1, qcoeff_ptr + 8); + + calculate_dqcoeff_and_store_64x64(qcoeff0, dequant, zero, dqcoeff_ptr); + dequant = _mm_unpackhi_epi64(dequant, dequant); + calculate_dqcoeff_and_store_64x64(qcoeff1, dequant, zero, dqcoeff_ptr + 8); + + eob = + scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero); + } + + // AC only loop. + for (index = 16; index < 1024; index += 16) { + coeff0 = load_coefficients(coeff_ptr + index); + coeff1 = load_coefficients(coeff_ptr + index + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + all_zero = _mm_or_si128(cmp_mask0, cmp_mask1); + if (_mm_movemask_epi8(all_zero) == 0) { + _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero); + _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero); + _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero); + continue; + } + calculate_qcoeff_64x64(&qcoeff0, round, quant, &shift); + calculate_qcoeff_64x64(&qcoeff1, round, quant, &shift); + + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_coefficients(qcoeff0, qcoeff_ptr + index); + store_coefficients(qcoeff1, qcoeff_ptr + index + 8); + + calculate_dqcoeff_and_store_64x64(qcoeff0, dequant, zero, + dqcoeff_ptr + index); + calculate_dqcoeff_and_store_64x64(qcoeff1, dequant, zero, + dqcoeff_ptr + 8 + index); + + eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index, + zero); + eob = _mm_max_epi16(eob, eob0); + } + + *eob_ptr = accumulate_eob(eob); +} diff --git a/libaom/aom_dsp/x86/quantize_x86.h b/libaom/aom_dsp/x86/quantize_x86.h index 4eed7dd..b2de01b 100644 --- a/libaom/aom_dsp/x86/quantize_x86.h +++ b/libaom/aom_dsp/x86/quantize_x86.h @@ -32,6 +32,11 @@ static INLINE __m128i invert_sign_sse2(__m128i a, __m128i sign) { return _mm_sub_epi16(a, sign); } +static INLINE __m128i invert_sign_32_sse2(__m128i a, __m128i sign) { + a = _mm_xor_si128(a, sign); + return _mm_sub_epi32(a, sign); +} + static INLINE void calculate_qcoeff(__m128i *coeff, const __m128i round, const __m128i quant, const __m128i shift) { __m128i tmp, qcoeff; @@ -41,10 +46,53 @@ static INLINE void calculate_qcoeff(__m128i *coeff, const __m128i round, *coeff = _mm_mulhi_epi16(qcoeff, shift); } +static INLINE void calculate_qcoeff_log_scale(__m128i *coeff, + const __m128i round, + const __m128i quant, + const __m128i *shift, + const int *log_scale) { + __m128i tmp, tmp1, qcoeff; + qcoeff = _mm_adds_epi16(*coeff, round); + tmp = _mm_mulhi_epi16(qcoeff, quant); + qcoeff = _mm_add_epi16(tmp, qcoeff); + tmp = _mm_mullo_epi16(qcoeff, *shift); + tmp = _mm_srli_epi16(tmp, (16 - *log_scale)); + tmp1 = _mm_mulhi_epi16(qcoeff, *shift); + tmp1 = _mm_slli_epi16(tmp1, *log_scale); + *coeff = _mm_or_si128(tmp, tmp1); +} + static INLINE __m128i calculate_dqcoeff(__m128i qcoeff, __m128i dequant) { return _mm_mullo_epi16(qcoeff, dequant); } +static INLINE void calculate_dqcoeff_and_store_log_scale(__m128i qcoeff, + __m128i dequant, + const __m128i zero, + tran_low_t *dqcoeff, + const int *log_scale) { + // calculate abs + __m128i coeff_sign = _mm_srai_epi16(qcoeff, 15); + __m128i coeff = invert_sign_sse2(qcoeff, coeff_sign); + + const __m128i sign_0 = _mm_unpacklo_epi16(coeff_sign, zero); + const __m128i sign_1 = _mm_unpackhi_epi16(coeff_sign, zero); + + const __m128i low = _mm_mullo_epi16(coeff, dequant); + const __m128i high = _mm_mulhi_epi16(coeff, dequant); + __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high); + __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high); + + dqcoeff32_0 = _mm_srli_epi32(dqcoeff32_0, *log_scale); + dqcoeff32_1 = _mm_srli_epi32(dqcoeff32_1, *log_scale); + + dqcoeff32_0 = invert_sign_32_sse2(dqcoeff32_0, sign_0); + dqcoeff32_1 = invert_sign_32_sse2(dqcoeff32_1, sign_1); + + _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0); + _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1); +} + // Scan 16 values for eob reference in scan_ptr. Use masks (-1) from comparing // to zbin to add 1 to the index in 'scan'. static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1, @@ -75,3 +123,23 @@ static INLINE int16_t accumulate_eob(__m128i eob) { eob = _mm_max_epi16(eob, eob_shuffled); return _mm_extract_epi16(eob, 1); } + +static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) { + assert(sizeof(tran_low_t) == 4); + const __m128i coeff1 = _mm_load_si128((__m128i *)(coeff_ptr)); + const __m128i coeff2 = _mm_load_si128((__m128i *)(coeff_ptr + 4)); + return _mm_packs_epi32(coeff1, coeff2); +} + +static INLINE void store_coefficients(__m128i coeff_vals, + tran_low_t *coeff_ptr) { + assert(sizeof(tran_low_t) == 4); + + __m128i one = _mm_set1_epi16(1); + __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one); + __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one); + __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi); + __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi); + _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1); + _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2); +} diff --git a/libaom/aom_dsp/x86/sse_avx2.c b/libaom/aom_dsp/x86/sse_avx2.c index fa45687..42df981 100644 --- a/libaom/aom_dsp/x86/sse_avx2.c +++ b/libaom/aom_dsp/x86/sse_avx2.c @@ -21,12 +21,11 @@ static INLINE void sse_w32_avx2(__m256i *sum, const uint8_t *a, const uint8_t *b) { const __m256i v_a0 = yy_loadu_256(a); const __m256i v_b0 = yy_loadu_256(b); - const __m256i v_a00_w = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_a0)); - const __m256i v_a01_w = - _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_a0, 1)); - const __m256i v_b00_w = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_b0)); - const __m256i v_b01_w = - _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_b0, 1)); + const __m256i zero = _mm256_setzero_si256(); + const __m256i v_a00_w = _mm256_unpacklo_epi8(v_a0, zero); + const __m256i v_a01_w = _mm256_unpackhi_epi8(v_a0, zero); + const __m256i v_b00_w = _mm256_unpacklo_epi8(v_b0, zero); + const __m256i v_b01_w = _mm256_unpackhi_epi8(v_b0, zero); const __m256i v_d00_w = _mm256_sub_epi16(v_a00_w, v_b00_w); const __m256i v_d01_w = _mm256_sub_epi16(v_a01_w, v_b01_w); *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d00_w, v_d00_w)); @@ -35,15 +34,13 @@ static INLINE void sse_w32_avx2(__m256i *sum, const uint8_t *a, static INLINE int64_t summary_all_avx2(const __m256i *sum_all) { int64_t sum; - const __m256i sum0_4x64 = - _mm256_cvtepu32_epi64(_mm256_castsi256_si128(*sum_all)); - const __m256i sum1_4x64 = - _mm256_cvtepu32_epi64(_mm256_extracti128_si256(*sum_all, 1)); + __m256i zero = _mm256_setzero_si256(); + const __m256i sum0_4x64 = _mm256_unpacklo_epi32(*sum_all, zero); + const __m256i sum1_4x64 = _mm256_unpackhi_epi32(*sum_all, zero); const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64); const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64), _mm256_extracti128_si256(sum_4x64, 1)); const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8)); - xx_storel_64(&sum, sum_1x64); return sum; } @@ -86,7 +83,6 @@ static INLINE void sse_w4x4_avx2(const uint8_t *a, int a_stride, const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); } - static INLINE void sse_w8x2_avx2(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, __m256i *sum) { const __m128i v_a0 = xx_loadl_64(a); @@ -98,12 +94,12 @@ static INLINE void sse_w8x2_avx2(const uint8_t *a, int a_stride, const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w)); } - int64_t aom_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height) { int32_t y = 0; int64_t sse = 0; __m256i sum = _mm256_setzero_si256(); + __m256i zero = _mm256_setzero_si256(); switch (width) { case 4: do { @@ -126,14 +122,26 @@ int64_t aom_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b, case 16: do { const __m128i v_a0 = xx_loadu_128(a); + const __m128i v_a1 = xx_loadu_128(a + a_stride); const __m128i v_b0 = xx_loadu_128(b); - const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0); - const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0); - const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w); - sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w)); - a += a_stride; - b += b_stride; - y += 1; + const __m128i v_b1 = xx_loadu_128(b + b_stride); + const __m256i v_a = + _mm256_insertf128_si256(_mm256_castsi128_si256(v_a0), v_a1, 0x01); + const __m256i v_b = + _mm256_insertf128_si256(_mm256_castsi128_si256(v_b0), v_b1, 0x01); + const __m256i v_al = _mm256_unpacklo_epi8(v_a, zero); + const __m256i v_au = _mm256_unpackhi_epi8(v_a, zero); + const __m256i v_bl = _mm256_unpacklo_epi8(v_b, zero); + const __m256i v_bu = _mm256_unpackhi_epi8(v_b, zero); + const __m256i v_asub = _mm256_sub_epi16(v_al, v_bl); + const __m256i v_bsub = _mm256_sub_epi16(v_au, v_bu); + const __m256i temp = + _mm256_add_epi32(_mm256_madd_epi16(v_asub, v_asub), + _mm256_madd_epi16(v_bsub, v_bsub)); + sum = _mm256_add_epi32(sum, temp); + a += a_stride << 1; + b += b_stride << 1; + y += 2; } while (y < height); sse = summary_all_avx2(&sum); break; diff --git a/libaom/aom_dsp/x86/txfm_common_avx2.h b/libaom/aom_dsp/x86/txfm_common_avx2.h index 8a40508..06a77e7 100644 --- a/libaom/aom_dsp/x86/txfm_common_avx2.h +++ b/libaom/aom_dsp/x86/txfm_common_avx2.h @@ -168,6 +168,36 @@ static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in, out[7 + 8] = _mm256_permute2x128_si256(c[12 + 2], c[13 + 2], 0x31); } +static INLINE void transpose_16bit_16x8_avx2(const __m256i *const in, + __m256i *const out) { + const __m256i a0 = _mm256_unpacklo_epi16(in[0], in[1]); + const __m256i a1 = _mm256_unpacklo_epi16(in[2], in[3]); + const __m256i a2 = _mm256_unpacklo_epi16(in[4], in[5]); + const __m256i a3 = _mm256_unpacklo_epi16(in[6], in[7]); + const __m256i a4 = _mm256_unpackhi_epi16(in[0], in[1]); + const __m256i a5 = _mm256_unpackhi_epi16(in[2], in[3]); + const __m256i a6 = _mm256_unpackhi_epi16(in[4], in[5]); + const __m256i a7 = _mm256_unpackhi_epi16(in[6], in[7]); + + const __m256i b0 = _mm256_unpacklo_epi32(a0, a1); + const __m256i b1 = _mm256_unpacklo_epi32(a2, a3); + const __m256i b2 = _mm256_unpacklo_epi32(a4, a5); + const __m256i b3 = _mm256_unpacklo_epi32(a6, a7); + const __m256i b4 = _mm256_unpackhi_epi32(a0, a1); + const __m256i b5 = _mm256_unpackhi_epi32(a2, a3); + const __m256i b6 = _mm256_unpackhi_epi32(a4, a5); + const __m256i b7 = _mm256_unpackhi_epi32(a6, a7); + + out[0] = _mm256_unpacklo_epi64(b0, b1); + out[1] = _mm256_unpackhi_epi64(b0, b1); + out[2] = _mm256_unpacklo_epi64(b4, b5); + out[3] = _mm256_unpackhi_epi64(b4, b5); + out[4] = _mm256_unpacklo_epi64(b2, b3); + out[5] = _mm256_unpackhi_epi64(b2, b3); + out[6] = _mm256_unpacklo_epi64(b6, b7); + out[7] = _mm256_unpackhi_epi64(b6, b7); +} + static INLINE void flip_buf_avx2(__m256i *in, __m256i *out, int size) { for (int i = 0; i < size; ++i) { out[size - i - 1] = in[i]; @@ -236,6 +266,66 @@ static INLINE void av1_round_shift_rect_array_32_avx2(__m256i *input, } } +static INLINE __m256i scale_round_avx2(const __m256i a, const int scale) { + const __m256i scale_rounding = + pair_set_w16_epi16(scale, 1 << (NewSqrt2Bits - 1)); + const __m256i b = _mm256_madd_epi16(a, scale_rounding); + return _mm256_srai_epi32(b, NewSqrt2Bits); +} + +static INLINE void store_rect_16bit_to_32bit_w8_avx2(const __m256i a, + int32_t *const b) { + const __m256i one = _mm256_set1_epi16(1); + const __m256i a_lo = _mm256_unpacklo_epi16(a, one); + const __m256i a_hi = _mm256_unpackhi_epi16(a, one); + const __m256i b_lo = scale_round_avx2(a_lo, NewSqrt2); + const __m256i b_hi = scale_round_avx2(a_hi, NewSqrt2); + const __m256i temp = _mm256_permute2f128_si256(b_lo, b_hi, 0x31); + _mm_store_si128((__m128i *)b, _mm256_castsi256_si128(b_lo)); + _mm_store_si128((__m128i *)(b + 4), _mm256_castsi256_si128(b_hi)); + _mm256_store_si256((__m256i *)(b + 64), temp); +} + +static INLINE void store_rect_buffer_16bit_to_32bit_w8_avx2( + const __m256i *const in, int32_t *const out, const int stride, + const int out_size) { + for (int i = 0; i < out_size; ++i) { + store_rect_16bit_to_32bit_w8_avx2(in[i], out + i * stride); + } +} + +static INLINE void pack_reg(const __m128i *in1, const __m128i *in2, + __m256i *out) { + out[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[0]), in2[0], 0x1); + out[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[1]), in2[1], 0x1); + out[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[2]), in2[2], 0x1); + out[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[3]), in2[3], 0x1); + out[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[4]), in2[4], 0x1); + out[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[5]), in2[5], 0x1); + out[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[6]), in2[6], 0x1); + out[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[7]), in2[7], 0x1); +} + +static INLINE void extract_reg(const __m256i *in, __m128i *out1) { + out1[0] = _mm256_castsi256_si128(in[0]); + out1[1] = _mm256_castsi256_si128(in[1]); + out1[2] = _mm256_castsi256_si128(in[2]); + out1[3] = _mm256_castsi256_si128(in[3]); + out1[4] = _mm256_castsi256_si128(in[4]); + out1[5] = _mm256_castsi256_si128(in[5]); + out1[6] = _mm256_castsi256_si128(in[6]); + out1[7] = _mm256_castsi256_si128(in[7]); + + out1[8] = _mm256_extracti128_si256(in[0], 0x01); + out1[9] = _mm256_extracti128_si256(in[1], 0x01); + out1[10] = _mm256_extracti128_si256(in[2], 0x01); + out1[11] = _mm256_extracti128_si256(in[3], 0x01); + out1[12] = _mm256_extracti128_si256(in[4], 0x01); + out1[13] = _mm256_extracti128_si256(in[5], 0x01); + out1[14] = _mm256_extracti128_si256(in[6], 0x01); + out1[15] = _mm256_extracti128_si256(in[7], 0x01); +} + #ifdef __cplusplus } #endif diff --git a/libaom/aom_dsp/x86/variance_sse2.c b/libaom/aom_dsp/x86/variance_sse2.c index c831e3e..f3efc15 100644 --- a/libaom/aom_dsp/x86/variance_sse2.c +++ b/libaom/aom_dsp/x86/variance_sse2.c @@ -494,7 +494,7 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm, const int ref_num = 0; const int is_intrabc = is_intrabc_block(mi); const struct scale_factors *const sf = - is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf; + is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num]; const int is_scaled = av1_is_scaled(sf); if (is_scaled) { diff --git a/libaom/aom_ports/mem.h b/libaom/aom_ports/mem.h index 3ffea3c..9e3d424 100644 --- a/libaom/aom_ports/mem.h +++ b/libaom/aom_ports/mem.h @@ -66,4 +66,34 @@ #define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1)) #define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1)) +/*!\brief force enum to be unsigned 1 byte*/ +#define UENUM1BYTE(enumvar) \ + ; \ + typedef uint8_t enumvar + +/*!\brief force enum to be signed 1 byte*/ +#define SENUM1BYTE(enumvar) \ + ; \ + typedef int8_t enumvar + +/*!\brief force enum to be unsigned 2 byte*/ +#define UENUM2BYTE(enumvar) \ + ; \ + typedef uint16_t enumvar + +/*!\brief force enum to be signed 2 byte*/ +#define SENUM2BYTE(enumvar) \ + ; \ + typedef int16_t enumvar + +/*!\brief force enum to be unsigned 4 byte*/ +#define UENUM4BYTE(enumvar) \ + ; \ + typedef uint32_t enumvar + +/*!\brief force enum to be unsigned 4 byte*/ +#define SENUM4BYTE(enumvar) \ + ; \ + typedef int32_t enumvar + #endif // AOM_AOM_PORTS_MEM_H_ diff --git a/libaom/aom_ports/x86.h b/libaom/aom_ports/x86.h index 52ee49c..8c18448 100644 --- a/libaom/aom_ports/x86.h +++ b/libaom/aom_ports/x86.h @@ -222,11 +222,26 @@ static INLINE int x86_simd_caps(void) { return flags & mask; } -// Note: -// 32-bit CPU cycle counter is light-weighted for most function performance -// measurement. For large function (CPU time > a couple of seconds), 64-bit -// counter should be used. -// 32-bit CPU cycle counter +// Fine-Grain Measurement Functions +// +// If you are a timing a small region of code, access the timestamp counter +// (TSC) via: +// +// unsigned int start = x86_tsc_start(); +// ... +// unsigned int end = x86_tsc_end(); +// unsigned int diff = end - start; +// +// The start/end functions introduce a few more instructions than using +// x86_readtsc directly, but prevent the CPU's out-of-order execution from +// affecting the measurement (by having earlier/later instructions be evaluated +// in the time interval). See the white paper, "How to Benchmark Code +// Execution Times on Intel® IA-32 and IA-64 Instruction Set Architectures" by +// Gabriele Paoloni for more information. +// +// If you are timing a large function (CPU time > a couple of seconds), use +// x86_readtsc64 to read the timestamp counter in a 64-bit integer. The +// out-of-order leakage that can occur is minimal compared to total runtime. static INLINE unsigned int x86_readtsc(void) { #if defined(__GNUC__) && __GNUC__ unsigned int tsc; @@ -263,6 +278,41 @@ static INLINE uint64_t x86_readtsc64(void) { #endif } +// 32-bit CPU cycle counter with a partial fence against out-of-order execution. +static INLINE unsigned int x86_readtscp(void) { +#if defined(__GNUC__) && __GNUC__ + unsigned int tscp; + __asm__ __volatile__("rdtscp\n\t" : "=a"(tscp) :); + return tscp; +#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) + unsigned int tscp; + asm volatile("rdtscp\n\t" : "=a"(tscp) :); + return tscp; +#elif defined(_MSC_VER) + unsigned int ui; + return (unsigned int)__rdtscp(&ui); +#else +#if ARCH_X86_64 + return (unsigned int)__rdtscp(); +#else + __asm rdtscp; +#endif +#endif +} + +static INLINE unsigned int x86_tsc_start(void) { + unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx; + cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); + return x86_readtsc(); +} + +static INLINE unsigned int x86_tsc_end(void) { + uint32_t v = x86_readtscp(); + unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx; + cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx); + return v; +} + #if defined(__GNUC__) && __GNUC__ #define x86_pause_hint() __asm__ __volatile__("pause \n\t") #elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) diff --git a/libaom/aom_scale/aom_scale.cmake b/libaom/aom_scale/aom_scale.cmake index 197dea6..3199733 100644 --- a/libaom/aom_scale/aom_scale.cmake +++ b/libaom/aom_scale/aom_scale.cmake @@ -34,5 +34,9 @@ function(setup_aom_scale_targets) "AOM_SCALE_INTRIN_DSPR2" "aom") endif() + target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_scale>) + + # Pass the new lib targets up to the parent scope instance of + # $AOM_LIB_TARGETS. set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_scale PARENT_SCOPE) endfunction() diff --git a/libaom/aom_scale/aom_scale_rtcd.pl b/libaom/aom_scale/aom_scale_rtcd.pl index 27378c7..eef6f16 100644 --- a/libaom/aom_scale/aom_scale_rtcd.pl +++ b/libaom/aom_scale/aom_scale_rtcd.pl @@ -26,6 +26,8 @@ if (aom_config("CONFIG_SPATIAL_RESAMPLING") eq "yes") { add_proto qw/void aom_vertical_band_2_1_scale_i/, "unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width"; } +add_proto qw/int aom_yv12_realloc_with_new_border/, "struct yv12_buffer_config *ybf, int new_border, int byte_alignment, int num_planes"; + add_proto qw/void aom_yv12_extend_frame_borders/, "struct yv12_buffer_config *ybf, const int num_planes"; add_proto qw/void aom_yv12_copy_frame/, "const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, const int num_planes"; diff --git a/libaom/aom_scale/generic/yv12config.c b/libaom/aom_scale/generic/yv12config.c index 7cf3c4f..a5ad1a7 100644 --- a/libaom/aom_scale/generic/yv12config.c +++ b/libaom/aom_scale/generic/yv12config.c @@ -46,37 +46,16 @@ int aom_free_frame_buffer(YV12_BUFFER_CONFIG *ybf) { return 0; } -int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, - int ss_x, int ss_y, int use_highbitdepth, - int border, int byte_alignment, - aom_codec_frame_buffer_t *fb, - aom_get_frame_buffer_cb_fn_t cb, void *cb_priv) { -#if CONFIG_SIZE_LIMIT - if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT) return -1; -#endif - - /* Only support allocating buffers that have a border that's a multiple - * of 32. The border restriction is required to get 16-byte alignment of - * the start of the chroma rows without introducing an arbitrary gap - * between planes, which would break the semantics of things like - * aom_img_set_rect(). */ - if (border & 0x1f) return -3; - +static int realloc_frame_buffer_aligned( + YV12_BUFFER_CONFIG *ybf, int width, int height, int ss_x, int ss_y, + int use_highbitdepth, int border, int byte_alignment, + aom_codec_frame_buffer_t *fb, aom_get_frame_buffer_cb_fn_t cb, + void *cb_priv, const int y_stride, const uint64_t yplane_size, + const uint64_t uvplane_size, const int aligned_width, + const int aligned_height, const int uv_width, const int uv_height, + const int uv_stride, const int uv_border_w, const int uv_border_h) { if (ybf) { const int aom_byte_align = (byte_alignment == 0) ? 1 : byte_alignment; - const int aligned_width = (width + 7) & ~7; - const int aligned_height = (height + 7) & ~7; - const int y_stride = ((aligned_width + 2 * border) + 31) & ~31; - const uint64_t yplane_size = - (aligned_height + 2 * border) * (uint64_t)y_stride + byte_alignment; - const int uv_width = aligned_width >> ss_x; - const int uv_height = aligned_height >> ss_y; - const int uv_stride = y_stride >> ss_x; - const int uv_border_w = border >> ss_x; - const int uv_border_h = border >> ss_y; - const uint64_t uvplane_size = - (uv_height + 2 * uv_border_h) * (uint64_t)uv_stride + byte_alignment; - const uint64_t frame_size = (1 + use_highbitdepth) * (yplane_size + 2 * uvplane_size); @@ -120,6 +99,7 @@ int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, // Allocation to hold larger frame, or first allocation. aom_free(ybf->buffer_alloc); ybf->buffer_alloc = NULL; + ybf->buffer_alloc_sz = 0; if (frame_size != (size_t)frame_size) return -1; @@ -190,6 +170,111 @@ int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, return -2; } +static int calc_stride_and_planesize(const int ss_x, const int ss_y, + const int aligned_width, + const int aligned_height, const int border, + const int byte_alignment, int *y_stride, + int *uv_stride, uint64_t *yplane_size, + uint64_t *uvplane_size, + const int uv_height) { + /* Only support allocating buffers that have a border that's a multiple + * of 32. The border restriction is required to get 16-byte alignment of + * the start of the chroma rows without introducing an arbitrary gap + * between planes, which would break the semantics of things like + * aom_img_set_rect(). */ + if (border & 0x1f) return -3; + *y_stride = ((aligned_width + 2 * border) + 31) & ~31; + *yplane_size = + (aligned_height + 2 * border) * (uint64_t)(*y_stride) + byte_alignment; + + *uv_stride = *y_stride >> ss_x; + *uvplane_size = (uv_height + 2 * (border >> ss_y)) * (uint64_t)(*uv_stride) + + byte_alignment; + return 0; +} + +int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, + int ss_x, int ss_y, int use_highbitdepth, + int border, int byte_alignment, + aom_codec_frame_buffer_t *fb, + aom_get_frame_buffer_cb_fn_t cb, void *cb_priv) { +#if CONFIG_SIZE_LIMIT + if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT) return -1; +#endif + + if (ybf) { + int y_stride = 0; + int uv_stride = 0; + uint64_t yplane_size = 0; + uint64_t uvplane_size = 0; + const int aligned_width = (width + 7) & ~7; + const int aligned_height = (height + 7) & ~7; + const int uv_width = aligned_width >> ss_x; + const int uv_height = aligned_height >> ss_y; + const int uv_border_w = border >> ss_x; + const int uv_border_h = border >> ss_y; + + int error = calc_stride_and_planesize( + ss_x, ss_y, aligned_width, aligned_height, border, byte_alignment, + &y_stride, &uv_stride, &yplane_size, &uvplane_size, uv_height); + if (error) return error; + return realloc_frame_buffer_aligned( + ybf, width, height, ss_x, ss_y, use_highbitdepth, border, + byte_alignment, fb, cb, cb_priv, y_stride, yplane_size, uvplane_size, + aligned_width, aligned_height, uv_width, uv_height, uv_stride, + uv_border_w, uv_border_h); + } + return -2; +} + +// TODO(anyone): This function allocates memory for +// lookahead buffer considering height and width is +// aligned to 128. Currently variance calculation of +// simple_motion_search_get_best_ref() function is done +// for full sb size (i.e integral multiple of max sb +// size = 128 or 64). Hence partial sbs need up to 127 +// pixels beyond frame boundary. 128 aligned limitation of +// lookahead buffer can be removed if variance calculation +// is adjusted for partial sbs + +// NOTE: Chroma width and height need not be aligned to +// 128 since variance calculation happens only for luma plane +int aom_realloc_lookahead_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, + int ss_x, int ss_y, int use_highbitdepth, + int border, int byte_alignment, + aom_codec_frame_buffer_t *fb, + aom_get_frame_buffer_cb_fn_t cb, + void *cb_priv) { + if (ybf) { + int y_stride = 0; + int uv_stride = 0; + uint64_t yplane_size = 0; + uint64_t uvplane_size = 0; + const int aligned_128_width = (width + 127) & ~127; + const int aligned_128_height = (height + 127) & ~127; + const int aligned_width = (width + 7) & ~7; + const int aligned_height = (height + 7) & ~7; + const int uv_64_height = aligned_128_height >> ss_y; + const int uv_width = aligned_width >> ss_x; + const int uv_height = aligned_height >> ss_y; + const int uv_border_w = border >> ss_x; + const int uv_border_h = border >> ss_y; + + int error = calc_stride_and_planesize( + ss_x, ss_y, aligned_128_width, aligned_128_height, border, + byte_alignment, &y_stride, &uv_stride, &yplane_size, &uvplane_size, + uv_64_height); + if (error) return error; + + return realloc_frame_buffer_aligned( + ybf, width, height, ss_x, ss_y, use_highbitdepth, border, + byte_alignment, fb, cb, cb_priv, y_stride, yplane_size, uvplane_size, + aligned_width, aligned_height, uv_width, uv_height, uv_stride, + uv_border_w, uv_border_h); + } + return -2; +} + int aom_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, int ss_x, int ss_y, int use_highbitdepth, int border, int byte_alignment) { diff --git a/libaom/aom_scale/generic/yv12extend.c b/libaom/aom_scale/generic/yv12extend.c index 127ca23..6e9cfff 100644 --- a/libaom/aom_scale/generic/yv12extend.c +++ b/libaom/aom_scale/generic/yv12extend.c @@ -434,3 +434,28 @@ void aom_yv12_partial_coloc_copy_v_c(const YV12_BUFFER_CONFIG *src_bc, aom_yv12_partial_copy_v_c(src_bc, hstart, hend, vstart, vend, dst_bc, hstart, vstart); } + +int aom_yv12_realloc_with_new_border_c(YV12_BUFFER_CONFIG *ybf, int new_border, + int byte_alignment, int num_planes) { + if (ybf) { + if (new_border == ybf->border) return 0; + YV12_BUFFER_CONFIG new_buf; + memset(&new_buf, 0, sizeof(new_buf)); + const int error = aom_alloc_frame_buffer( + &new_buf, ybf->y_crop_width, ybf->y_crop_height, ybf->subsampling_x, + ybf->subsampling_y, ybf->flags & YV12_FLAG_HIGHBITDEPTH, new_border, + byte_alignment); + if (error) return error; + // Copy image buffer + aom_yv12_copy_frame(ybf, &new_buf, num_planes); + + // Extend up to new border + aom_extend_frame_borders(&new_buf, num_planes); + + // Now free the old buffer and replace with the new + aom_free_frame_buffer(ybf); + memcpy(ybf, &new_buf, sizeof(new_buf)); + return 0; + } + return -2; +} diff --git a/libaom/aom_scale/yv12config.h b/libaom/aom_scale/yv12config.h index 10c6ad5..04a1c04 100644 --- a/libaom/aom_scale/yv12config.h +++ b/libaom/aom_scale/yv12config.h @@ -24,15 +24,10 @@ extern "C" { #define AOMINNERBORDERINPIXELS 160 #define AOM_INTERP_EXTEND 4 - -// TODO(jingning): Use unified inter predictor for encoder and -// decoder during the development process. Revisit the frame border -// to improve the decoder performance. -#if CONFIG_REDUCED_ENCODER_BORDER -#define AOM_BORDER_IN_PIXELS 160 -#else #define AOM_BORDER_IN_PIXELS 288 -#endif // CONFIG_REDUCED_ENCODER_BORDER +#define AOM_ENC_NO_SCALE_BORDER 160 +#define AOM_ENC_LOOKAHEAD_BORDER 64 +#define AOM_DEC_BORDER_IN_PIXELS 64 typedef struct yv12_buffer_config { union { @@ -102,7 +97,7 @@ typedef struct yv12_buffer_config { aom_color_primaries_t color_primaries; aom_transfer_characteristics_t transfer_characteristics; aom_matrix_coefficients_t matrix_coefficients; - int monochrome; + uint8_t monochrome; aom_chroma_sample_position_t chroma_sample_position; aom_color_range_t color_range; int render_width; @@ -130,6 +125,14 @@ int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, int border, int byte_alignment, aom_codec_frame_buffer_t *fb, aom_get_frame_buffer_cb_fn_t cb, void *cb_priv); + +int aom_realloc_lookahead_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, + int ss_x, int ss_y, int use_highbitdepth, + int border, int byte_alignment, + aom_codec_frame_buffer_t *fb, + aom_get_frame_buffer_cb_fn_t cb, + void *cb_priv); + int aom_free_frame_buffer(YV12_BUFFER_CONFIG *ybf); #ifdef __cplusplus diff --git a/libaom/apps/aomdec.c b/libaom/apps/aomdec.c index 58ac172..549c4da 100644 --- a/libaom/apps/aomdec.c +++ b/libaom/apps/aomdec.c @@ -484,6 +484,7 @@ static int main_loop(int argc, const char **argv_) { input.webm_ctx = &webm_ctx; #endif struct ObuDecInputContext obu_ctx = { NULL, NULL, 0, 0, 0 }; + int is_ivf = 0; obu_ctx.avx_ctx = &aom_input_ctx; input.obu_ctx = &obu_ctx; @@ -610,8 +611,10 @@ static int main_loop(int argc, const char **argv_) { #endif input.aom_input_ctx->filename = fn; input.aom_input_ctx->file = infile; - if (file_is_ivf(input.aom_input_ctx)) + if (file_is_ivf(input.aom_input_ctx)) { input.aom_input_ctx->file_type = FILE_TYPE_IVF; + is_ivf = 1; + } #if CONFIG_WEBM_IO else if (file_is_webm(input.webm_ctx, input.aom_input_ctx)) input.aom_input_ctx->file_type = FILE_TYPE_WEBM; @@ -661,6 +664,10 @@ static int main_loop(int argc, const char **argv_) { } fourcc_interface = get_aom_decoder_by_fourcc(aom_input_ctx.fourcc); + + if (is_ivf && !fourcc_interface) + fatal("Unsupported fourcc: %x\n", aom_input_ctx.fourcc); + if (interface && fourcc_interface && interface != fourcc_interface) warn("Header indicates codec: %s\n", fourcc_interface->name); else @@ -844,7 +851,7 @@ static int main_loop(int argc, const char **argv_) { } // Default to codec bit depth if output bit depth not set unsigned int output_bit_depth; - if (!fixed_output_bit_depth && single_file && !do_md5) { + if (!fixed_output_bit_depth && single_file) { output_bit_depth = img->bit_depth; } else { output_bit_depth = fixed_output_bit_depth; diff --git a/libaom/apps/aomenc.c b/libaom/apps/aomenc.c index 4680d3a..08bf08d 100644 --- a/libaom/apps/aomenc.c +++ b/libaom/apps/aomenc.c @@ -144,16 +144,14 @@ static const arg_def_t pass_arg = ARG_DEF(NULL, "pass", 1, "Pass to execute (1/2)"); static const arg_def_t fpf_name = ARG_DEF(NULL, "fpf", 1, "First pass statistics file name"); -#if CONFIG_FP_MB_STATS -static const arg_def_t fpmbf_name = - ARG_DEF(NULL, "fpmbf", 1, "First pass block statistics file name"); -#endif static const arg_def_t limit = ARG_DEF(NULL, "limit", 1, "Stop encoding after n input frames"); static const arg_def_t skip = ARG_DEF(NULL, "skip", 1, "Skip the first n input frames"); static const arg_def_t good_dl = ARG_DEF(NULL, "good", 0, "Use Good Quality Deadline"); +static const arg_def_t rt_dl = + ARG_DEF(NULL, "rt", 0, "Use Realtime Quality Deadline"); static const arg_def_t quietarg = ARG_DEF("q", "quiet", 0, "Do not print encode progress"); static const arg_def_t verbosearg = @@ -219,6 +217,7 @@ static const arg_def_t *main_args[] = { &help, &limit, &skip, &good_dl, + &rt_dl, &quietarg, &verbosearg, &psnrarg, @@ -263,9 +262,9 @@ static const arg_def_t global_error_resilient = "Enable global error resiliency features"); static const arg_def_t lag_in_frames = ARG_DEF(NULL, "lag-in-frames", 1, "Max number of frames to lag"); -static const arg_def_t large_scale_tile = - ARG_DEF(NULL, "large-scale-tile", 1, - "Large scale tile coding (0: off (default), 1: on)"); +static const arg_def_t large_scale_tile = ARG_DEF( + NULL, "large-scale-tile", 1, + "Large scale tile coding (0: off (default), 1: on (ivf output only))"); static const arg_def_t monochrome = ARG_DEF(NULL, "monochrome", 0, "Monochrome video (no chroma planes)"); static const arg_def_t full_still_picture_hdr = ARG_DEF( @@ -415,7 +414,7 @@ static const arg_def_t cpu_used_av1 = ARG_DEF(NULL, "cpu-used", 1, "CPU Used (0..8)"); static const arg_def_t rowmtarg = ARG_DEF(NULL, "row-mt", 1, - "Enable row based multi-threading (0: off (default), 1: on)"); + "Enable row based multi-threading (0: off, 1: on (default))"); static const arg_def_t tile_cols = ARG_DEF(NULL, "tile-columns", 1, "Number of tile columns to use, log2"); static const arg_def_t tile_rows = @@ -437,10 +436,121 @@ static const arg_def_t enable_restoration = ARG_DEF(NULL, "enable-restoration", 1, "Enable the loop restoration filter (0: false, " "1: true (default))"); +static const arg_def_t enable_rect_partitions = + ARG_DEF(NULL, "enable-rect-partitions", 1, + "Enable rectangular partitions " + "(0: false, 1: true (default))"); +static const arg_def_t enable_ab_partitions = + ARG_DEF(NULL, "enable-ab-partitions", 1, + "Enable ab partitions (0: false, 1: true (default))"); +static const arg_def_t enable_1to4_partitions = + ARG_DEF(NULL, "enable-1to4-partitions", 1, + "Enable 1:4 and 4:1 partitions " + "(0: false, 1: true (default))"); +static const arg_def_t min_partition_size = + ARG_DEF(NULL, "min-partition-size", 4, + "Set min partition size " + "(4:4x4, 8:8x8, 16:16x16, 32:32x32, 64:64x64, 128:128x128)"); +static const arg_def_t max_partition_size = + ARG_DEF(NULL, "max-partition-size", 128, + "Set max partition size " + "(4:4x4, 8:8x8, 16:16x16, 32:32x32, 64:64x64, 128:128x128)"); +static const arg_def_t enable_dual_filter = + ARG_DEF(NULL, "enable-dual-filter", 1, + "Enable dual filter " + "(0: false, 1: true (default))"); +static const arg_def_t enable_intra_edge_filter = + ARG_DEF(NULL, "enable-intra-edge-filter", 1, + "Enable intra edge filtering " + "(0: false, 1: true (default))"); +static const arg_def_t enable_order_hint = + ARG_DEF(NULL, "enable-order-hint", 1, + "Enable order hint " + "(0: false, 1: true (default))"); +static const arg_def_t enable_tx64 = + ARG_DEF(NULL, "enable-tx64", 1, + "Enable 64-pt transform (0: false, 1: true (default))"); +static const arg_def_t tx_size_search_method = + ARG_DEF(NULL, "tx-size-search-method", 0, + "Set transform block size search method " + "(0: Full RD (default), 1: Fast RD, 2: use largest allowed)"); +static const arg_def_t enable_flip_idtx = + ARG_DEF(NULL, "enable-flip-idtx", 1, + "Enable extended transform type (0: false, 1: true (default)) " + "including FLIPADST_DCT, DCT_FLIPADST, FLIPADST_FLIPADST, " + "ADST_FLIPADST, FLIPADST_ADST, IDTX, V_DCT, H_DCT, V_ADST, " + "H_ADST, V_FLIPADST, H_FLIPADST"); +static const arg_def_t enable_dist_wtd_comp = + ARG_DEF(NULL, "enable-dist-wtd-comp", 1, + "Enable distance-weighted compound " + "(0: false, 1: true (default))"); +static const arg_def_t enable_masked_comp = + ARG_DEF(NULL, "enable-masked-comp", 1, + "Enable masked (wedge/diff-wtd) compound " + "(0: false, 1: true (default))"); +static const arg_def_t enable_onesided_comp = + ARG_DEF(NULL, "enable-onesided-comp", 1, + "Enable one sided compound " + "(0: false, 1: true (default))"); +static const arg_def_t enable_interintra_comp = + ARG_DEF(NULL, "enable-interintra-comp", 1, + "Enable interintra compound " + "(0: false, 1: true (default))"); +static const arg_def_t enable_smooth_interintra = + ARG_DEF(NULL, "enable-smooth-interintra", 1, + "Enable smooth interintra mode " + "(0: false, 1: true (default))"); +static const arg_def_t enable_diff_wtd_comp = + ARG_DEF(NULL, "enable-diff-wtd-comp", 1, + "Enable difference-weighted compound " + "(0: false, 1: true (default))"); +static const arg_def_t enable_interinter_wedge = + ARG_DEF(NULL, "enable-interinter-wedge", 1, + "Enable interinter wedge compound " + "(0: false, 1: true (default))"); +static const arg_def_t enable_interintra_wedge = + ARG_DEF(NULL, "enable-interintra-wedge", 1, + "Enable interintra wedge compound " + "(0: false, 1: true (default))"); +static const arg_def_t enable_global_motion = + ARG_DEF(NULL, "enable-global-motion", 1, + "Enable global motion " + "(0: false, 1: true (default))"); +static const arg_def_t enable_warped_motion = + ARG_DEF(NULL, "enable-warped-motion", 1, + "Enable local warped motion " + "(0: false, 1: true (default))"); +static const arg_def_t enable_filter_intra = + ARG_DEF(NULL, "enable-filter-intra", 1, + "Enable filter intra prediction mode " + "(0: false, 1: true (default))"); +static const arg_def_t enable_smooth_intra = + ARG_DEF(NULL, "enable-smooth-intra", 1, + "Enable smooth intra prediction modes " + "(0: false, 1: true (default))"); +static const arg_def_t enable_paeth_intra = + ARG_DEF(NULL, "enable-paeth-intra", 1, + "Enable Paeth intra prediction mode (0: false, 1: true (default))"); +static const arg_def_t enable_cfl_intra = + ARG_DEF(NULL, "enable-cfl-intra", 1, + "Enable chroma from luma intra prediction mode " + "(0: false, 1: true (default))"); +static const arg_def_t enable_obmc = ARG_DEF( + NULL, "enable-obmc", 1, "Enable OBMC (0: false, 1: true (default))"); +static const arg_def_t enable_palette = + ARG_DEF(NULL, "enable-palette", 1, + "Enable palette prediction mode (0: false, 1: true (default))"); +static const arg_def_t enable_intrabc = + ARG_DEF(NULL, "enable-intrabc", 1, + "Enable intra block copy prediction mode " + "(0: false, 1: true (default))"); +static const arg_def_t enable_angle_delta = + ARG_DEF(NULL, "enable-angle-delta", 1, + "Enable intra angle delta (0: false, 1: true (default))"); static const arg_def_t disable_trellis_quant = ARG_DEF(NULL, "disable-trellis-quant", 1, "Disable trellis optimization of quantized coefficients (0: false (" - "default) 1: true)"); + "default) 1: true 2: partial true)"); static const arg_def_t enable_qm = ARG_DEF(NULL, "enable-qm", 1, "Enable quantisation matrices (0: false (default), 1: true)"); @@ -448,6 +558,25 @@ static const arg_def_t qm_min = ARG_DEF( NULL, "qm-min", 1, "Min quant matrix flatness (0..15), default is 8"); static const arg_def_t qm_max = ARG_DEF( NULL, "qm-max", 1, "Max quant matrix flatness (0..15), default is 15"); +static const arg_def_t reduced_tx_type_set = ARG_DEF( + NULL, "reduced-tx-type-set", 1, "Use reduced set of transform types"); +static const arg_def_t use_intra_dct_only = + ARG_DEF(NULL, "use-intra-dct-only", 1, "Use DCT only for INTRA modes"); +static const arg_def_t use_inter_dct_only = + ARG_DEF(NULL, "use-inter-dct-only", 1, "Use DCT only for INTER modes"); +static const arg_def_t use_intra_default_tx_only = + ARG_DEF(NULL, "use-intra-default-tx-only", 1, + "Use Default-transform only for INTRA modes"); +static const arg_def_t quant_b_adapt = + ARG_DEF(NULL, "quant-b-adapt", 1, "Use adaptive quantize_b"); +static const arg_def_t coeff_cost_upd_freq = + ARG_DEF(NULL, "coeff-cost-upd-freq", 1, + "Update freq for coeff costs" + "0: SB, 1: SB Row per Tile, 2: Tile"); +static const arg_def_t mode_cost_upd_freq = + ARG_DEF(NULL, "mode-cost-upd-freq", 1, + "Update freq for mode costs" + "0: SB, 1: SB Row per Tile, 2: Tile"); #if CONFIG_DIST_8X8 static const arg_def_t enable_dist_8x8 = ARG_DEF(NULL, "enable-dist-8x8", 1, @@ -515,6 +644,25 @@ static const arg_def_t min_gf_interval = ARG_DEF( static const arg_def_t max_gf_interval = ARG_DEF( NULL, "max-gf-interval", 1, "max gf/arf frame interval (default 0, indicating in-built behavior)"); +static const arg_def_t gf_max_pyr_height = + ARG_DEF(NULL, "gf-max-pyr-height", 1, + "maximum height for GF group pyramid structure (0 to 4 (default))"); +static const arg_def_t max_reference_frames = ARG_DEF( + NULL, "max-reference-frames", 1, + "maximum number of reference frames allowed per frame (3 to 7 (default))"); +static const arg_def_t reduced_reference_set = + ARG_DEF(NULL, "reduced-reference-set", 1, + "Use reduced set of single and compound references (0: off " + "(default), 1: on)"); +static const arg_def_t target_seq_level_idx = + ARG_DEF(NULL, "target-seq-level-idx", 1, + "Target sequence level index. " + "Possible values are in the form of \"ABxy\"(pad leading zeros if " + "less than 4 digits). " + "AB: Operating point(OP) index; " + "xy: Target level index for the OP. " + "E.g. \"0\" means target level index 0 for the 0th OP; " + "\"1021\" means target level index 21 for the 10th OP."); static const struct arg_enum_list color_primaries_enum[] = { { "bt709", AOM_CICP_CP_BT_709 }, @@ -620,6 +768,12 @@ static const struct arg_enum_list superblock_size_enum[] = { static const arg_def_t superblock_size = ARG_DEF_ENUM( NULL, "sb-size", 1, "Superblock size to use", superblock_size_enum); +static const arg_def_t set_tier_mask = + ARG_DEF(NULL, "set-tier-mask", 1, + "Set bit mask to specify which tier each of the 32 possible " + "operating points conforms to. " + "Bit value 0(defualt): Main Tier; 1: High Tier."); + static const arg_def_t *av1_args[] = { &cpu_used_av1, &auto_altref, &sharpness, @@ -638,10 +792,46 @@ static const arg_def_t *av1_args[] = { &cpu_used_av1, &lossless, &enable_cdef, &enable_restoration, + &enable_rect_partitions, + &enable_ab_partitions, + &enable_1to4_partitions, + &min_partition_size, + &max_partition_size, + &enable_dual_filter, + &enable_intra_edge_filter, + &enable_order_hint, + &enable_tx64, + &tx_size_search_method, + &enable_flip_idtx, + &enable_dist_wtd_comp, + &enable_masked_comp, + &enable_onesided_comp, + &enable_interintra_comp, + &enable_smooth_interintra, + &enable_diff_wtd_comp, + &enable_interinter_wedge, + &enable_interintra_wedge, + &enable_global_motion, + &enable_warped_motion, + &enable_filter_intra, + &enable_smooth_intra, + &enable_paeth_intra, + &enable_cfl_intra, + &enable_obmc, + &enable_palette, + &enable_intrabc, + &enable_angle_delta, &disable_trellis_quant, &enable_qm, &qm_min, &qm_max, + &reduced_tx_type_set, + &use_intra_dct_only, + &use_inter_dct_only, + &use_intra_default_tx_only, + &quant_b_adapt, + &coeff_cost_upd_freq, + &mode_cost_upd_freq, #if CONFIG_DIST_8X8 &enable_dist_8x8, #endif @@ -659,6 +849,7 @@ static const arg_def_t *av1_args[] = { &cpu_used_av1, &input_chroma_sample_position, &min_gf_interval, &max_gf_interval, + &gf_max_pyr_height, &superblock_size, &num_tg, &mtu_size, @@ -668,8 +859,12 @@ static const arg_def_t *av1_args[] = { &cpu_used_av1, #if CONFIG_DENOISE &denoise_noise_level, &denoise_block_size, -#endif +#endif // CONFIG_DENOISE + &max_reference_frames, + &reduced_reference_set, &enable_ref_frame_mvs, + &target_seq_level_idx, + &set_tier_mask, &bitdeptharg, &inbitdeptharg, &input_chroma_subsampling_x, @@ -696,10 +891,46 @@ static const int av1_arg_ctrl_map[] = { AOME_SET_CPUUSED, AV1E_SET_LOSSLESS, AV1E_SET_ENABLE_CDEF, AV1E_SET_ENABLE_RESTORATION, + AV1E_SET_ENABLE_RECT_PARTITIONS, + AV1E_SET_ENABLE_AB_PARTITIONS, + AV1E_SET_ENABLE_1TO4_PARTITIONS, + AV1E_SET_MIN_PARTITION_SIZE, + AV1E_SET_MAX_PARTITION_SIZE, + AV1E_SET_ENABLE_DUAL_FILTER, + AV1E_SET_ENABLE_INTRA_EDGE_FILTER, + AV1E_SET_ENABLE_ORDER_HINT, + AV1E_SET_ENABLE_TX64, + AV1E_SET_TX_SIZE_SEARCH_METHOD, + AV1E_SET_ENABLE_FLIP_IDTX, + AV1E_SET_ENABLE_DIST_WTD_COMP, + AV1E_SET_ENABLE_MASKED_COMP, + AV1E_SET_ENABLE_ONESIDED_COMP, + AV1E_SET_ENABLE_INTERINTRA_COMP, + AV1E_SET_ENABLE_SMOOTH_INTERINTRA, + AV1E_SET_ENABLE_DIFF_WTD_COMP, + AV1E_SET_ENABLE_INTERINTER_WEDGE, + AV1E_SET_ENABLE_INTERINTRA_WEDGE, + AV1E_SET_ENABLE_GLOBAL_MOTION, + AV1E_SET_ENABLE_WARPED_MOTION, + AV1E_SET_ENABLE_FILTER_INTRA, + AV1E_SET_ENABLE_SMOOTH_INTRA, + AV1E_SET_ENABLE_PAETH_INTRA, + AV1E_SET_ENABLE_CFL_INTRA, + AV1E_SET_ENABLE_OBMC, + AV1E_SET_ENABLE_PALETTE, + AV1E_SET_ENABLE_INTRABC, + AV1E_SET_ENABLE_ANGLE_DELTA, AV1E_SET_DISABLE_TRELLIS_QUANT, AV1E_SET_ENABLE_QM, AV1E_SET_QM_MIN, AV1E_SET_QM_MAX, + AV1E_SET_REDUCED_TX_TYPE_SET, + AV1E_SET_INTRA_DCT_ONLY, + AV1E_SET_INTER_DCT_ONLY, + AV1E_SET_INTRA_DEFAULT_TX_ONLY, + AV1E_SET_QUANT_B_ADAPT, + AV1E_SET_COEFF_COST_UPD_FREQ, + AV1E_SET_MODE_COST_UPD_FREQ, #if CONFIG_DIST_8X8 AV1E_SET_ENABLE_DIST_8X8, #endif @@ -717,6 +948,7 @@ static const int av1_arg_ctrl_map[] = { AOME_SET_CPUUSED, AV1E_SET_CHROMA_SAMPLE_POSITION, AV1E_SET_MIN_GF_INTERVAL, AV1E_SET_MAX_GF_INTERVAL, + AV1E_SET_GF_MAX_PYRAMID_HEIGHT, AV1E_SET_SUPERBLOCK_SIZE, AV1E_SET_NUM_TG, AV1E_SET_MTU, @@ -726,12 +958,12 @@ static const int av1_arg_ctrl_map[] = { AOME_SET_CPUUSED, #if CONFIG_DENOISE AV1E_SET_DENOISE_NOISE_LEVEL, AV1E_SET_DENOISE_BLOCK_SIZE, -#endif +#endif // CONFIG_DENOISE + AV1E_SET_MAX_REFERENCE_FRAMES, + AV1E_SET_REDUCED_REFERENCE_SET, AV1E_SET_ENABLE_REF_FRAME_MVS, - AV1E_SET_ENABLE_DF, - AV1E_SET_ENABLE_ORDER_HINT, - AV1E_SET_ENABLE_JNT_COMP, - AV1E_SET_ENABLE_SUPERRES, + AV1E_SET_TARGET_SEQ_LEVEL_IDX, + AV1E_SET_TIER_MASK, 0 }; #endif // CONFIG_AV1_ENCODER @@ -798,9 +1030,6 @@ struct stream_config { struct aom_codec_enc_cfg cfg; const char *out_fn; const char *stats_fn; -#if CONFIG_FP_MB_STATS - const char *fpmb_stats_fn; -#endif stereo_format_t stereo_fmt; int arg_ctrls[ARG_CTRL_CNT_MAX][2]; int arg_ctrl_cnt; @@ -828,9 +1057,6 @@ struct stream_state { uint64_t cx_time; size_t nbytes; stats_io_t stats; -#if CONFIG_FP_MB_STATS - stats_io_t fpmb_stats; -#endif struct aom_image *img; aom_codec_ctx_t decoder; int mismatch_seen; @@ -916,7 +1142,9 @@ static void parse_global_config(struct AvxEncoderConfig *global, int argc, } else if (arg_match(&arg, &usage, argi)) global->usage = arg_parse_uint(&arg); else if (arg_match(&arg, &good_dl, argi)) - warn("Deprecated --good option! Ignoring\n"); + global->usage = AOM_USAGE_GOOD_QUALITY; // Good quality usage + else if (arg_match(&arg, &rt_dl, argi)) + global->usage = AOM_USAGE_REALTIME; // Real-time usage else if (arg_match(&arg, &use_yv12, argi)) global->color_type = YV12; else if (arg_match(&arg, &use_i420, argi)) @@ -969,11 +1197,19 @@ static void parse_global_config(struct AvxEncoderConfig *global, int argc, // Make default AV1 passes = 2 until there is a better quality 1-pass // encoder if (global->codec != NULL && global->codec->name != NULL) - global->passes = (strcmp(global->codec->name, "av1") == 0) ? 2 : 1; + global->passes = (strcmp(global->codec->name, "av1") == 0 && + global->usage != AOM_USAGE_REALTIME) + ? 2 + : 1; #else global->passes = 1; #endif } + + if (global->usage == AOM_USAGE_REALTIME && global->passes > 1) { + warn("Enforcing one-pass encoding in realtime mode\n"); + global->passes = 1; + } } static void open_input_file(struct AvxInputContext *input, @@ -1090,6 +1326,17 @@ static void set_config_arg_ctrls(struct stream_config *config, int key, return; } + // For target level, the settings should accumulate rather than overwrite, + // so we simply append it. + if (key == AV1E_SET_TARGET_SEQ_LEVEL_IDX) { + j = config->arg_ctrl_cnt; + assert(j < (int)ARG_CTRL_CNT_MAX); + config->arg_ctrls[j][0] = key; + config->arg_ctrls[j][1] = arg_parse_enum_or_int(arg); + ++config->arg_ctrl_cnt; + return; + } + /* Point either to the next free element or the first instance of this * control. */ @@ -1159,10 +1406,6 @@ static int parse_stream_params(struct AvxEncoderConfig *global, } } else if (arg_match(&arg, &fpf_name, argi)) { config->stats_fn = arg.val; -#if CONFIG_FP_MB_STATS - } else if (arg_match(&arg, &fpmbf_name, argi)) { - config->fpmb_stats_fn = arg.val; -#endif } else if (arg_match(&arg, &use_webm, argi)) { #if CONFIG_WEBM_IO config->write_webm = 1; @@ -1207,8 +1450,15 @@ static int parse_stream_params(struct AvxEncoderConfig *global, config->cfg.g_error_resilient = arg_parse_uint(&arg); } else if (arg_match(&arg, &lag_in_frames, argi)) { config->cfg.g_lag_in_frames = arg_parse_uint(&arg); + if (global->usage == AOM_USAGE_REALTIME && + config->cfg.rc_end_usage == AOM_CBR && + config->cfg.g_lag_in_frames != 0) { + warn("non-zero %s option ignored in realtime CBR mode.\n", arg.name); + config->cfg.g_lag_in_frames = 0; + } } else if (arg_match(&arg, &large_scale_tile, argi)) { config->cfg.large_scale_tile = arg_parse_uint(&arg); + if (config->cfg.large_scale_tile) global->codec = get_aom_lst_encoder(); } else if (arg_match(&arg, &monochrome, argi)) { config->cfg.monochrome = 1; } else if (arg_match(&arg, &full_still_picture_hdr, argi)) { @@ -1349,17 +1599,6 @@ static void validate_stream_config(const struct stream_state *stream, fatal("Stream %d: duplicate stats file (from stream %d)", streami->index, stream->index); } - -#if CONFIG_FP_MB_STATS - /* Check for two streams sharing a mb stats file. */ - if (streami != stream) { - const char *a = stream->config.fpmb_stats_fn; - const char *b = streami->config.fpmb_stats_fn; - if (a && b && !strcmp(a, b)) - fatal("Stream %d: duplicate mb stats file (from stream %d)", - streami->index, stream->index); - } -#endif } } @@ -1524,26 +1763,11 @@ static void setup_pass(struct stream_state *stream, fatal("Failed to open statistics store"); } -#if CONFIG_FP_MB_STATS - if (stream->config.fpmb_stats_fn) { - if (!stats_open_file(&stream->fpmb_stats, stream->config.fpmb_stats_fn, - pass)) - fatal("Failed to open mb statistics store"); - } else { - if (!stats_open_mem(&stream->fpmb_stats, pass)) - fatal("Failed to open mb statistics store"); - } -#endif - stream->config.cfg.g_pass = global->passes == 2 ? pass ? AOM_RC_LAST_PASS : AOM_RC_FIRST_PASS : AOM_RC_ONE_PASS; if (pass) { stream->config.cfg.rc_twopass_stats_in = stats_get(&stream->stats); -#if CONFIG_FP_MB_STATS - stream->config.cfg.rc_firstpass_mb_stats_in = - stats_get(&stream->fpmb_stats); -#endif } stream->cx_time = 0; @@ -1772,13 +1996,6 @@ static void get_cx_data(struct stream_state *stream, pkt->data.twopass_stats.sz); stream->nbytes += pkt->data.raw.sz; break; -#if CONFIG_FP_MB_STATS - case AOM_CODEC_FPMB_STATS_PKT: - stats_write(&stream->fpmb_stats, pkt->data.firstpass_mb_stats.buf, - pkt->data.firstpass_mb_stats.sz); - stream->nbytes += pkt->data.raw.sz; - break; -#endif case AOM_CODEC_PSNR_PKT: if (global->show_psnr) { @@ -1966,6 +2183,10 @@ int main(int argc, const char **argv_) { FOREACH_STREAM(stream, streams) { check_encoder_config(global.disable_warning_prompt, &global, &stream->config.cfg); + + // If large_scale_tile = 1, only support to output to ivf format. + if (stream->config.cfg.large_scale_tile && !stream->config.write_ivf) + die("only support ivf output format while large-scale-tile=1\n"); } /* Handle non-option arguments */ @@ -2371,12 +2592,6 @@ int main(int argc, const char **argv_) { stats_close(&stream->stats, global.passes - 1); } -#if CONFIG_FP_MB_STATS - FOREACH_STREAM(stream, streams) { - stats_close(&stream->fpmb_stats, global.passes - 1); - } -#endif - if (global.pass) break; } diff --git a/libaom/av1/av1.cmake b/libaom/av1/av1.cmake index 8c92615..fb9678a 100644 --- a/libaom/av1/av1.cmake +++ b/libaom/av1/av1.cmake @@ -137,6 +137,8 @@ list(APPEND AOM_AV1_ENCODER_SOURCES "${AOM_ROOT}/av1/encoder/encodemb.h" "${AOM_ROOT}/av1/encoder/encodemv.c" "${AOM_ROOT}/av1/encoder/encodemv.h" + "${AOM_ROOT}/av1/encoder/encode_strategy.c" + "${AOM_ROOT}/av1/encoder/encode_strategy.h" "${AOM_ROOT}/av1/encoder/encoder.c" "${AOM_ROOT}/av1/encoder/encoder.h" "${AOM_ROOT}/av1/encoder/encodetxb.c" @@ -149,6 +151,8 @@ list(APPEND AOM_AV1_ENCODER_SOURCES "${AOM_ROOT}/av1/encoder/firstpass.h" "${AOM_ROOT}/av1/encoder/global_motion.c" "${AOM_ROOT}/av1/encoder/global_motion.h" + "${AOM_ROOT}/av1/encoder/gop_structure.c" + "${AOM_ROOT}/av1/encoder/gop_structure.h" "${AOM_ROOT}/av1/encoder/grain_test_vectors.h" "${AOM_ROOT}/av1/encoder/hash.c" "${AOM_ROOT}/av1/encoder/hash.h" @@ -156,6 +160,8 @@ list(APPEND AOM_AV1_ENCODER_SOURCES "${AOM_ROOT}/av1/encoder/hash_motion.h" "${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.c" "${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.h" + "${AOM_ROOT}/av1/encoder/level.c" + "${AOM_ROOT}/av1/encoder/level.h" "${AOM_ROOT}/av1/encoder/lookahead.c" "${AOM_ROOT}/av1/encoder/lookahead.h" "${AOM_ROOT}/av1/encoder/mbgraph.c" @@ -166,6 +172,10 @@ list(APPEND AOM_AV1_ENCODER_SOURCES "${AOM_ROOT}/av1/encoder/ml.h" "${AOM_ROOT}/av1/encoder/palette.c" "${AOM_ROOT}/av1/encoder/palette.h" + "${AOM_ROOT}/av1/encoder/partition_strategy.h" + "${AOM_ROOT}/av1/encoder/partition_strategy.c" + "${AOM_ROOT}/av1/encoder/pass2_strategy.h" + "${AOM_ROOT}/av1/encoder/pass2_strategy.c" "${AOM_ROOT}/av1/encoder/pickcdef.c" "${AOM_ROOT}/av1/encoder/picklpf.c" "${AOM_ROOT}/av1/encoder/picklpf.h" @@ -189,7 +199,11 @@ list(APPEND AOM_AV1_ENCODER_SOURCES "${AOM_ROOT}/av1/encoder/temporal_filter.h" "${AOM_ROOT}/av1/encoder/tokenize.c" "${AOM_ROOT}/av1/encoder/tokenize.h" + "${AOM_ROOT}/av1/encoder/tpl_model.c" + "${AOM_ROOT}/av1/encoder/tpl_model.h" "${AOM_ROOT}/av1/encoder/wedge_utils.c" + "${AOM_ROOT}/av1/encoder/var_based_part.c" + "${AOM_ROOT}/av1/encoder/var_based_part.h" "${AOM_ROOT}/third_party/fastfeat/fast.c" "${AOM_ROOT}/third_party/fastfeat/fast.h" "${AOM_ROOT}/third_party/fastfeat/fast_9.c" @@ -253,8 +267,7 @@ list(APPEND AOM_AV1_COMMON_INTRIN_AVX2 "${AOM_ROOT}/av1/common/x86/wiener_convolve_avx2.c") list(APPEND AOM_AV1_ENCODER_ASM_SSE2 "${AOM_ROOT}/av1/encoder/x86/dct_sse2.asm" - "${AOM_ROOT}/av1/encoder/x86/error_sse2.asm" - "${AOM_ROOT}/av1/encoder/x86/temporal_filter_apply_sse2.asm") + "${AOM_ROOT}/av1/encoder/x86/error_sse2.asm") list(APPEND AOM_AV1_ENCODER_INTRIN_SSE2 "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_sse2.c" @@ -277,14 +290,20 @@ list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_1 "${AOM_ROOT}/av1/encoder/x86/encodetxb_sse4.c" "${AOM_ROOT}/av1/encoder/x86/highbd_fwd_txfm_sse4.c" "${AOM_ROOT}/av1/encoder/x86/rdopt_sse4.c" + "${AOM_ROOT}/av1/encoder/x86/temporal_filter_constants.h" + "${AOM_ROOT}/av1/encoder/x86/temporal_filter_sse4.c" + "${AOM_ROOT}/av1/encoder/x86/highbd_temporal_filter_sse4.c" "${AOM_ROOT}/av1/encoder/x86/pickrst_sse4.c") list(APPEND AOM_AV1_ENCODER_INTRIN_AVX2 "${AOM_ROOT}/av1/encoder/x86/av1_quantize_avx2.c" "${AOM_ROOT}/av1/encoder/x86/av1_highbd_quantize_avx2.c" + "${AOM_ROOT}/av1/encoder/x86/corner_match_avx2.c" "${AOM_ROOT}/av1/encoder/x86/error_intrin_avx2.c" + "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_avx2.c" "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_avx2.h" "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm2d_avx2.c" + "${AOM_ROOT}/av1/encoder/x86/highbd_fwd_txfm_avx2.c" "${AOM_ROOT}/av1/encoder/x86/wedge_utils_avx2.c" "${AOM_ROOT}/av1/encoder/x86/encodetxb_avx2.c" "${AOM_ROOT}/av1/encoder/x86/rdopt_avx2.c" @@ -340,15 +359,7 @@ endif() function(setup_av1_targets) add_library(aom_av1_common OBJECT ${AOM_AV1_COMMON_SOURCES}) list(APPEND AOM_LIB_TARGETS aom_av1_common) - - create_dummy_source_file("aom_av1" "c" "dummy_source_file") - add_library(aom_av1 OBJECT "${dummy_source_file}") target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_av1_common>) - list(APPEND AOM_LIB_TARGETS aom_av1) - - # Not all generators support libraries consisting only of object files. Add a - # dummy source file to the aom_av1 target. - add_dummy_source_file_to_target("aom_av1" "c") if(CONFIG_AV1_DECODER) add_library(aom_av1_decoder OBJECT ${AOM_AV1_DECODER_SOURCES}) @@ -446,13 +457,13 @@ function(setup_av1_targets) if(HAVE_NEON) if(AOM_AV1_COMMON_INTRIN_NEON) - add_intrinsics_object_library("${AOM_INTRIN_NEON_FLAG}" "neon" + add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon" "aom_av1_common" "AOM_AV1_COMMON_INTRIN_NEON" "aom") endif() if(AOM_AV1_ENCODER_INTRIN_NEON) - add_intrinsics_object_library("${AOM_INTRIN_NEON_FLAG}" "neon" + add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon" "aom_av1_encoder" "AOM_AV1_ENCODER_INTRIN_NEON" "aom") endif() @@ -470,13 +481,7 @@ function(setup_av1_targets) "AOM_AV1_ENCODER_INTRIN_MSA" "aom") endif() - target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_dsp>) - target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_scale>) - # Pass the new lib targets up to the parent scope instance of # $AOM_LIB_TARGETS. set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE) endfunction() - -function(setup_av1_test_targets) -endfunction() diff --git a/libaom/av1/av1_cx_iface.c b/libaom/av1/av1_cx_iface.c index 43a6028..e8cd508 100644 --- a/libaom/av1/av1_cx_iface.c +++ b/libaom/av1/av1_cx_iface.c @@ -26,10 +26,6 @@ #include "av1/encoder/encoder.h" #include "av1/encoder/firstpass.h" -#if CONFIG_REDUCED_ENCODER_BORDER -#include "common/tools_common.h" -#endif // CONFIG_REDUCED_ENCODER_BORDER - #define MAG_SIZE (4) #define MAX_NUM_ENHANCEMENT_LAYERS 3 @@ -48,6 +44,7 @@ struct av1_extracfg { unsigned int arnr_strength; unsigned int min_gf_interval; unsigned int max_gf_interval; + unsigned int gf_max_pyr_height; aom_tune_metric tuning; unsigned int cq_level; // constrained quality level unsigned int rc_max_intra_bitrate_pct; @@ -56,6 +53,7 @@ struct av1_extracfg { unsigned int lossless; unsigned int enable_cdef; unsigned int enable_restoration; + unsigned int enable_obmc; unsigned int disable_trellis_quant; unsigned int enable_qm; unsigned int qm_y; @@ -71,7 +69,7 @@ struct av1_extracfg { aom_timing_info_type_t timing_info_type; unsigned int frame_parallel_decoding_mode; - int use_dual_filter; + int enable_dual_filter; AQ_MODE aq_mode; DELTAQ_MODE deltaq_mode; unsigned int frame_periodic_boost; @@ -93,13 +91,39 @@ struct av1_extracfg { const char *film_grain_table_filename; unsigned int motion_vector_unit_test; unsigned int cdf_update_mode; - int enable_order_hint; - int enable_jnt_comp; - int enable_ref_frame_mvs; // sequence level - int allow_ref_frame_mvs; // frame level - int enable_warped_motion; // sequence level - int allow_warped_motion; // frame level + int enable_rect_partitions; // enable rectangular partitions for sequence + int enable_ab_partitions; // enable AB partitions for sequence + int enable_1to4_partitions; // enable 1:4 and 4:1 partitions for sequence + int min_partition_size; // min partition size [4,8,16,32,64,128] + int max_partition_size; // max partition size [4,8,16,32,64,128] + int enable_intra_edge_filter; // enable intra-edge filter for sequence + int enable_order_hint; // enable order hint for sequence + int enable_tx64; // enable 64-pt transform usage for sequence + int tx_size_search_method; // set transform block size search method + int enable_flip_idtx; // enable flip and identity transform types + int enable_dist_wtd_comp; // enable dist wtd compound for sequence + int max_reference_frames; // maximum number of references per frame + int enable_reduced_reference_set; // enable reduced set of references + int enable_ref_frame_mvs; // sequence level + int allow_ref_frame_mvs; // frame level + int enable_masked_comp; // enable masked compound for sequence + int enable_onesided_comp; // enable one sided compound for sequence + int enable_interintra_comp; // enable interintra compound for sequence + int enable_smooth_interintra; // enable smooth interintra mode usage + int enable_diff_wtd_comp; // enable diff-wtd compound usage + int enable_interinter_wedge; // enable interinter-wedge compound usage + int enable_interintra_wedge; // enable interintra-wedge compound usage + int enable_global_motion; // enable global motion usage for sequence + int enable_warped_motion; // sequence level + int allow_warped_motion; // frame level + int enable_filter_intra; // enable filter intra for sequence + int enable_smooth_intra; // enable smooth intra modes for sequence + int enable_paeth_intra; // enable Paeth intra mode for sequence + int enable_cfl_intra; // enable CFL uv intra mode for sequence int enable_superres; + int enable_palette; + int enable_intrabc; + int enable_angle_delta; #if CONFIG_DENOISE float noise_level; int noise_block_size; @@ -107,6 +131,17 @@ struct av1_extracfg { unsigned int chroma_subsampling_x; unsigned int chroma_subsampling_y; + int reduced_tx_type_set; + int use_intra_dct_only; + int use_inter_dct_only; + int use_intra_default_tx_only; + int quant_b_adapt; + AV1_LEVEL target_seq_level_idx[MAX_NUM_OPERATING_POINTS]; + // Bit mask to specify which tier each of the 32 possible operating points + // conforms to. + unsigned int tier_mask; + COST_UPDATE_TYPE coeff_cost_upd_freq; + COST_UPDATE_TYPE mode_cost_upd_freq; }; static struct av1_extracfg default_extra_cfg = { @@ -116,7 +151,7 @@ static struct av1_extracfg default_extra_cfg = { 0, // noise_sensitivity CONFIG_SHARP_SETTINGS, // sharpness 0, // static_thresh - 0, // row_mt + 1, // row_mt 0, // tile_columns 0, // tile_rows 0, // enable_tpl_model @@ -124,6 +159,7 @@ static struct av1_extracfg default_extra_cfg = { 5, // arnr_strength 0, // min_gf_interval; 0 -> default decision 0, // max_gf_interval; 0 -> default decision + 4, // gf_max_pyr_height AOM_TUNE_PSNR, // tuning 10, // cq_level 0, // rc_max_intra_bitrate_pct @@ -132,6 +168,7 @@ static struct av1_extracfg default_extra_cfg = { 0, // lossless !CONFIG_SHARP_SETTINGS, // enable_cdef 1, // enable_restoration + 1, // enable_obmc 0, // disable_trellis_quant 0, // enable_qm DEFAULT_QM_Y, // qm_y @@ -145,7 +182,7 @@ static struct av1_extracfg default_extra_cfg = { 1, // max number of tile groups 0, // mtu_size AOM_TIMING_UNSPECIFIED, // No picture timing signaling in bitstream - 1, // frame_parallel_decoding_mode + 0, // frame_parallel_decoding_mode 1, // enable dual filter NO_AQ, // aq_mode NO_DELTA_Q, // deltaq_mode @@ -167,19 +204,57 @@ static struct av1_extracfg default_extra_cfg = { 0, // film_grain_table_filename 0, // motion_vector_unit_test 1, // CDF update mode + 1, // enable rectangular partitions + 1, // enable ab shape partitions + 1, // enable 1:4 and 4:1 partitions + 4, // min_partition_size + 128, // max_partition_size + 1, // enable intra edge filter 1, // frame order hint - 1, // jnt_comp + 1, // enable 64-pt transform usage + 0, // transform block size search method + 1, // enable flip and identity transform + 1, // dist-wtd compound + 7, // max_reference_frames + 0, // enable_reduced_reference_set 1, // enable_ref_frame_mvs sequence level 1, // allow ref_frame_mvs frame level + 1, // enable masked compound at sequence level + 1, // enable one sided compound at sequence level + 1, // enable interintra compound at sequence level + 1, // enable smooth interintra mode + 1, // enable difference-weighted compound + 1, // enable interinter wedge compound + 1, // enable interintra wedge compound + 1, // enable_global_motion usage 1, // enable_warped_motion at sequence level 1, // allow_warped_motion at frame level + 1, // enable filter intra at sequence level + 1, // enable smooth intra modes usage for sequence + 1, // enable Paeth intra mode usage for sequence + 1, // enable CFL uv intra mode usage for sequence 1, // superres + 1, // enable palette + !CONFIG_SHARP_SETTINGS, // enable intrabc + 1, // enable angle delta #if CONFIG_DENOISE 0, // noise_level 32, // noise_block_size #endif 0, // chroma_subsampling_x 0, // chroma_subsampling_y + 0, // reduced_tx_type_set + 0, // use_intra_dct_only + 0, // use_inter_dct_only + 0, // use_intra_default_tx_only + 0, // quant_b_adapt + { + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, + }, // target_seq_level_idx + 0, // tier_mask + COST_UPD_SB, // coeff_cost_upd_freq + COST_UPD_SB, // mode_cost_upd_freq }; struct aom_codec_alg_priv { @@ -251,6 +326,7 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx, RANGE_CHECK_HI(extra_cfg, aq_mode, AQ_MODE_COUNT - 1); RANGE_CHECK_HI(extra_cfg, deltaq_mode, DELTAQ_MODE_COUNT - 1); RANGE_CHECK_HI(extra_cfg, frame_periodic_boost, 1); + RANGE_CHECK_HI(cfg, g_usage, 1); RANGE_CHECK_HI(cfg, g_threads, MAX_NUM_THREADS); RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS); RANGE_CHECK(cfg, rc_end_usage, AOM_VBR, AOM_Q); @@ -266,6 +342,7 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx, RANGE_CHECK(extra_cfg, max_gf_interval, MAX(2, extra_cfg->min_gf_interval), (MAX_LAG_BUFFERS - 1)); } + RANGE_CHECK_HI(extra_cfg, gf_max_pyr_height, 4); RANGE_CHECK_HI(cfg, rc_resize_mode, RESIZE_MODES - 1); RANGE_CHECK(cfg, rc_resize_denominator, SCALE_NUMERATOR, @@ -382,9 +459,26 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx, #endif } + RANGE_CHECK(extra_cfg, max_reference_frames, 3, 7); + RANGE_CHECK(extra_cfg, enable_reduced_reference_set, 0, 1); RANGE_CHECK_HI(extra_cfg, chroma_subsampling_x, 1); RANGE_CHECK_HI(extra_cfg, chroma_subsampling_y, 1); + RANGE_CHECK_HI(extra_cfg, disable_trellis_quant, 3); + RANGE_CHECK(extra_cfg, coeff_cost_upd_freq, 0, 2); + RANGE_CHECK(extra_cfg, mode_cost_upd_freq, 0, 2); + + RANGE_CHECK(extra_cfg, min_partition_size, 4, 128); + RANGE_CHECK(extra_cfg, max_partition_size, 4, 128); + RANGE_CHECK_HI(extra_cfg, min_partition_size, extra_cfg->max_partition_size); + + RANGE_CHECK(extra_cfg, tx_size_search_method, 0, 2); + + for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) { + if (!is_valid_seq_level_idx(extra_cfg->target_seq_level_idx[i])) + ERROR("Target sequence level index is invalid"); + } + return AOM_CODEC_OK; } @@ -452,6 +546,7 @@ static aom_codec_err_t set_encoder_config( oxcf->profile = cfg->g_profile; oxcf->fwd_kf_enabled = cfg->fwd_kf_enabled; oxcf->max_threads = (int)cfg->g_threads; + oxcf->mode = (cfg->g_usage == 1) ? REALTIME : GOOD; oxcf->width = cfg->g_w; oxcf->height = cfg->g_h; oxcf->forced_max_frame_width = cfg->g_forced_max_frame_width; @@ -494,7 +589,6 @@ static aom_codec_err_t set_encoder_config( oxcf->init_framerate = 30; oxcf->timing_info_present = 0; } - oxcf->mode = GOOD; oxcf->cfg = &cfg->cfg; switch (cfg->g_pass) { @@ -522,6 +616,10 @@ static aom_codec_err_t set_encoder_config( oxcf->enable_cdef = extra_cfg->enable_cdef; oxcf->enable_restoration = extra_cfg->enable_restoration; + oxcf->enable_obmc = extra_cfg->enable_obmc; + oxcf->enable_palette = extra_cfg->enable_palette; + oxcf->enable_intrabc = extra_cfg->enable_intrabc; + oxcf->enable_angle_delta = extra_cfg->enable_angle_delta; oxcf->disable_trellis_quant = extra_cfg->disable_trellis_quant; oxcf->using_qm = extra_cfg->enable_qm; oxcf->qm_y = extra_cfg->qm_y; @@ -529,6 +627,13 @@ static aom_codec_err_t set_encoder_config( oxcf->qm_v = extra_cfg->qm_v; oxcf->qm_minlevel = extra_cfg->qm_min; oxcf->qm_maxlevel = extra_cfg->qm_max; + oxcf->reduced_tx_type_set = extra_cfg->reduced_tx_type_set; + oxcf->use_intra_dct_only = extra_cfg->use_intra_dct_only; + oxcf->use_inter_dct_only = extra_cfg->use_inter_dct_only; + oxcf->use_intra_default_tx_only = extra_cfg->use_intra_default_tx_only; + oxcf->quant_b_adapt = extra_cfg->quant_b_adapt; + oxcf->coeff_cost_upd_freq = (COST_UPDATE_TYPE)extra_cfg->coeff_cost_upd_freq; + oxcf->mode_cost_upd_freq = (COST_UPDATE_TYPE)extra_cfg->mode_cost_upd_freq; #if CONFIG_DIST_8X8 oxcf->using_dist_8x8 = extra_cfg->enable_dist_8x8; if (extra_cfg->tuning == AOM_TUNE_CDEF_DIST || @@ -539,7 +644,6 @@ static aom_codec_err_t set_encoder_config( // In large-scale tile encoding mode, num_tile_groups is always 1. if (cfg->large_scale_tile) oxcf->num_tile_groups = 1; oxcf->mtu = extra_cfg->mtu_size; - oxcf->enable_tpl_model = extra_cfg->enable_tpl_model; // FIXME(debargha): Should this be: // oxcf->allow_ref_frame_mvs = extra_cfg->allow_ref_frame_mvs & @@ -579,6 +683,9 @@ static aom_codec_err_t set_encoder_config( } } + oxcf->enable_tpl_model = + extra_cfg->enable_tpl_model && (oxcf->superres_mode == SUPERRES_NONE); + oxcf->maximum_buffer_size_ms = is_vbr ? 240000 : cfg->rc_buf_sz; oxcf->starting_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_initial_sz; oxcf->optimal_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_optimal_sz; @@ -604,10 +711,6 @@ static aom_codec_err_t set_encoder_config( oxcf->two_pass_stats_in = cfg->rc_twopass_stats_in; -#if CONFIG_FP_MB_STATS - oxcf->firstpass_mb_stats_in = cfg->rc_firstpass_mb_stats_in; -#endif - oxcf->color_primaries = extra_cfg->color_primaries; oxcf->transfer_characteristics = extra_cfg->transfer_characteristics; oxcf->matrix_coefficients = extra_cfg->matrix_coefficients; @@ -623,6 +726,7 @@ static aom_codec_err_t set_encoder_config( oxcf->arnr_strength = extra_cfg->arnr_strength; oxcf->min_gf_interval = extra_cfg->min_gf_interval; oxcf->max_gf_interval = extra_cfg->max_gf_interval; + oxcf->gf_max_pyr_height = extra_cfg->gf_max_pyr_height; oxcf->tuning = extra_cfg->tuning; oxcf->content = extra_cfg->content; @@ -659,16 +763,43 @@ static aom_codec_err_t set_encoder_config( oxcf->monochrome = cfg->monochrome; oxcf->full_still_picture_hdr = cfg->full_still_picture_hdr; - oxcf->enable_dual_filter = extra_cfg->use_dual_filter; + oxcf->enable_dual_filter = extra_cfg->enable_dual_filter; + oxcf->enable_rect_partitions = extra_cfg->enable_rect_partitions; + oxcf->enable_ab_partitions = extra_cfg->enable_ab_partitions; + oxcf->enable_1to4_partitions = extra_cfg->enable_1to4_partitions; + oxcf->min_partition_size = extra_cfg->min_partition_size; + oxcf->max_partition_size = extra_cfg->max_partition_size; + oxcf->enable_intra_edge_filter = extra_cfg->enable_intra_edge_filter; + oxcf->enable_tx64 = extra_cfg->enable_tx64; + oxcf->tx_size_search_method = extra_cfg->tx_size_search_method; + oxcf->enable_flip_idtx = extra_cfg->enable_flip_idtx; oxcf->enable_order_hint = extra_cfg->enable_order_hint; - oxcf->enable_jnt_comp = - extra_cfg->enable_jnt_comp & extra_cfg->enable_order_hint; + oxcf->enable_dist_wtd_comp = + extra_cfg->enable_dist_wtd_comp & extra_cfg->enable_order_hint; + oxcf->max_reference_frames = extra_cfg->max_reference_frames; + oxcf->enable_reduced_reference_set = extra_cfg->enable_reduced_reference_set; + oxcf->enable_masked_comp = extra_cfg->enable_masked_comp; + oxcf->enable_onesided_comp = extra_cfg->enable_onesided_comp; + oxcf->enable_diff_wtd_comp = + extra_cfg->enable_masked_comp & extra_cfg->enable_diff_wtd_comp; + oxcf->enable_interinter_wedge = + extra_cfg->enable_masked_comp & extra_cfg->enable_interinter_wedge; + oxcf->enable_interintra_comp = extra_cfg->enable_interintra_comp; + oxcf->enable_smooth_interintra = + extra_cfg->enable_interintra_comp && extra_cfg->enable_smooth_interintra; + oxcf->enable_interintra_wedge = + extra_cfg->enable_interintra_comp & extra_cfg->enable_interintra_wedge; oxcf->enable_ref_frame_mvs = extra_cfg->enable_ref_frame_mvs & extra_cfg->enable_order_hint; + oxcf->enable_global_motion = extra_cfg->enable_global_motion; oxcf->enable_warped_motion = extra_cfg->enable_warped_motion; oxcf->allow_warped_motion = extra_cfg->allow_warped_motion & extra_cfg->enable_warped_motion; + oxcf->enable_filter_intra = extra_cfg->enable_filter_intra; + oxcf->enable_smooth_intra = extra_cfg->enable_smooth_intra; + oxcf->enable_paeth_intra = extra_cfg->enable_paeth_intra; + oxcf->enable_cfl_intra = extra_cfg->enable_cfl_intra; oxcf->enable_superres = (oxcf->superres_mode != SUPERRES_NONE) && extra_cfg->enable_superres; @@ -710,23 +841,14 @@ static aom_codec_err_t set_encoder_config( oxcf->frame_periodic_boost = extra_cfg->frame_periodic_boost; oxcf->motion_vector_unit_test = extra_cfg->motion_vector_unit_test; -#if CONFIG_REDUCED_ENCODER_BORDER - if (oxcf->superres_mode != SUPERRES_NONE || - oxcf->resize_mode != RESIZE_NONE) { - warn( - "Superres / resize cannot be used with CONFIG_REDUCED_ENCODER_BORDER. " - "Disabling superres/resize.\n"); - // return AOM_CODEC_INVALID_PARAM; - disable_superres(oxcf); - oxcf->resize_mode = RESIZE_NONE; - oxcf->resize_scale_denominator = SCALE_NUMERATOR; - oxcf->resize_kf_scale_denominator = SCALE_NUMERATOR; - } -#endif // CONFIG_REDUCED_ENCODER_BORDER - oxcf->chroma_subsampling_x = extra_cfg->chroma_subsampling_x; oxcf->chroma_subsampling_y = extra_cfg->chroma_subsampling_y; - + oxcf->border_in_pixels = (oxcf->resize_mode || oxcf->superres_mode) + ? AOM_BORDER_IN_PIXELS + : AOM_ENC_NO_SCALE_BORDER; + memcpy(oxcf->target_seq_level_idx, extra_cfg->target_seq_level_idx, + sizeof(oxcf->target_seq_level_idx)); + oxcf->tier_mask = extra_cfg->tier_mask; return AOM_CODEC_OK; } @@ -939,6 +1061,13 @@ static aom_codec_err_t ctrl_set_enable_restoration(aom_codec_alg_priv_t *ctx, return update_extra_cfg(ctx, &extra_cfg); } +static aom_codec_err_t ctrl_set_enable_obmc(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_obmc = CAST(AV1E_SET_ENABLE_OBMC, args); + return update_extra_cfg(ctx, &extra_cfg); +} + static aom_codec_err_t ctrl_set_disable_trellis_quant(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; @@ -1007,10 +1136,55 @@ static aom_codec_err_t ctrl_set_timing_info_type(aom_codec_alg_priv_t *ctx, return update_extra_cfg(ctx, &extra_cfg); } -static aom_codec_err_t ctrl_set_enable_df(aom_codec_alg_priv_t *ctx, - va_list args) { +static aom_codec_err_t ctrl_set_enable_dual_filter(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_dual_filter = CAST(AV1E_SET_ENABLE_DUAL_FILTER, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_rect_partitions( + aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; - extra_cfg.use_dual_filter = CAST(AV1E_SET_ENABLE_DF, args); + extra_cfg.enable_rect_partitions = + CAST(AV1E_SET_ENABLE_RECT_PARTITIONS, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_ab_partitions(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_ab_partitions = CAST(AV1E_SET_ENABLE_AB_PARTITIONS, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_1to4_partitions( + aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_1to4_partitions = + CAST(AV1E_SET_ENABLE_1TO4_PARTITIONS, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_min_partition_size(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.min_partition_size = CAST(AV1E_SET_MIN_PARTITION_SIZE, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_max_partition_size(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.max_partition_size = CAST(AV1E_SET_MAX_PARTITION_SIZE, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_intra_edge_filter( + aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_intra_edge_filter = + CAST(AV1E_SET_ENABLE_INTRA_EDGE_FILTER, args); return update_extra_cfg(ctx, &extra_cfg); } @@ -1021,10 +1195,46 @@ static aom_codec_err_t ctrl_set_enable_order_hint(aom_codec_alg_priv_t *ctx, return update_extra_cfg(ctx, &extra_cfg); } -static aom_codec_err_t ctrl_set_enable_jnt_comp(aom_codec_alg_priv_t *ctx, - va_list args) { +static aom_codec_err_t ctrl_set_enable_tx64(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_tx64 = CAST(AV1E_SET_ENABLE_TX64, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_tx_size_search_method(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.tx_size_search_method = CAST(AV1E_SET_TX_SIZE_SEARCH_METHOD, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_flip_idtx(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_flip_idtx = CAST(AV1E_SET_ENABLE_FLIP_IDTX, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_dist_wtd_comp(aom_codec_alg_priv_t *ctx, + va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; - extra_cfg.enable_jnt_comp = CAST(AV1E_SET_ENABLE_JNT_COMP, args); + extra_cfg.enable_dist_wtd_comp = CAST(AV1E_SET_ENABLE_DIST_WTD_COMP, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_max_reference_frames(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.max_reference_frames = CAST(AV1E_SET_MAX_REFERENCE_FRAMES, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_reduced_reference_set( + aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_reduced_reference_set = + CAST(AV1E_SET_REDUCED_REFERENCE_SET, args); return update_extra_cfg(ctx, &extra_cfg); } @@ -1042,6 +1252,66 @@ static aom_codec_err_t ctrl_set_allow_ref_frame_mvs(aom_codec_alg_priv_t *ctx, return update_extra_cfg(ctx, &extra_cfg); } +static aom_codec_err_t ctrl_set_enable_masked_comp(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_masked_comp = CAST(AV1E_SET_ENABLE_MASKED_COMP, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_onesided_comp(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_onesided_comp = CAST(AV1E_SET_ENABLE_ONESIDED_COMP, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_interintra_comp( + aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_interintra_comp = + CAST(AV1E_SET_ENABLE_INTERINTRA_COMP, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_smooth_interintra( + aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_smooth_interintra = + CAST(AV1E_SET_ENABLE_SMOOTH_INTERINTRA, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_diff_wtd_comp(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_diff_wtd_comp = CAST(AV1E_SET_ENABLE_DIFF_WTD_COMP, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_interinter_wedge( + aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_interinter_wedge = + CAST(AV1E_SET_ENABLE_INTERINTER_WEDGE, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_interintra_wedge( + aom_codec_alg_priv_t *ctx, va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_interintra_wedge = + CAST(AV1E_SET_ENABLE_INTERINTRA_WEDGE, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_global_motion(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_global_motion = CAST(AV1E_SET_ENABLE_GLOBAL_MOTION, args); + return update_extra_cfg(ctx, &extra_cfg); +} + static aom_codec_err_t ctrl_set_enable_warped_motion(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; @@ -1056,6 +1326,34 @@ static aom_codec_err_t ctrl_set_allow_warped_motion(aom_codec_alg_priv_t *ctx, return update_extra_cfg(ctx, &extra_cfg); } +static aom_codec_err_t ctrl_set_enable_filter_intra(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_filter_intra = CAST(AV1E_SET_ENABLE_FILTER_INTRA, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_smooth_intra(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_smooth_intra = CAST(AV1E_SET_ENABLE_SMOOTH_INTRA, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_paeth_intra(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_paeth_intra = CAST(AV1E_SET_ENABLE_PAETH_INTRA, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_cfl_intra(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_cfl_intra = CAST(AV1E_SET_ENABLE_CFL_INTRA, args); + return update_extra_cfg(ctx, &extra_cfg); +} + static aom_codec_err_t ctrl_set_enable_superres(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; @@ -1063,6 +1361,27 @@ static aom_codec_err_t ctrl_set_enable_superres(aom_codec_alg_priv_t *ctx, return update_extra_cfg(ctx, &extra_cfg); } +static aom_codec_err_t ctrl_set_enable_palette(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_palette = CAST(AV1E_SET_ENABLE_PALETTE, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_intrabc(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_intrabc = CAST(AV1E_SET_ENABLE_INTRABC, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_enable_angle_delta(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_angle_delta = CAST(AV1E_SET_ENABLE_ANGLE_DELTA, args); + return update_extra_cfg(ctx, &extra_cfg); +} + static aom_codec_err_t ctrl_set_error_resilient_mode(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; @@ -1099,6 +1418,56 @@ static aom_codec_err_t ctrl_set_aq_mode(aom_codec_alg_priv_t *ctx, return update_extra_cfg(ctx, &extra_cfg); } +static aom_codec_err_t ctrl_set_reduced_tx_type_set(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.reduced_tx_type_set = CAST(AV1E_SET_REDUCED_TX_TYPE_SET, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_intra_dct_only(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.use_intra_dct_only = CAST(AV1E_SET_INTRA_DCT_ONLY, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_inter_dct_only(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.use_inter_dct_only = CAST(AV1E_SET_INTER_DCT_ONLY, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_intra_default_tx_only(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.use_intra_default_tx_only = + CAST(AV1E_SET_INTRA_DEFAULT_TX_ONLY, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_quant_b_adapt(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.quant_b_adapt = CAST(AV1E_SET_QUANT_B_ADAPT, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_coeff_cost_upd_freq(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.coeff_cost_upd_freq = CAST(AV1E_SET_COEFF_COST_UPD_FREQ, args); + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_mode_cost_upd_freq(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.mode_cost_upd_freq = CAST(AV1E_SET_MODE_COST_UPD_FREQ, args); + return update_extra_cfg(ctx, &extra_cfg); +} + static aom_codec_err_t ctrl_set_film_grain_test_vector( aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; @@ -1152,6 +1521,13 @@ static aom_codec_err_t ctrl_set_max_gf_interval(aom_codec_alg_priv_t *ctx, return update_extra_cfg(ctx, &extra_cfg); } +static aom_codec_err_t ctrl_set_gf_max_pyr_height(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.gf_max_pyr_height = CAST(AV1E_SET_GF_MAX_PYRAMID_HEIGHT, args); + return update_extra_cfg(ctx, &extra_cfg); +} + static aom_codec_err_t ctrl_set_frame_periodic_boost(aom_codec_alg_priv_t *ctx, va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; @@ -1167,6 +1543,26 @@ static aom_codec_err_t ctrl_enable_motion_vector_unit_test( return update_extra_cfg(ctx, &extra_cfg); } +static aom_codec_err_t ctrl_set_target_seq_level_idx(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + const int val = CAST(AV1E_SET_TARGET_SEQ_LEVEL_IDX, args); + const int level = val % 100; + const int operating_point_idx = val / 100; + if (operating_point_idx >= 0 && + operating_point_idx < MAX_NUM_OPERATING_POINTS) { + extra_cfg.target_seq_level_idx[operating_point_idx] = (AV1_LEVEL)level; + } + return update_extra_cfg(ctx, &extra_cfg); +} + +static aom_codec_err_t ctrl_set_tier_mask(aom_codec_alg_priv_t *ctx, + va_list args) { + struct av1_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.tier_mask = CAST(AV1E_SET_TIER_MASK, args); + return update_extra_cfg(ctx, &extra_cfg); +} + static aom_codec_err_t encoder_init(aom_codec_ctx_t *ctx, aom_codec_priv_enc_mr_cfg_t *data) { aom_codec_err_t res = AOM_CODEC_OK; @@ -1269,8 +1665,7 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx, } } } - - if (ctx->oxcf.mode != GOOD) { + if (ctx->oxcf.mode != GOOD && ctx->oxcf.mode != REALTIME) { ctx->oxcf.mode = GOOD; av1_change_config(ctx->cpi, &ctx->oxcf); } @@ -1328,6 +1723,8 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx, unsigned char *cx_data = ctx->cx_data; size_t cx_data_sz = ctx->cx_data_sz; + assert(!(cx_data == NULL && cx_data_sz != 0)); + /* Any pending invisible frames? */ if (ctx->pending_cx_data) { memmove(cx_data, ctx->pending_cx_data, ctx->pending_cx_data_sz); @@ -1355,12 +1752,6 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx, -1 != av1_get_compressed_data(cpi, &lib_flags, &frame_size, cx_data, &dst_time_stamp, &dst_end_time_stamp, !img, timebase)) { - if (cpi->common.seq_params.frame_id_numbers_present_flag) { - if (cpi->common.invalid_delta_frame_id_minus_1) { - aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, - "Invalid delta_frame_id_minus_1"); - } - } cpi->seq_params_locked = 1; if (frame_size) { if (ctx->pending_cx_data == 0) ctx->pending_cx_data = cx_data; @@ -1380,8 +1771,8 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx, frame_size); } const uint32_t obu_header_offset = 0; - obu_header_size = write_obu_header( - OBU_TEMPORAL_DELIMITER, 0, + obu_header_size = av1_write_obu_header( + cpi, OBU_TEMPORAL_DELIMITER, 0, (uint8_t *)(ctx->pending_cx_data + obu_header_offset)); // OBUs are preceded/succeeded by an unsigned leb128 coded integer. @@ -1742,6 +2133,13 @@ static aom_codec_err_t ctrl_set_chroma_subsampling_y(aom_codec_alg_priv_t *ctx, return update_extra_cfg(ctx, &extra_cfg); } +static aom_codec_err_t ctrl_get_seq_level_idx(aom_codec_alg_priv_t *ctx, + va_list args) { + int *const arg = va_arg(args, int *); + if (arg == NULL) return AOM_CODEC_INVALID_PARAM; + return av1_get_seq_level_idx(ctx->cpi, arg); +} + static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { { AV1_COPY_REFERENCE, ctrl_copy_reference }, { AOME_USE_REFERENCE, ctrl_use_reference }, @@ -1773,6 +2171,7 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { { AV1E_SET_LOSSLESS, ctrl_set_lossless }, { AV1E_SET_ENABLE_CDEF, ctrl_set_enable_cdef }, { AV1E_SET_ENABLE_RESTORATION, ctrl_set_enable_restoration }, + { AV1E_SET_ENABLE_OBMC, ctrl_set_enable_obmc }, { AV1E_SET_DISABLE_TRELLIS_QUANT, ctrl_set_disable_trellis_quant }, { AV1E_SET_ENABLE_QM, ctrl_set_enable_qm }, { AV1E_SET_QM_Y, ctrl_set_qm_y }, @@ -1789,15 +2188,48 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { { AV1E_SET_FRAME_PARALLEL_DECODING, ctrl_set_frame_parallel_decoding_mode }, { AV1E_SET_ERROR_RESILIENT_MODE, ctrl_set_error_resilient_mode }, { AV1E_SET_S_FRAME_MODE, ctrl_set_s_frame_mode }, - { AV1E_SET_ENABLE_DF, ctrl_set_enable_df }, + { AV1E_SET_ENABLE_RECT_PARTITIONS, ctrl_set_enable_rect_partitions }, + { AV1E_SET_ENABLE_AB_PARTITIONS, ctrl_set_enable_ab_partitions }, + { AV1E_SET_ENABLE_1TO4_PARTITIONS, ctrl_set_enable_1to4_partitions }, + { AV1E_SET_MIN_PARTITION_SIZE, ctrl_set_min_partition_size }, + { AV1E_SET_MAX_PARTITION_SIZE, ctrl_set_max_partition_size }, + { AV1E_SET_ENABLE_DUAL_FILTER, ctrl_set_enable_dual_filter }, + { AV1E_SET_ENABLE_INTRA_EDGE_FILTER, ctrl_set_enable_intra_edge_filter }, { AV1E_SET_ENABLE_ORDER_HINT, ctrl_set_enable_order_hint }, - { AV1E_SET_ENABLE_JNT_COMP, ctrl_set_enable_jnt_comp }, + { AV1E_SET_ENABLE_TX64, ctrl_set_enable_tx64 }, + { AV1E_SET_TX_SIZE_SEARCH_METHOD, ctrl_set_tx_size_search_method }, + { AV1E_SET_ENABLE_FLIP_IDTX, ctrl_set_enable_flip_idtx }, + { AV1E_SET_ENABLE_DIST_WTD_COMP, ctrl_set_enable_dist_wtd_comp }, + { AV1E_SET_MAX_REFERENCE_FRAMES, ctrl_set_max_reference_frames }, + { AV1E_SET_REDUCED_REFERENCE_SET, ctrl_set_enable_reduced_reference_set }, { AV1E_SET_ENABLE_REF_FRAME_MVS, ctrl_set_enable_ref_frame_mvs }, { AV1E_SET_ALLOW_REF_FRAME_MVS, ctrl_set_allow_ref_frame_mvs }, + { AV1E_SET_ENABLE_MASKED_COMP, ctrl_set_enable_masked_comp }, + { AV1E_SET_ENABLE_ONESIDED_COMP, ctrl_set_enable_onesided_comp }, + { AV1E_SET_ENABLE_INTERINTRA_COMP, ctrl_set_enable_interintra_comp }, + { AV1E_SET_ENABLE_SMOOTH_INTERINTRA, ctrl_set_enable_smooth_interintra }, + { AV1E_SET_ENABLE_DIFF_WTD_COMP, ctrl_set_enable_diff_wtd_comp }, + { AV1E_SET_ENABLE_INTERINTER_WEDGE, ctrl_set_enable_interinter_wedge }, + { AV1E_SET_ENABLE_INTERINTRA_WEDGE, ctrl_set_enable_interintra_wedge }, + { AV1E_SET_ENABLE_GLOBAL_MOTION, ctrl_set_enable_global_motion }, { AV1E_SET_ENABLE_WARPED_MOTION, ctrl_set_enable_warped_motion }, { AV1E_SET_ALLOW_WARPED_MOTION, ctrl_set_allow_warped_motion }, + { AV1E_SET_ENABLE_FILTER_INTRA, ctrl_set_enable_filter_intra }, + { AV1E_SET_ENABLE_SMOOTH_INTRA, ctrl_set_enable_smooth_intra }, + { AV1E_SET_ENABLE_PAETH_INTRA, ctrl_set_enable_paeth_intra }, + { AV1E_SET_ENABLE_CFL_INTRA, ctrl_set_enable_cfl_intra }, { AV1E_SET_ENABLE_SUPERRES, ctrl_set_enable_superres }, + { AV1E_SET_ENABLE_PALETTE, ctrl_set_enable_palette }, + { AV1E_SET_ENABLE_INTRABC, ctrl_set_enable_intrabc }, + { AV1E_SET_ENABLE_ANGLE_DELTA, ctrl_set_enable_angle_delta }, { AV1E_SET_AQ_MODE, ctrl_set_aq_mode }, + { AV1E_SET_REDUCED_TX_TYPE_SET, ctrl_set_reduced_tx_type_set }, + { AV1E_SET_INTRA_DCT_ONLY, ctrl_set_intra_dct_only }, + { AV1E_SET_INTER_DCT_ONLY, ctrl_set_inter_dct_only }, + { AV1E_SET_INTRA_DEFAULT_TX_ONLY, ctrl_set_intra_default_tx_only }, + { AV1E_SET_QUANT_B_ADAPT, ctrl_set_quant_b_adapt }, + { AV1E_SET_COEFF_COST_UPD_FREQ, ctrl_set_coeff_cost_upd_freq }, + { AV1E_SET_MODE_COST_UPD_FREQ, ctrl_set_mode_cost_upd_freq }, { AV1E_SET_DELTAQ_MODE, ctrl_set_deltaq_mode }, { AV1E_SET_FRAME_PERIODIC_BOOST, ctrl_set_frame_periodic_boost }, { AV1E_SET_TUNE_CONTENT, ctrl_set_tune_content }, @@ -1810,6 +2242,7 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { { AV1E_SET_NOISE_SENSITIVITY, ctrl_set_noise_sensitivity }, { AV1E_SET_MIN_GF_INTERVAL, ctrl_set_min_gf_interval }, { AV1E_SET_MAX_GF_INTERVAL, ctrl_set_max_gf_interval }, + { AV1E_SET_GF_MAX_PYRAMID_HEIGHT, ctrl_set_gf_max_pyr_height }, { AV1E_SET_RENDER_SIZE, ctrl_set_render_size }, { AV1E_SET_SUPERBLOCK_SIZE, ctrl_set_superblock_size }, { AV1E_SET_SINGLE_TILE_DECODING, ctrl_set_single_tile_decoding }, @@ -1820,6 +2253,8 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { { AV1E_SET_DENOISE_BLOCK_SIZE, ctrl_set_denoise_block_size }, #endif // CONFIG_FILM_GRAIN { AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST, ctrl_enable_motion_vector_unit_test }, + { AV1E_SET_TARGET_SEQ_LEVEL_IDX, ctrl_set_target_seq_level_idx }, + { AV1E_SET_TIER_MASK, ctrl_set_tier_mask }, // Getters { AOME_GET_LAST_QUANTIZER, ctrl_get_quantizer }, @@ -1830,6 +2265,7 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { { AV1_COPY_NEW_FRAME_IMAGE, ctrl_copy_new_frame_image }, { AV1E_SET_CHROMA_SUBSAMPLING_X, ctrl_set_chroma_subsampling_x }, { AV1E_SET_CHROMA_SUBSAMPLING_Y, ctrl_set_chroma_subsampling_y }, + { AV1E_GET_SEQ_LEVEL_IDX, ctrl_get_seq_level_idx }, { -1, NULL }, }; @@ -1837,7 +2273,7 @@ static aom_codec_enc_cfg_map_t encoder_usage_cfg_map[] = { { 0, { // NOLINT - 0, // g_usage + 0, // g_usage - non-realtime usage 0, // g_threads 0, // g_profile @@ -1862,11 +2298,11 @@ static aom_codec_enc_cfg_map_t encoder_usage_cfg_map[] = { SCALE_NUMERATOR, // rc_resize_denominator SCALE_NUMERATOR, // rc_resize_kf_denominator - 0, // rc_superres_mode + SUPERRES_NONE, // rc_superres_mode SCALE_NUMERATOR, // rc_superres_denominator SCALE_NUMERATOR, // rc_superres_kf_denominator 63, // rc_superres_qthresh - 63, // rc_superres_kf_qthresh + 32, // rc_superres_kf_qthresh AOM_VBR, // rc_end_usage { NULL, 0 }, // rc_twopass_stats_in @@ -1902,6 +2338,74 @@ static aom_codec_enc_cfg_map_t encoder_usage_cfg_map[] = { { 0 }, // tile_heights { 1 }, // config file } }, + { 1, + { + // NOLINT + 1, // g_usage - real-time usage + 0, // g_threads + 0, // g_profile + + 320, // g_width + 240, // g_height + 0, // g_limit + 0, // g_forced_max_frame_width + 0, // g_forced_max_frame_height + AOM_BITS_8, // g_bit_depth + 8, // g_input_bit_depth + + { 1, 30 }, // g_timebase + + 0, // g_error_resilient + + AOM_RC_ONE_PASS, // g_pass + + 1, // g_lag_in_frames + + 0, // rc_dropframe_thresh + RESIZE_NONE, // rc_resize_mode + SCALE_NUMERATOR, // rc_resize_denominator + SCALE_NUMERATOR, // rc_resize_kf_denominator + + 0, // rc_superres_mode + SCALE_NUMERATOR, // rc_superres_denominator + SCALE_NUMERATOR, // rc_superres_kf_denominator + 63, // rc_superres_qthresh + 32, // rc_superres_kf_qthresh + + AOM_CBR, // rc_end_usage + { NULL, 0 }, // rc_twopass_stats_in + { NULL, 0 }, // rc_firstpass_mb_stats_in + 256, // rc_target_bandwidth + 0, // rc_min_quantizer + 63, // rc_max_quantizer + 25, // rc_undershoot_pct + 25, // rc_overshoot_pct + + 6000, // rc_max_buffer_size + 4000, // rc_buffer_initial_size + 5000, // rc_buffer_optimal_size + + 50, // rc_two_pass_vbrbias + 0, // rc_two_pass_vbrmin_section + 2000, // rc_two_pass_vbrmax_section + + // keyframing settings (kf) + 0, // fwd_kf_enabled + AOM_KF_AUTO, // g_kfmode + 0, // kf_min_dist + 9999, // kf_max_dist + 0, // sframe_dist + 1, // sframe_mode + 0, // large_scale_tile + 0, // monochrome + 0, // full_still_picture_hdr + 0, // save_as_annexb + 0, // tile_width_count + 0, // tile_height_count + { 0 }, // tile_widths + { 0 }, // tile_heights + { 1 }, // config file + } }, }; #ifndef VERSION_STRING @@ -1925,7 +2429,7 @@ CODEC_INTERFACE(aom_codec_av1_cx) = { }, { // NOLINT - 1, // 1 cfg map + 2, // 2 cfg map encoder_usage_cfg_map, // aom_codec_enc_cfg_map_t encoder_encode, // aom_codec_encode_fn_t encoder_get_cxdata, // aom_codec_get_cx_data_fn_t diff --git a/libaom/av1/av1_dx_iface.c b/libaom/av1/av1_dx_iface.c index 08da650..ca872d7 100644 --- a/libaom/av1/av1_dx_iface.c +++ b/libaom/av1/av1_dx_iface.c @@ -44,7 +44,7 @@ struct aom_codec_alg_priv { int img_avail; int flushed; int invert_tile_order; - int last_show_frame; // Index of last output frame. + RefCntBuffer *last_show_frame; // Last output frame buffer int byte_alignment; int skip_loop_filter; int skip_film_grain; @@ -154,6 +154,49 @@ static aom_codec_err_t decoder_destroy(aom_codec_alg_priv_t *ctx) { return AOM_CODEC_OK; } +static aom_codec_err_t parse_timing_info(struct aom_read_bit_buffer *rb) { + const uint32_t num_units_in_display_tick = + aom_rb_read_unsigned_literal(rb, 32); + const uint32_t time_scale = aom_rb_read_unsigned_literal(rb, 32); + if (num_units_in_display_tick == 0 || time_scale == 0) + return AOM_CODEC_UNSUP_BITSTREAM; + const uint8_t equal_picture_interval = aom_rb_read_bit(rb); + if (equal_picture_interval) { + const uint32_t num_ticks_per_picture_minus_1 = aom_rb_read_uvlc(rb); + if (num_ticks_per_picture_minus_1 == UINT32_MAX) { + // num_ticks_per_picture_minus_1 cannot be (1 << 32) − 1. + return AOM_CODEC_UNSUP_BITSTREAM; + } + } + return AOM_CODEC_OK; +} + +static aom_codec_err_t parse_decoder_model_info( + struct aom_read_bit_buffer *rb, int *buffer_delay_length_minus_1) { + *buffer_delay_length_minus_1 = aom_rb_read_literal(rb, 5); + const uint32_t num_units_in_decoding_tick = + aom_rb_read_unsigned_literal(rb, 32); + const uint8_t buffer_removal_time_length_minus_1 = aom_rb_read_literal(rb, 5); + const uint8_t frame_presentation_time_length_minus_1 = + aom_rb_read_literal(rb, 5); + (void)num_units_in_decoding_tick; + (void)buffer_removal_time_length_minus_1; + (void)frame_presentation_time_length_minus_1; + return AOM_CODEC_OK; +} + +static aom_codec_err_t parse_op_parameters_info( + struct aom_read_bit_buffer *rb, int buffer_delay_length_minus_1) { + const int n = buffer_delay_length_minus_1 + 1; + const uint32_t decoder_buffer_delay = aom_rb_read_unsigned_literal(rb, n); + const uint32_t encoder_buffer_delay = aom_rb_read_unsigned_literal(rb, n); + const uint8_t low_delay_mode_flag = aom_rb_read_bit(rb); + (void)decoder_buffer_delay; + (void)encoder_buffer_delay; + (void)low_delay_mode_flag; + return AOM_CODEC_OK; +} + // Parses the operating points (including operating_point_idc, seq_level_idx, // and seq_tier) and then sets si->number_spatial_layers and // si->number_temporal_layers based on operating_point_idc[0]. @@ -161,10 +204,23 @@ static aom_codec_err_t parse_operating_points(struct aom_read_bit_buffer *rb, int is_reduced_header, aom_codec_stream_info_t *si) { int operating_point_idc0 = 0; - if (is_reduced_header) { aom_rb_read_literal(rb, LEVEL_BITS); // level } else { + uint8_t decoder_model_info_present_flag = 0; + int buffer_delay_length_minus_1 = 0; + aom_codec_err_t status; + const uint8_t timing_info_present_flag = aom_rb_read_bit(rb); + if (timing_info_present_flag) { + if ((status = parse_timing_info(rb)) != AOM_CODEC_OK) return status; + decoder_model_info_present_flag = aom_rb_read_bit(rb); + if (decoder_model_info_present_flag) { + if ((status = parse_decoder_model_info( + rb, &buffer_delay_length_minus_1)) != AOM_CODEC_OK) + return status; + } + } + const uint8_t initial_display_delay_present_flag = aom_rb_read_bit(rb); const uint8_t operating_points_cnt_minus_1 = aom_rb_read_literal(rb, OP_POINTS_CNT_MINUS_1_BITS); for (int i = 0; i < operating_points_cnt_minus_1 + 1; i++) { @@ -173,6 +229,20 @@ static aom_codec_err_t parse_operating_points(struct aom_read_bit_buffer *rb, if (i == 0) operating_point_idc0 = operating_point_idc; int seq_level_idx = aom_rb_read_literal(rb, LEVEL_BITS); // level if (seq_level_idx > 7) aom_rb_read_bit(rb); // tier + if (decoder_model_info_present_flag) { + const uint8_t decoder_model_present_for_this_op = aom_rb_read_bit(rb); + if (decoder_model_present_for_this_op) { + if ((status = parse_op_parameters_info( + rb, buffer_delay_length_minus_1)) != AOM_CODEC_OK) + return status; + } + } + if (initial_display_delay_present_flag) { + const uint8_t initial_display_delay_present_for_this_op = + aom_rb_read_bit(rb); + if (initial_display_delay_present_for_this_op) + aom_rb_read_literal(rb, 4); // initial_display_delay_minus_1 + } } } @@ -203,7 +273,7 @@ static aom_codec_err_t decoder_peek_si_internal(const uint8_t *data, memset(&obu_header, 0, sizeof(obu_header)); size_t payload_size = 0; size_t bytes_read = 0; - int reduced_still_picture_hdr = 0; + uint8_t reduced_still_picture_hdr = 0; aom_codec_err_t status = aom_read_obu_header_and_size( data, data_sz, si->is_annexb, &obu_header, &payload_size, &bytes_read); if (status != AOM_CODEC_OK) return status; @@ -232,7 +302,7 @@ static aom_codec_err_t decoder_peek_si_internal(const uint8_t *data, struct aom_read_bit_buffer rb = { data, data + data_sz, 0, NULL, NULL }; av1_read_profile(&rb); // profile - const int still_picture = aom_rb_read_bit(&rb); + const uint8_t still_picture = aom_rb_read_bit(&rb); reduced_still_picture_hdr = aom_rb_read_bit(&rb); if (!still_picture && reduced_still_picture_hdr) { @@ -317,7 +387,6 @@ static void init_buffer_callbacks(aom_codec_alg_priv_t *ctx) { AV1_COMMON *const cm = &frame_worker_data->pbi->common; BufferPool *const pool = cm->buffer_pool; - cm->new_fb_idx = INVALID_IDX; cm->cur_frame = NULL; cm->byte_alignment = ctx->byte_alignment; cm->skip_loop_filter = ctx->skip_loop_filter; @@ -357,7 +426,6 @@ static int frame_worker_hook(void *arg1, void *arg2) { if (result != 0) { // Check decode result in serial decode. - frame_worker_data->pbi->common.cur_frame->buf.corrupted = 1; frame_worker_data->pbi->need_resync = 1; } return !result; @@ -367,7 +435,7 @@ static aom_codec_err_t init_decoder(aom_codec_alg_priv_t *ctx) { int i; const AVxWorkerInterface *const winterface = aom_get_worker_interface(); - ctx->last_show_frame = -1; + ctx->last_show_frame = NULL; ctx->next_output_worker_id = 0; ctx->need_resync = 1; ctx->num_frame_workers = 1; @@ -449,8 +517,7 @@ static INLINE void check_resync(aom_codec_alg_priv_t *const ctx, const AV1Decoder *const pbi) { // Clear resync flag if worker got a key frame or intra only frame. if (ctx->need_resync == 1 && pbi->need_resync == 0 && - (pbi->common.current_frame.intra_only || - pbi->common.current_frame.frame_type == KEY_FRAME)) + frame_is_intra_only(&pbi->common)) ctx->need_resync = 0; } @@ -529,7 +596,7 @@ static aom_codec_err_t decoder_inspect(aom_codec_alg_priv_t *ctx, data2->idx = -1; for (int i = 0; i < REF_FRAMES; ++i) - if (cm->ref_frame_map[i] == cm->new_fb_idx) data2->idx = i; + if (cm->ref_frame_map[i] == cm->cur_frame) data2->idx = i; data2->buf = data; data2->show_existing = cm->show_existing_frame; return res; @@ -551,7 +618,6 @@ static aom_codec_err_t decoder_decode(aom_codec_alg_priv_t *ctx, // arguments are invalid. if (ctx->frame_workers) { BufferPool *const pool = ctx->buffer_pool; - RefCntBuffer *const frame_bufs = pool->frame_bufs; lock_buffer_pool(pool); for (int i = 0; i < ctx->num_frame_workers; ++i) { AVxWorker *const worker = &ctx->frame_workers[i]; @@ -559,7 +625,7 @@ static aom_codec_err_t decoder_decode(aom_codec_alg_priv_t *ctx, (FrameWorkerData *)worker->data1; struct AV1Decoder *pbi = frame_worker_data->pbi; for (size_t j = 0; j < pbi->num_output_frames; j++) { - decrease_ref_count(pbi->output_frame_index[j], frame_bufs, pool); + decrease_ref_count(pbi->output_frames[j], pool); } pbi->num_output_frames = 0; } @@ -696,7 +762,6 @@ static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx, (FrameWorkerData *)worker->data1; AV1Decoder *const pbi = frame_worker_data->pbi; AV1_COMMON *const cm = &pbi->common; - RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; ctx->next_output_worker_id = (ctx->next_output_worker_id + 1) % ctx->num_frame_workers; // Wait for the frame from worker thread. @@ -709,8 +774,8 @@ static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx, aom_film_grain_t *grain_params; if (av1_get_raw_frame(frame_worker_data->pbi, *index, &sd, &grain_params) == 0) { - const int buf_idx = pbi->output_frame_index[*index]; - ctx->last_show_frame = buf_idx; + RefCntBuffer *const output_frame_buf = pbi->output_frames[*index]; + ctx->last_show_frame = output_frame_buf; if (ctx->need_resync) return NULL; yuvconfig2image(&ctx->img, sd, frame_worker_data->user_priv); @@ -725,8 +790,10 @@ static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx, const int num_planes = av1_num_planes(cm); if (pbi->ext_tile_debug && cm->single_tile_decoding && pbi->dec_tile_row >= 0) { + int tile_width, tile_height; + av1_get_uniform_tile_size(cm, &tile_width, &tile_height); const int tile_row = AOMMIN(pbi->dec_tile_row, cm->tile_rows - 1); - const int mi_row = tile_row * cm->tile_height; + const int mi_row = tile_row * tile_height; const int ssy = ctx->img.y_chroma_shift; int plane; ctx->img.planes[0] += mi_row * MI_SIZE * ctx->img.stride[0]; @@ -736,14 +803,15 @@ static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx, mi_row * (MI_SIZE >> ssy) * ctx->img.stride[plane]; } } - ctx->img.d_h = - AOMMIN(cm->tile_height, cm->mi_rows - mi_row) * MI_SIZE; + ctx->img.d_h = AOMMIN(tile_height, cm->mi_rows - mi_row) * MI_SIZE; } if (pbi->ext_tile_debug && cm->single_tile_decoding && pbi->dec_tile_col >= 0) { + int tile_width, tile_height; + av1_get_uniform_tile_size(cm, &tile_width, &tile_height); const int tile_col = AOMMIN(pbi->dec_tile_col, cm->tile_cols - 1); - const int mi_col = tile_col * cm->tile_width; + const int mi_col = tile_col * tile_width; const int ssx = ctx->img.x_chroma_shift; const int is_hbd = (ctx->img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 1 : 0; @@ -755,11 +823,10 @@ static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx, mi_col * (MI_SIZE >> ssx) * (1 + is_hbd); } } - ctx->img.d_w = - AOMMIN(cm->tile_width, cm->mi_cols - mi_col) * MI_SIZE; + ctx->img.d_w = AOMMIN(tile_width, cm->mi_cols - mi_col) * MI_SIZE; } - ctx->img.fb_priv = frame_bufs[buf_idx].raw_frame_buffer.priv; + ctx->img.fb_priv = output_frame_buf->raw_frame_buffer.priv; img = &ctx->img; img->temporal_id = cm->temporal_layer_id; img->spatial_id = cm->spatial_layer_id; @@ -911,7 +978,8 @@ static aom_codec_err_t ctrl_get_last_ref_updates(aom_codec_alg_priv_t *ctx, AVxWorker *const worker = ctx->frame_workers; FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; - *update_info = frame_worker_data->pbi->refresh_frame_flags; + *update_info = + frame_worker_data->pbi->common.current_frame.refresh_frame_flags; return AOM_CODEC_OK; } else { return AOM_CODEC_ERROR; @@ -940,11 +1008,10 @@ static aom_codec_err_t ctrl_get_frame_corrupted(aom_codec_alg_priv_t *ctx, FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; AV1Decoder *const pbi = frame_worker_data->pbi; - RefCntBuffer *const frame_bufs = pbi->common.buffer_pool->frame_bufs; if (pbi->seen_frame_header && pbi->num_output_frames == 0) return AOM_CODEC_ERROR; - if (ctx->last_show_frame >= 0) - *corrupted = frame_bufs[ctx->last_show_frame].buf.corrupted; + if (ctx->last_show_frame != NULL) + *corrupted = ctx->last_show_frame->buf.corrupted; return AOM_CODEC_OK; } else { return AOM_CODEC_ERROR; @@ -1124,8 +1191,9 @@ static aom_codec_err_t ctrl_get_tile_size(aom_codec_alg_priv_t *ctx, FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; const AV1_COMMON *const cm = &frame_worker_data->pbi->common; - *tile_size = - ((cm->tile_width * MI_SIZE) << 16) + cm->tile_height * MI_SIZE; + int tile_width, tile_height; + av1_get_uniform_tile_size(cm, &tile_width, &tile_height); + *tile_size = ((tile_width * MI_SIZE) << 16) + tile_height * MI_SIZE; return AOM_CODEC_OK; } else { return AOM_CODEC_ERROR; diff --git a/libaom/av1/av1_iface_common.h b/libaom/av1/av1_iface_common.h index 713d8c3..5568c89 100644 --- a/libaom/av1/av1_iface_common.h +++ b/libaom/av1/av1_iface_common.h @@ -124,7 +124,12 @@ static aom_codec_err_t image2yuvconfig(const aom_image_t *img, } else { yv12->flags = 0; } - yv12->border = (yv12->y_stride - img->w) / 2; + + // Note(yunqing): if img is allocated the same as the frame buffer, y_stride + // is 32-byte aligned. Also, handle the cases while allocating img without a + // border or stride_align is less than 32. + int border = (yv12->y_stride - (int)((img->w + 31) & ~31)) / 2; + yv12->border = (border < 0) ? 0 : border; yv12->subsampling_x = img->x_chroma_shift; yv12->subsampling_y = img->y_chroma_shift; return AOM_CODEC_OK; diff --git a/libaom/av1/common/alloccommon.c b/libaom/av1/common/alloccommon.c index 39b6b73..1c8528a 100644 --- a/libaom/av1/common/alloccommon.c +++ b/libaom/av1/common/alloccommon.c @@ -139,7 +139,7 @@ void av1_alloc_restoration_buffers(AV1_COMMON *cm) { // Now we need to allocate enough space to store the line buffers for the // stripes const int frame_w = cm->superres_upscaled_width; - const int use_highbd = cm->seq_params.use_highbitdepth ? 1 : 0; + const int use_highbd = cm->seq_params.use_highbitdepth; for (int p = 0; p < num_planes; ++p) { const int is_uv = p > 0; diff --git a/libaom/av1/common/arm/av1_txfm_neon.c b/libaom/av1/common/arm/av1_txfm_neon.c index de3c547..7e3a05a 100644 --- a/libaom/av1/common/arm/av1_txfm_neon.c +++ b/libaom/av1/common/arm/av1_txfm_neon.c @@ -12,6 +12,8 @@ #include <arm_neon.h> #include <assert.h> +#include "config/av1_rtcd.h" + #include "aom_ports/mem.h" #include "av1/common/arm/mem_neon.h" diff --git a/libaom/av1/common/arm/jnt_convolve_neon.c b/libaom/av1/common/arm/jnt_convolve_neon.c index e5674ef..379ff98 100644 --- a/libaom/av1/common/arm/jnt_convolve_neon.c +++ b/libaom/av1/common/arm/jnt_convolve_neon.c @@ -23,19 +23,17 @@ #include "av1/common/arm/transpose_neon.h" #if !defined(__aarch64__) -static INLINE void compute_avg_4x1(uint16x4_t res0, uint16x4_t d0, - const uint16_t fwd_offset, - const uint16_t bck_offset, - const int16x4_t sub_const_vec, - const int16_t round_bits, - const int use_jnt_comp_avg, uint8x8_t *t0) { +static INLINE void compute_avg_4x1( + uint16x4_t res0, uint16x4_t d0, const uint16_t fwd_offset, + const uint16_t bck_offset, const int16x4_t sub_const_vec, + const int16_t round_bits, const int use_dist_wtd_comp_avg, uint8x8_t *t0) { int16x4_t tmp0; uint16x4_t tmp_u0; uint32x4_t sum0; int32x4_t dst0; int16x8_t tmp4; - if (use_jnt_comp_avg) { + if (use_dist_wtd_comp_avg) { const int32x4_t round_bits_vec = vdupq_n_s32((int32_t)(-round_bits)); sum0 = vmull_n_u16(res0, fwd_offset); @@ -65,12 +63,10 @@ static INLINE void compute_avg_4x1(uint16x4_t res0, uint16x4_t d0, } } -static INLINE void compute_avg_8x1(uint16x8_t res0, uint16x8_t d0, - const uint16_t fwd_offset, - const uint16_t bck_offset, - const int16x4_t sub_const, - const int16_t round_bits, - const int use_jnt_comp_avg, uint8x8_t *t0) { +static INLINE void compute_avg_8x1( + uint16x8_t res0, uint16x8_t d0, const uint16_t fwd_offset, + const uint16_t bck_offset, const int16x4_t sub_const, + const int16_t round_bits, const int use_dist_wtd_comp_avg, uint8x8_t *t0) { int16x4_t tmp0, tmp2; int16x8_t f0; uint32x4_t sum0, sum2; @@ -78,7 +74,7 @@ static INLINE void compute_avg_8x1(uint16x8_t res0, uint16x8_t d0, uint16x8_t tmp_u0; - if (use_jnt_comp_avg) { + if (use_dist_wtd_comp_avg) { const int32x4_t sub_const_vec = vmovl_s16(sub_const); const int32x4_t round_bits_vec = vdupq_n_s32(-(int32_t)round_bits); @@ -123,7 +119,7 @@ static INLINE void compute_avg_4x4( uint16x4_t d0, uint16x4_t d1, uint16x4_t d2, uint16x4_t d3, const uint16_t fwd_offset, const uint16_t bck_offset, const int16x4_t sub_const_vec, const int16_t round_bits, - const int use_jnt_comp_avg, uint8x8_t *t0, uint8x8_t *t1) { + const int use_dist_wtd_comp_avg, uint8x8_t *t0, uint8x8_t *t1) { int16x4_t tmp0, tmp1, tmp2, tmp3; uint16x4_t tmp_u0, tmp_u1, tmp_u2, tmp_u3; uint32x4_t sum0, sum1, sum2, sum3; @@ -132,7 +128,7 @@ static INLINE void compute_avg_4x4( int16x8_t tmp4, tmp5; const int16x8_t zero = vdupq_n_s16(0); - if (use_jnt_comp_avg) { + if (use_dist_wtd_comp_avg) { const int32x4_t round_bits_vec = vdupq_n_s32((int32_t)(-round_bits)); const int32x4_t const_vec = vmovl_s16(sub_const_vec); @@ -203,8 +199,8 @@ static INLINE void compute_avg_8x4( uint16x8_t d0, uint16x8_t d1, uint16x8_t d2, uint16x8_t d3, const uint16_t fwd_offset, const uint16_t bck_offset, const int16x4_t sub_const, const int16_t round_bits, - const int use_jnt_comp_avg, uint8x8_t *t0, uint8x8_t *t1, uint8x8_t *t2, - uint8x8_t *t3) { + const int use_dist_wtd_comp_avg, uint8x8_t *t0, uint8x8_t *t1, + uint8x8_t *t2, uint8x8_t *t3) { int16x4_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; int16x8_t f0, f1, f2, f3; uint32x4_t sum0, sum1, sum2, sum3; @@ -214,7 +210,7 @@ static INLINE void compute_avg_8x4( uint16x8_t tmp_u0, tmp_u1, tmp_u2, tmp_u3; const int16x8_t zero = vdupq_n_s16(0); - if (use_jnt_comp_avg) { + if (use_dist_wtd_comp_avg) { const int32x4_t sub_const_vec = vmovl_s16(sub_const); const int32x4_t round_bits_vec = vdupq_n_s32(-(int32_t)round_bits); @@ -319,7 +315,7 @@ static INLINE void compute_avg_8x4( } } -static INLINE void jnt_convolve_2d_horiz_neon( +static INLINE void dist_wtd_convolve_2d_horiz_neon( const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride, int16_t *x_filter_tmp, const int im_h, int w, const int round_0) { const int bd = 8; @@ -563,7 +559,7 @@ static INLINE void jnt_convolve_2d_horiz_neon( } } -static INLINE void jnt_convolve_2d_vert_neon( +static INLINE void dist_wtd_convolve_2d_vert_neon( int16_t *im_block, const int im_stride, uint8_t *dst8, int dst8_stride, ConvolveParams *conv_params, const int16_t *y_filter, int h, int w) { uint8_t *dst_u8_ptr, *d_u8; @@ -587,7 +583,7 @@ static INLINE void jnt_convolve_2d_vert_neon( const uint16_t fwd_offset = conv_params->fwd_offset; const uint16_t bck_offset = conv_params->bck_offset; const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; int16x4_t s0, s1, s2, s3, s4, s5, s6, s7; uint16x4_t res4, d0; @@ -652,8 +648,8 @@ static INLINE void jnt_convolve_2d_vert_neon( d += (dst_stride << 2); compute_avg_4x4(res4, res5, res6, res7, d0, d1, d2, d3, fwd_offset, - bck_offset, sub_const_vec, round_bits, use_jnt_comp_avg, - &t0, &t1); + bck_offset, sub_const_vec, round_bits, + use_dist_wtd_comp_avg, &t0, &t1); vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0); d_u8 += dst8_stride; @@ -691,7 +687,7 @@ static INLINE void jnt_convolve_2d_vert_neon( d += (dst_stride); compute_avg_4x1(res4, d0, fwd_offset, bck_offset, sub_const_vec, - round_bits, use_jnt_comp_avg, &t0); + round_bits, use_dist_wtd_comp_avg, &t0); vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0); d_u8 += dst8_stride; @@ -717,12 +713,12 @@ static INLINE void jnt_convolve_2d_vert_neon( } while (w > 0); } -void av1_jnt_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst8, - int dst8_stride, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { +void av1_dist_wtd_convolve_2d_neon(const uint8_t *src, int src_stride, + uint8_t *dst8, int dst8_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_q4, const int subpel_y_q4, + ConvolveParams *conv_params) { assert(!(w % 4)); assert(!(h % 4)); @@ -748,19 +744,18 @@ void av1_jnt_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst8, filter_x_coef = vshrq_n_s16(filter_x_coef, 1); vst1q_s16(&x_filter_tmp[0], filter_x_coef); - jnt_convolve_2d_horiz_neon(src_ptr, src_stride, im_block, im_stride, - x_filter_tmp, im_h, w, round_0); + dist_wtd_convolve_2d_horiz_neon(src_ptr, src_stride, im_block, im_stride, + x_filter_tmp, im_h, w, round_0); - jnt_convolve_2d_vert_neon(im_block, im_stride, dst8, dst8_stride, conv_params, - y_filter, h, w); + dist_wtd_convolve_2d_vert_neon(im_block, im_stride, dst8, dst8_stride, + conv_params, y_filter, h, w); } -void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride, - uint8_t *dst8, int dst8_stride, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { +void av1_dist_wtd_convolve_2d_copy_neon( + const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, + const int subpel_y_q4, ConvolveParams *conv_params) { uint8x8_t res0_8, res1_8, res2_8, res3_8, tmp_shift0, tmp_shift1, tmp_shift2, tmp_shift3; uint16x8_t res_q0, res_q1, res_q2, res_q3, tmp_q0, tmp_q1, tmp_q2, tmp_q3; @@ -811,7 +806,7 @@ void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride, compute_avg_8x4(tmp_q0, tmp_q1, tmp_q2, tmp_q3, res_q0, res_q1, res_q2, res_q3, conv_params->fwd_offset, conv_params->bck_offset, sub_const_vec, bits, - conv_params->use_jnt_comp_avg, &tmp_shift0, + conv_params->use_dist_wtd_comp_avg, &tmp_shift0, &tmp_shift1, &tmp_shift2, &tmp_shift3); vst1_u8(dst8_1 + (0 * dst8_stride), tmp_shift0); @@ -854,7 +849,7 @@ void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride, compute_avg_4x4(tmp4, tmp5, tmp6, tmp7, res4, res5, res6, res7, conv_params->fwd_offset, conv_params->bck_offset, - sub_const_vec, bits, conv_params->use_jnt_comp_avg, + sub_const_vec, bits, conv_params->use_dist_wtd_comp_avg, &tmp_shift0, &tmp_shift1); vst1_lane_u32((uint32_t *)(dst8_1), vreinterpret_u32_u8(tmp_shift0), 0); @@ -881,12 +876,12 @@ void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride, } } -void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8, - int dst8_stride, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { +void av1_dist_wtd_convolve_x_neon(const uint8_t *src, int src_stride, + uint8_t *dst8, int dst8_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_q4, const int subpel_y_q4, + ConvolveParams *conv_params) { assert(!(w % 4)); assert(!(h % 4)); @@ -902,7 +897,7 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8, 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const uint16_t fwd_offset = conv_params->fwd_offset; const uint16_t bck_offset = conv_params->bck_offset; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; (void)filter_params_y; (void)subpel_y_q4; @@ -1031,8 +1026,8 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8, compute_avg_4x4(res4, res5, res6, res7, vreinterpret_u16_s16(d0), vreinterpret_u16_s16(d1), vreinterpret_u16_s16(d2), vreinterpret_u16_s16(d3), fwd_offset, bck_offset, - round_offset_vec, round_bits, use_jnt_comp_avg, &t0, - &t1); + round_offset_vec, round_bits, use_dist_wtd_comp_avg, + &t0, &t1); vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0); // 00 01 02 03 @@ -1103,7 +1098,7 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8, compute_avg_4x1(res4, vreinterpret_u16_s16(d0), fwd_offset, bck_offset, round_offset_vec, round_bits, - use_jnt_comp_avg, &t0); + use_dist_wtd_comp_avg, &t0); vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0); // 00 01 02 03 @@ -1231,11 +1226,12 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8, load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11); d_tmp += (dst_stride << 2); - compute_avg_8x4( - res8, res9, res10, res11, vreinterpretq_u16_s16(res0), - vreinterpretq_u16_s16(res1), vreinterpretq_u16_s16(res2), - vreinterpretq_u16_s16(res3), fwd_offset, bck_offset, - round_offset64, round_bits, use_jnt_comp_avg, &t0, &t1, &t2, &t3); + compute_avg_8x4(res8, res9, res10, res11, vreinterpretq_u16_s16(res0), + vreinterpretq_u16_s16(res1), + vreinterpretq_u16_s16(res2), + vreinterpretq_u16_s16(res3), fwd_offset, bck_offset, + round_offset64, round_bits, use_dist_wtd_comp_avg, + &t0, &t1, &t2, &t3); store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3); d_u8 += (dst8_stride << 2); @@ -1243,11 +1239,12 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8, load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11); d_tmp += (dst_stride << 2); - compute_avg_8x4( - res8, res9, res10, res11, vreinterpretq_u16_s16(res4), - vreinterpretq_u16_s16(res5), vreinterpretq_u16_s16(res6), - vreinterpretq_u16_s16(res7), fwd_offset, bck_offset, - round_offset64, round_bits, use_jnt_comp_avg, &t0, &t1, &t2, &t3); + compute_avg_8x4(res8, res9, res10, res11, vreinterpretq_u16_s16(res4), + vreinterpretq_u16_s16(res5), + vreinterpretq_u16_s16(res6), + vreinterpretq_u16_s16(res7), fwd_offset, bck_offset, + round_offset64, round_bits, use_dist_wtd_comp_avg, + &t0, &t1, &t2, &t3); store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3); d_u8 += (dst8_stride << 2); @@ -1319,7 +1316,7 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8, compute_avg_8x1(res8, vreinterpretq_u16_s16(res0), fwd_offset, bck_offset, round_offset64, round_bits, - use_jnt_comp_avg, &t0); + use_dist_wtd_comp_avg, &t0); vst1_u8(d_u8, t0); d_u8 += (dst8_stride); @@ -1342,12 +1339,12 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8, } } -void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8, - int dst8_stride, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { +void av1_dist_wtd_convolve_y_neon(const uint8_t *src, int src_stride, + uint8_t *dst8, int dst8_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_q4, const int subpel_y_q4, + ConvolveParams *conv_params) { assert(!(w % 4)); assert(!(h % 4)); @@ -1363,7 +1360,7 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8, 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const uint16_t fwd_offset = conv_params->fwd_offset; const uint16_t bck_offset = conv_params->bck_offset; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const int shift_value = (conv_params->round_1 - 1 - bits); (void)filter_params_x; @@ -1489,8 +1486,8 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8, compute_avg_4x4(res4, res5, res6, res7, vreinterpret_u16_s16(d0), vreinterpret_u16_s16(d1), vreinterpret_u16_s16(d2), vreinterpret_u16_s16(d3), fwd_offset, bck_offset, - round_offset64, round_bits, use_jnt_comp_avg, &t0, - &t1); + round_offset64, round_bits, use_dist_wtd_comp_avg, + &t0, &t1); vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0); d_u8 += dst8_stride; @@ -1535,7 +1532,7 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8, compute_avg_4x1(res4, vreinterpret_u16_s16(d0), fwd_offset, bck_offset, round_offset64, round_bits, - use_jnt_comp_avg, &t0); + use_dist_wtd_comp_avg, &t0); vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0); d_u8 += dst8_stride; @@ -1654,11 +1651,12 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8, load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11); d_tmp += (dst_stride << 2); - compute_avg_8x4( - res8, res9, res10, res11, vreinterpretq_u16_s16(res0), - vreinterpretq_u16_s16(res1), vreinterpretq_u16_s16(res2), - vreinterpretq_u16_s16(res3), fwd_offset, bck_offset, - round_offset64, round_bits, use_jnt_comp_avg, &t0, &t1, &t2, &t3); + compute_avg_8x4(res8, res9, res10, res11, vreinterpretq_u16_s16(res0), + vreinterpretq_u16_s16(res1), + vreinterpretq_u16_s16(res2), + vreinterpretq_u16_s16(res3), fwd_offset, bck_offset, + round_offset64, round_bits, use_dist_wtd_comp_avg, + &t0, &t1, &t2, &t3); store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3); d_u8 += (dst8_stride << 2); @@ -1666,11 +1664,12 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8, load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11); d_tmp += (dst_stride << 2); - compute_avg_8x4( - res8, res9, res10, res11, vreinterpretq_u16_s16(res4), - vreinterpretq_u16_s16(res5), vreinterpretq_u16_s16(res6), - vreinterpretq_u16_s16(res7), fwd_offset, bck_offset, - round_offset64, round_bits, use_jnt_comp_avg, &t0, &t1, &t2, &t3); + compute_avg_8x4(res8, res9, res10, res11, vreinterpretq_u16_s16(res4), + vreinterpretq_u16_s16(res5), + vreinterpretq_u16_s16(res6), + vreinterpretq_u16_s16(res7), fwd_offset, bck_offset, + round_offset64, round_bits, use_dist_wtd_comp_avg, + &t0, &t1, &t2, &t3); store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3); d_u8 += (dst8_stride << 2); @@ -1718,7 +1717,7 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8, compute_avg_8x1(res8, vreinterpretq_u16_s16(res0), fwd_offset, bck_offset, round_offset64, round_bits, - use_jnt_comp_avg, &t0); + use_dist_wtd_comp_avg, &t0); vst1_u8(d_u8, t0); d_u8 += (dst8_stride); diff --git a/libaom/av1/common/arm/warp_plane_neon.c b/libaom/av1/common/arm/warp_plane_neon.c index 7f02d42..1062cc3 100644 --- a/libaom/av1/common/arm/warp_plane_neon.c +++ b/libaom/av1/common/arm/warp_plane_neon.c @@ -640,7 +640,7 @@ void av1_warp_affine_neon(const int32_t *mat, const uint8_t *ref, int width, uint16x4_t tmp16_lo = vld1_u16(p); int32x4_t tmp32_lo = vreinterpretq_s32_u32(vmovl_u16(tmp16_lo)); int16x4_t tmp16_low; - if (conv_params->use_jnt_comp_avg) { + if (conv_params->use_dist_wtd_comp_avg) { res_lo = vmulq_s32(res_lo, bwd); tmp32_lo = vmulq_s32(tmp32_lo, fwd); tmp32_lo = vaddq_s32(tmp32_lo, res_lo); @@ -671,7 +671,7 @@ void av1_warp_affine_neon(const int32_t *mat, const uint8_t *ref, int width, uint16x4_t tmp16_hi = vld1_u16(p4); int32x4_t tmp32_hi = vreinterpretq_s32_u32(vmovl_u16(tmp16_hi)); int16x4_t tmp16_high; - if (conv_params->use_jnt_comp_avg) { + if (conv_params->use_dist_wtd_comp_avg) { res_hi = vmulq_s32(res_hi, bwd); tmp32_hi = vmulq_s32(tmp32_hi, fwd); tmp32_hi = vaddq_s32(tmp32_hi, res_hi); diff --git a/libaom/av1/common/av1_inv_txfm2d.c b/libaom/av1/common/av1_inv_txfm2d.c index 4f2d57b..fc9c8d2 100644 --- a/libaom/av1/common/av1_inv_txfm2d.c +++ b/libaom/av1/common/av1_inv_txfm2d.c @@ -228,7 +228,7 @@ void av1_gen_inv_stage_range(int8_t *stage_range_col, int8_t *stage_range_row, (void)real_range_row; if (cfg->txfm_type_row == TXFM_TYPE_ADST4 && i == 1) { // the adst4 may use 1 extra bit on top of opt_range_row at stage 1 - // so opt_range_col >= real_range_col will not hold + // so opt_range_row >= real_range_row will not hold stage_range_row[i] = opt_range_row; } else { assert(opt_range_row >= real_range_row); @@ -241,7 +241,7 @@ void av1_gen_inv_stage_range(int8_t *stage_range_col, int8_t *stage_range_row, cfg->stage_range_col[i] + fwd_shift + shift[0] + bd + 1; (void)real_range_col; if (cfg->txfm_type_col == TXFM_TYPE_ADST4 && i == 1) { - // the adst4 may use 1 extra bit on top of opt_range_row at stage 1 + // the adst4 may use 1 extra bit on top of opt_range_col at stage 1 // so opt_range_col >= real_range_col will not hold stage_range_col[i] = opt_range_col; } else { diff --git a/libaom/av1/common/av1_loopfilter.c b/libaom/av1/common/av1_loopfilter.c index c5a86fb..0aa1f9b 100644 --- a/libaom/av1/common/av1_loopfilter.c +++ b/libaom/av1/common/av1_loopfilter.c @@ -32,7 +32,7 @@ static const int delta_lf_id_lut[MAX_MB_PLANE][2] = { { 0, 1 }, { 2, 2 }, { 3, 3 } }; -typedef enum EDGE_DIR { VERT_EDGE = 0, HORZ_EDGE = 1, NUM_EDGE_DIRS } EDGE_DIR; +enum { VERT_EDGE = 0, HORZ_EDGE = 1, NUM_EDGE_DIRS } UENUM1BYTE(EDGE_DIR); static const int mode_lf_lut[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // INTRA_MODES @@ -1426,9 +1426,9 @@ static void highbd_filter_selectively_horiz( lfi->hev_thr, lfin->mblim, lfin->lim, lfin->hev_thr, bd); } else { - aom_highbd_lpf_horizontal_14_dual_c(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, lfin->mblim, - lfin->lim, lfin->hev_thr, bd); + aom_highbd_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, lfin->mblim, + lfin->lim, lfin->hev_thr, bd); } count = 2; } else { diff --git a/libaom/av1/common/av1_rtcd_defs.pl b/libaom/av1/common/av1_rtcd_defs.pl index 7049f16..aca5ec7 100755..100644 --- a/libaom/av1/common/av1_rtcd_defs.pl +++ b/libaom/av1/common/av1_rtcd_defs.pl @@ -81,8 +81,11 @@ specialize qw/av1_highbd_wiener_convolve_add_src avx2/; # directional intra predictor functions add_proto qw/void av1_dr_prediction_z1/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy"; +specialize qw/av1_dr_prediction_z1 avx2/; add_proto qw/void av1_dr_prediction_z2/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int dx, int dy"; +specialize qw/av1_dr_prediction_z2 avx2/; add_proto qw/void av1_dr_prediction_z3/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_left, int dx, int dy"; +specialize qw/av1_dr_prediction_z3 avx2/; # FILTER_INTRA predictor functions add_proto qw/void av1_filter_intra_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode"; @@ -108,31 +111,19 @@ specialize qw/av1_highbd_convolve8_vert/, "$sse2_x86_64"; #inv txfm add_proto qw/void av1_inv_txfm_add/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param"; -specialize qw/av1_inv_txfm_add ssse3 avx2 neon/; +# TODO(http://crbug.com/aomedia/2350): avx2 is disabled due to test vector +# mismatches. +specialize qw/av1_inv_txfm_add ssse3 neon/; add_proto qw/void av1_highbd_inv_txfm_add/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param"; -specialize qw/av1_highbd_inv_txfm_add sse4_1 avx2/; +# TODO(http://crbug.com/aomedia/2350): avx2 is disabled due to test vector +# mismatches. +specialize qw/av1_highbd_inv_txfm_add sse4_1/; add_proto qw/void av1_highbd_inv_txfm_add_4x4/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param"; specialize qw/av1_highbd_inv_txfm_add_4x4 sse4_1/; add_proto qw/void av1_highbd_inv_txfm_add_8x8/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param"; -specialize qw/av1_highbd_inv_txfm_add_8x8 sse4_1 avx2/; -add_proto qw/void av1_highbd_inv_txfm_add_16x8/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param"; -specialize qw/av1_highbd_inv_txfm_add_16x8 sse4_1 avx2/; -add_proto qw/void av1_highbd_inv_txfm_add_8x16/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param"; -specialize qw/av1_highbd_inv_txfm_add_8x16 sse4_1 avx2/; -add_proto qw/void av1_highbd_inv_txfm_add_16x16/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param"; -specialize qw/av1_highbd_inv_txfm_add_16x16 sse4_1 avx2/; -add_proto qw/void av1_highbd_inv_txfm_add_32x32/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param"; -specialize qw/av1_highbd_inv_txfm_add_32x32 sse4_1 avx2/; -add_proto qw/void av1_highbd_inv_txfm_add_16x32/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param"; -specialize qw/av1_highbd_inv_txfm_add_16x32 sse4_1 avx2/; -add_proto qw/void av1_highbd_inv_txfm_add_32x16/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param"; -specialize qw/av1_highbd_inv_txfm_add_32x16 sse4_1 avx2/; -add_proto qw/void av1_highbd_inv_txfm_add_8x32/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param"; -specialize qw/av1_highbd_inv_txfm_add_8x32 sse4_1 avx2/; -add_proto qw/void av1_highbd_inv_txfm_add_32x8/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param"; -specialize qw/av1_highbd_inv_txfm_add_32x8 sse4_1 avx2/; +specialize qw/av1_highbd_inv_txfm_add_8x8 sse4_1/; add_proto qw/void av1_highbd_inv_txfm_add_4x8/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param"; specialize qw/av1_highbd_inv_txfm_add_4x8 sse4_1/; add_proto qw/void av1_highbd_inv_txfm_add_8x4/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param"; @@ -173,7 +164,9 @@ add_proto qw/void av1_inv_txfm2d_add_32x8/, "const int32_t *input, uint16_t *out add_proto qw/void av1_highbd_dr_prediction_z1/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd"; specialize qw/av1_highbd_dr_prediction_z1 avx2/; add_proto qw/void av1_highbd_dr_prediction_z2/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy, int bd"; -#specialize qw/av1_highbd_dr_prediction_z2 avx2/; +# TODO(niva213@gmail.com): Re-enable avx2 after fixing valgrind issue +# https://crbug.com/aomedia/2316 +# specialize qw/av1_highbd_dr_prediction_z2 avx2/; add_proto qw/void av1_highbd_dr_prediction_z3/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_left, int dx, int dy, int bd"; specialize qw/av1_highbd_dr_prediction_z3 avx2/; @@ -187,6 +180,10 @@ specialize qw/av1_build_compound_diffwtd_mask_highbd ssse3 avx2/; add_proto qw/void av1_build_compound_diffwtd_mask_d16/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd"; specialize qw/av1_build_compound_diffwtd_mask_d16 sse4_1 avx2 neon/; +# Helper functions. +add_proto qw/void av1_round_shift_array/, "int32_t *arr, int size, int bit"; +specialize "av1_round_shift_array", qw/sse4_1 neon/; + # # Encoder functions below this point. # @@ -221,9 +218,9 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { add_proto qw/void av1_fwd_txfm2d_8x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; specialize qw/av1_fwd_txfm2d_8x4 sse4_1/; add_proto qw/void av1_fwd_txfm2d_8x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; - specialize qw/av1_fwd_txfm2d_8x16 sse4_1/; + specialize qw/av1_fwd_txfm2d_8x16 sse4_1 avx2/; add_proto qw/void av1_fwd_txfm2d_16x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; - specialize qw/av1_fwd_txfm2d_16x8 sse4_1/; + specialize qw/av1_fwd_txfm2d_16x8 sse4_1 avx2/; add_proto qw/void av1_fwd_txfm2d_16x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; specialize qw/av1_fwd_txfm2d_16x32 sse4_1/; add_proto qw/void av1_fwd_txfm2d_32x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; @@ -239,14 +236,14 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { add_proto qw/void av1_fwd_txfm2d_4x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; specialize qw/av1_fwd_txfm2d_4x4 sse4_1/; add_proto qw/void av1_fwd_txfm2d_8x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; - specialize qw/av1_fwd_txfm2d_8x8 sse4_1/; + specialize qw/av1_fwd_txfm2d_8x8 sse4_1 avx2/; add_proto qw/void av1_fwd_txfm2d_16x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; - specialize qw/av1_fwd_txfm2d_16x16 sse4_1/; + specialize qw/av1_fwd_txfm2d_16x16 sse4_1 avx2/; add_proto qw/void av1_fwd_txfm2d_32x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; - specialize qw/av1_fwd_txfm2d_32x32 sse4_1/; + specialize qw/av1_fwd_txfm2d_32x32 sse4_1 avx2/; add_proto qw/void av1_fwd_txfm2d_64x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; - specialize qw/av1_fwd_txfm2d_64x64 sse4_1/; + specialize qw/av1_fwd_txfm2d_64x64 sse4_1 avx2/; add_proto qw/void av1_fwd_txfm2d_32x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; specialize qw/av1_fwd_txfm2d_32x64 sse4_1/; add_proto qw/void av1_fwd_txfm2d_64x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; @@ -263,17 +260,18 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { add_proto qw/int av1_full_range_search/, "const struct macroblock *x, const struct search_site_config *cfg, MV *ref_mv, MV *best_mv, int search_param, int sad_per_bit, int *num00, const struct aom_variance_vtable *fn_ptr, const MV *center_mv"; - add_proto qw/void av1_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count"; - specialize qw/av1_temporal_filter_apply sse2 msa/; + add_proto qw/void av1_apply_temporal_filter/, "const uint8_t *y_frame1, int y_stride, const uint8_t *y_pred, int y_buf_stride, const uint8_t *u_frame1, const uint8_t *v_frame1, int uv_stride, const uint8_t *u_pred, const uint8_t *v_pred, int uv_buf_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count"; + specialize qw/av1_apply_temporal_filter sse4_1/; add_proto qw/void av1_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale"; # ENCODEMB INVOKE add_proto qw/int64_t av1_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd"; - specialize qw/av1_highbd_block_error sse2/; + specialize qw/av1_highbd_block_error sse2 avx2/; - add_proto qw/void av1_highbd_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count"; + add_proto qw/void av1_highbd_apply_temporal_filter/, "const uint8_t *yf, int y_stride, const uint8_t *yp, int y_buf_stride, const uint8_t *uf, const uint8_t *vf, int uv_stride, const uint8_t *up, const uint8_t *vp, int uv_buf_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count"; + specialize qw/av1_highbd_apply_temporal_filter sse4_1/; add_proto qw/void av1_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale"; specialize qw/av1_highbd_quantize_fp sse4_1 avx2/; @@ -347,7 +345,7 @@ specialize qw/av1_highbd_warp_affine sse4_1/; if (aom_config("CONFIG_AV1_ENCODER") eq "yes") { add_proto qw/double compute_cross_correlation/, "unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2"; - specialize qw/compute_cross_correlation sse4_1/; + specialize qw/compute_cross_correlation sse4_1 avx2/; } # LOOP_RESTORATION functions @@ -366,18 +364,18 @@ add_proto qw/void av1_convolve_2d_sr/, "const uint8_t *src, int src_stride, uint add_proto qw/void av1_convolve_2d_copy_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params"; add_proto qw/void av1_convolve_x_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params"; add_proto qw/void av1_convolve_y_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params"; -add_proto qw/void av1_jnt_convolve_2d/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params"; -add_proto qw/void av1_jnt_convolve_2d_copy/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params"; -add_proto qw/void av1_jnt_convolve_x/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params"; -add_proto qw/void av1_jnt_convolve_y/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params"; +add_proto qw/void av1_dist_wtd_convolve_2d/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params"; +add_proto qw/void av1_dist_wtd_convolve_2d_copy/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params"; +add_proto qw/void av1_dist_wtd_convolve_x/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params"; +add_proto qw/void av1_dist_wtd_convolve_y/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params"; add_proto qw/void av1_highbd_convolve_2d_copy_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd"; add_proto qw/void av1_highbd_convolve_2d_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd"; add_proto qw/void av1_highbd_convolve_x_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd"; add_proto qw/void av1_highbd_convolve_y_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd"; -add_proto qw/void av1_highbd_jnt_convolve_2d/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd"; -add_proto qw/void av1_highbd_jnt_convolve_x/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd"; -add_proto qw/void av1_highbd_jnt_convolve_y/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd"; -add_proto qw/void av1_highbd_jnt_convolve_2d_copy/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd"; +add_proto qw/void av1_highbd_dist_wtd_convolve_2d/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd"; +add_proto qw/void av1_highbd_dist_wtd_convolve_x/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd"; +add_proto qw/void av1_highbd_dist_wtd_convolve_y/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd"; +add_proto qw/void av1_highbd_dist_wtd_convolve_2d_copy/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd"; add_proto qw/void av1_convolve_2d_scale/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params"; add_proto qw/void av1_highbd_convolve_2d_scale/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd"; @@ -387,19 +385,19 @@ add_proto qw/void av1_highbd_jnt_convolve_2d_copy/, "const uint16_t *src, int sr specialize qw/av1_convolve_x_sr sse2 avx2 neon/; specialize qw/av1_convolve_y_sr sse2 avx2 neon/; specialize qw/av1_convolve_2d_scale sse4_1/; - specialize qw/av1_jnt_convolve_2d sse2 ssse3 avx2 neon/; - specialize qw/av1_jnt_convolve_2d_copy sse2 avx2 neon/; - specialize qw/av1_jnt_convolve_x sse2 avx2 neon/; - specialize qw/av1_jnt_convolve_y sse2 avx2 neon/; + specialize qw/av1_dist_wtd_convolve_2d sse2 ssse3 avx2 neon/; + specialize qw/av1_dist_wtd_convolve_2d_copy sse2 avx2 neon/; + specialize qw/av1_dist_wtd_convolve_x sse2 avx2 neon/; + specialize qw/av1_dist_wtd_convolve_y sse2 avx2 neon/; specialize qw/av1_highbd_convolve_2d_copy_sr sse2 avx2/; specialize qw/av1_highbd_convolve_2d_sr ssse3 avx2/; specialize qw/av1_highbd_convolve_x_sr ssse3 avx2/; specialize qw/av1_highbd_convolve_y_sr ssse3 avx2/; specialize qw/av1_highbd_convolve_2d_scale sse4_1/; - specialize qw/av1_highbd_jnt_convolve_2d sse4_1 avx2/; - specialize qw/av1_highbd_jnt_convolve_x sse4_1 avx2/; - specialize qw/av1_highbd_jnt_convolve_y sse4_1 avx2/; - specialize qw/av1_highbd_jnt_convolve_2d_copy sse4_1 avx2/; + specialize qw/av1_highbd_dist_wtd_convolve_2d sse4_1 avx2/; + specialize qw/av1_highbd_dist_wtd_convolve_x sse4_1 avx2/; + specialize qw/av1_highbd_dist_wtd_convolve_y sse4_1 avx2/; + specialize qw/av1_highbd_dist_wtd_convolve_2d_copy sse4_1 avx2/; # INTRA_EDGE functions add_proto qw/void av1_filter_intra_edge/, "uint8_t *p, int sz, int strength"; diff --git a/libaom/av1/common/av1_txfm.c b/libaom/av1/common/av1_txfm.c index 4fbb756..ac43402 100644 --- a/libaom/av1/common/av1_txfm.c +++ b/libaom/av1/common/av1_txfm.c @@ -10,6 +10,7 @@ */ #include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" #include "av1/common/av1_txfm.h" diff --git a/libaom/av1/common/av1_txfm.h b/libaom/av1/common/av1_txfm.h index 59d64ca..20049b6 100644 --- a/libaom/av1/common/av1_txfm.h +++ b/libaom/av1/common/av1_txfm.h @@ -59,7 +59,9 @@ static INLINE int32_t range_check_value(int32_t value, int8_t bit) { const int64_t min_value = -(1LL << (bit - 1)); if (value < min_value || value > max_value) { fprintf(stderr, "coeff out of bit range, value: %d bit %d\n", value, bit); +#if !CONFIG_AV1_ENCODER assert(0); +#endif } #endif // CONFIG_COEFFICIENT_RANGE_CHECKING #if DO_RANGE_CHECK_CLAMP @@ -110,7 +112,7 @@ typedef void (*TxfmFunc)(const int32_t *input, int32_t *output, int8_t cos_bit, typedef void (*FwdTxfm2dFunc)(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd); -typedef enum TXFM_TYPE { +enum { TXFM_TYPE_DCT4, TXFM_TYPE_DCT8, TXFM_TYPE_DCT16, @@ -125,7 +127,7 @@ typedef enum TXFM_TYPE { TXFM_TYPE_IDENTITY32, TXFM_TYPES, TXFM_TYPE_INVALID, -} TXFM_TYPE; +} UENUM1BYTE(TXFM_TYPE); typedef struct TXFM_2D_FLIP_CFG { TX_SIZE tx_size; diff --git a/libaom/av1/common/blockd.h b/libaom/av1/common/blockd.h index d6727b8..91ef3df 100644 --- a/libaom/av1/common/blockd.h +++ b/libaom/av1/common/blockd.h @@ -38,19 +38,19 @@ extern "C" { #define MAX_DIFFWTD_MASK_BITS 1 // DIFFWTD_MASK_TYPES should not surpass 1 << MAX_DIFFWTD_MASK_BITS -typedef enum ATTRIBUTE_PACKED { +enum { DIFFWTD_38 = 0, DIFFWTD_38_INV, DIFFWTD_MASK_TYPES, -} DIFFWTD_MASK_TYPE; +} UENUM1BYTE(DIFFWTD_MASK_TYPE); -typedef enum ATTRIBUTE_PACKED { +enum { KEY_FRAME = 0, INTER_FRAME = 1, INTRA_ONLY_FRAME = 2, // replaces intra-only S_FRAME = 3, FRAME_TYPES, -} FRAME_TYPE; +} UENUM1BYTE(FRAME_TYPE); static INLINE int is_comp_ref_allowed(BLOCK_SIZE bsize) { return AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8; @@ -157,15 +157,15 @@ static INLINE int is_masked_compound_type(COMPOUND_TYPE type) { is a single probability table. */ typedef struct { - // Number of base colors for Y (0) and UV (1) - uint8_t palette_size[2]; // Value of base colors for Y, U, and V uint16_t palette_colors[3 * PALETTE_MAX_SIZE]; + // Number of base colors for Y (0) and UV (1) + uint8_t palette_size[2]; } PALETTE_MODE_INFO; typedef struct { - uint8_t use_filter_intra; FILTER_INTRA_MODE filter_intra_mode; + uint8_t use_filter_intra; } FILTER_INTRA_MODE_INFO; static const PREDICTION_MODE fimode_to_intradir[FILTER_INTRA_MODES] = { @@ -190,11 +190,6 @@ typedef struct RD_STATS { int64_t ref_rdcost; int zero_rate; uint8_t invalid_rate; -#if CONFIG_ONE_PASS_SVM - int eob, eob_0, eob_1, eob_2, eob_3; - int64_t rd, rd_0, rd_1, rd_2, rd_3; - int64_t y_sse, sse_0, sse_1, sse_2, sse_3; -#endif #if CONFIG_RD_DEBUG int txb_coeff_cost[MAX_MB_PLANE]; int txb_coeff_cost_map[MAX_MB_PLANE][TXB_COEFF_COST_MAP_SIZE] @@ -205,10 +200,10 @@ typedef struct RD_STATS { // This struct is used to group function args that are commonly // sent together in functions related to interinter compound modes typedef struct { + uint8_t *seg_mask; int wedge_index; int wedge_sign; DIFFWTD_MASK_TYPE mask_type; - uint8_t *seg_mask; COMPOUND_TYPE type; } INTERINTER_COMPOUND_DATA; @@ -216,48 +211,18 @@ typedef struct { #define TXK_TYPE_BUF_LEN 64 // This structure now relates to 4x4 block regions. typedef struct MB_MODE_INFO { - // Common for both INTER and INTRA blocks - BLOCK_SIZE sb_type; - PREDICTION_MODE mode; - TX_SIZE tx_size; - uint8_t inter_tx_size[INTER_TX_SIZE_BUF_LEN]; - int8_t skip; - int8_t skip_mode; - int8_t segment_id; - int8_t seg_id_predicted; // valid only when temporal_update is enabled - - // Only for INTRA blocks - UV_PREDICTION_MODE uv_mode; - PALETTE_MODE_INFO palette_mode_info; - uint8_t use_intrabc; - + WarpedMotionParams wm_params; + // interinter members + INTERINTER_COMPOUND_DATA interinter_comp; + FILTER_INTRA_MODE_INFO filter_intra_mode_info; + int_mv mv[2]; // Only for INTER blocks InterpFilters interp_filters; - MV_REFERENCE_FRAME ref_frame[2]; - - TX_TYPE txk_type[TXK_TYPE_BUF_LEN]; - - FILTER_INTRA_MODE_INFO filter_intra_mode_info; - - // The actual prediction angle is the base angle + (angle_delta * step). - int8_t angle_delta[PLANE_TYPES]; - - // interintra members - INTERINTRA_MODE interintra_mode; // TODO(debargha): Consolidate these flags - int use_wedge_interintra; int interintra_wedge_index; int interintra_wedge_sign; - // interinter members - INTERINTER_COMPOUND_DATA interinter_comp; - MOTION_MODE motion_mode; int overlappable_neighbors[2]; - int_mv mv[2]; - uint8_t ref_mv_idx; - PARTITION_TYPE partition; - /* deringing gain *per-superblock* */ - int8_t cdef_strength; int current_qindex; int delta_lf_from_base; int delta_lf[FRAME_LF_COUNT]; @@ -267,15 +232,43 @@ typedef struct MB_MODE_INFO { int mi_col; #endif int num_proj_ref; - WarpedMotionParams wm_params; // Index of the alpha Cb and alpha Cr combination int cfl_alpha_idx; // Joint sign of alpha Cb and alpha Cr int cfl_alpha_signs; - int compound_idx; + // Indicate if masked compound is used(1) or not(0). int comp_group_idx; + // If comp_group_idx=0, indicate if dist_wtd_comp(0) or avg_comp(1) is used. + int compound_idx; +#if CONFIG_INSPECTION + int16_t tx_skip[TXK_TYPE_BUF_LEN]; +#endif + // Common for both INTER and INTRA blocks + BLOCK_SIZE sb_type; + PREDICTION_MODE mode; + // Only for INTRA blocks + UV_PREDICTION_MODE uv_mode; + // interintra members + INTERINTRA_MODE interintra_mode; + MOTION_MODE motion_mode; + PARTITION_TYPE partition; + TX_TYPE txk_type[TXK_TYPE_BUF_LEN]; + MV_REFERENCE_FRAME ref_frame[2]; + int8_t use_wedge_interintra; + int8_t skip; + int8_t skip_mode; + uint8_t inter_tx_size[INTER_TX_SIZE_BUF_LEN]; + TX_SIZE tx_size; + int8_t segment_id; + int8_t seg_id_predicted; // valid only when temporal_update is enabled + uint8_t use_intrabc; + // The actual prediction angle is the base angle + (angle_delta * step). + int8_t angle_delta[PLANE_TYPES]; + /* deringing gain *per-superblock* */ + int8_t cdef_strength; + uint8_t ref_mv_idx; } MB_MODE_INFO; static INLINE int is_intrabc_block(const MB_MODE_INFO *mbmi) { @@ -375,7 +368,7 @@ static INLINE void mi_to_pixel_loc(int *pixel_c, int *pixel_r, int mi_col, } #endif -enum ATTRIBUTE_PACKED mv_precision { MV_PRECISION_Q3, MV_PRECISION_Q4 }; +enum { MV_PRECISION_Q3, MV_PRECISION_Q4 } UENUM1BYTE(mv_precision); struct buf_2d { uint8_t *buf; @@ -431,14 +424,6 @@ typedef struct macroblockd_plane { #define BLOCK_OFFSET(x, i) \ ((x) + (i) * (1 << (tx_size_wide_log2[0] + tx_size_high_log2[0]))) -struct RefCntBuffer; - -typedef struct RefBuffer { - int map_idx; // frame map idx - struct RefCntBuffer *buf; - struct scale_factors sf; -} RefBuffer; - typedef struct { DECLARE_ALIGNED(16, InterpKernel, vfilter); DECLARE_ALIGNED(16, InterpKernel, hfilter); @@ -494,11 +479,13 @@ typedef struct cfl_ctx { int is_chroma_reference; } CFL_CTX; -typedef struct jnt_comp_params { - int use_jnt_comp_avg; +typedef struct dist_wtd_comp_params { + int use_dist_wtd_comp_avg; int fwd_offset; int bck_offset; -} JNT_COMP_PARAMS; +} DIST_WTD_COMP_PARAMS; + +struct scale_factors; // Most/all of the pointers are mere pointers to actual arrays are allocated // elsewhere. This is mostly for coding convenience. @@ -526,8 +513,8 @@ typedef struct macroblockd { int mb_to_top_edge; int mb_to_bottom_edge; - /* pointers to reference frames */ - const RefBuffer *block_refs[2]; + /* pointers to reference frame scale factors */ + const struct scale_factors *block_ref_scale_factors[2]; /* pointer to current frame */ const YV12_BUFFER_CONFIG *cur_buf; @@ -596,7 +583,7 @@ typedef struct macroblockd { uint8_t *mc_buf[2]; CFL_CTX cfl; - JNT_COMP_PARAMS jcp_param; + DIST_WTD_COMP_PARAMS jcp_param; uint16_t cb_offset[MAX_MB_PLANE]; uint16_t txb_offset[MAX_MB_PLANE]; @@ -606,7 +593,7 @@ typedef struct macroblockd { uint8_t *tmp_obmc_bufs[2]; } MACROBLOCKD; -static INLINE int get_bitdepth_data_path_index(const MACROBLOCKD *xd) { +static INLINE int is_cur_buf_hbd(const MACROBLOCKD *xd) { return xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH ? 1 : 0; } @@ -781,11 +768,13 @@ static INLINE int av1_raster_order_to_block_index(TX_SIZE tx_size, static INLINE TX_TYPE get_default_tx_type(PLANE_TYPE plane_type, const MACROBLOCKD *xd, - TX_SIZE tx_size) { + TX_SIZE tx_size, + int is_screen_content_type) { const MB_MODE_INFO *const mbmi = xd->mi[0]; if (is_inter_block(mbmi) || plane_type != PLANE_TYPE_Y || - xd->lossless[mbmi->segment_id] || tx_size >= TX_32X32) + xd->lossless[mbmi->segment_id] || tx_size >= TX_32X32 || + is_screen_content_type) return DCT_DCT; return intra_mode_to_tx_type(mbmi, plane_type); @@ -1049,7 +1038,8 @@ motion_mode_allowed(const WarpedMotionParams *gm_params, const MACROBLOCKD *xd, if (!check_num_overlappable_neighbors(mbmi)) return SIMPLE_TRANSLATION; assert(!has_second_ref(mbmi)); if (mbmi->num_proj_ref >= 1 && - (allow_warped_motion && !av1_is_scaled(&(xd->block_refs[0]->sf)))) { + (allow_warped_motion && + !av1_is_scaled(xd->block_ref_scale_factors[0]))) { if (xd->cur_frame_force_integer_mv) { return OBMC_CAUSAL; } diff --git a/libaom/av1/common/cdef.c b/libaom/av1/common/cdef.c index 556dede..63f9883 100644 --- a/libaom/av1/common/cdef.c +++ b/libaom/av1/common/cdef.c @@ -80,7 +80,6 @@ int sb_compute_cdef_list(const AV1_COMMON *const cm, int mi_row, int mi_col, if (!is_8x8_block_skip(grid, mi_row + r, mi_col + c, cm->mi_stride)) { dlist[count].by = r >> r_shift; dlist[count].bx = c >> c_shift; - dlist[count].skip = 0; count++; } } diff --git a/libaom/av1/common/cdef_block.c b/libaom/av1/common/cdef_block.c index 845df37..dfd5882 100644 --- a/libaom/av1/common/cdef_block.c +++ b/libaom/av1/common/cdef_block.c @@ -232,8 +232,8 @@ void cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, uint16_t *in, } for (bi = 0; bi < cdef_count; bi++) { - int t = dlist[bi].skip ? 0 : pri_strength; - int s = dlist[bi].skip ? 0 : sec_strength; + int t = pri_strength; + int s = sec_strength; by = dlist[bi].by; bx = dlist[bi].bx; if (dst8) diff --git a/libaom/av1/common/cdef_block.h b/libaom/av1/common/cdef_block.h index 0e921e0..8321d48 100644 --- a/libaom/av1/common/cdef_block.h +++ b/libaom/av1/common/cdef_block.h @@ -38,7 +38,6 @@ DECLARE_ALIGNED(16, extern const int, cdef_directions[8][2]); typedef struct { uint8_t by; uint8_t bx; - uint8_t skip; } cdef_list; typedef void (*cdef_filter_block_func)(uint8_t *dst8, uint16_t *dst16, diff --git a/libaom/av1/common/cfl.c b/libaom/av1/common/cfl.c index 99410be..65e18e8 100644 --- a/libaom/av1/common/cfl.c +++ b/libaom/av1/common/cfl.c @@ -37,7 +37,7 @@ void cfl_store_dc_pred(MACROBLOCKD *const xd, const uint8_t *input, assert(pred_plane < CFL_PRED_PLANES); assert(width <= CFL_BUF_LINE); - if (get_bitdepth_data_path_index(xd)) { + if (is_cur_buf_hbd(xd)) { uint16_t *const input_16 = CONVERT_TO_SHORTPTR(input); memcpy(xd->cfl.dc_pred_cache[pred_plane], input_16, width << 1); return; @@ -69,7 +69,7 @@ void cfl_load_dc_pred(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride, assert(pred_plane < CFL_PRED_PLANES); assert(width <= CFL_BUF_LINE); assert(height <= CFL_BUF_LINE); - if (get_bitdepth_data_path_index(xd)) { + if (is_cur_buf_hbd(xd)) { uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst); cfl_load_dc_pred_hbd(xd->cfl.dc_pred_cache[pred_plane], dst_16, dst_stride, width, height); @@ -196,7 +196,7 @@ void cfl_predict_block(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride, cfl_idx_to_alpha(mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, plane - 1); assert((tx_size_high[tx_size] - 1) * CFL_BUF_LINE + tx_size_wide[tx_size] <= CFL_BUF_SQUARE); - if (get_bitdepth_data_path_index(xd)) { + if (is_cur_buf_hbd(xd)) { uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst); get_predict_hbd_fn(tx_size)(cfl->ac_buf_q3, dst_16, dst_stride, alpha_q3, xd->bd); @@ -388,8 +388,7 @@ void cfl_store_tx(MACROBLOCKD *const xd, int row, int col, TX_SIZE tx_size, assert(!((row & 1) && tx_size_high[tx_size] != 4)); sub8x8_adjust_offset(cfl, &row, &col); } - cfl_store(cfl, dst, pd->dst.stride, row, col, tx_size, - get_bitdepth_data_path_index(xd)); + cfl_store(cfl, dst, pd->dst.stride, row, col, tx_size, is_cur_buf_hbd(xd)); } void cfl_store_block(MACROBLOCKD *const xd, BLOCK_SIZE bsize, TX_SIZE tx_size) { @@ -405,5 +404,5 @@ void cfl_store_block(MACROBLOCKD *const xd, BLOCK_SIZE bsize, TX_SIZE tx_size) { const int height = max_intra_block_height(xd, bsize, AOM_PLANE_Y, tx_size); tx_size = get_tx_size(width, height); cfl_store(cfl, pd->dst.buf, pd->dst.stride, row, col, tx_size, - get_bitdepth_data_path_index(xd)); + is_cur_buf_hbd(xd)); } diff --git a/libaom/av1/common/convolve.c b/libaom/av1/common/convolve.c index 8ba3ed4..5a55ece 100644 --- a/libaom/av1/common/convolve.c +++ b/libaom/av1/common/convolve.c @@ -238,16 +238,16 @@ void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, (void)conv_params; for (int y = 0; y < h; ++y) { - memcpy(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0])); + memmove(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0])); } } -void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8, - int dst8_stride, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { +void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride, + uint8_t *dst8, int dst8_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_q4, const int subpel_y_q4, + ConvolveParams *conv_params) { CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; @@ -290,7 +290,7 @@ void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8, CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); if (conv_params->do_average) { int32_t tmp = dst[y * dst_stride + x]; - if (conv_params->use_jnt_comp_avg) { + if (conv_params->use_dist_wtd_comp_avg) { tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; tmp = tmp >> DIST_PRECISION_BITS; } else { @@ -308,12 +308,12 @@ void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8, } } -void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst8, - int dst8_stride, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { +void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, + uint8_t *dst8, int dst8_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_q4, const int subpel_y_q4, + ConvolveParams *conv_params) { CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; const int fo_vert = filter_params_y->taps / 2 - 1; @@ -341,7 +341,7 @@ void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst8, if (conv_params->do_average) { int32_t tmp = dst[y * dst_stride + x]; - if (conv_params->use_jnt_comp_avg) { + if (conv_params->use_dist_wtd_comp_avg) { tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; tmp = tmp >> DIST_PRECISION_BITS; } else { @@ -358,12 +358,12 @@ void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst8, } } -void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst8, - int dst8_stride, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { +void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, + uint8_t *dst8, int dst8_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_q4, const int subpel_y_q4, + ConvolveParams *conv_params) { CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; const int fo_horiz = filter_params_x->taps / 2 - 1; @@ -391,7 +391,7 @@ void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst8, if (conv_params->do_average) { int32_t tmp = dst[y * dst_stride + x]; - if (conv_params->use_jnt_comp_avg) { + if (conv_params->use_dist_wtd_comp_avg) { tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; tmp = tmp >> DIST_PRECISION_BITS; } else { @@ -408,12 +408,11 @@ void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst8, } } -void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, - uint8_t *dst8, int dst8_stride, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { +void av1_dist_wtd_convolve_2d_copy_c( + const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, + const int subpel_y_q4, ConvolveParams *conv_params) { CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; const int bits = @@ -434,7 +433,7 @@ void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, if (conv_params->do_average) { int32_t tmp = dst[y * dst_stride + x]; - if (conv_params->use_jnt_comp_avg) { + if (conv_params->use_dist_wtd_comp_avg) { tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; tmp = tmp >> DIST_PRECISION_BITS; } else { @@ -511,7 +510,7 @@ void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst8, if (conv_params->is_compound) { if (conv_params->do_average) { int32_t tmp = dst16[y * dst16_stride + x]; - if (conv_params->use_jnt_comp_avg) { + if (conv_params->use_dist_wtd_comp_avg) { tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; tmp = tmp >> DIST_PRECISION_BITS; } else { @@ -632,7 +631,7 @@ void av1_highbd_convolve_2d_copy_sr_c( (void)bd; for (int y = 0; y < h; ++y) { - memcpy(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0])); + memmove(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0])); } } @@ -748,13 +747,11 @@ void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride, } } -void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, - uint16_t *dst16, int dst16_stride, int w, - int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params, int bd) { +void av1_highbd_dist_wtd_convolve_2d_c( + const uint16_t *src, int src_stride, uint16_t *dst16, int dst16_stride, + int w, int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, + const int subpel_y_q4, ConvolveParams *conv_params, int bd) { int x, y, k; int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]; CONV_BUF_TYPE *dst = conv_params->dst; @@ -799,7 +796,7 @@ void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1); if (conv_params->do_average) { int32_t tmp = dst[y * dst_stride + x]; - if (conv_params->use_jnt_comp_avg) { + if (conv_params->use_dist_wtd_comp_avg) { tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; tmp = tmp >> DIST_PRECISION_BITS; } else { @@ -817,13 +814,11 @@ void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, } } -void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, - uint16_t *dst16, int dst16_stride, int w, - int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params, int bd) { +void av1_highbd_dist_wtd_convolve_x_c( + const uint16_t *src, int src_stride, uint16_t *dst16, int dst16_stride, + int w, int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, + const int subpel_y_q4, ConvolveParams *conv_params, int bd) { CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; const int fo_horiz = filter_params_x->taps / 2 - 1; @@ -851,7 +846,7 @@ void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, if (conv_params->do_average) { int32_t tmp = dst[y * dst_stride + x]; - if (conv_params->use_jnt_comp_avg) { + if (conv_params->use_dist_wtd_comp_avg) { tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; tmp = tmp >> DIST_PRECISION_BITS; } else { @@ -868,13 +863,11 @@ void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, } } -void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, - uint16_t *dst16, int dst16_stride, int w, - int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params, int bd) { +void av1_highbd_dist_wtd_convolve_y_c( + const uint16_t *src, int src_stride, uint16_t *dst16, int dst16_stride, + int w, int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, + const int subpel_y_q4, ConvolveParams *conv_params, int bd) { CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; const int fo_vert = filter_params_y->taps / 2 - 1; @@ -902,7 +895,7 @@ void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, if (conv_params->do_average) { int32_t tmp = dst[y * dst_stride + x]; - if (conv_params->use_jnt_comp_avg) { + if (conv_params->use_dist_wtd_comp_avg) { tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; tmp = tmp >> DIST_PRECISION_BITS; } else { @@ -919,7 +912,7 @@ void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, } } -void av1_highbd_jnt_convolve_2d_copy_c( +void av1_highbd_dist_wtd_convolve_2d_copy_c( const uint16_t *src, int src_stride, uint16_t *dst16, int dst16_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, @@ -943,7 +936,7 @@ void av1_highbd_jnt_convolve_2d_copy_c( res += round_offset; if (conv_params->do_average) { int32_t tmp = dst[y * dst_stride + x]; - if (conv_params->use_jnt_comp_avg) { + if (conv_params->use_dist_wtd_comp_avg) { tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; tmp = tmp >> DIST_PRECISION_BITS; } else { @@ -1019,7 +1012,7 @@ void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, if (conv_params->is_compound) { if (conv_params->do_average) { int32_t tmp = dst16[y * dst16_stride + x]; - if (conv_params->use_jnt_comp_avg) { + if (conv_params->use_dist_wtd_comp_avg) { tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; tmp = tmp >> DIST_PRECISION_BITS; } else { diff --git a/libaom/av1/common/convolve.h b/libaom/av1/common/convolve.h index d0972db..e5479e6 100644 --- a/libaom/av1/common/convolve.h +++ b/libaom/av1/common/convolve.h @@ -26,7 +26,7 @@ typedef struct ConvolveParams { int round_1; int plane; int is_compound; - int use_jnt_comp_avg; + int use_dist_wtd_comp_avg; int fwd_offset; int bck_offset; } ConvolveParams; diff --git a/libaom/av1/common/debugmodes.c b/libaom/av1/common/debugmodes.c index 5242f19..b26c7dd 100644 --- a/libaom/av1/common/debugmodes.c +++ b/libaom/av1/common/debugmodes.c @@ -40,7 +40,7 @@ static void print_mi_data(AV1_COMMON *cm, FILE *file, const char *descriptor, mi++; } fprintf(file, "\n"); - mi += MAX_MIB_SIZE; + mi += cm->mi_stride - cols; } fprintf(file, "\n"); } @@ -68,7 +68,7 @@ void av1_print_modes_and_motion_vectors(AV1_COMMON *cm, const char *file) { mi++; } fprintf(mvs, "\n"); - mi += MAX_MIB_SIZE; + mi += cm->mi_stride - cols; } fprintf(mvs, "\n"); @@ -82,7 +82,7 @@ void av1_print_modes_and_motion_vectors(AV1_COMMON *cm, const char *file) { mi++; } fprintf(mvs, "\n"); - mi += MAX_MIB_SIZE; + mi += cm->mi_stride - cols; } fprintf(mvs, "\n"); diff --git a/libaom/av1/common/entropy.c b/libaom/av1/common/entropy.c index 4f95ef6..f63ac98 100644 --- a/libaom/av1/common/entropy.c +++ b/libaom/av1/common/entropy.c @@ -101,7 +101,7 @@ void av1_reset_cdf_symbol_counters(FRAME_CONTEXT *fc) { RESET_CDF_COUNTER(fc->refmv_cdf, 2); RESET_CDF_COUNTER(fc->drl_cdf, 2); RESET_CDF_COUNTER(fc->inter_compound_mode_cdf, INTER_COMPOUND_MODES); - RESET_CDF_COUNTER(fc->compound_type_cdf, COMPOUND_TYPES - 1); + RESET_CDF_COUNTER(fc->compound_type_cdf, MASKED_COMPOUND_TYPES); RESET_CDF_COUNTER(fc->wedge_idx_cdf, 16); RESET_CDF_COUNTER(fc->interintra_cdf, 2); RESET_CDF_COUNTER(fc->wedge_interintra_cdf, 2); diff --git a/libaom/av1/common/entropy.h b/libaom/av1/common/entropy.h index 991692c..41218d3 100644 --- a/libaom/av1/common/entropy.h +++ b/libaom/av1/common/entropy.h @@ -54,12 +54,12 @@ extern "C" { #define BASE_CONTEXT_POSITION_NUM 12 -typedef enum TX_CLASS { +enum { TX_CLASS_2D = 0, TX_CLASS_HORIZ = 1, TX_CLASS_VERT = 2, TX_CLASSES = 3, -} TX_CLASS; +} UENUM1BYTE(TX_CLASS); #define DCT_MAX_VALUE 16384 #define DCT_MAX_VALUE_HIGH10 65536 diff --git a/libaom/av1/common/entropymode.c b/libaom/av1/common/entropymode.c index 51bbea7..90702ac 100644 --- a/libaom/av1/common/entropymode.c +++ b/libaom/av1/common/entropymode.c @@ -488,17 +488,17 @@ static const aom_cdf_prob { AOM_CDF2(16384) } }; -static const aom_cdf_prob - default_compound_type_cdf[BLOCK_SIZES_ALL][CDF_SIZE(COMPOUND_TYPES - 1)] = { - { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, - { AOM_CDF2(23431) }, { AOM_CDF2(13171) }, { AOM_CDF2(11470) }, - { AOM_CDF2(9770) }, { AOM_CDF2(9100) }, { AOM_CDF2(8233) }, - { AOM_CDF2(6172) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, - { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, - { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, - { AOM_CDF2(11820) }, { AOM_CDF2(7701) }, { AOM_CDF2(16384) }, - { AOM_CDF2(16384) } - }; +static const aom_cdf_prob default_compound_type_cdf[BLOCK_SIZES_ALL][CDF_SIZE( + MASKED_COMPOUND_TYPES)] = { + { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, + { AOM_CDF2(23431) }, { AOM_CDF2(13171) }, { AOM_CDF2(11470) }, + { AOM_CDF2(9770) }, { AOM_CDF2(9100) }, { AOM_CDF2(8233) }, + { AOM_CDF2(6172) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, + { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, + { AOM_CDF2(11820) }, { AOM_CDF2(7701) }, { AOM_CDF2(16384) }, + { AOM_CDF2(16384) } +}; static const aom_cdf_prob default_wedge_idx_cdf[BLOCK_SIZES_ALL][CDF_SIZE(16)] = { { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432, @@ -1072,9 +1072,9 @@ void av1_setup_frame_contexts(AV1_COMMON *cm) { // TODO(jack.haughton@argondesign.com): don't think this should be necessary, // but could do with fuller testing if (cm->large_scale_tile) { - for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { - if (cm->current_frame.frame_refs[i].buf != NULL) - cm->current_frame.frame_refs[i].buf->frame_context = *cm->fc; + for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { + RefCntBuffer *const buf = get_ref_frame_buf(cm, i); + if (buf != NULL) buf->frame_context = *cm->fc; } for (int i = 0; i < FRAME_BUFFERS; ++i) cm->buffer_pool->frame_bufs[i].frame_context = *cm->fc; @@ -1086,10 +1086,8 @@ void av1_setup_past_independence(AV1_COMMON *cm) { // Features disabled, 0, with delta coding (Default state). av1_clearall_segfeatures(&cm->seg); - cm->current_frame_seg_map = cm->cur_frame->seg_map; - - if (cm->current_frame_seg_map) - memset(cm->current_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols)); + if (cm->cur_frame->seg_map) + memset(cm->cur_frame->seg_map, 0, (cm->mi_rows * cm->mi_cols)); // reset mode ref deltas av1_set_default_ref_deltas(cm->cur_frame->ref_deltas); @@ -1099,7 +1097,6 @@ void av1_setup_past_independence(AV1_COMMON *cm) { av1_default_coef_probs(cm); init_mode_probs(cm->fc); av1_init_mv_probs(cm); - av1_init_lv_map(cm); cm->fc->initialized = 1; av1_setup_frame_contexts(cm); diff --git a/libaom/av1/common/entropymode.h b/libaom/av1/common/entropymode.h index 7047f34..69b5218 100644 --- a/libaom/av1/common/entropymode.h +++ b/libaom/av1/common/entropymode.h @@ -92,7 +92,8 @@ typedef struct frame_contexts { aom_cdf_prob inter_compound_mode_cdf[INTER_MODE_CONTEXTS] [CDF_SIZE(INTER_COMPOUND_MODES)]; - aom_cdf_prob compound_type_cdf[BLOCK_SIZES_ALL][CDF_SIZE(COMPOUND_TYPES - 1)]; + aom_cdf_prob compound_type_cdf[BLOCK_SIZES_ALL] + [CDF_SIZE(MASKED_COMPOUND_TYPES)]; aom_cdf_prob wedge_idx_cdf[BLOCK_SIZES_ALL][CDF_SIZE(16)]; aom_cdf_prob interintra_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(2)]; aom_cdf_prob wedge_interintra_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)]; diff --git a/libaom/av1/common/entropymv.h b/libaom/av1/common/entropymv.h index fa818a2..cddc807 100644 --- a/libaom/av1/common/entropymv.h +++ b/libaom/av1/common/entropymv.h @@ -30,12 +30,12 @@ void av1_init_mv_probs(struct AV1Common *cm); /* Symbols for coding which components are zero jointly */ #define MV_JOINTS 4 -typedef enum { +enum { MV_JOINT_ZERO = 0, /* Zero vector */ MV_JOINT_HNZVZ = 1, /* Vert zero, hor nonzero */ MV_JOINT_HZVNZ = 2, /* Hor zero, vert nonzero */ MV_JOINT_HNZVNZ = 3, /* Both components nonzero */ -} MV_JOINT_TYPE; +} UENUM1BYTE(MV_JOINT_TYPE); static INLINE int mv_joint_vertical(MV_JOINT_TYPE type) { return type == MV_JOINT_HZVNZ || type == MV_JOINT_HNZVNZ; @@ -47,7 +47,7 @@ static INLINE int mv_joint_horizontal(MV_JOINT_TYPE type) { /* Symbols for coding magnitude class of nonzero components */ #define MV_CLASSES 11 -typedef enum { +enum { MV_CLASS_0 = 0, /* (0, 2] integer pel */ MV_CLASS_1 = 1, /* (2, 4] integer pel */ MV_CLASS_2 = 2, /* (4, 8] integer pel */ @@ -59,7 +59,7 @@ typedef enum { MV_CLASS_8 = 8, /* (256, 512] integer pel */ MV_CLASS_9 = 9, /* (512, 1024] integer pel */ MV_CLASS_10 = 10, /* (1024,2048] integer pel */ -} MV_CLASS_TYPE; +} UENUM1BYTE(MV_CLASS_TYPE); #define CLASS0_BITS 1 /* bits at integer precision for class 0 */ #define CLASS0_SIZE (1 << CLASS0_BITS) @@ -91,11 +91,11 @@ typedef struct { nmv_component comps[2]; } nmv_context; -typedef enum { +enum { MV_SUBPEL_NONE = -1, MV_SUBPEL_LOW_PRECISION = 0, MV_SUBPEL_HIGH_PRECISION, -} MvSubpelPrecision; +} SENUM1BYTE(MvSubpelPrecision); #ifdef __cplusplus } // extern "C" diff --git a/libaom/av1/common/enums.h b/libaom/av1/common/enums.h index eb17c58..fbacc89 100644 --- a/libaom/av1/common/enums.h +++ b/libaom/av1/common/enums.h @@ -16,6 +16,7 @@ #include "aom/aom_codec.h" #include "aom/aom_integer.h" +#include "aom_ports/mem.h" #ifdef __cplusplus extern "C" { @@ -84,21 +85,12 @@ extern "C" { // Profile 2. 8-bit and 10-bit 4:2:2 // 12-bit 4:0:0, 4:2:2 and 4:4:4 // Since we have three bits for the profiles, it can be extended later. -typedef enum BITSTREAM_PROFILE { +enum { PROFILE_0, PROFILE_1, PROFILE_2, MAX_PROFILES, -} BITSTREAM_PROFILE; - -#define LEVEL_MAJOR_BITS 3 -#define LEVEL_MINOR_BITS 2 -#define LEVEL_BITS (LEVEL_MAJOR_BITS + LEVEL_MINOR_BITS) - -#define LEVEL_MAJOR_MIN 2 -#define LEVEL_MAJOR_MAX ((1 << LEVEL_MAJOR_BITS) - 1 + LEVEL_MAJOR_MIN) -#define LEVEL_MINOR_MIN 0 -#define LEVEL_MINOR_MAX ((1 << LEVEL_MINOR_BITS) - 1) +} SENUM1BYTE(BITSTREAM_PROFILE); #define OP_POINTS_CNT_MINUS_1_BITS 5 #define OP_POINTS_IDC_BITS 12 @@ -138,7 +130,7 @@ typedef enum ATTRIBUTE_PACKED { // 4X4, 8X8, 16X16, 32X32, 64X64, 128X128 #define SQR_BLOCK_SIZES 6 -typedef enum ATTRIBUTE_PACKED { +enum { PARTITION_NONE, PARTITION_HORZ, PARTITION_VERT, @@ -152,7 +144,7 @@ typedef enum ATTRIBUTE_PACKED { EXT_PARTITION_TYPES, PARTITION_TYPES = PARTITION_SPLIT + 1, PARTITION_INVALID = 255 -} PARTITION_TYPE; +} UENUM1BYTE(PARTITION_TYPE); typedef char PARTITION_CONTEXT; #define PARTITION_PLOFFSET 4 // number of probability models per block size @@ -160,12 +152,7 @@ typedef char PARTITION_CONTEXT; #define PARTITION_CONTEXTS (PARTITION_BLOCK_SIZES * PARTITION_PLOFFSET) // block transform size -#if defined(_MSC_VER) -typedef uint8_t TX_SIZE; -enum ATTRIBUTE_PACKED { -#else -typedef enum ATTRIBUTE_PACKED { -#endif +enum { TX_4X4, // 4x4 transform TX_8X8, // 8x8 transform TX_16X16, // 16x16 transform @@ -189,11 +176,7 @@ typedef enum ATTRIBUTE_PACKED { TX_SIZES = TX_4X8, // Does NOT include rectangular transforms TX_SIZES_LARGEST = TX_64X64, TX_INVALID = 255 // Invalid transform size -#if defined(_MSC_VER) -}; -#else -} TX_SIZE; -#endif +} UENUM1BYTE(TX_SIZE); #define TX_SIZE_LUMA_MIN (TX_4X4) /* We don't need to code a transform size unless the allowed size is at least @@ -215,7 +198,7 @@ typedef enum ATTRIBUTE_PACKED { #define TX_PAD_HOR 4 // Pad 6 extra rows (2 on top and 4 on bottom) to remove vertical availability // check. -#define TX_PAD_TOP 2 +#define TX_PAD_TOP 0 #define TX_PAD_BOTTOM 4 #define TX_PAD_VER (TX_PAD_TOP + TX_PAD_BOTTOM) // Pad 16 extra bytes to avoid reading overflow in SIMD optimization. @@ -227,23 +210,23 @@ typedef enum ATTRIBUTE_PACKED { #define MAX_TX_BLOCKS_IN_MAX_SB (1 << MAX_TX_BLOCKS_IN_MAX_SB_LOG2) // frame transform mode -typedef enum ATTRIBUTE_PACKED { +enum { ONLY_4X4, // use only 4x4 transform TX_MODE_LARGEST, // transform size is the largest possible for pu size TX_MODE_SELECT, // transform specified for each block TX_MODES, -} TX_MODE; +} UENUM1BYTE(TX_MODE); // 1D tx types -typedef enum ATTRIBUTE_PACKED { +enum { DCT_1D, ADST_1D, FLIPADST_1D, IDTX_1D, TX_TYPES_1D, -} TX_TYPE_1D; +} UENUM1BYTE(TX_TYPE_1D); -typedef enum ATTRIBUTE_PACKED { +enum { DCT_DCT, // DCT in both horizontal and vertical ADST_DCT, // ADST in vertical, DCT in horizontal DCT_ADST, // DCT in vertical, ADST in horizontal @@ -261,9 +244,9 @@ typedef enum ATTRIBUTE_PACKED { V_FLIPADST, // FLIPADST in vertical, identity in horizontal H_FLIPADST, // Identity in vertical, FLIPADST in horizontal TX_TYPES, -} TX_TYPE; +} UENUM1BYTE(TX_TYPE); -typedef enum ATTRIBUTE_PACKED { +enum { REG_REG, REG_SMOOTH, REG_SHARP, @@ -273,9 +256,9 @@ typedef enum ATTRIBUTE_PACKED { SHARP_REG, SHARP_SMOOTH, SHARP_SHARP, -} DUAL_FILTER_TYPE; +} UENUM1BYTE(DUAL_FILTER_TYPE); -typedef enum ATTRIBUTE_PACKED { +enum { // DCT only EXT_TX_SET_DCTONLY, // DCT + Identity only @@ -289,7 +272,7 @@ typedef enum ATTRIBUTE_PACKED { // Discrete Trig transforms w/ flip (9) + Identity (1) + 1D Hor/Ver (6) EXT_TX_SET_ALL16, EXT_TX_SET_TYPES -} TxSetType; +} UENUM1BYTE(TxSetType); #define IS_2D_TRANSFORM(tx_type) (tx_type < IDTX) @@ -297,7 +280,7 @@ typedef enum ATTRIBUTE_PACKED { #define EXT_TX_SETS_INTER 4 // Sets of transform selections for INTER #define EXT_TX_SETS_INTRA 3 // Sets of transform selections for INTRA -typedef enum ATTRIBUTE_PACKED { +enum { AOM_LAST_FLAG = 1 << 0, AOM_LAST2_FLAG = 1 << 1, AOM_LAST3_FLAG = 1 << 2, @@ -306,19 +289,15 @@ typedef enum ATTRIBUTE_PACKED { AOM_ALT2_FLAG = 1 << 5, AOM_ALT_FLAG = 1 << 6, AOM_REFFRAME_ALL = (1 << 7) - 1 -} AOM_REFFRAME; +} UENUM1BYTE(AOM_REFFRAME); -typedef enum ATTRIBUTE_PACKED { +enum { UNIDIR_COMP_REFERENCE, BIDIR_COMP_REFERENCE, COMP_REFERENCE_TYPES, -} COMP_REFERENCE_TYPE; +} UENUM1BYTE(COMP_REFERENCE_TYPE); -typedef enum ATTRIBUTE_PACKED { - PLANE_TYPE_Y, - PLANE_TYPE_UV, - PLANE_TYPES -} PLANE_TYPE; +enum { PLANE_TYPE_Y, PLANE_TYPE_UV, PLANE_TYPES } UENUM1BYTE(PLANE_TYPE); #define CFL_ALPHABET_SIZE_LOG2 4 #define CFL_ALPHABET_SIZE (1 << CFL_ALPHABET_SIZE_LOG2) @@ -326,24 +305,20 @@ typedef enum ATTRIBUTE_PACKED { #define CFL_IDX_U(idx) (idx >> CFL_ALPHABET_SIZE_LOG2) #define CFL_IDX_V(idx) (idx & (CFL_ALPHABET_SIZE - 1)) -typedef enum ATTRIBUTE_PACKED { - CFL_PRED_U, - CFL_PRED_V, - CFL_PRED_PLANES -} CFL_PRED_TYPE; +enum { CFL_PRED_U, CFL_PRED_V, CFL_PRED_PLANES } UENUM1BYTE(CFL_PRED_TYPE); -typedef enum ATTRIBUTE_PACKED { +enum { CFL_SIGN_ZERO, CFL_SIGN_NEG, CFL_SIGN_POS, CFL_SIGNS -} CFL_SIGN_TYPE; +} UENUM1BYTE(CFL_SIGN_TYPE); -typedef enum ATTRIBUTE_PACKED { +enum { CFL_DISALLOWED, CFL_ALLOWED, CFL_ALLOWED_TYPES -} CFL_ALLOWED_TYPE; +} UENUM1BYTE(CFL_ALLOWED_TYPE); // CFL_SIGN_ZERO,CFL_SIGN_ZERO is invalid #define CFL_JOINT_SIGNS (CFL_SIGNS * CFL_SIGNS - 1) @@ -360,12 +335,12 @@ typedef enum ATTRIBUTE_PACKED { #define CFL_CONTEXT_V(js) \ (CFL_SIGN_V(js) * CFL_SIGNS + CFL_SIGN_U(js) - CFL_SIGNS) -typedef enum ATTRIBUTE_PACKED { +enum { PALETTE_MAP, COLOR_MAP_TYPES, -} COLOR_MAP_TYPE; +} UENUM1BYTE(COLOR_MAP_TYPE); -typedef enum ATTRIBUTE_PACKED { +enum { TWO_COLORS, THREE_COLORS, FOUR_COLORS, @@ -374,9 +349,9 @@ typedef enum ATTRIBUTE_PACKED { SEVEN_COLORS, EIGHT_COLORS, PALETTE_SIZES -} PALETTE_SIZE; +} UENUM1BYTE(PALETTE_SIZE); -typedef enum ATTRIBUTE_PACKED { +enum { PALETTE_COLOR_ONE, PALETTE_COLOR_TWO, PALETTE_COLOR_THREE, @@ -386,11 +361,11 @@ typedef enum ATTRIBUTE_PACKED { PALETTE_COLOR_SEVEN, PALETTE_COLOR_EIGHT, PALETTE_COLORS -} PALETTE_COLOR; +} UENUM1BYTE(PALETTE_COLOR); // Note: All directional predictors must be between V_PRED and D67_PRED (both // inclusive). -typedef enum ATTRIBUTE_PACKED { +enum { DC_PRED, // Average of above and left pixels V_PRED, // Vertical H_PRED, // Horizontal @@ -431,11 +406,11 @@ typedef enum ATTRIBUTE_PACKED { INTER_MODE_END = MB_MODE_COUNT, INTRA_MODES = PAETH_PRED + 1, // PAETH_PRED has to be the last intra mode. INTRA_INVALID = MB_MODE_COUNT // For uv_mode in inter blocks -} PREDICTION_MODE; +} UENUM1BYTE(PREDICTION_MODE); // TODO(ltrudeau) Do we really want to pack this? // TODO(ltrudeau) Do we match with PREDICTION_MODE? -typedef enum ATTRIBUTE_PACKED { +enum { UV_DC_PRED, // Average of above and left pixels UV_V_PRED, // Vertical UV_H_PRED, // Horizontal @@ -452,38 +427,71 @@ typedef enum ATTRIBUTE_PACKED { UV_CFL_PRED, // Chroma-from-Luma UV_INTRA_MODES, UV_MODE_INVALID, // For uv_mode in inter blocks -} UV_PREDICTION_MODE; +} UENUM1BYTE(UV_PREDICTION_MODE); -typedef enum ATTRIBUTE_PACKED { +enum { SIMPLE_TRANSLATION, OBMC_CAUSAL, // 2-sided OBMC WARPED_CAUSAL, // 2-sided WARPED MOTION_MODES -} MOTION_MODE; +} UENUM1BYTE(MOTION_MODE); -typedef enum ATTRIBUTE_PACKED { +enum { II_DC_PRED, II_V_PRED, II_H_PRED, II_SMOOTH_PRED, INTERINTRA_MODES -} INTERINTRA_MODE; +} UENUM1BYTE(INTERINTRA_MODE); -typedef enum ATTRIBUTE_PACKED { +enum { COMPOUND_AVERAGE, + COMPOUND_DISTWTD, COMPOUND_WEDGE, COMPOUND_DIFFWTD, COMPOUND_TYPES, -} COMPOUND_TYPE; + MASKED_COMPOUND_TYPES = 2, +} UENUM1BYTE(COMPOUND_TYPE); -typedef enum ATTRIBUTE_PACKED { +enum { FILTER_DC_PRED, FILTER_V_PRED, FILTER_H_PRED, FILTER_D157_PRED, FILTER_PAETH_PRED, FILTER_INTRA_MODES, -} FILTER_INTRA_MODE; +} UENUM1BYTE(FILTER_INTRA_MODE); + +enum { + SEQ_LEVEL_2_0, + SEQ_LEVEL_2_1, + SEQ_LEVEL_2_2, + SEQ_LEVEL_2_3, + SEQ_LEVEL_3_0, + SEQ_LEVEL_3_1, + SEQ_LEVEL_3_2, + SEQ_LEVEL_3_3, + SEQ_LEVEL_4_0, + SEQ_LEVEL_4_1, + SEQ_LEVEL_4_2, + SEQ_LEVEL_4_3, + SEQ_LEVEL_5_0, + SEQ_LEVEL_5_1, + SEQ_LEVEL_5_2, + SEQ_LEVEL_5_3, + SEQ_LEVEL_6_0, + SEQ_LEVEL_6_1, + SEQ_LEVEL_6_2, + SEQ_LEVEL_6_3, + SEQ_LEVEL_7_0, + SEQ_LEVEL_7_1, + SEQ_LEVEL_7_2, + SEQ_LEVEL_7_3, + SEQ_LEVELS, + SEQ_LEVEL_MAX = 31 +} UENUM1BYTE(AV1_LEVEL); + +#define LEVEL_BITS 5 #define DIRECTIONAL_MODES 8 #define MAX_ANGLE_DELTA 3 @@ -540,7 +548,7 @@ typedef enum ATTRIBUTE_PACKED { typedef uint8_t TXFM_CONTEXT; // An enum for single reference types (and some derived values). -enum ATTRIBUTE_PACKED { +enum { NONE_FRAME = -1, INTRA_FRAME, LAST_FRAME, @@ -572,14 +580,14 @@ enum ATTRIBUTE_PACKED { #define REF_FRAMES_LOG2 3 // REF_FRAMES for the cm->ref_frame_map array, 1 scratch frame for the new -// frame in cm->new_fb_idx, INTER_REFS_PER_FRAME for scaled references on the -// encoder in the cpi->scaled_ref_idx array. +// frame in cm->cur_frame, INTER_REFS_PER_FRAME for scaled references on the +// encoder in the cpi->scaled_ref_buf array. #define FRAME_BUFFERS (REF_FRAMES + 1 + INTER_REFS_PER_FRAME) #define FWD_RF_OFFSET(ref) (ref - LAST_FRAME) #define BWD_RF_OFFSET(ref) (ref - BWDREF_FRAME) -typedef enum ATTRIBUTE_PACKED { +enum { LAST_LAST2_FRAMES, // { LAST_FRAME, LAST2_FRAME } LAST_LAST3_FRAMES, // { LAST_FRAME, LAST3_FRAME } LAST_GOLDEN_FRAMES, // { LAST_FRAME, GOLDEN_FRAME } @@ -593,7 +601,7 @@ typedef enum ATTRIBUTE_PACKED { // NOTE: UNIDIR_COMP_REFS is the number of uni-directional reference pairs // that are explicitly signaled. UNIDIR_COMP_REFS = BWDREF_ALTREF_FRAMES + 1, -} UNIDIR_COMP_REF; +} UENUM1BYTE(UNIDIR_COMP_REF); #define TOTAL_COMP_REFS (FWD_REFS * BWD_REFS + TOTAL_UNIDIR_COMP_REFS) @@ -608,14 +616,14 @@ typedef enum ATTRIBUTE_PACKED { // NONE_FRAME to (MODE_CTX_REF_FRAMES - 1). Hence, it is not defined as an enum. typedef int8_t MV_REFERENCE_FRAME; -typedef enum ATTRIBUTE_PACKED { +enum { RESTORE_NONE, RESTORE_WIENER, RESTORE_SGRPROJ, RESTORE_SWITCHABLE, RESTORE_SWITCHABLE_TYPES = RESTORE_SWITCHABLE, RESTORE_TYPES = 4, -} RestorationType; +} UENUM1BYTE(RestorationType); #define SUPERRES_SCALE_BITS 3 #define SUPERRES_SCALE_DENOMINATOR_MIN (SCALE_NUMERATOR + 1) diff --git a/libaom/av1/common/filter.h b/libaom/av1/common/filter.h index d7ef5c9..184f5b2 100644 --- a/libaom/av1/common/filter.h +++ b/libaom/av1/common/filter.h @@ -37,12 +37,12 @@ typedef enum ATTRIBUTE_PACKED { EXTRA_FILTERS = INTERP_FILTERS_ALL - SWITCHABLE_FILTERS, } InterpFilter; -typedef enum { +enum { USE_2_TAPS_ORIG = 0, // This is used in temporal filtering. USE_2_TAPS, USE_4_TAPS, USE_8_TAPS, -} SUBPEL_SEARCH_TYPE; +} UENUM1BYTE(SUBPEL_SEARCH_TYPE); // Pack two InterpFilter's into a uint32_t: since there are at most 10 filters, // we can use 16 bits for each and have more than enough space. This reduces diff --git a/libaom/av1/common/idct.c b/libaom/av1/common/idct.c index 55925a5..bff438f 100644 --- a/libaom/av1/common/idct.c +++ b/libaom/av1/common/idct.c @@ -204,7 +204,7 @@ static void init_txfm_param(const MACROBLOCKD *xd, int plane, TX_SIZE tx_size, txfm_param->eob = eob; txfm_param->lossless = xd->lossless[xd->mi[0]->segment_id]; txfm_param->bd = xd->bd; - txfm_param->is_hbd = get_bitdepth_data_path_index(xd); + txfm_param->is_hbd = is_cur_buf_hbd(xd); txfm_param->tx_set_type = av1_get_ext_tx_set_type( txfm_param->tx_size, is_inter_block(xd->mi[0]), reduced_tx_set); } diff --git a/libaom/av1/common/mv.h b/libaom/av1/common/mv.h index 5b02251..d097f9e 100644 --- a/libaom/av1/common/mv.h +++ b/libaom/av1/common/mv.h @@ -56,13 +56,13 @@ typedef struct mv32 { #define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS) /* clang-format off */ -typedef enum ATTRIBUTE_PACKED { +enum { IDENTITY = 0, // identity transformation, 0-parameter TRANSLATION = 1, // translational motion 2-parameter ROTZOOM = 2, // simplified affine with rotation + zoom only, 4-parameter AFFINE = 3, // affine, 6-parameter TRANS_TYPES, -} TransformationType; +} UENUM1BYTE(TransformationType); /* clang-format on */ // Number of types used for global motion (must be >= 3 and <= TRANS_TYPES) @@ -87,18 +87,18 @@ static const int trans_model_params[TRANS_TYPES] = { 0, 2, 4, 6 }; // z . y' = m4 m5 m1 * y // 1] m6 m7 1) 1] typedef struct { - TransformationType wmtype; int32_t wmmat[8]; int16_t alpha, beta, gamma, delta; + TransformationType wmtype; int8_t invalid; } WarpedMotionParams; /* clang-format off */ static const WarpedMotionParams default_warp_params = { - IDENTITY, { 0, 0, (1 << WARPEDMODEL_PREC_BITS), 0, 0, (1 << WARPEDMODEL_PREC_BITS), 0, 0 }, 0, 0, 0, 0, + IDENTITY, 0, }; /* clang-format on */ @@ -263,7 +263,7 @@ static INLINE int_mv gm_get_motion_vector(const WarpedMotionParams *gm, return res; } -static INLINE TransformationType get_gmtype(const WarpedMotionParams *gm) { +static INLINE TransformationType get_wmtype(const WarpedMotionParams *gm) { if (gm->wmmat[5] == (1 << WARPEDMODEL_PREC_BITS) && !gm->wmmat[4] && gm->wmmat[2] == (1 << WARPEDMODEL_PREC_BITS) && !gm->wmmat[3]) { return ((!gm->wmmat[1] && !gm->wmmat[0]) ? IDENTITY : TRANSLATION); diff --git a/libaom/av1/common/mvref_common.c b/libaom/av1/common/mvref_common.c index b3d9c2f..e38891f 100644 --- a/libaom/av1/common/mvref_common.c +++ b/libaom/av1/common/mvref_common.c @@ -347,8 +347,7 @@ static int add_tpl_ref_mv(const AV1_COMMON *cm, const MACROBLOCKD *xd, if (rf[1] == NONE_FRAME) { int cur_frame_index = cm->cur_frame->order_hint; - const RefCntBuffer *const buf_0 = - cm->current_frame.frame_refs[FWD_RF_OFFSET(rf[0])].buf; + const RefCntBuffer *const buf_0 = get_ref_frame_buf(cm, rf[0]); int frame0_index = buf_0->order_hint; int cur_offset_0 = get_relative_dist(&cm->seq_params.order_hint_info, cur_frame_index, frame0_index); @@ -383,14 +382,12 @@ static int add_tpl_ref_mv(const AV1_COMMON *cm, const MACROBLOCKD *xd, } else { // Process compound inter mode int cur_frame_index = cm->cur_frame->order_hint; - const RefCntBuffer *const buf_0 = - cm->current_frame.frame_refs[FWD_RF_OFFSET(rf[0])].buf; + const RefCntBuffer *const buf_0 = get_ref_frame_buf(cm, rf[0]); int frame0_index = buf_0->order_hint; int cur_offset_0 = get_relative_dist(&cm->seq_params.order_hint_info, cur_frame_index, frame0_index); - const RefCntBuffer *const buf_1 = - cm->current_frame.frame_refs[FWD_RF_OFFSET(rf[1])].buf; + const RefCntBuffer *const buf_1 = get_ref_frame_buf(cm, rf[1]); int frame1_index = buf_1->order_hint; int cur_offset_1 = get_relative_dist(&cm->seq_params.order_hint_info, cur_frame_index, frame1_index); @@ -824,7 +821,7 @@ void av1_find_mv_refs(const AV1_COMMON *cm, const MACROBLOCKD *xd, MV_REFERENCE_FRAME rf[2]; av1_set_ref_frame(rf, ref_frame); - if (ref_frame < REF_FRAMES) { + if (global_mvs != NULL && ref_frame < REF_FRAMES) { if (ref_frame != INTRA_FRAME) { global_mvs[ref_frame] = gm_get_motion_vector( &cm->global_motion[ref_frame], cm->allow_high_precision_mv, bsize, @@ -871,8 +868,7 @@ void av1_setup_frame_buf_refs(AV1_COMMON *cm) { MV_REFERENCE_FRAME ref_frame; for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { - const RefCntBuffer *const buf = - cm->current_frame.frame_refs[ref_frame - LAST_FRAME].buf; + const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame); if (buf != NULL) cm->cur_frame->ref_order_hints[ref_frame - LAST_FRAME] = buf->order_hint; } @@ -881,8 +877,7 @@ void av1_setup_frame_buf_refs(AV1_COMMON *cm) { void av1_setup_frame_sign_bias(AV1_COMMON *cm) { MV_REFERENCE_FRAME ref_frame; for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { - const RefCntBuffer *const buf = - cm->current_frame.frame_refs[ref_frame - LAST_FRAME].buf; + const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame); if (cm->seq_params.order_hint_info.enable_order_hint && buf != NULL) { const int ref_order_hint = buf->order_hint; cm->ref_frame_sign_bias[ref_frame] = @@ -942,13 +937,13 @@ static int motion_field_projection(AV1_COMMON *cm, TPL_MV_REF *tpl_mvs_base = cm->tpl_mvs; int ref_offset[REF_FRAMES] = { 0 }; - (void)dir; - const RefCntBuffer *const start_frame_buf = - cm->current_frame.frame_refs[FWD_RF_OFFSET(start_frame)].buf; + get_ref_frame_buf(cm, start_frame); if (start_frame_buf == NULL) return 0; - if (start_frame_buf->intra_only) return 0; + if (start_frame_buf->frame_type == KEY_FRAME || + start_frame_buf->frame_type == INTRA_ONLY_FRAME) + return 0; if (start_frame_buf->mi_rows != cm->mi_rows || start_frame_buf->mi_cols != cm->mi_cols) @@ -1029,7 +1024,7 @@ void av1_setup_motion_field(AV1_COMMON *cm) { for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { const int ref_idx = ref_frame - LAST_FRAME; - const RefCntBuffer *const buf = cm->current_frame.frame_refs[ref_idx].buf; + const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame); int order_hint = 0; if (buf != NULL) order_hint = buf->order_hint; @@ -1074,8 +1069,7 @@ void av1_setup_motion_field(AV1_COMMON *cm) { ref_stamp >= 0) if (motion_field_projection(cm, ALTREF_FRAME, 0)) --ref_stamp; - if (ref_stamp >= 0 && ref_buf[LAST2_FRAME - LAST_FRAME] != NULL) - if (motion_field_projection(cm, LAST2_FRAME, 2)) --ref_stamp; + if (ref_stamp >= 0) motion_field_projection(cm, LAST2_FRAME, 2); } static INLINE void record_samples(MB_MODE_INFO *mbmi, int *pts, int *pts_inref, @@ -1293,7 +1287,7 @@ void av1_setup_skip_mode_allowed(AV1_COMMON *cm) { // Identify the nearest forward and backward references. for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { - const RefCntBuffer *const buf = cm->current_frame.frame_refs[i].buf; + const RefCntBuffer *const buf = get_ref_frame_buf(cm, LAST_FRAME + i); if (buf == NULL) continue; const int ref_order_hint = buf->order_hint; @@ -1328,7 +1322,7 @@ void av1_setup_skip_mode_allowed(AV1_COMMON *cm) { // Identify the second nearest forward reference. ref_order_hints[1] = -1; for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { - const RefCntBuffer *const buf = cm->current_frame.frame_refs[i].buf; + const RefCntBuffer *const buf = get_ref_frame_buf(cm, LAST_FRAME + i); if (buf == NULL) continue; const int ref_order_hint = buf->order_hint; @@ -1352,38 +1346,31 @@ void av1_setup_skip_mode_allowed(AV1_COMMON *cm) { } typedef struct { - int map_idx; // frame map index - int buf_idx; // frame buffer index - int sort_idx; // index based on the offset to be used for sorting + int map_idx; // frame map index + RefCntBuffer *buf; // frame buffer + int sort_idx; // index based on the offset to be used for sorting } REF_FRAME_INFO; +// Compares the sort_idx fields. If they are equal, then compares the map_idx +// fields to break the tie. This ensures a stable sort. static int compare_ref_frame_info(const void *arg_a, const void *arg_b) { const REF_FRAME_INFO *info_a = (REF_FRAME_INFO *)arg_a; const REF_FRAME_INFO *info_b = (REF_FRAME_INFO *)arg_b; - if (info_a->sort_idx < info_b->sort_idx) return -1; - if (info_a->sort_idx > info_b->sort_idx) return 1; - return (info_a->map_idx < info_b->map_idx) - ? -1 - : ((info_a->map_idx > info_b->map_idx) ? 1 : 0); + const int sort_idx_diff = info_a->sort_idx - info_b->sort_idx; + if (sort_idx_diff != 0) return sort_idx_diff; + return info_a->map_idx - info_b->map_idx; } -static void set_ref_frame_info(AV1_COMMON *const cm, int frame_idx, +static void set_ref_frame_info(int *remapped_ref_idx, int frame_idx, REF_FRAME_INFO *ref_info) { assert(frame_idx >= 0 && frame_idx < INTER_REFS_PER_FRAME); - const int buf_idx = ref_info->buf_idx; - - cm->current_frame.frame_refs[frame_idx].buf = - &cm->buffer_pool->frame_bufs[buf_idx]; - cm->current_frame.frame_refs[frame_idx].map_idx = ref_info->map_idx; + remapped_ref_idx[frame_idx] = ref_info->map_idx; } -void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx, - int gld_map_idx) { - BufferPool *const pool = cm->buffer_pool; - RefCntBuffer *const frame_bufs = pool->frame_bufs; - +void av1_set_frame_refs(AV1_COMMON *const cm, int *remapped_ref_idx, + int lst_map_idx, int gld_map_idx) { int lst_frame_sort_idx = -1; int gld_frame_sort_idx = -1; @@ -1402,15 +1389,14 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx, ref_frame_info[i].map_idx = map_idx; ref_frame_info[i].sort_idx = -1; - const int buf_idx = cm->ref_frame_map[map_idx]; - ref_frame_info[i].buf_idx = buf_idx; + RefCntBuffer *const buf = cm->ref_frame_map[map_idx]; + ref_frame_info[i].buf = buf; - assert(buf_idx < FRAME_BUFFERS); - if (buf_idx < 0) continue; - // TODO(zoeliu@google.com): To verify the checking on ref_count. - if (frame_bufs[buf_idx].ref_count <= 0) continue; + if (buf == NULL) continue; + // If this assertion fails, there is a reference leak. + assert(buf->ref_count > 0); - const int offset = (int)frame_bufs[buf_idx].order_hint; + const int offset = (int)buf->order_hint; ref_frame_info[i].sort_idx = (offset == -1) ? -1 : cur_frame_sort_idx + @@ -1461,7 +1447,7 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx, // == ALTREF_FRAME == if (bwd_start_idx <= bwd_end_idx) { - set_ref_frame_info(cm, ALTREF_FRAME - LAST_FRAME, + set_ref_frame_info(remapped_ref_idx, ALTREF_FRAME - LAST_FRAME, &ref_frame_info[bwd_end_idx]); ref_flag_list[ALTREF_FRAME - LAST_FRAME] = 1; bwd_end_idx--; @@ -1469,7 +1455,7 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx, // == BWDREF_FRAME == if (bwd_start_idx <= bwd_end_idx) { - set_ref_frame_info(cm, BWDREF_FRAME - LAST_FRAME, + set_ref_frame_info(remapped_ref_idx, BWDREF_FRAME - LAST_FRAME, &ref_frame_info[bwd_start_idx]); ref_flag_list[BWDREF_FRAME - LAST_FRAME] = 1; bwd_start_idx++; @@ -1477,7 +1463,7 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx, // == ALTREF2_FRAME == if (bwd_start_idx <= bwd_end_idx) { - set_ref_frame_info(cm, ALTREF2_FRAME - LAST_FRAME, + set_ref_frame_info(remapped_ref_idx, ALTREF2_FRAME - LAST_FRAME, &ref_frame_info[bwd_start_idx]); ref_flag_list[ALTREF2_FRAME - LAST_FRAME] = 1; } @@ -1487,13 +1473,15 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx, for (int i = fwd_start_idx; i <= fwd_end_idx; ++i) { // == LAST_FRAME == if (ref_frame_info[i].map_idx == lst_map_idx) { - set_ref_frame_info(cm, LAST_FRAME - LAST_FRAME, &ref_frame_info[i]); + set_ref_frame_info(remapped_ref_idx, LAST_FRAME - LAST_FRAME, + &ref_frame_info[i]); ref_flag_list[LAST_FRAME - LAST_FRAME] = 1; } // == GOLDEN_FRAME == if (ref_frame_info[i].map_idx == gld_map_idx) { - set_ref_frame_info(cm, GOLDEN_FRAME - LAST_FRAME, &ref_frame_info[i]); + set_ref_frame_info(remapped_ref_idx, GOLDEN_FRAME - LAST_FRAME, + &ref_frame_info[i]); ref_flag_list[GOLDEN_FRAME - LAST_FRAME] = 1; } } @@ -1525,7 +1513,7 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx, } if (fwd_start_idx > fwd_end_idx) break; - set_ref_frame_info(cm, ref_frame - LAST_FRAME, + set_ref_frame_info(remapped_ref_idx, ref_frame - LAST_FRAME, &ref_frame_info[fwd_end_idx]); ref_flag_list[ref_frame - LAST_FRAME] = 1; @@ -1536,7 +1524,7 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx, for (; ref_idx < (INTER_REFS_PER_FRAME - 2); ref_idx++) { const MV_REFERENCE_FRAME ref_frame = ref_frame_list[ref_idx]; if (ref_flag_list[ref_frame - LAST_FRAME] == 1) continue; - set_ref_frame_info(cm, ref_frame - LAST_FRAME, + set_ref_frame_info(remapped_ref_idx, ref_frame - LAST_FRAME, &ref_frame_info[fwd_start_idx]); ref_flag_list[ref_frame - LAST_FRAME] = 1; } diff --git a/libaom/av1/common/mvref_common.h b/libaom/av1/common/mvref_common.h index 2dbd12c..0aa9d38 100644 --- a/libaom/av1/common/mvref_common.h +++ b/libaom/av1/common/mvref_common.h @@ -70,18 +70,6 @@ static INLINE int_mv get_sub_block_pred_mv(const MB_MODE_INFO *candidate, return candidate->mv[which_mv]; } -// Performs mv sign inversion if indicated by the reference frame combination. -static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref, - const MV_REFERENCE_FRAME this_ref_frame, - const int *ref_sign_bias) { - int_mv mv = mbmi->mv[ref]; - if (ref_sign_bias[mbmi->ref_frame[ref]] != ref_sign_bias[this_ref_frame]) { - mv.as_mv.row *= -1; - mv.as_mv.col *= -1; - } - return mv; -} - // Checks that the given mi_row, mi_col and search point // are inside the borders of the tile. static INLINE int is_inside(const TileInfo *const tile, int mi_col, int mi_row, @@ -222,7 +210,8 @@ void av1_setup_frame_buf_refs(AV1_COMMON *cm); void av1_setup_frame_sign_bias(AV1_COMMON *cm); void av1_setup_skip_mode_allowed(AV1_COMMON *cm); void av1_setup_motion_field(AV1_COMMON *cm); -void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx, int gld_map_idx); +void av1_set_frame_refs(AV1_COMMON *const cm, int *remapped_ref_idx, + int lst_map_idx, int gld_map_idx); static INLINE void av1_collect_neighbors_ref_counts(MACROBLOCKD *const xd) { av1_zero(xd->neighbors_ref_counts); @@ -255,6 +244,9 @@ void av1_copy_frame_mvs(const AV1_COMMON *const cm, const MB_MODE_INFO *const mi, int mi_row, int mi_col, int x_mis, int y_mis); +// The global_mvs output parameter points to an array of REF_FRAMES elements. +// The caller may pass a null global_mvs if it does not need the global_mvs +// output. void av1_find_mv_refs(const AV1_COMMON *cm, const MACROBLOCKD *xd, MB_MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame, uint8_t ref_mv_count[MODE_CTX_REF_FRAMES], diff --git a/libaom/av1/common/onyxc_int.h b/libaom/av1/common/onyxc_int.h index 117afb6..8117dfc 100644 --- a/libaom/av1/common/onyxc_int.h +++ b/libaom/av1/common/onyxc_int.h @@ -79,14 +79,14 @@ extern "C" { #define TXCOEFF_TIMER 0 #define TXCOEFF_COST_TIMER 0 -typedef enum { +enum { SINGLE_REFERENCE = 0, COMPOUND_REFERENCE = 1, REFERENCE_MODE_SELECT = 2, REFERENCE_MODES = 3, -} REFERENCE_MODE; +} UENUM1BYTE(REFERENCE_MODE); -typedef enum { +enum { /** * Frame context updates are disabled */ @@ -96,7 +96,7 @@ typedef enum { * updates based on entropy/counts in the decoded frame */ REFRESH_FRAME_CONTEXT_BACKWARD, -} REFRESH_FRAME_CONTEXT_MODE; +} UENUM1BYTE(REFRESH_FRAME_CONTEXT_MODE); #define MFMV_STACK_SIZE 3 typedef struct { @@ -109,24 +109,12 @@ typedef struct { MV_REFERENCE_FRAME ref_frame; } MV_REF; -// FIXME(jack.haughton@argondesign.com): This enum was originally in -// encoder/ratectrl.h, and is encoder specific. When we move to C++, this -// should go back there and BufferPool should be templatized. -typedef enum { - INTER_NORMAL = 0, - INTER_LOW = 1, - INTER_HIGH = 2, - GF_ARF_LOW = 3, - GF_ARF_STD = 4, - KF_STD = 5, - RATE_FACTOR_LEVELS = 6 -} RATE_FACTOR_LEVEL; typedef struct RefCntBuffer { // For a RefCntBuffer, the following are reference-holding variables: // - cm->ref_frame_map[] - // - cm->new_fb_idx - // - cm->scaled_ref_idx[] (encoder only) + // - cm->cur_frame + // - cm->scaled_ref_buf[] (encoder only) // - cm->next_ref_frame_map[] (decoder only) // - pbi->output_frame_index[] (decoder only) // With that definition, 'ref_count' is the number of reference-holding @@ -136,8 +124,6 @@ typedef struct RefCntBuffer { // - Total 'n' of the variables / array elements above have value 'k' (that // is, they are pointing to buffer at index 'k'). // Then, pool->frame_bufs[k].ref_count = n. - // TODO(david.turner@argondesign.com) Check whether this helpful comment is - // still correct after we finish restructuring int ref_count; unsigned int order_hint; @@ -154,14 +140,17 @@ typedef struct RefCntBuffer { int height; WarpedMotionParams global_motion[REF_FRAMES]; int showable_frame; // frame can be used as show existing frame in future - int film_grain_params_present; + uint8_t film_grain_params_present; aom_film_grain_t film_grain_params; aom_codec_frame_buffer_t raw_frame_buffer; YV12_BUFFER_CONFIG buf; hash_table hash_table; - uint8_t intra_only; FRAME_TYPE frame_type; + // This is only used in the encoder but needs to be indexed per ref frame + // so it's extremely convenient to keep it here. + int interp_filter_selected[SWITCHABLE]; + // Inter frame reference frame delta for loop filter int8_t ref_deltas[REF_FRAMES]; @@ -169,7 +158,6 @@ typedef struct RefCntBuffer { int8_t mode_deltas[MAX_MODE_LF_DELTAS]; FRAME_CONTEXT frame_context; - RATE_FACTOR_LEVEL frame_rf_level; } RefCntBuffer; typedef struct BufferPool { @@ -195,18 +183,6 @@ typedef struct BufferPool { } BufferPool; typedef struct { - int base_ctx_table[2 /*row*/][2 /*col*/][3 /*sig_map*/] - [BASE_CONTEXT_POSITION_NUM + 1]; -} LV_MAP_CTX_TABLE; -typedef int BASE_CTX_TABLE[2 /*col*/][3 /*sig_map*/] - [BASE_CONTEXT_POSITION_NUM + 1]; - -typedef struct BitstreamLevel { - uint8_t major; - uint8_t minor; -} BitstreamLevel; - -typedef struct { int cdef_pri_damping; int cdef_sec_damping; int nb_cdef_strengths; @@ -230,11 +206,11 @@ typedef struct { typedef struct { int enable_order_hint; // 0 - disable order hint, and related tools - int order_hint_bits_minus_1; - // jnt_comp, ref_frame_mvs, frame_sign_bias - // if 0, enable_jnt_comp and - // enable_ref_frame_mvs must be set zs 0. - int enable_jnt_comp; // 0 - disable joint compound modes + int order_hint_bits_minus_1; // dist_wtd_comp, ref_frame_mvs, + // frame_sign_bias + // if 0, enable_dist_wtd_comp and + // enable_ref_frame_mvs must be set as 0. + int enable_dist_wtd_comp; // 0 - disable dist-wtd compound modes // 1 - enable it int enable_ref_frame_mvs; // 0 - disable ref frame mvs // 1 - enable it @@ -249,7 +225,7 @@ typedef struct SequenceHeader { int num_bits_height; int max_frame_width; int max_frame_height; - int frame_id_numbers_present_flag; + uint8_t frame_id_numbers_present_flag; int frame_id_length; int delta_frame_id_length; BLOCK_SIZE sb_size; // Size of the superblock used for this frame @@ -258,45 +234,44 @@ typedef struct SequenceHeader { OrderHintInfo order_hint_info; - int force_screen_content_tools; // 0 - force off - // 1 - force on - // 2 - adaptive - int force_integer_mv; // 0 - Not to force. MV can be in 1/4 or 1/8 - // 1 - force to integer - // 2 - adaptive - int still_picture; // Video is a single frame still picture - int reduced_still_picture_hdr; // Use reduced header for still picture - int enable_filter_intra; // enables/disables filterintra - int enable_intra_edge_filter; // enables/disables corner/edge/upsampling - int enable_interintra_compound; // enables/disables interintra_compound - int enable_masked_compound; // enables/disables masked compound - int enable_dual_filter; // 0 - disable dual interpolation filter - // 1 - enable vert/horiz filter selection - int enable_warped_motion; // 0 - disable warped motion for sequence - // 1 - enable it for the sequence - int enable_superres; // 0 - Disable superres for the sequence, and disable - // transmitting per-frame superres enabled flag. - // 1 - Enable superres for the sequence, and also - // enable per-frame flag to denote if superres is - // enabled for that frame. - int enable_cdef; // To turn on/off CDEF - int enable_restoration; // To turn on/off loop restoration + uint8_t force_screen_content_tools; // 0 - force off + // 1 - force on + // 2 - adaptive + uint8_t still_picture; // Video is a single frame still picture + uint8_t reduced_still_picture_hdr; // Use reduced header for still picture + uint8_t force_integer_mv; // 0 - Don't force. MV can use subpel + // 1 - force to integer + // 2 - adaptive + uint8_t enable_filter_intra; // enables/disables filterintra + uint8_t enable_intra_edge_filter; // enables/disables edge upsampling + uint8_t enable_interintra_compound; // enables/disables interintra_compound + uint8_t enable_masked_compound; // enables/disables masked compound + uint8_t enable_dual_filter; // 0 - disable dual interpolation filter + // 1 - enable vert/horz filter selection + uint8_t enable_warped_motion; // 0 - disable warp for the sequence + // 1 - enable warp for the sequence + uint8_t enable_superres; // 0 - Disable superres for the sequence + // and no frame level superres flag + // 1 - Enable superres for the sequence + // enable per-frame superres flag + uint8_t enable_cdef; // To turn on/off CDEF + uint8_t enable_restoration; // To turn on/off loop restoration BITSTREAM_PROFILE profile; // Operating point info. int operating_points_cnt_minus_1; int operating_point_idc[MAX_NUM_OPERATING_POINTS]; - int display_model_info_present_flag; - int decoder_model_info_present_flag; - BitstreamLevel level[MAX_NUM_OPERATING_POINTS]; + uint8_t display_model_info_present_flag; + uint8_t decoder_model_info_present_flag; + AV1_LEVEL seq_level_idx[MAX_NUM_OPERATING_POINTS]; uint8_t tier[MAX_NUM_OPERATING_POINTS]; // seq_tier in the spec. One bit: 0 // or 1. // Color config. aom_bit_depth_t bit_depth; // AOM_BITS_8 in profile 0 or 1, // AOM_BITS_10 or AOM_BITS_12 in profile 2 or 3. - int use_highbitdepth; // If true, we need to use 16bit frame buffers. - int monochrome; // Monochorme video + uint8_t use_highbitdepth; // If true, we need to use 16bit frame buffers. + uint8_t monochrome; // Monochorme video aom_color_primaries_t color_primaries; aom_transfer_characteristics_t transfer_characteristics; aom_matrix_coefficients_t matrix_coefficients; @@ -304,9 +279,8 @@ typedef struct SequenceHeader { int subsampling_x; // Chroma subsampling for x int subsampling_y; // Chroma subsampling for y aom_chroma_sample_position_t chroma_sample_position; - int separate_uv_delta_q; - - int film_grain_params_present; + uint8_t separate_uv_delta_q; + uint8_t film_grain_params_present; } SequenceHeader; typedef struct { @@ -318,16 +292,13 @@ typedef struct { typedef struct { FRAME_TYPE frame_type; - // Flag signaling that the frame is encoded using only INTRA modes. - uint8_t intra_only; REFERENCE_MODE reference_mode; unsigned int order_hint; unsigned int frame_number; SkipModeInfo skip_mode_info; - // Each Inter frame can reference INTER_REFS_PER_FRAME buffers. This maps each - // (inter) reference frame type to the corresponding reference buffer. - RefBuffer frame_refs[INTER_REFS_PER_FRAME]; + int refresh_frame_flags; // Which ref frames are overwritten by this frame + int frame_refs_short_signaling; } CurrentFrame; typedef struct AV1Common { @@ -337,8 +308,6 @@ typedef struct AV1Common { int height; int render_width; int render_height; - int last_width; - int last_height; int timing_info_present; aom_timing_info_t timing_info; int buffer_removal_time_present; @@ -347,49 +316,59 @@ typedef struct AV1Common { aom_op_timing_info_t op_frame_timing[MAX_NUM_OPERATING_POINTS + 1]; uint32_t frame_presentation_time; - int largest_tile_id; - size_t largest_tile_size; int context_update_tile_id; // Scale of the current frame with respect to itself. struct scale_factors sf_identity; - YV12_BUFFER_CONFIG *frame_to_show; RefCntBuffer *prev_frame; // TODO(hkuang): Combine this with cur_buf in macroblockd. RefCntBuffer *cur_frame; - // For decoder, ref_frame_map[i] maps reference type 'i' to actual index of - // the buffer in the buffer pool ‘cm->buffer_pool.frame_bufs’. + // For encoder, we have a two-level mapping from reference frame type to the + // corresponding buffer in the buffer pool: + // * 'remapped_ref_idx[i - 1]' maps reference type 'i' (range: LAST_FRAME ... + // EXTREF_FRAME) to a remapped index 'j' (in range: 0 ... REF_FRAMES - 1) + // * Later, 'cm->ref_frame_map[j]' maps the remapped index 'j' to a pointer to + // the reference counted buffer structure RefCntBuffer, taken from the buffer + // pool cm->buffer_pool->frame_bufs. + // + // LAST_FRAME, ..., EXTREF_FRAME + // | | + // v v + // remapped_ref_idx[LAST_FRAME - 1], ..., remapped_ref_idx[EXTREF_FRAME - 1] + // | | + // v v + // ref_frame_map[], ..., ref_frame_map[] + // + // Note: INTRA_FRAME always refers to the current frame, so there's no need to + // have a remapped index for the same. + int remapped_ref_idx[REF_FRAMES]; + + struct scale_factors ref_scale_factors[REF_FRAMES]; + + // For decoder, ref_frame_map[i] maps reference type 'i' to a pointer to + // the buffer in the buffer pool 'cm->buffer_pool.frame_bufs'. // For encoder, ref_frame_map[j] (where j = remapped_ref_idx[i]) maps // remapped reference index 'j' (that is, original reference type 'i') to - // actual index of the buffer in the buffer pool ‘cm->buffer_pool.frame_bufs’. - int ref_frame_map[REF_FRAMES]; + // a pointer to the buffer in the buffer pool 'cm->buffer_pool.frame_bufs'. + RefCntBuffer *ref_frame_map[REF_FRAMES]; // Prepare ref_frame_map for the next frame. // Only used in frame parallel decode. - int next_ref_frame_map[REF_FRAMES]; - - // Index to the 'new' frame (i.e. the frame currently being encoded or - // decoded) in the buffer pool 'cm->buffer_pool'. - int new_fb_idx; - + RefCntBuffer *next_ref_frame_map[REF_FRAMES]; FRAME_TYPE last_frame_type; /* last frame's frame type for motion search.*/ int show_frame; int showable_frame; // frame can be used as show existing frame in future int show_existing_frame; - // Flag for a frame used as a reference - not written to the bitstream - int is_reference_frame; - int reset_decoder_state; - uint8_t last_intra_only; uint8_t disable_cdf_update; int allow_high_precision_mv; - int cur_frame_force_integer_mv; // 0 the default in AOM, 1 only integer + uint8_t cur_frame_force_integer_mv; // 0 the default in AOM, 1 only integer - int allow_screen_content_tools; + uint8_t allow_screen_content_tools; int allow_intrabc; int allow_warped_motion; @@ -437,6 +416,7 @@ typedef struct AV1Common { int qm_v; int min_qmlevel; int max_qmlevel; + int use_quant_b_adapt; /* We allocate a MB_MODE_INFO struct for each macroblock, together with an extra row on top and column on the left to simplify prediction. */ @@ -465,8 +445,6 @@ typedef struct AV1Common { int allow_ref_frame_mvs; uint8_t *last_frame_seg_map; - uint8_t *current_frame_seg_map; - int seg_map_alloc_size; InterpFilter interp_filter; @@ -505,17 +483,11 @@ typedef struct AV1Common { FRAME_CONTEXT *fc; /* this frame entropy */ FRAME_CONTEXT *default_frame_context; - unsigned int frame_context_idx; /* Context to use/update */ - int fb_of_context_type[REF_FRAMES]; int primary_ref_frame; - aom_bit_depth_t dequant_bit_depth; // bit_depth of current dequantizer - int error_resilient_mode; - int force_primary_ref_none; int tile_cols, tile_rows; - int last_tile_cols, last_tile_rows; int max_tile_width_sb; int min_log2_tile_cols; @@ -530,6 +502,7 @@ typedef struct AV1Common { int tile_col_start_sb[MAX_TILE_COLS + 1]; // valid for 0 <= i <= tile_cols int tile_row_start_sb[MAX_TILE_ROWS + 1]; // valid for 0 <= i <= tile_rows int tile_width, tile_height; // In MI units + int min_inner_tile_width; // min width of non-rightmost tile unsigned int large_scale_tile; unsigned int single_tile_decoding; @@ -555,8 +528,6 @@ typedef struct AV1Common { int current_frame_id; int ref_frame_id[REF_FRAMES]; int valid_for_referencing[REF_FRAMES]; - int invalid_delta_frame_id_minus_1; - LV_MAP_CTX_TABLE coeff_ctx_table; TPL_MV_REF *tpl_mvs; int tpl_mvs_mem_size; // TODO(jingning): This can be combined with sign_bias later. @@ -564,7 +535,6 @@ typedef struct AV1Common { int is_annexb; - int frame_refs_short_signaling; int temporal_layer_id; int spatial_layer_id; unsigned int number_temporal_layers; @@ -608,9 +578,8 @@ static void unlock_buffer_pool(BufferPool *const pool) { static INLINE YV12_BUFFER_CONFIG *get_ref_frame(AV1_COMMON *cm, int index) { if (index < 0 || index >= REF_FRAMES) return NULL; - if (cm->ref_frame_map[index] < 0) return NULL; - assert(cm->ref_frame_map[index] < FRAME_BUFFERS); - return &cm->buffer_pool->frame_bufs[cm->ref_frame_map[index]].buf; + if (cm->ref_frame_map[index] == NULL) return NULL; + return &cm->ref_frame_map[index]->buf; } static INLINE int get_free_fb(AV1_COMMON *cm) { @@ -646,38 +615,83 @@ static INLINE int get_free_fb(AV1_COMMON *cm) { return i; } -// Modify 'idx_ptr' to reference the buffer at 'new_idx', and update the ref +static INLINE RefCntBuffer *assign_cur_frame_new_fb(AV1_COMMON *const cm) { + // Release the previously-used frame-buffer + if (cm->cur_frame != NULL) { + --cm->cur_frame->ref_count; + cm->cur_frame = NULL; + } + + // Assign a new framebuffer + const int new_fb_idx = get_free_fb(cm); + if (new_fb_idx == INVALID_IDX) return NULL; + + cm->cur_frame = &cm->buffer_pool->frame_bufs[new_fb_idx]; + cm->cur_frame->buf.buf_8bit_valid = 0; + av1_zero(cm->cur_frame->interp_filter_selected); + return cm->cur_frame; +} + +// Modify 'lhs_ptr' to reference the buffer at 'rhs_ptr', and update the ref // counts accordingly. -static INLINE void assign_frame_buffer(RefCntBuffer *bufs, int *idx_ptr, - int new_idx) { - const int old_idx = *idx_ptr; - if (old_idx >= 0) { - assert(bufs[old_idx].ref_count > 0); - // One less reference to the buffer at 'old_idx', so decrease ref count. - --bufs[old_idx].ref_count; +static INLINE void assign_frame_buffer_p(RefCntBuffer **lhs_ptr, + RefCntBuffer *rhs_ptr) { + RefCntBuffer *const old_ptr = *lhs_ptr; + if (old_ptr != NULL) { + assert(old_ptr->ref_count > 0); + // One less reference to the buffer at 'old_ptr', so decrease ref count. + --old_ptr->ref_count; } - *idx_ptr = new_idx; - // One more reference to the buffer at 'new_idx', so increase ref count. - ++bufs[new_idx].ref_count; + *lhs_ptr = rhs_ptr; + // One more reference to the buffer at 'rhs_ptr', so increase ref count. + ++rhs_ptr->ref_count; } static INLINE int frame_is_intra_only(const AV1_COMMON *const cm) { return cm->current_frame.frame_type == KEY_FRAME || - cm->current_frame.intra_only; + cm->current_frame.frame_type == INTRA_ONLY_FRAME; } static INLINE int frame_is_sframe(const AV1_COMMON *cm) { return cm->current_frame.frame_type == S_FRAME; } -static INLINE RefCntBuffer *get_prev_frame(const AV1_COMMON *const cm) { - if (cm->primary_ref_frame == PRIMARY_REF_NONE || - cm->current_frame.frame_refs[cm->primary_ref_frame].buf == NULL) { - return NULL; - } else { - return cm->current_frame.frame_refs[cm->primary_ref_frame].buf; - } +// These functions take a reference frame label between LAST_FRAME and +// EXTREF_FRAME inclusive. Note that this is different to the indexing +// previously used by the frame_refs[] array. +static INLINE int get_ref_frame_map_idx(const AV1_COMMON *const cm, + const MV_REFERENCE_FRAME ref_frame) { + return (ref_frame >= LAST_FRAME && ref_frame <= EXTREF_FRAME) + ? cm->remapped_ref_idx[ref_frame - LAST_FRAME] + : INVALID_IDX; +} + +static INLINE RefCntBuffer *get_ref_frame_buf( + const AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) { + const int map_idx = get_ref_frame_map_idx(cm, ref_frame); + return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : NULL; +} + +// Both const and non-const versions of this function are provided so that it +// can be used with a const AV1_COMMON if needed. +static INLINE const struct scale_factors *get_ref_scale_factors_const( + const AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) { + const int map_idx = get_ref_frame_map_idx(cm, ref_frame); + return (map_idx != INVALID_IDX) ? &cm->ref_scale_factors[map_idx] : NULL; +} + +static INLINE struct scale_factors *get_ref_scale_factors( + AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) { + const int map_idx = get_ref_frame_map_idx(cm, ref_frame); + return (map_idx != INVALID_IDX) ? &cm->ref_scale_factors[map_idx] : NULL; +} + +static INLINE RefCntBuffer *get_primary_ref_frame_buf( + const AV1_COMMON *const cm) { + if (cm->primary_ref_frame == PRIMARY_REF_NONE) return NULL; + const int map_idx = get_ref_frame_map_idx(cm, cm->primary_ref_frame + 1); + return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : NULL; } // Returns 1 if this frame might allow mvs from some reference frame. @@ -1233,8 +1247,8 @@ static INLINE TX_SIZE get_tx_size(int width, int height) { return TX_4X4; } -static INLINE int txfm_partition_context(TXFM_CONTEXT *above_ctx, - TXFM_CONTEXT *left_ctx, +static INLINE int txfm_partition_context(const TXFM_CONTEXT *const above_ctx, + const TXFM_CONTEXT *const left_ctx, BLOCK_SIZE bsize, TX_SIZE tx_size) { const uint8_t txw = tx_size_wide[tx_size]; const uint8_t txh = tx_size_high[tx_size]; @@ -1358,17 +1372,8 @@ static INLINE int is_coded_lossless(const AV1_COMMON *cm, return coded_lossless; } -static INLINE int is_valid_seq_level_idx(uint8_t seq_level_idx) { - return seq_level_idx < 24 || seq_level_idx == 31; -} - -static INLINE uint8_t major_minor_to_seq_level_idx(BitstreamLevel bl) { - assert(bl.major >= LEVEL_MAJOR_MIN && bl.major <= LEVEL_MAJOR_MAX); - // Since bl.minor is unsigned a comparison will return a warning: - // comparison is always true due to limited range of data type - assert(LEVEL_MINOR_MIN == 0); - assert(bl.minor <= LEVEL_MINOR_MAX); - return ((bl.major - LEVEL_MAJOR_MIN) << LEVEL_MINOR_BITS) + bl.minor; +static INLINE int is_valid_seq_level_idx(AV1_LEVEL seq_level_idx) { + return seq_level_idx < SEQ_LEVELS || seq_level_idx == SEQ_LEVEL_MAX; } #ifdef __cplusplus diff --git a/libaom/av1/common/pred_common.h b/libaom/av1/common/pred_common.h index f667057..d9b30a9 100644 --- a/libaom/av1/common/pred_common.h +++ b/libaom/av1/common/pred_common.h @@ -48,20 +48,24 @@ static INLINE int av1_get_spatial_seg_pred(const AV1_COMMON *const cm, int prev_l = -1; // left segment_id int prev_u = -1; // top segment_id if ((xd->up_available) && (xd->left_available)) { - prev_ul = get_segment_id(cm, cm->current_frame_seg_map, BLOCK_4X4, - mi_row - 1, mi_col - 1); + prev_ul = get_segment_id(cm, cm->cur_frame->seg_map, BLOCK_4X4, mi_row - 1, + mi_col - 1); } if (xd->up_available) { - prev_u = get_segment_id(cm, cm->current_frame_seg_map, BLOCK_4X4, - mi_row - 1, mi_col - 0); + prev_u = get_segment_id(cm, cm->cur_frame->seg_map, BLOCK_4X4, mi_row - 1, + mi_col - 0); } if (xd->left_available) { - prev_l = get_segment_id(cm, cm->current_frame_seg_map, BLOCK_4X4, - mi_row - 0, mi_col - 1); + prev_l = get_segment_id(cm, cm->cur_frame->seg_map, BLOCK_4X4, mi_row - 0, + mi_col - 1); } + // This property follows from the fact that get_segment_id() returns a + // nonnegative value. This allows us to test for all edge cases with a simple + // prev_ul < 0 check. + assert(IMPLIES(prev_ul >= 0, prev_u >= 0 && prev_l >= 0)); // Pick CDF index based on number of matching/out-of-bounds segment IDs. - if (prev_ul < 0 || prev_u < 0 || prev_l < 0) /* Edge case */ + if (prev_ul < 0) /* Edge cases */ *cdf_index = 0; else if ((prev_ul == prev_u) && (prev_ul == prev_l)) *cdf_index = 2; @@ -90,10 +94,8 @@ static INLINE int av1_get_pred_context_seg_id(const MACROBLOCKD *xd) { static INLINE int get_comp_index_context(const AV1_COMMON *cm, const MACROBLOCKD *xd) { MB_MODE_INFO *mbmi = xd->mi[0]; - const RefCntBuffer *const bck_buf = - cm->current_frame.frame_refs[mbmi->ref_frame[0] - LAST_FRAME].buf; - const RefCntBuffer *const fwd_buf = - cm->current_frame.frame_refs[mbmi->ref_frame[1] - LAST_FRAME].buf; + const RefCntBuffer *const bck_buf = get_ref_frame_buf(cm, mbmi->ref_frame[0]); + const RefCntBuffer *const fwd_buf = get_ref_frame_buf(cm, mbmi->ref_frame[1]); int bck_frame_index = 0, fwd_frame_index = 0; int cur_frame_index = cm->cur_frame->order_hint; diff --git a/libaom/av1/common/reconinter.c b/libaom/av1/common/reconinter.c index f338e1b..ea351cf 100644 --- a/libaom/av1/common/reconinter.c +++ b/libaom/av1/common/reconinter.c @@ -84,12 +84,11 @@ void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, if (do_warp && xd->cur_frame_force_integer_mv == 0) { const struct macroblockd_plane *const pd = &xd->plane[plane]; const struct buf_2d *const pre_buf = &pd->pre[ref]; - av1_warp_plane(&final_warp_params, - xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd, + av1_warp_plane(&final_warp_params, is_cur_buf_hbd(xd), xd->bd, pre_buf->buf0, pre_buf->width, pre_buf->height, pre_buf->stride, dst, p_col, p_row, w, h, dst_stride, pd->subsampling_x, pd->subsampling_y, conv_params); - } else if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + } else if (is_cur_buf_hbd(xd)) { highbd_inter_predictor(src, src_stride, dst, dst_stride, subpel_params, sf, w, h, conv_params, interp_filters, is_intrabc, xd->bd); @@ -568,14 +567,15 @@ static void build_masked_compound_no_round( const int subh = (2 << mi_size_high_log2[sb_type]) == h; const int subw = (2 << mi_size_wide_log2[sb_type]) == w; const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type); - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + if (is_cur_buf_hbd(xd)) { aom_highbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, block_size_wide[sb_type], w, h, subw, subh, conv_params, xd->bd); - else + } else { aom_lowbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, block_size_wide[sb_type], w, h, subw, subh, conv_params); + } } void av1_make_masked_inter_predictor( @@ -626,20 +626,20 @@ void av1_make_masked_inter_predictor( mi->sb_type, h, w, conv_params, xd); } -void av1_jnt_comp_weight_assign(const AV1_COMMON *cm, const MB_MODE_INFO *mbmi, - int order_idx, int *fwd_offset, int *bck_offset, - int *use_jnt_comp_avg, int is_compound) { +void av1_dist_wtd_comp_weight_assign(const AV1_COMMON *cm, + const MB_MODE_INFO *mbmi, int order_idx, + int *fwd_offset, int *bck_offset, + int *use_dist_wtd_comp_avg, + int is_compound) { assert(fwd_offset != NULL && bck_offset != NULL); if (!is_compound || mbmi->compound_idx) { - *use_jnt_comp_avg = 0; + *use_dist_wtd_comp_avg = 0; return; } - *use_jnt_comp_avg = 1; - const RefCntBuffer *const bck_buf = - cm->current_frame.frame_refs[mbmi->ref_frame[0] - LAST_FRAME].buf; - const RefCntBuffer *const fwd_buf = - cm->current_frame.frame_refs[mbmi->ref_frame[1] - LAST_FRAME].buf; + *use_dist_wtd_comp_avg = 1; + const RefCntBuffer *const bck_buf = get_ref_frame_buf(cm, mbmi->ref_frame[0]); + const RefCntBuffer *const fwd_buf = get_ref_frame_buf(cm, mbmi->ref_frame[1]); const int cur_frame_index = cm->cur_frame->order_hint; int bck_frame_index = 0, fwd_frame_index = 0; @@ -800,53 +800,6 @@ void av1_modify_neighbor_predictor_for_obmc(MB_MODE_INFO *mbmi) { return; } -struct obmc_check_mv_field_ctxt { - MB_MODE_INFO *current_mi; - int mv_field_check_result; -}; - -static INLINE void obmc_check_identical_mv(MACROBLOCKD *xd, int rel_mi_col, - uint8_t nb_mi_width, - MB_MODE_INFO *nb_mi, void *fun_ctxt, - const int num_planes) { - (void)xd; - (void)rel_mi_col; - (void)nb_mi_width; - (void)num_planes; - struct obmc_check_mv_field_ctxt *ctxt = - (struct obmc_check_mv_field_ctxt *)fun_ctxt; - const MB_MODE_INFO *current_mi = ctxt->current_mi; - - if (ctxt->mv_field_check_result == 0) return; - - if (nb_mi->ref_frame[0] != current_mi->ref_frame[0] || - nb_mi->mv[0].as_int != current_mi->mv[0].as_int || - nb_mi->interp_filters != current_mi->interp_filters) { - ctxt->mv_field_check_result = 0; - } - return; -} - -// Check if the neighbors' motions used by obmc have same parameters as for -// the current block. If all the parameters are identical, obmc will produce -// the same prediction as from regular bmc, therefore we can skip the -// overlapping operations for less complexity. The parameters checked include -// reference frame, motion vector, and interpolation filter. -int av1_check_identical_obmc_mv_field(const AV1_COMMON *cm, MACROBLOCKD *xd, - int mi_row, int mi_col) { - const BLOCK_SIZE bsize = xd->mi[0]->sb_type; - struct obmc_check_mv_field_ctxt mv_field_check_ctxt = { xd->mi[0], 1 }; - - foreach_overlappable_nb_above(cm, xd, mi_col, - max_neighbor_obmc[mi_size_wide_log2[bsize]], - obmc_check_identical_mv, &mv_field_check_ctxt); - foreach_overlappable_nb_left(cm, xd, mi_row, - max_neighbor_obmc[mi_size_high_log2[bsize]], - obmc_check_identical_mv, &mv_field_check_ctxt); - - return mv_field_check_ctxt.mv_field_check_result; -} - struct obmc_inter_pred_ctxt { uint8_t **adjacent; int *adjacent_stride; @@ -860,7 +813,7 @@ static INLINE void build_obmc_inter_pred_above(MACROBLOCKD *xd, int rel_mi_col, (void)above_mi; struct obmc_inter_pred_ctxt *ctxt = (struct obmc_inter_pred_ctxt *)fun_ctxt; const BLOCK_SIZE bsize = xd->mi[0]->sb_type; - const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0; + const int is_hbd = is_cur_buf_hbd(xd); const int overlap = AOMMIN(block_size_high[bsize], block_size_high[BLOCK_64X64]) >> 1; @@ -897,7 +850,7 @@ static INLINE void build_obmc_inter_pred_left(MACROBLOCKD *xd, int rel_mi_row, const BLOCK_SIZE bsize = xd->mi[0]->sb_type; const int overlap = AOMMIN(block_size_wide[bsize], block_size_wide[BLOCK_64X64]) >> 1; - const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0; + const int is_hbd = is_cur_buf_hbd(xd); for (int plane = 0; plane < num_planes; ++plane) { const struct macroblockd_plane *pd = &xd->plane[plane]; @@ -968,15 +921,15 @@ void av1_setup_build_prediction_by_above_pred( for (int ref = 0; ref < num_refs; ++ref) { const MV_REFERENCE_FRAME frame = above_mbmi->ref_frame[ref]; - const RefBuffer *const ref_buf = - &ctxt->cm->current_frame.frame_refs[frame - LAST_FRAME]; - - xd->block_refs[ref] = ref_buf; - if ((!av1_is_valid_scale(&ref_buf->sf))) + const RefCntBuffer *const ref_buf = get_ref_frame_buf(ctxt->cm, frame); + const struct scale_factors *const sf = + get_ref_scale_factors_const(ctxt->cm, frame); + xd->block_ref_scale_factors[ref] = sf; + if ((!av1_is_valid_scale(sf))) aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM, "Reference frame has invalid dimensions"); - av1_setup_pre_planes(xd, ref, &ref_buf->buf->buf, ctxt->mi_row, - above_mi_col, &ref_buf->sf, num_planes); + av1_setup_pre_planes(xd, ref, &ref_buf->buf, ctxt->mi_row, above_mi_col, sf, + num_planes); } xd->mb_to_left_edge = 8 * MI_SIZE * (-above_mi_col); @@ -1006,15 +959,16 @@ void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row, for (int ref = 0; ref < num_refs; ++ref) { const MV_REFERENCE_FRAME frame = left_mbmi->ref_frame[ref]; - const RefBuffer *const ref_buf = - &ctxt->cm->current_frame.frame_refs[frame - LAST_FRAME]; + const RefCntBuffer *const ref_buf = get_ref_frame_buf(ctxt->cm, frame); + const struct scale_factors *const ref_scale_factors = + get_ref_scale_factors_const(ctxt->cm, frame); - xd->block_refs[ref] = ref_buf; - if ((!av1_is_valid_scale(&ref_buf->sf))) + xd->block_ref_scale_factors[ref] = ref_scale_factors; + if ((!av1_is_valid_scale(ref_scale_factors))) aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM, "Reference frame has invalid dimensions"); - av1_setup_pre_planes(xd, ref, &ref_buf->buf->buf, left_mi_row, ctxt->mi_col, - &ref_buf->sf, num_planes); + av1_setup_pre_planes(xd, ref, &ref_buf->buf, left_mi_row, ctxt->mi_col, + ref_scale_factors, num_planes); } xd->mb_to_top_edge = 8 * MI_SIZE * (-left_mi_row); @@ -1081,12 +1035,13 @@ static void build_smooth_interintra_mask(uint8_t *mask, int stride, } } -static void combine_interintra(INTERINTRA_MODE mode, int use_wedge_interintra, - int wedge_index, int wedge_sign, - BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize, - uint8_t *comppred, int compstride, - const uint8_t *interpred, int interstride, - const uint8_t *intrapred, int intrastride) { +static void combine_interintra(INTERINTRA_MODE mode, + int8_t use_wedge_interintra, int wedge_index, + int wedge_sign, BLOCK_SIZE bsize, + BLOCK_SIZE plane_bsize, uint8_t *comppred, + int compstride, const uint8_t *interpred, + int interstride, const uint8_t *intrapred, + int intrastride) { const int bw = block_size_wide[plane_bsize]; const int bh = block_size_high[plane_bsize]; @@ -1110,7 +1065,7 @@ static void combine_interintra(INTERINTRA_MODE mode, int use_wedge_interintra, } static void combine_interintra_highbd( - INTERINTRA_MODE mode, int use_wedge_interintra, int wedge_index, + INTERINTRA_MODE mode, int8_t use_wedge_interintra, int wedge_index, int wedge_sign, BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize, uint8_t *comppred8, int compstride, const uint8_t *interpred8, int interstride, const uint8_t *intrapred8, int intrastride, int bd) { @@ -1140,8 +1095,8 @@ static void combine_interintra_highbd( void av1_build_intra_predictors_for_interintra(const AV1_COMMON *cm, MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane, - BUFFER_SET *ctx, uint8_t *dst, - int dst_stride) { + const BUFFER_SET *ctx, + uint8_t *dst, int dst_stride) { struct macroblockd_plane *const pd = &xd->plane[plane]; const int ssx = xd->plane[plane].subsampling_x; const int ssy = xd->plane[plane].subsampling_y; @@ -1164,7 +1119,7 @@ void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane, const int ssx = xd->plane[plane].subsampling_x; const int ssy = xd->plane[plane].subsampling_y; const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ssx, ssy); - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + if (is_cur_buf_hbd(xd)) { combine_interintra_highbd( xd->mi[0]->interintra_mode, xd->mi[0]->use_wedge_interintra, xd->mi[0]->interintra_wedge_index, xd->mi[0]->interintra_wedge_sign, @@ -1183,9 +1138,9 @@ void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane, // build interintra_predictors for one plane void av1_build_interintra_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd, uint8_t *pred, int stride, - BUFFER_SET *ctx, int plane, + const BUFFER_SET *ctx, int plane, BLOCK_SIZE bsize) { - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + if (is_cur_buf_hbd(xd)) { DECLARE_ALIGNED(16, uint16_t, intrapredictor[MAX_SB_SQUARE]); av1_build_intra_predictors_for_interintra( cm, xd, bsize, plane, ctx, CONVERT_TO_BYTEPTR(intrapredictor), @@ -1204,7 +1159,8 @@ void av1_build_interintra_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd, void av1_build_interintra_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd, uint8_t *upred, uint8_t *vpred, int ustride, int vstride, - BUFFER_SET *ctx, BLOCK_SIZE bsize) { + const BUFFER_SET *ctx, + BLOCK_SIZE bsize) { av1_build_interintra_predictors_sbp(cm, xd, upred, ustride, ctx, 1, bsize); av1_build_interintra_predictors_sbp(cm, xd, vpred, vstride, ctx, 2, bsize); } diff --git a/libaom/av1/common/reconinter.h b/libaom/av1/common/reconinter.h index b773679..9d562f9 100644 --- a/libaom/av1/common/reconinter.h +++ b/libaom/av1/common/reconinter.h @@ -47,7 +47,7 @@ extern "C" { #define WEDGE_NONE -1 // Angles are with respect to horizontal anti-clockwise -typedef enum { +enum { WEDGE_HORIZONTAL = 0, WEDGE_VERTICAL = 1, WEDGE_OBLIQUE27 = 2, @@ -55,7 +55,7 @@ typedef enum { WEDGE_OBLIQUE117 = 4, WEDGE_OBLIQUE153 = 5, WEDGE_DIRECTIONS -} WedgeDirectionType; +} UENUM1BYTE(WedgeDirectionType); // 3-tuple: {direction, x_offset, y_offset} typedef struct { @@ -161,14 +161,13 @@ static INLINE void highbd_inter_predictor(const uint8_t *src, int src_stride, void av1_modify_neighbor_predictor_for_obmc(MB_MODE_INFO *mbmi); int av1_skip_u4x4_pred_in_obmc(BLOCK_SIZE bsize, const struct macroblockd_plane *pd, int dir); -int av1_check_identical_obmc_mv_field(const AV1_COMMON *cm, MACROBLOCKD *xd, - int mi_row, int mi_col); static INLINE int is_interinter_compound_used(COMPOUND_TYPE type, BLOCK_SIZE sb_type) { const int comp_allowed = is_comp_ref_allowed(sb_type); switch (type) { case COMPOUND_AVERAGE: + case COMPOUND_DISTWTD: case COMPOUND_DIFFWTD: return comp_allowed; case COMPOUND_WEDGE: return comp_allowed && wedge_params_lookup[sb_type].bits > 0; @@ -247,13 +246,14 @@ static INLINE MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, return clamped_mv; } -static INLINE int scaled_buffer_offset(int x_offset, int y_offset, int stride, - const struct scale_factors *sf) { +static INLINE int64_t scaled_buffer_offset(int x_offset, int y_offset, + int stride, + const struct scale_factors *sf) { const int x = sf ? sf->scale_value_x(x_offset, sf) >> SCALE_EXTRA_BITS : x_offset; const int y = sf ? sf->scale_value_y(y_offset, sf) >> SCALE_EXTRA_BITS : y_offset; - return y * stride + x; + return (int64_t)y * stride + x; } static INLINE void setup_pred_plane(struct buf_2d *dst, BLOCK_SIZE bsize, @@ -335,25 +335,28 @@ const uint8_t *av1_get_compound_type_mask( // build interintra_predictors for one plane void av1_build_interintra_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd, uint8_t *pred, int stride, - BUFFER_SET *ctx, int plane, + const BUFFER_SET *ctx, int plane, BLOCK_SIZE bsize); void av1_build_interintra_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd, uint8_t *upred, uint8_t *vpred, int ustride, int vstride, - BUFFER_SET *ctx, BLOCK_SIZE bsize); + const BUFFER_SET *ctx, + BLOCK_SIZE bsize); void av1_build_intra_predictors_for_interintra( const AV1_COMMON *cm, MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane, - BUFFER_SET *ctx, uint8_t *intra_pred, int intra_stride); + const BUFFER_SET *ctx, uint8_t *intra_pred, int intra_stride); void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane, const uint8_t *inter_pred, int inter_stride, const uint8_t *intra_pred, int intra_stride); -void av1_jnt_comp_weight_assign(const AV1_COMMON *cm, const MB_MODE_INFO *mbmi, - int order_idx, int *fwd_offset, int *bck_offset, - int *use_jnt_comp_avg, int is_compound); +void av1_dist_wtd_comp_weight_assign(const AV1_COMMON *cm, + const MB_MODE_INFO *mbmi, int order_idx, + int *fwd_offset, int *bck_offset, + int *use_dist_wtd_comp_avg, + int is_compound); int av1_allow_warp(const MB_MODE_INFO *const mbmi, const WarpTypesAllowed *const warp_types, const WarpedMotionParams *const gm_params, diff --git a/libaom/av1/common/reconintra.c b/libaom/av1/common/reconintra.c index df69d6b..559e499 100644 --- a/libaom/av1/common/reconintra.c +++ b/libaom/av1/common/reconintra.c @@ -1510,7 +1510,7 @@ void av1_predict_intra_block( xd->color_index_map_offset[plane != 0]; const uint16_t *const palette = mbmi->palette_mode_info.palette_colors + plane * PALETTE_MAX_SIZE; - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + if (is_cur_buf_hbd(xd)) { uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst); for (r = 0; r < txhpx; ++r) { for (c = 0; c < txwpx; ++c) { @@ -1569,7 +1569,7 @@ void av1_predict_intra_block( tx_size, row_off, col_off, pd->subsampling_x, pd->subsampling_y); const int disable_edge_filter = !cm->seq_params.enable_intra_edge_filter; - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + if (is_cur_buf_hbd(xd)) { build_intra_predictors_high( xd, ref, ref_stride, dst, dst_stride, mode, angle_delta, filter_intra_mode, tx_size, disable_edge_filter, diff --git a/libaom/av1/common/restoration.c b/libaom/av1/common/restoration.c index c62862b..9e472b8 100644 --- a/libaom/av1/common/restoration.c +++ b/libaom/av1/common/restoration.c @@ -1099,7 +1099,7 @@ void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt, const int frame_height = frame->crop_heights[0]; if (aom_realloc_frame_buffer( lr_ctxt->dst, frame_width, frame_height, seq_params->subsampling_x, - seq_params->subsampling_y, highbd, AOM_BORDER_IN_PIXELS, + seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER, cm->byte_alignment, NULL, NULL, NULL) < 0) aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate restoration dst buffer"); diff --git a/libaom/av1/common/restoration.h b/libaom/av1/common/restoration.h index d834f92..6d6ba37 100644 --- a/libaom/av1/common/restoration.h +++ b/libaom/av1/common/restoration.h @@ -22,6 +22,8 @@ extern "C" { #endif +// Border for Loop restoration buffer +#define AOM_RESTORATION_FRAME_BORDER 32 #define CLIP(x, lo, hi) ((x) < (lo) ? (lo) : (x) > (hi) ? (hi) : (x)) #define RINT(x) ((x) < 0 ? (int)((x)-0.5) : (int)((x) + 0.5)) diff --git a/libaom/av1/common/scale.c b/libaom/av1/common/scale.c index c525fe2..bac7bd9 100644 --- a/libaom/av1/common/scale.c +++ b/libaom/av1/common/scale.c @@ -97,13 +97,13 @@ void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w, // subpel_x_q4 != 0 && subpel_y_q4 != 0 sf->convolve[1][1][0] = av1_convolve_2d_sr; // subpel_x_q4 == 0 && subpel_y_q4 == 0 - sf->convolve[0][0][1] = av1_jnt_convolve_2d_copy; + sf->convolve[0][0][1] = av1_dist_wtd_convolve_2d_copy; // subpel_x_q4 == 0 - sf->convolve[0][1][1] = av1_jnt_convolve_y; + sf->convolve[0][1][1] = av1_dist_wtd_convolve_y; // subpel_y_q4 == 0 - sf->convolve[1][0][1] = av1_jnt_convolve_x; + sf->convolve[1][0][1] = av1_dist_wtd_convolve_x; // subpel_x_q4 != 0 && subpel_y_q4 != 0 - sf->convolve[1][1][1] = av1_jnt_convolve_2d; + sf->convolve[1][1][1] = av1_dist_wtd_convolve_2d; // AV1 High BD convolve functions // Special case convolve functions should produce the same result as // av1_highbd_convolve_2d. @@ -116,11 +116,11 @@ void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w, // subpel_x_q4 != 0 && subpel_y_q4 != 0 sf->highbd_convolve[1][1][0] = av1_highbd_convolve_2d_sr; // subpel_x_q4 == 0 && subpel_y_q4 == 0 - sf->highbd_convolve[0][0][1] = av1_highbd_jnt_convolve_2d_copy; + sf->highbd_convolve[0][0][1] = av1_highbd_dist_wtd_convolve_2d_copy; // subpel_x_q4 == 0 - sf->highbd_convolve[0][1][1] = av1_highbd_jnt_convolve_y; + sf->highbd_convolve[0][1][1] = av1_highbd_dist_wtd_convolve_y; // subpel_y_q4 == 0 - sf->highbd_convolve[1][0][1] = av1_highbd_jnt_convolve_x; + sf->highbd_convolve[1][0][1] = av1_highbd_dist_wtd_convolve_x; // subpel_x_q4 != 0 && subpel_y_q4 != 0 - sf->highbd_convolve[1][1][1] = av1_highbd_jnt_convolve_2d; + sf->highbd_convolve[1][1][1] = av1_highbd_dist_wtd_convolve_2d; } diff --git a/libaom/av1/common/scan.h b/libaom/av1/common/scan.h index 233dc0e..f9c3392 100644 --- a/libaom/av1/common/scan.h +++ b/libaom/av1/common/scan.h @@ -25,14 +25,14 @@ extern "C" { #define MAX_NEIGHBORS 2 -typedef enum SCAN_MODE { +enum { SCAN_MODE_ZIG_ZAG, SCAN_MODE_COL_DIAG, SCAN_MODE_ROW_DIAG, SCAN_MODE_COL_1D, SCAN_MODE_ROW_1D, SCAN_MODES -} SCAN_MODE; +} UENUM1BYTE(SCAN_MODE); extern const SCAN_ORDER av1_default_scan_orders[TX_SIZES]; extern const SCAN_ORDER av1_scan_orders[TX_SIZES_ALL][TX_TYPES]; diff --git a/libaom/av1/common/seg_common.h b/libaom/av1/common/seg_common.h index 8c35bba..fa7894c 100644 --- a/libaom/av1/common/seg_common.h +++ b/libaom/av1/common/seg_common.h @@ -24,7 +24,7 @@ extern "C" { #define SEG_TEMPORAL_PRED_CTXS 3 #define SPATIAL_PREDICTION_PROBS 3 -typedef enum { +enum { SEG_LVL_ALT_Q, // Use alternate Quantizer .... SEG_LVL_ALT_LF_Y_V, // Use alternate loop filter value on y plane vertical SEG_LVL_ALT_LF_Y_H, // Use alternate loop filter value on y plane horizontal @@ -34,7 +34,7 @@ typedef enum { SEG_LVL_SKIP, // Optional Segment (0,0) + skip mode SEG_LVL_GLOBALMV, SEG_LVL_MAX -} SEG_LVL_FEATURES; +} UENUM1BYTE(SEG_LVL_FEATURES); struct segmentation { uint8_t enabled; diff --git a/libaom/av1/common/tile_common.c b/libaom/av1/common/tile_common.c index 1b41348..02f50f5 100644 --- a/libaom/av1/common/tile_common.c +++ b/libaom/av1/common/tile_common.c @@ -51,6 +51,10 @@ void av1_calculate_tile_cols(AV1_COMMON *const cm) { int sb_rows = mi_rows >> cm->seq_params.mib_size_log2; int i; + // This will be overridden if there is at least two columns of tiles + // (otherwise there is no inner tile width) + cm->min_inner_tile_width = -1; + if (cm->uniform_tile_spacing_flag) { int start_sb; int size_sb = ALIGN_POWER_OF_TWO(sb_cols, cm->log2_tile_cols); @@ -67,18 +71,29 @@ void av1_calculate_tile_cols(AV1_COMMON *const cm) { cm->tile_width = size_sb << cm->seq_params.mib_size_log2; cm->tile_width = AOMMIN(cm->tile_width, cm->mi_cols); + if (cm->tile_cols > 1) { + cm->min_inner_tile_width = cm->tile_width; + } } else { int max_tile_area_sb = (sb_rows * sb_cols); int widest_tile_sb = 1; + int narrowest_inner_tile_sb = 65536; cm->log2_tile_cols = tile_log2(1, cm->tile_cols); for (i = 0; i < cm->tile_cols; i++) { int size_sb = cm->tile_col_start_sb[i + 1] - cm->tile_col_start_sb[i]; widest_tile_sb = AOMMAX(widest_tile_sb, size_sb); + // ignore the rightmost tile in frame for determining the narrowest + if (i < cm->tile_cols - 1) + narrowest_inner_tile_sb = AOMMIN(narrowest_inner_tile_sb, size_sb); } if (cm->min_log2_tiles) { max_tile_area_sb >>= (cm->min_log2_tiles + 1); } cm->max_tile_height_sb = AOMMAX(max_tile_area_sb / widest_tile_sb, 1); + if (cm->tile_cols > 1) { + cm->min_inner_tile_width = narrowest_inner_tile_sb + << cm->seq_params.mib_size_log2; + } } } @@ -143,30 +158,6 @@ int av1_get_sb_cols_in_tile(AV1_COMMON *cm, TileInfo tile) { return sb_cols; } -int get_tile_size(int mi_frame_size, int log2_tile_num, int *ntiles) { - // Round the frame up to a whole number of max superblocks - mi_frame_size = ALIGN_POWER_OF_TWO(mi_frame_size, MAX_MIB_SIZE_LOG2); - - // Divide by the signalled number of tiles, rounding up to the multiple of - // the max superblock size. To do this, shift right (and round up) to get the - // tile size in max super-blocks and then shift left again to convert it to - // mi units. - const int shift = log2_tile_num + MAX_MIB_SIZE_LOG2; - const int max_sb_tile_size = - ALIGN_POWER_OF_TWO(mi_frame_size, shift) >> shift; - const int mi_tile_size = max_sb_tile_size << MAX_MIB_SIZE_LOG2; - - // The actual number of tiles is the ceiling of the frame size in mi units - // divided by mi_size. This is at most 1 << log2_tile_num but might be - // strictly less if max_sb_tile_size got rounded up significantly. - if (ntiles) { - *ntiles = (mi_frame_size + mi_tile_size - 1) / mi_tile_size; - assert(*ntiles <= (1 << log2_tile_num)); - } - - return mi_tile_size; -} - AV1PixelRect av1_get_tile_rect(const TileInfo *tile_info, const AV1_COMMON *cm, int is_uv) { AV1PixelRect r; @@ -205,3 +196,34 @@ AV1PixelRect av1_get_tile_rect(const TileInfo *tile_info, const AV1_COMMON *cm, return r; } + +void av1_get_uniform_tile_size(const AV1_COMMON *cm, int *w, int *h) { + if (cm->uniform_tile_spacing_flag) { + *w = cm->tile_width; + *h = cm->tile_height; + } else { + for (int i = 0; i < cm->tile_cols; ++i) { + const int tile_width_sb = + cm->tile_col_start_sb[i + 1] - cm->tile_col_start_sb[i]; + const int tile_w = tile_width_sb * cm->seq_params.mib_size; + assert(i == 0 || tile_w == *w); // ensure all tiles have same dimension + *w = tile_w; + } + + for (int i = 0; i < cm->tile_rows; ++i) { + const int tile_height_sb = + cm->tile_row_start_sb[i + 1] - cm->tile_row_start_sb[i]; + const int tile_h = tile_height_sb * cm->seq_params.mib_size; + assert(i == 0 || tile_h == *h); // ensure all tiles have same dimension + *h = tile_h; + } + } +} + +int is_min_tile_width_satisfied(const AV1_COMMON *cm) { + // Disable check if there is a single tile col in the frame + if (cm->tile_cols == 1) return 1; + + return ((cm->min_inner_tile_width << MI_SIZE_LOG2) >= + (64 << av1_superres_scaled(cm))); +} diff --git a/libaom/av1/common/tile_common.h b/libaom/av1/common/tile_common.h index c03553d..a235f2d 100644 --- a/libaom/av1/common/tile_common.h +++ b/libaom/av1/common/tile_common.h @@ -25,7 +25,6 @@ struct AV1Common; typedef struct TileInfo { int mi_row_start, mi_row_end; int mi_col_start, mi_col_end; - int tg_horz_boundary; int tile_row; int tile_col; } TileInfo; @@ -37,12 +36,6 @@ void av1_tile_init(TileInfo *tile, const struct AV1Common *cm, int row, void av1_tile_set_row(TileInfo *tile, const struct AV1Common *cm, int row); void av1_tile_set_col(TileInfo *tile, const struct AV1Common *cm, int col); -void av1_get_tile_n_bits(int mi_cols, int *min_log2_tile_cols, - int *max_log2_tile_cols); - -// Calculate the correct tile size (width or height) for (1 << log2_tile_num) -// tiles horizontally or vertically in the frame. -int get_tile_size(int mi_frame_size, int log2_tile_num, int *ntiles); int av1_get_sb_rows_in_tile(struct AV1Common *cm, TileInfo tile); int av1_get_sb_cols_in_tile(struct AV1Common *cm, TileInfo tile); @@ -61,10 +54,14 @@ AV1PixelRect av1_get_tile_rect(const TileInfo *tile_info, #define MAX_TILE_WIDTH (4096) // Max Tile width in pixels #define MAX_TILE_AREA (4096 * 2304) // Maximum tile area in pixels +void av1_get_uniform_tile_size(const struct AV1Common *cm, int *w, int *h); void av1_get_tile_limits(struct AV1Common *const cm); void av1_calculate_tile_cols(struct AV1Common *const cm); void av1_calculate_tile_rows(struct AV1Common *const cm); +// Checks if the minimum tile_width requirement is satisfied +int is_min_tile_width_satisfied(const struct AV1Common *cm); + #ifdef __cplusplus } // extern "C" #endif diff --git a/libaom/av1/common/txb_common.c b/libaom/av1/common/txb_common.c index c96d37c..cb92bd8 100644 --- a/libaom/av1/common/txb_common.c +++ b/libaom/av1/common/txb_common.c @@ -453,23 +453,6 @@ const int8_t *av1_nz_map_ctx_offset[19] = { av1_nz_map_ctx_offset_64x32, // TX_64x16 }; -void av1_init_lv_map(AV1_COMMON *cm) { - LV_MAP_CTX_TABLE *coeff_ctx_table = &cm->coeff_ctx_table; - for (int row = 0; row < 2; ++row) { - for (int col = 0; col < 2; ++col) { - for (int sig_mag = 0; sig_mag < 3; ++sig_mag) { - for (int count = 0; count < BASE_CONTEXT_POSITION_NUM + 1; ++count) { - if (row == 0 && col == 0 && count > 5) continue; - if ((row == 0 || col == 0) && count > 8) continue; - - coeff_ctx_table->base_ctx_table[row][col][sig_mag][count] = - get_base_ctx_from_count_mag(row, col, count, sig_mag); - } - } - } - } -} - const int16_t k_eob_group_start[12] = { 0, 1, 2, 3, 5, 9, 17, 33, 65, 129, 257, 513 }; const int16_t k_eob_offset_bits[12] = { 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; diff --git a/libaom/av1/common/txb_common.h b/libaom/av1/common/txb_common.h index 698e95b..8a3932d 100644 --- a/libaom/av1/common/txb_common.h +++ b/libaom/av1/common/txb_common.h @@ -159,6 +159,19 @@ static INLINE int get_br_ctx_2d(const uint8_t *const levels, return mag + 14; } +static AOM_FORCE_INLINE int get_br_ctx_eob(const int c, // raster order + const int bwl, + const TX_CLASS tx_class) { + const int row = c >> bwl; + const int col = c - (row << bwl); + if (c == 0) return 0; + if ((tx_class == TX_CLASS_2D && row < 2 && col < 2) || + (tx_class == TX_CLASS_HORIZ && col == 0) || + (tx_class == TX_CLASS_VERT && row == 0)) + return 7; + return 14; +} + static AOM_FORCE_INLINE int get_br_ctx(const uint8_t *const levels, const int c, // raster order const int bwl, const TX_CLASS tx_class) { @@ -272,12 +285,10 @@ static AOM_FORCE_INLINE int get_nz_map_ctx_from_stats( const int row = coeff_idx >> bwl; const int col = coeff_idx - (row << bwl); return ctx + nz_map_ctx_offset_1d[col]; - break; } case TX_CLASS_VERT: { const int row = coeff_idx >> bwl; return ctx + nz_map_ctx_offset_1d[row]; - break; } default: break; } @@ -421,6 +432,4 @@ static INLINE void get_txb_ctx(const BLOCK_SIZE plane_bsize, #undef MAX_TX_SIZE_UNIT } -void av1_init_lv_map(AV1_COMMON *cm); - #endif // AOM_AV1_COMMON_TXB_COMMON_H_ diff --git a/libaom/av1/common/warped_motion.c b/libaom/av1/common/warped_motion.c index 4144c43..e232e10 100644 --- a/libaom/av1/common/warped_motion.c +++ b/libaom/av1/common/warped_motion.c @@ -485,7 +485,7 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, uint16_t *dst16 = &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)]; int32_t tmp32 = *p; - if (conv_params->use_jnt_comp_avg) { + if (conv_params->use_dist_wtd_comp_avg) { tmp32 = tmp32 * conv_params->fwd_offset + sum * conv_params->bck_offset; tmp32 = tmp32 >> DIST_PRECISION_BITS; @@ -563,7 +563,7 @@ static int64_t highbd_warp_error( uint16_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK]; ConvolveParams conv_params = get_conv_params(0, 0, bd); - conv_params.use_jnt_comp_avg = 0; + conv_params.use_dist_wtd_comp_avg = 0; for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) { for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) { // avoid warping extra 8x8 blocks in the padded region of the frame @@ -773,7 +773,7 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width, uint8_t *dst8 = &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)]; int32_t tmp32 = *p; - if (conv_params->use_jnt_comp_avg) { + if (conv_params->use_dist_wtd_comp_avg) { tmp32 = tmp32 * conv_params->fwd_offset + sum * conv_params->bck_offset; tmp32 = tmp32 >> DIST_PRECISION_BITS; @@ -846,7 +846,7 @@ static int64_t warp_error(WarpedMotionParams *wm, const uint8_t *const ref, int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK); uint8_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK]; ConvolveParams conv_params = get_conv_params(0, 0, 8); - conv_params.use_jnt_comp_avg = 0; + conv_params.use_dist_wtd_comp_avg = 0; for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) { for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) { diff --git a/libaom/av1/common/x86/av1_convolve_scale_sse4.c b/libaom/av1/common/x86/av1_convolve_scale_sse4.c index d9fb537..8f44238 100644 --- a/libaom/av1/common/x86/av1_convolve_scale_sse4.c +++ b/libaom/av1/common/x86/av1_convolve_scale_sse4.c @@ -175,7 +175,7 @@ static void vfilter8(const int16_t *src, int src_stride, uint8_t *dst, if (conv_params->is_compound) { if (conv_params->do_average) { const __m128i p_16 = _mm_loadl_epi64((__m128i *)dst_16_x); - if (conv_params->use_jnt_comp_avg) { + if (conv_params->use_dist_wtd_comp_avg) { const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, shifted_16); const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, wt); const __m128i shifted_32 = @@ -207,7 +207,7 @@ static void vfilter8(const int16_t *src, int src_stride, uint8_t *dst, if (conv_params->is_compound) { if (conv_params->do_average) { int32_t tmp = dst16[y * dst16_stride + x]; - if (conv_params->use_jnt_comp_avg) { + if (conv_params->use_dist_wtd_comp_avg) { tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; tmp = tmp >> DIST_PRECISION_BITS; } else { @@ -408,7 +408,7 @@ static void highbd_vfilter8(const int16_t *src, int src_stride, uint16_t *dst, __m128i p_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i *)dst_16_x)); - if (conv_params->use_jnt_comp_avg) { + if (conv_params->use_dist_wtd_comp_avg) { shifted = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0), _mm_mullo_epi32(shifted, wt1)); shifted = _mm_srai_epi32(shifted, DIST_PRECISION_BITS); @@ -443,7 +443,7 @@ static void highbd_vfilter8(const int16_t *src, int src_stride, uint16_t *dst, if (conv_params->is_compound) { if (conv_params->do_average) { int32_t tmp = dst16[y * dst16_stride + x]; - if (conv_params->use_jnt_comp_avg) { + if (conv_params->use_dist_wtd_comp_avg) { tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset; tmp = tmp >> DIST_PRECISION_BITS; } else { diff --git a/libaom/av1/common/x86/av1_inv_txfm_ssse3.c b/libaom/av1/common/x86/av1_inv_txfm_ssse3.c index 9841bf3..de0a561 100644 --- a/libaom/av1/common/x86/av1_inv_txfm_ssse3.c +++ b/libaom/av1/common/x86/av1_inv_txfm_ssse3.c @@ -2920,8 +2920,18 @@ void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param) { const TX_TYPE tx_type = txfm_param->tx_type; if (!txfm_param->lossless) { - av1_lowbd_inv_txfm2d_add_ssse3(dqcoeff, dst, stride, tx_type, - txfm_param->tx_size, txfm_param->eob); + switch (txfm_param->tx_size) { + case TX_4X16: + case TX_16X4: + // TODO(http://crbug.com/aomedia/2350): the ssse3 versions cause test + // vector mismatches. + av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param); + break; + default: + av1_lowbd_inv_txfm2d_add_ssse3(dqcoeff, dst, stride, tx_type, + txfm_param->tx_size, txfm_param->eob); + break; + } } else { av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param); } diff --git a/libaom/av1/common/x86/av1_inv_txfm_ssse3.h b/libaom/av1/common/x86/av1_inv_txfm_ssse3.h index 66bd339..7d5055d 100644 --- a/libaom/av1/common/x86/av1_inv_txfm_ssse3.h +++ b/libaom/av1/common/x86/av1_inv_txfm_ssse3.h @@ -72,13 +72,13 @@ static INLINE void round_shift_16bit_ssse3(__m128i *in, int size, int bit) { } // 1D itx types -typedef enum ATTRIBUTE_PACKED { +enum { IDCT_1D, IADST_1D, IFLIPADST_1D = IADST_1D, IIDENTITY_1D, ITX_TYPES_1D, -} ITX_TYPE_1D; +} UENUM1BYTE(ITX_TYPE_1D); static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = { IDCT_1D, IADST_1D, IDCT_1D, IADST_1D, diff --git a/libaom/av1/common/x86/av1_txfm_sse4.c b/libaom/av1/common/x86/av1_txfm_sse4.c index 90b9879..65ccd19 100644 --- a/libaom/av1/common/x86/av1_txfm_sse4.c +++ b/libaom/av1/common/x86/av1_txfm_sse4.c @@ -9,7 +9,7 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" #include "av1/common/av1_txfm.h" #include "av1/common/x86/av1_txfm_sse4.h" diff --git a/libaom/av1/common/x86/convolve_2d_avx2.c b/libaom/av1/common/x86/convolve_2d_avx2.c index 0acafd0..ae12a60 100644 --- a/libaom/av1/common/x86/convolve_2d_avx2.c +++ b/libaom/av1/common/x86/convolve_2d_avx2.c @@ -27,31 +27,15 @@ void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params) { const int bd = 8; - - DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]); - int im_h = h + filter_params_y->taps - 1; int im_stride = 8; - int i, j; - const int fo_vert = filter_params_y->taps / 2 - 1; - const int fo_horiz = filter_params_x->taps / 2 - 1; - const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; - + int i, is_horiz_4tap = 0, is_vert_4tap = 0; + DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]); const int bits = FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; - __m256i filt[4], coeffs_h[4], coeffs_v[4]; - assert(conv_params->round_0 > 0); - filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2); - filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); - filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); - filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); - - prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_h); - prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_v); - const __m256i round_const_h = _mm256_set1_epi16( ((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2))); const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1); @@ -65,58 +49,96 @@ void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, ((1 << (offset_bits - conv_params->round_1)) >> 1)); const __m128i round_shift_v = _mm_cvtsi32_si128(bits); - for (j = 0; j < w; j += 8) { - for (i = 0; i < im_h; i += 2) { - __m256i data = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); + __m256i filt[4], coeffs_h[4], coeffs_v[4]; + + filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2)); + filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); + + prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_h); + prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_v); - // Load the next line - if (i + 1 < im_h) + // Condition for checking valid horz_filt taps + if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_h[0], coeffs_h[3]), 0))) + is_horiz_4tap = 1; + + // Condition for checking valid vert_filt taps + if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_v[0], coeffs_v[3]), 0))) + is_vert_4tap = 1; + + // horz_filt as 4 tap and vert_filt as 8 tap + if (is_horiz_4tap) { + int im_h = h + filter_params_y->taps - 1; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + // horz-filter + for (int j = 0; j < w; j += 8) { + for (i = 0; i < (im_h - 2); i += 2) { + __m256i data = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); + + // Load the next line data = _mm256_inserti128_si256( data, _mm_loadu_si128( (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]), 1); + __m256i res = convolve_lowbd_x_4tap(data, coeffs_h + 1, filt); - __m256i res = convolve_lowbd_x(data, coeffs_h, filt); + res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), + round_shift_h); + _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); + } + __m256i data_1 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); + + __m256i res = convolve_lowbd_x_4tap(data_1, coeffs_h + 1, filt); res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); - _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); - } - /* Vertical filter */ - { + // vert filter + CONVOLVE_SR_VERTICAL_FILTER_8TAP; + } + } else if (is_vert_4tap) { + int im_h = h + 3; + const int fo_vert = 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); + + for (int j = 0; j < w; j += 8) { + // horz_filter + CONVOLVE_SR_HORIZONTAL_FILTER_8TAP; + // vert_filter + __m256i s[6]; __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); - __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); - __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); - __m256i s[8]; s[0] = _mm256_unpacklo_epi16(src_0, src_1); s[1] = _mm256_unpacklo_epi16(src_2, src_3); - s[2] = _mm256_unpacklo_epi16(src_4, src_5); - - s[4] = _mm256_unpackhi_epi16(src_0, src_1); - s[5] = _mm256_unpackhi_epi16(src_2, src_3); - s[6] = _mm256_unpackhi_epi16(src_4, src_5); + s[3] = _mm256_unpackhi_epi16(src_0, src_1); + s[4] = _mm256_unpackhi_epi16(src_2, src_3); for (i = 0; i < h; i += 2) { const int16_t *data = &im_block[i * im_stride]; - const __m256i s6 = - _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); - const __m256i s7 = - _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); + const __m256i s4 = + _mm256_loadu_si256((__m256i *)(data + 4 * im_stride)); + const __m256i s5 = + _mm256_loadu_si256((__m256i *)(data + 5 * im_stride)); - s[3] = _mm256_unpacklo_epi16(s6, s7); - s[7] = _mm256_unpackhi_epi16(s6, s7); + s[2] = _mm256_unpacklo_epi16(s4, s5); + s[5] = _mm256_unpackhi_epi16(s4, s5); - __m256i res_a = convolve(s, coeffs_v); - __m256i res_b = convolve(s + 4, coeffs_v); + __m256i res_a = convolve_4tap(s, coeffs_v + 1); + __m256i res_b = convolve_4tap(s + 3, coeffs_v + 1); // Combine V round and 2F-H-V round into a single rounding res_a = @@ -154,13 +176,25 @@ void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, s[0] = s[1]; s[1] = s[2]; - s[2] = s[3]; - + s[3] = s[4]; s[4] = s[5]; - s[5] = s[6]; - s[6] = s[7]; } } + } else { + int j; + int im_h = h + filter_params_y->taps - 1; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); + + for (j = 0; j < w; j += 8) { + CONVOLVE_SR_HORIZONTAL_FILTER_8TAP; + + CONVOLVE_SR_VERTICAL_FILTER_8TAP; + } } } @@ -195,20 +229,20 @@ void av1_convolve_2d_copy_sr_avx2(const uint8_t *src, int src_stride, if (w == 2) { do { - memcpy(dst, src, 2 * sizeof(*src)); + memmove(dst, src, 2 * sizeof(*src)); src += src_stride; dst += dst_stride; - memcpy(dst, src, 2 * sizeof(*src)); + memmove(dst, src, 2 * sizeof(*src)); src += src_stride; dst += dst_stride; h -= 2; } while (h); } else if (w == 4) { do { - memcpy(dst, src, 4 * sizeof(*src)); + memmove(dst, src, 4 * sizeof(*src)); src += src_stride; dst += dst_stride; - memcpy(dst, src, 4 * sizeof(*src)); + memmove(dst, src, 4 * sizeof(*src)); src += src_stride; dst += dst_stride; h -= 2; diff --git a/libaom/av1/common/x86/convolve_2d_sse2.c b/libaom/av1/common/x86/convolve_2d_sse2.c index b1a62a4..369922b 100644 --- a/libaom/av1/common/x86/convolve_2d_sse2.c +++ b/libaom/av1/common/x86/convolve_2d_sse2.c @@ -255,20 +255,20 @@ void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride, if (w == 2) { do { - memcpy(dst, src, 2 * sizeof(*src)); + memmove(dst, src, 2 * sizeof(*src)); src += src_stride; dst += dst_stride; - memcpy(dst, src, 2 * sizeof(*src)); + memmove(dst, src, 2 * sizeof(*src)); src += src_stride; dst += dst_stride; h -= 2; } while (h); } else if (w == 4) { do { - memcpy(dst, src, 4 * sizeof(*src)); + memmove(dst, src, 4 * sizeof(*src)); src += src_stride; dst += dst_stride; - memcpy(dst, src, 4 * sizeof(*src)); + memmove(dst, src, 4 * sizeof(*src)); src += src_stride; dst += dst_stride; h -= 2; @@ -354,12 +354,11 @@ void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride, } } -void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, - uint8_t *dst0, int dst_stride0, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { +void av1_dist_wtd_convolve_2d_copy_sse2( + const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, + const int subpel_y_q4, ConvolveParams *conv_params) { const int bd = 8; CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; @@ -371,7 +370,7 @@ void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, const int bits = FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0; const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const __m128i zero = _mm_setzero_si128(); const __m128i left_shift = _mm_cvtsi32_si128(bits); int i, j; @@ -411,14 +410,14 @@ void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, const __m128i data_ref_0_hi = _mm_loadu_si128((__m128i *)(&dst[j + 8])); - const __m128i comp_avg_res_lo = - comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt, use_jnt_comp_avg); + const __m128i comp_avg_res_lo = comp_avg( + &data_ref_0_lo, &res_unsigned_lo, &wt, use_dist_wtd_comp_avg); const __m128i round_result_lo = convolve_rounding( &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); - const __m128i comp_avg_res_hi = - comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt, use_jnt_comp_avg); + const __m128i comp_avg_res_hi = comp_avg( + &data_ref_0_hi, &res_unsigned_hi, &wt, use_dist_wtd_comp_avg); const __m128i round_result_hi = convolve_rounding( &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift); @@ -449,7 +448,7 @@ void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)(&dst[j])); const __m128i comp_avg_res = - comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg); + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); const __m128i round_result = convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); diff --git a/libaom/av1/common/x86/convolve_avx2.c b/libaom/av1/common/x86/convolve_avx2.c index 0e91ea9..21b9fe4 100644 --- a/libaom/av1/common/x86/convolve_avx2.c +++ b/libaom/av1/common/x86/convolve_avx2.c @@ -23,153 +23,239 @@ void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params) { - int i, j; - const int fo_vert = filter_params_y->taps / 2 - 1; - const uint8_t *const src_ptr = src - fo_vert * src_stride; - + int i, j, is_vert_4tap = 0; // right shift is F-1 because we are already dividing // filter co-efficients by 2 const int right_shift_bits = (FILTER_BITS - 1); const __m128i right_shift = _mm_cvtsi32_si128(right_shift_bits); const __m256i right_shift_const = _mm256_set1_epi16((1 << right_shift_bits) >> 1); - __m256i coeffs[4], s[8]; assert(conv_params->round_0 <= FILTER_BITS); assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) || ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS))); - prepare_coeffs_lowbd(filter_params_y, subpel_y_q4, coeffs); - (void)filter_params_x; (void)subpel_x_q4; (void)conv_params; + __m256i coeffs[4], s[8]; + __m128i d[6]; - for (j = 0; j < w; j += 16) { - const uint8_t *data = &src_ptr[j]; - __m256i src6; - - // Load lines a and b. Line a to lower 128, line b to upper 128 - const __m256i src_01a = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 0 * src_stride))), - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 1 * src_stride))), - 0x20); - - const __m256i src_12a = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 1 * src_stride))), - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 2 * src_stride))), - 0x20); - - const __m256i src_23a = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 2 * src_stride))), - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 3 * src_stride))), - 0x20); - - const __m256i src_34a = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 3 * src_stride))), - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 4 * src_stride))), - 0x20); - - const __m256i src_45a = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 4 * src_stride))), - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), - 0x20); - - src6 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 6 * src_stride))); - const __m256i src_56a = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), - src6, 0x20); - - s[0] = _mm256_unpacklo_epi8(src_01a, src_12a); - s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); - s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); - - s[4] = _mm256_unpackhi_epi8(src_01a, src_12a); - s[5] = _mm256_unpackhi_epi8(src_23a, src_34a); - s[6] = _mm256_unpackhi_epi8(src_45a, src_56a); - - for (i = 0; i < h; i += 2) { - data = &src_ptr[i * src_stride + j]; - const __m256i src_67a = _mm256_permute2x128_si256( - src6, - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), - 0x20); + prepare_coeffs_lowbd(filter_params_y, subpel_y_q4, coeffs); - src6 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 8 * src_stride))); - const __m256i src_78a = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), - src6, 0x20); + // Condition for checking valid vert_filt taps + if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0))) + is_vert_4tap = 1; + + // vert_filt as 4 tap + if (is_vert_4tap) { + const int fo_vert = 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride; + for (j = 0; j < w; j += 16) { + const uint8_t *data = &src_ptr[j]; + d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); + d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); + d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); + d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride)); + d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride)); + + // Load lines a and b. Line a to lower 128, line b to upper 128 + const __m256i src_01a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20); + + const __m256i src_12a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20); - s[3] = _mm256_unpacklo_epi8(src_67a, src_78a); - s[7] = _mm256_unpackhi_epi8(src_67a, src_78a); + const __m256i src_23a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20); - const __m256i res_lo = convolve_lowbd(s, coeffs); + const __m256i src_34a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20); - /* rounding code */ - // shift by F - 1 - const __m256i res_16b_lo = _mm256_sra_epi16( - _mm256_add_epi16(res_lo, right_shift_const), right_shift); - // 8 bit conversion and saturation to uint8 - __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); + s[0] = _mm256_unpacklo_epi8(src_01a, src_12a); + s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); - if (w - j > 8) { - const __m256i res_hi = convolve_lowbd(s + 4, coeffs); + s[3] = _mm256_unpackhi_epi8(src_01a, src_12a); + s[4] = _mm256_unpackhi_epi8(src_23a, src_34a); + for (i = 0; i < h; i += 2) { + data = &src_ptr[i * src_stride + j]; + d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride)); + const __m256i src_45a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20); + + d[4] = _mm_loadu_si128((__m128i *)(data + 6 * src_stride)); + const __m256i src_56a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[4]), 0x20); + + s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); + s[5] = _mm256_unpackhi_epi8(src_45a, src_56a); + + const __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1); /* rounding code */ // shift by F - 1 - const __m256i res_16b_hi = _mm256_sra_epi16( - _mm256_add_epi16(res_hi, right_shift_const), right_shift); + const __m256i res_16b_lo = _mm256_sra_epi16( + _mm256_add_epi16(res_lo, right_shift_const), right_shift); // 8 bit conversion and saturation to uint8 - __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi); - - __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi); - - const __m128i res_0 = _mm256_castsi256_si128(res_a); - const __m128i res_1 = _mm256_extracti128_si256(res_a, 1); - - _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0); - _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], - res_1); - } else { - const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo); - const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); - if (w - j > 4) { - _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0); - _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], + __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); + + if (w - j > 8) { + const __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1); + + /* rounding code */ + // shift by F - 1 + const __m256i res_16b_hi = _mm256_sra_epi16( + _mm256_add_epi16(res_hi, right_shift_const), right_shift); + // 8 bit conversion and saturation to uint8 + __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi); + + __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi); + + const __m128i res_0 = _mm256_castsi256_si128(res_a); + const __m128i res_1 = _mm256_extracti128_si256(res_a, 1); + + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], res_1); - } else if (w - j > 2) { - xx_storel_32(&dst[i * dst_stride + j], res_0); - xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1); } else { - __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; - __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride]; - *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0); - *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1); + const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo); + const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); + if (w - j > 4) { + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], + res_1); + } else if (w - j > 2) { + xx_storel_32(&dst[i * dst_stride + j], res_0); + xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1); + } else { + __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; + __m128i *const p_1 = + (__m128i *)&dst[i * dst_stride + j + dst_stride]; + *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0); + *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1); + } } + s[0] = s[1]; + s[1] = s[2]; + + s[3] = s[4]; + s[4] = s[5]; } + } + } else { + const int fo_vert = filter_params_y->taps / 2 - 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride; + + for (j = 0; j < w; j += 16) { + const uint8_t *data = &src_ptr[j]; + __m256i src6; + + d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); + d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); + d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); + d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride)); + d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride)); + d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride)); + // Load lines a and b. Line a to lower 128, line b to upper 128 + const __m256i src_01a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20); - s[0] = s[1]; - s[1] = s[2]; - s[2] = s[3]; + const __m256i src_12a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20); + + const __m256i src_23a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20); + + const __m256i src_34a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20); + + const __m256i src_45a = _mm256_permute2x128_si256( + _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20); + + src6 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 6 * src_stride))); + const __m256i src_56a = + _mm256_permute2x128_si256(_mm256_castsi128_si256(d[5]), src6, 0x20); + + s[0] = _mm256_unpacklo_epi8(src_01a, src_12a); + s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); + s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); + + s[4] = _mm256_unpackhi_epi8(src_01a, src_12a); + s[5] = _mm256_unpackhi_epi8(src_23a, src_34a); + s[6] = _mm256_unpackhi_epi8(src_45a, src_56a); + + for (i = 0; i < h; i += 2) { + data = &src_ptr[i * src_stride + j]; + const __m256i src_67a = _mm256_permute2x128_si256( + src6, + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), + 0x20); + + src6 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 8 * src_stride))); + const __m256i src_78a = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), + src6, 0x20); + + s[3] = _mm256_unpacklo_epi8(src_67a, src_78a); + s[7] = _mm256_unpackhi_epi8(src_67a, src_78a); + + const __m256i res_lo = convolve_lowbd(s, coeffs); - s[4] = s[5]; - s[5] = s[6]; - s[6] = s[7]; + /* rounding code */ + // shift by F - 1 + const __m256i res_16b_lo = _mm256_sra_epi16( + _mm256_add_epi16(res_lo, right_shift_const), right_shift); + // 8 bit conversion and saturation to uint8 + __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); + + if (w - j > 8) { + const __m256i res_hi = convolve_lowbd(s + 4, coeffs); + + /* rounding code */ + // shift by F - 1 + const __m256i res_16b_hi = _mm256_sra_epi16( + _mm256_add_epi16(res_hi, right_shift_const), right_shift); + // 8 bit conversion and saturation to uint8 + __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi); + + __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi); + + const __m128i res_0 = _mm256_castsi256_si128(res_a); + const __m128i res_1 = _mm256_extracti128_si256(res_a, 1); + + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], + res_1); + } else { + const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo); + const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); + if (w - j > 4) { + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], + res_1); + } else if (w - j > 2) { + xx_storel_32(&dst[i * dst_stride + j], res_0); + xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1); + } else { + __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; + __m128i *const p_1 = + (__m128i *)&dst[i * dst_stride + j + dst_stride]; + *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0); + *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1); + } + } + s[0] = s[1]; + s[1] = s[2]; + s[2] = s[3]; + + s[4] = s[5]; + s[5] = s[6]; + s[6] = s[7]; + } } } } @@ -180,26 +266,14 @@ void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params) { - int i, j; - const int fo_horiz = filter_params_x->taps / 2 - 1; - const uint8_t *const src_ptr = src - fo_horiz; const int bits = FILTER_BITS - conv_params->round_0; - __m256i filt[4], coeffs[4]; - - filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2); - filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); - filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); - filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); - - prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs); - const __m256i round_0_const = _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1); const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1); const __m256i round_const = _mm256_set1_epi16((1 << bits) >> 1); const __m128i round_shift = _mm_cvtsi32_si128(bits); - + int i, is_horiz_4tap = 0; (void)filter_params_y; (void)subpel_y_q4; @@ -208,51 +282,101 @@ void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); assert(conv_params->round_0 > 0); - if (w <= 8) { - for (i = 0; i < h; i += 2) { - const __m256i data = _mm256_permute2x128_si256( - _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))), - _mm256_castsi128_si256(_mm_loadu_si128( - (__m128i *)(&src_ptr[i * src_stride + src_stride]))), - 0x20); - - __m256i res_16b = convolve_lowbd_x(data, coeffs, filt); - - res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), - round_0_shift); - - res_16b = - _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), round_shift); - - /* rounding code */ - // 8 bit conversion and saturation to uint8 - __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); - - const __m128i res_0 = _mm256_castsi256_si128(res_8b); - const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); - if (w > 4) { - _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0); - _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1); - } else if (w > 2) { - xx_storel_32(&dst[i * dst_stride], res_0); - xx_storel_32(&dst[i * dst_stride + dst_stride], res_1); - } else { - __m128i *const p_0 = (__m128i *)&dst[i * dst_stride]; - __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride]; - *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0); - *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1); + __m256i coeffs[4], filt[4]; + filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2)); + filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); + + prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs); + + // Condition for checking valid horz_filt taps + if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0))) + is_horiz_4tap = 1; + + // horz_filt as 4 tap + if (is_horiz_4tap) { + const int fo_horiz = 1; + const uint8_t *const src_ptr = src - fo_horiz; + if (w <= 8) { + for (i = 0; i < h; i += 2) { + const __m256i data = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))), + _mm256_castsi128_si256(_mm_loadu_si128( + (__m128i *)(&src_ptr[i * src_stride + src_stride]))), + 0x20); + + __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt); + + res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), + round_0_shift); + + res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), + round_shift); + + /* rounding code */ + // 8 bit conversion and saturation to uint8 + __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); + + const __m128i res_0 = _mm256_castsi256_si128(res_8b); + const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); + + if (w > 4) { + _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1); + } else if (w > 2) { + xx_storel_32(&dst[i * dst_stride], res_0); + xx_storel_32(&dst[i * dst_stride + dst_stride], res_1); + } else { + __m128i *const p_0 = (__m128i *)&dst[i * dst_stride]; + __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride]; + *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0); + *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1); + } + } + } else { + for (i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17 + // 18 19 20 21 22 23 + const __m256i data = _mm256_inserti128_si256( + _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]), + _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]), + 1); + + __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt); + + res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), + round_0_shift); + + res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), + round_shift); + + /* rounding code */ + // 8 bit conversion and saturation to uint8 + __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); + + // Store values into the destination buffer + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + res_8b = _mm256_permute4x64_epi64(res_8b, 216); + __m128i res = _mm256_castsi256_si128(res_8b); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res); + } } } } else { - for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 16) { - // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17 18 - // 19 20 21 22 23 - const __m256i data = _mm256_inserti128_si256( - _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]), - _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]), - 1); + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint8_t *const src_ptr = src - fo_horiz; + filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); + + if (w <= 8) { + for (i = 0; i < h; i += 2) { + const __m256i data = _mm256_permute2x128_si256( + _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))), + _mm256_castsi128_si256(_mm_loadu_si128( + (__m128i *)(&src_ptr[i * src_stride + src_stride]))), + 0x20); __m256i res_16b = convolve_lowbd_x(data, coeffs, filt); @@ -266,11 +390,49 @@ void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst, // 8 bit conversion and saturation to uint8 __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); - // Store values into the destination buffer - // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 - res_8b = _mm256_permute4x64_epi64(res_8b, 216); - __m128i res = _mm256_castsi256_si128(res_8b); - _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res); + const __m128i res_0 = _mm256_castsi256_si128(res_8b); + const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); + if (w > 4) { + _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0); + _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1); + } else if (w > 2) { + xx_storel_32(&dst[i * dst_stride], res_0); + xx_storel_32(&dst[i * dst_stride + dst_stride], res_1); + } else { + __m128i *const p_0 = (__m128i *)&dst[i * dst_stride]; + __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride]; + *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0); + *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1); + } + } + } else { + for (i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17 + // 18 19 20 21 22 23 + const __m256i data = _mm256_inserti128_si256( + _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]), + _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]), + 1); + + __m256i res_16b = convolve_lowbd_x(data, coeffs, filt); + + res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), + round_0_shift); + + res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), + round_shift); + + /* rounding code */ + // 8 bit conversion and saturation to uint8 + __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); + + // Store values into the destination buffer + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + res_8b = _mm256_permute4x64_epi64(res_8b, 216); + __m128i res = _mm256_castsi256_si128(res_8b); + _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res); + } } } } diff --git a/libaom/av1/common/x86/highbd_convolve_2d_avx2.c b/libaom/av1/common/x86/highbd_convolve_2d_avx2.c index ae68f0b..357df12 100644 --- a/libaom/av1/common/x86/highbd_convolve_2d_avx2.c +++ b/libaom/av1/common/x86/highbd_convolve_2d_avx2.c @@ -238,10 +238,10 @@ void av1_highbd_convolve_2d_copy_sr_avx2( if (w == 2) { do { - memcpy(dst, src, 2 * sizeof(*src)); + memmove(dst, src, 2 * sizeof(*src)); src += src_stride; dst += dst_stride; - memcpy(dst, src, 2 * sizeof(*src)); + memmove(dst, src, 2 * sizeof(*src)); src += src_stride; dst += dst_stride; h -= 2; diff --git a/libaom/av1/common/x86/highbd_convolve_2d_sse4.c b/libaom/av1/common/x86/highbd_convolve_2d_sse4.c index 3f8dafb..3c1d5d1 100644 --- a/libaom/av1/common/x86/highbd_convolve_2d_sse4.c +++ b/libaom/av1/common/x86/highbd_convolve_2d_sse4.c @@ -21,7 +21,7 @@ #include "aom_dsp/x86/convolve_sse4_1.h" #include "av1/common/convolve.h" -void av1_highbd_jnt_convolve_2d_copy_sse4_1( +void av1_highbd_dist_wtd_convolve_2d_copy_sse4_1( const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, @@ -37,7 +37,7 @@ void av1_highbd_jnt_convolve_2d_copy_sse4_1( FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0; const __m128i left_shift = _mm_cvtsi32_si128(bits); const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const int w0 = conv_params->fwd_offset; const int w1 = conv_params->bck_offset; const __m128i wt0 = _mm_set1_epi32(w0); @@ -75,15 +75,17 @@ void av1_highbd_jnt_convolve_2d_copy_sse4_1( const __m128i res_unsigned_lo = _mm_add_epi32(res_32b_lo, offset_const); - const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1( - &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg); + const __m128i comp_avg_res_lo = + highbd_comp_avg_sse4_1(&data_ref_0_lo, &res_unsigned_lo, &wt0, + &wt1, use_dist_wtd_comp_avg); const __m128i res_32b_hi = _mm_unpackhi_epi16(res, zero); const __m128i res_unsigned_hi = _mm_add_epi32(res_32b_hi, offset_const); - const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1( - &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg); + const __m128i comp_avg_res_hi = + highbd_comp_avg_sse4_1(&data_ref_0_hi, &res_unsigned_hi, &wt0, + &wt1, use_dist_wtd_comp_avg); const __m128i round_result_lo = highbd_convolve_rounding_sse2( &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); @@ -132,9 +134,9 @@ void av1_highbd_jnt_convolve_2d_copy_sse4_1( _mm_add_epi32(res_32b_hi, offset_const); const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1( - &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg); + &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg); const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1( - &data_ref_1, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg); + &data_ref_1, &res_unsigned_hi, &wt0, &wt1, use_dist_wtd_comp_avg); const __m128i round_result_lo = highbd_convolve_rounding_sse2( &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); @@ -166,7 +168,7 @@ void av1_highbd_jnt_convolve_2d_copy_sse4_1( } } -void av1_highbd_jnt_convolve_2d_sse4_1( +void av1_highbd_dist_wtd_convolve_2d_sse4_1( const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, @@ -179,7 +181,7 @@ void av1_highbd_jnt_convolve_2d_sse4_1( int im_stride = MAX_SB_SIZE; int i, j; const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const int fo_vert = filter_params_y->taps / 2 - 1; const int fo_horiz = filter_params_x->taps / 2 - 1; const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; @@ -359,8 +361,9 @@ void av1_highbd_jnt_convolve_2d_sse4_1( const __m128i data_ref_0 = _mm_cvtepu16_epi32(data_0); - const __m128i comp_avg_res = highbd_comp_avg_sse4_1( - &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg); + const __m128i comp_avg_res = + highbd_comp_avg_sse4_1(&data_ref_0, &res_unsigned_lo, &wt0, + &wt1, use_dist_wtd_comp_avg); const __m128i round_result = highbd_convolve_rounding_sse2( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); @@ -391,10 +394,12 @@ void av1_highbd_jnt_convolve_2d_sse4_1( const __m128i data_ref_0_lo = _mm_cvtepu16_epi32(data_lo); const __m128i data_ref_0_hi = _mm_cvtepu16_epi32(data_hi); - const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1( - &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg); - const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1( - &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg); + const __m128i comp_avg_res_lo = + highbd_comp_avg_sse4_1(&data_ref_0_lo, &res_unsigned_lo, &wt0, + &wt1, use_dist_wtd_comp_avg); + const __m128i comp_avg_res_hi = + highbd_comp_avg_sse4_1(&data_ref_0_hi, &res_unsigned_hi, &wt0, + &wt1, use_dist_wtd_comp_avg); const __m128i round_result_lo = highbd_convolve_rounding_sse2(&comp_avg_res_lo, &offset_const, diff --git a/libaom/av1/common/x86/highbd_inv_txfm_avx2.c b/libaom/av1/common/x86/highbd_inv_txfm_avx2.c index 5418057..fe22465 100644 --- a/libaom/av1/common/x86/highbd_inv_txfm_avx2.c +++ b/libaom/av1/common/x86/highbd_inv_txfm_avx2.c @@ -4309,213 +4309,17 @@ void av1_highbd_inv_txfm2d_add_universe_avx2(const int32_t *input, highbd_inv_txfm2d_add_no_identity_avx2(input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob, bd); break; - default: assert(0); break; - } -} - -void av1_highbd_inv_txfm_add_16x16_avx2(const tran_low_t *input, uint8_t *dest, - int stride, - const TxfmParam *txfm_param) { - int bd = txfm_param->bd; - const TX_TYPE tx_type = txfm_param->tx_type; - const int32_t *src = cast_to_int32(input); - switch (tx_type) { - // Assembly version doesn't support some transform types, so use C version - // for those. - case V_DCT: - case H_DCT: - case V_ADST: - case H_ADST: - case V_FLIPADST: - case H_FLIPADST: - case IDTX: - av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, - tx_type, bd); - break; - default: - av1_highbd_inv_txfm2d_add_universe_avx2(input, dest, stride, tx_type, - txfm_param->tx_size, - txfm_param->eob, bd); - break; - } -} - -void av1_highbd_inv_txfm_add_32x32_avx2(const tran_low_t *input, uint8_t *dest, - int stride, - const TxfmParam *txfm_param) { - const int bd = txfm_param->bd; - const TX_TYPE tx_type = txfm_param->tx_type; - const int32_t *src = cast_to_int32(input); - switch (tx_type) { - case DCT_DCT: - av1_highbd_inv_txfm2d_add_universe_avx2(input, dest, stride, tx_type, - txfm_param->tx_size, - txfm_param->eob, bd); - break; - // Assembly version doesn't support IDTX, so use C version for it. - case IDTX: - av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, - tx_type, bd); - break; - - default: assert(0); - } -} - -void av1_highbd_inv_txfm_add_16x32_avx2(const tran_low_t *input, uint8_t *dest, - int stride, - const TxfmParam *txfm_param) { - int bd = txfm_param->bd; - const TX_TYPE tx_type = txfm_param->tx_type; - const int32_t *src = cast_to_int32(input); - switch (tx_type) { - case DCT_DCT: - av1_highbd_inv_txfm2d_add_universe_avx2(input, dest, stride, tx_type, - txfm_param->tx_size, - txfm_param->eob, bd); - break; - case IDTX: - av1_inv_txfm2d_add_16x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, - txfm_param->tx_type, txfm_param->bd); - break; - default: assert(0); - } -} - -void av1_highbd_inv_txfm_add_32x16_avx2(const tran_low_t *input, uint8_t *dest, - int stride, - const TxfmParam *txfm_param) { - int bd = txfm_param->bd; - const TX_TYPE tx_type = txfm_param->tx_type; - const int32_t *src = cast_to_int32(input); - switch (tx_type) { - case DCT_DCT: - av1_highbd_inv_txfm2d_add_universe_avx2(input, dest, stride, tx_type, - txfm_param->tx_size, - txfm_param->eob, bd); - break; - case IDTX: - av1_inv_txfm2d_add_32x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, - txfm_param->tx_type, txfm_param->bd); - break; - default: assert(0); - } -} -void av1_highbd_inv_txfm_add_8x8_avx2(const tran_low_t *input, uint8_t *dest, - int stride, const TxfmParam *txfm_param) { - int bd = txfm_param->bd; - const TX_TYPE tx_type = txfm_param->tx_type; - const int32_t *src = cast_to_int32(input); - switch (tx_type) { - // Assembly version doesn't support some transform types, so use C version - // for those. - case V_DCT: - case H_DCT: - case V_ADST: - case H_ADST: - case V_FLIPADST: - case H_FLIPADST: case IDTX: - av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, - bd); - break; - default: - av1_highbd_inv_txfm2d_add_universe_avx2(input, dest, stride, tx_type, - txfm_param->tx_size, - txfm_param->eob, bd); - break; - } -} -void av1_highbd_inv_txfm_add_8x32_avx2(const tran_low_t *input, uint8_t *dest, - int stride, - const TxfmParam *txfm_param) { - int bd = txfm_param->bd; - const TX_TYPE tx_type = txfm_param->tx_type; - const int32_t *src = cast_to_int32(input); - switch (tx_type) { - case DCT_DCT: - av1_highbd_inv_txfm2d_add_universe_avx2(input, dest, stride, tx_type, - txfm_param->tx_size, - txfm_param->eob, bd); - break; - case IDTX: - av1_inv_txfm2d_add_8x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, - txfm_param->tx_type, txfm_param->bd); - break; - default: assert(0); - } -} - -void av1_highbd_inv_txfm_add_32x8_avx2(const tran_low_t *input, uint8_t *dest, - int stride, - const TxfmParam *txfm_param) { - int bd = txfm_param->bd; - const TX_TYPE tx_type = txfm_param->tx_type; - const int32_t *src = cast_to_int32(input); - switch (tx_type) { - case DCT_DCT: - av1_highbd_inv_txfm2d_add_universe_avx2(input, dest, stride, tx_type, - txfm_param->tx_size, - txfm_param->eob, bd); - break; - case IDTX: - av1_inv_txfm2d_add_32x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, - txfm_param->tx_type, txfm_param->bd); - break; - default: assert(0); - } -} -void av1_highbd_inv_txfm_add_16x8_avx2(const tran_low_t *input, uint8_t *dest, - int stride, - const TxfmParam *txfm_param) { - int bd = txfm_param->bd; - const TX_TYPE tx_type = txfm_param->tx_type; - const int32_t *src = cast_to_int32(input); - switch (tx_type) { - // Assembly version doesn't support some transform types, so use C version - // for those. - case V_DCT: case H_DCT: - case V_ADST: case H_ADST: - case V_FLIPADST: case H_FLIPADST: - case IDTX: - av1_inv_txfm2d_add_16x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, - txfm_param->tx_type, txfm_param->bd); - break; - default: - av1_highbd_inv_txfm2d_add_universe_avx2(input, dest, stride, tx_type, - txfm_param->tx_size, - txfm_param->eob, bd); - break; - } -} - -void av1_highbd_inv_txfm_add_8x16_avx2(const tran_low_t *input, uint8_t *dest, - int stride, - const TxfmParam *txfm_param) { - int bd = txfm_param->bd; - const TX_TYPE tx_type = txfm_param->tx_type; - const int32_t *src = cast_to_int32(input); - switch (tx_type) { - // Assembly version doesn't support some transform types, so use C version - // for those. case V_DCT: - case H_DCT: case V_ADST: - case H_ADST: case V_FLIPADST: - case H_FLIPADST: - case IDTX: - av1_inv_txfm2d_add_8x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, - txfm_param->tx_type, txfm_param->bd); - break; - default: - av1_highbd_inv_txfm2d_add_universe_avx2(input, dest, stride, tx_type, - txfm_param->tx_size, - txfm_param->eob, bd); + av1_highbd_inv_txfm2d_add_universe_sse4_1(input, output, stride, tx_type, + tx_size, eob, bd); break; + default: assert(0); break; } } void av1_highbd_inv_txfm_add_avx2(const tran_low_t *input, uint8_t *dest, @@ -4523,33 +4327,12 @@ void av1_highbd_inv_txfm_add_avx2(const tran_low_t *input, uint8_t *dest, assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); const TX_SIZE tx_size = txfm_param->tx_size; switch (tx_size) { - case TX_32X32: - av1_highbd_inv_txfm_add_32x32_avx2(input, dest, stride, txfm_param); - break; - case TX_16X16: - av1_highbd_inv_txfm_add_16x16_avx2(input, dest, stride, txfm_param); - break; - case TX_8X8: - av1_highbd_inv_txfm_add_8x8_avx2(input, dest, stride, txfm_param); - break; case TX_4X8: av1_highbd_inv_txfm_add_4x8_sse4_1(input, dest, stride, txfm_param); break; case TX_8X4: av1_highbd_inv_txfm_add_8x4_sse4_1(input, dest, stride, txfm_param); break; - case TX_8X16: - av1_highbd_inv_txfm_add_8x16_avx2(input, dest, stride, txfm_param); - break; - case TX_16X8: - av1_highbd_inv_txfm_add_16x8_avx2(input, dest, stride, txfm_param); - break; - case TX_16X32: - av1_highbd_inv_txfm_add_16x32_avx2(input, dest, stride, txfm_param); - break; - case TX_32X16: - av1_highbd_inv_txfm_add_32x16_avx2(input, dest, stride, txfm_param); - break; case TX_4X4: av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param); break; @@ -4559,21 +4342,10 @@ void av1_highbd_inv_txfm_add_avx2(const tran_low_t *input, uint8_t *dest, case TX_4X16: av1_highbd_inv_txfm_add_4x16_sse4_1(input, dest, stride, txfm_param); break; - case TX_8X32: - av1_highbd_inv_txfm_add_8x32_avx2(input, dest, stride, txfm_param); - break; - case TX_32X8: - av1_highbd_inv_txfm_add_32x8_avx2(input, dest, stride, txfm_param); - break; - case TX_64X64: - case TX_32X64: - case TX_64X32: - case TX_16X64: - case TX_64X16: + default: av1_highbd_inv_txfm2d_add_universe_avx2( input, dest, stride, txfm_param->tx_type, txfm_param->tx_size, txfm_param->eob, txfm_param->bd); break; - default: assert(0 && "Invalid transform size"); break; } } diff --git a/libaom/av1/common/x86/highbd_inv_txfm_sse4.c b/libaom/av1/common/x86/highbd_inv_txfm_sse4.c index 12c6350..8a8641d 100644 --- a/libaom/av1/common/x86/highbd_inv_txfm_sse4.c +++ b/libaom/av1/common/x86/highbd_inv_txfm_sse4.c @@ -583,7 +583,66 @@ static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride, _mm_storel_epi64((__m128i *)(output + 2 * stride), v2); _mm_storel_epi64((__m128i *)(output + 3 * stride), v3); } +static void highbd_clamp_epi32_sse4_1(const __m128i *in, __m128i *out, + const __m128i *clamp_lo, + const __m128i *clamp_hi, int size) { + __m128i a0, a1; + for (int i = 0; i < size; i += 4) { + a0 = _mm_max_epi32(in[i], *clamp_lo); + out[i] = _mm_min_epi32(a0, *clamp_hi); + + a1 = _mm_max_epi32(in[i + 1], *clamp_lo); + out[i + 1] = _mm_min_epi32(a1, *clamp_hi); + + a0 = _mm_max_epi32(in[i + 2], *clamp_lo); + out[i + 2] = _mm_min_epi32(a0, *clamp_hi); + + a1 = _mm_max_epi32(in[i + 3], *clamp_lo); + out[i + 3] = _mm_min_epi32(a1, *clamp_hi); + } +} +static void iidentity4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + (void)bit; + (void)out_shift; + __m128i v[4]; + __m128i fact = _mm_set1_epi32(NewSqrt2); + __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1)); + __m128i a0, a1; + + a0 = _mm_mullo_epi32(in[0], fact); + a1 = _mm_mullo_epi32(in[1], fact); + a0 = _mm_add_epi32(a0, offset); + a1 = _mm_add_epi32(a1, offset); + out[0] = _mm_srai_epi32(a0, NewSqrt2Bits); + out[1] = _mm_srai_epi32(a1, NewSqrt2Bits); + + a0 = _mm_mullo_epi32(in[2], fact); + a1 = _mm_mullo_epi32(in[3], fact); + a0 = _mm_add_epi32(a0, offset); + a1 = _mm_add_epi32(a1, offset); + out[2] = _mm_srai_epi32(a0, NewSqrt2Bits); + out[3] = _mm_srai_epi32(a1, NewSqrt2Bits); + + if (!do_cols) { + const int log_range = AOMMAX(16, bd + 6); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + + highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 4); + } + + // Transpose for 4x4 + v[0] = _mm_unpacklo_epi32(out[0], out[1]); + v[1] = _mm_unpackhi_epi32(out[0], out[1]); + v[2] = _mm_unpacklo_epi32(out[2], out[3]); + v[3] = _mm_unpackhi_epi32(out[2], out[3]); + out[0] = _mm_unpacklo_epi64(v[0], v[2]); + out[1] = _mm_unpackhi_epi64(v[0], v[2]); + out[2] = _mm_unpacklo_epi64(v[1], v[3]); + out[3] = _mm_unpackhi_epi64(v[1], v[3]); +} void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output, int stride, TX_TYPE tx_type, int bd) { __m128i in[4]; @@ -646,6 +705,48 @@ void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output, iadst4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd); break; + case IDTX: + load_buffer_4x4(coeff, in); + iidentity4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0); + iidentity4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case V_DCT: + load_buffer_4x4(coeff, in); + iidentity4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0); + idct4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case H_DCT: + load_buffer_4x4(coeff, in); + idct4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0); + iidentity4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case V_ADST: + load_buffer_4x4(coeff, in); + iidentity4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0); + iadst4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case H_ADST: + load_buffer_4x4(coeff, in); + iadst4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0); + iidentity4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd); + break; + case V_FLIPADST: + load_buffer_4x4(coeff, in); + iidentity4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0); + iadst4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd); + break; + case H_FLIPADST: + load_buffer_4x4(coeff, in); + iadst4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0); + iidentity4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd); + break; default: assert(0); } } @@ -1116,6 +1217,61 @@ static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, &clamp_hi_out, out_shift); } } +static void shift_sse4_1(const __m128i *in, __m128i *out, + const __m128i *clamp_lo, const __m128i *clamp_hi, + int shift, int size) { + __m128i offset = _mm_set1_epi32((1 << shift) >> 1); + __m128i shift_vec = _mm_cvtsi32_si128(shift); + __m128i a0, a1; + for (int i = 0; i < size; i += 4) { + a0 = _mm_add_epi32(in[i], offset); + a1 = _mm_add_epi32(in[i + 1], offset); + a0 = _mm_sra_epi32(a0, shift_vec); + a1 = _mm_sra_epi32(a1, shift_vec); + a0 = _mm_max_epi32(a0, *clamp_lo); + a1 = _mm_max_epi32(a1, *clamp_lo); + out[i] = _mm_min_epi32(a0, *clamp_hi); + out[i + 1] = _mm_min_epi32(a1, *clamp_hi); + + a0 = _mm_add_epi32(in[i + 2], offset); + a1 = _mm_add_epi32(in[i + 3], offset); + a0 = _mm_sra_epi32(a0, shift_vec); + a1 = _mm_sra_epi32(a1, shift_vec); + a0 = _mm_max_epi32(a0, *clamp_lo); + a1 = _mm_max_epi32(a1, *clamp_lo); + out[i + 2] = _mm_min_epi32(a0, *clamp_hi); + out[i + 3] = _mm_min_epi32(a1, *clamp_hi); + } +} + +static void iidentity8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + (void)bit; + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i v[8]; + v[0] = _mm_add_epi32(in[0], in[0]); + v[1] = _mm_add_epi32(in[1], in[1]); + v[2] = _mm_add_epi32(in[2], in[2]); + v[3] = _mm_add_epi32(in[3], in[3]); + v[4] = _mm_add_epi32(in[4], in[4]); + v[5] = _mm_add_epi32(in[5], in[5]); + v[6] = _mm_add_epi32(in[6], in[6]); + v[7] = _mm_add_epi32(in[7], in[7]); + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( + -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); + const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( + (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); + + shift_sse4_1(v, out, &clamp_lo_out, &clamp_hi_out, out_shift, 8); + } else { + highbd_clamp_epi32_sse4_1(v, out, &clamp_lo, &clamp_hi, 8); + } +} static void round_shift_8x8(__m128i *in, int shift) { round_shift_4x4(&in[0], shift); @@ -3000,7 +3156,59 @@ static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, } } } +static void iidentity16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + (void)bit; + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i v[16]; + __m128i fact = _mm_set1_epi32(2 * NewSqrt2); + __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1)); + __m128i a0, a1, a2, a3; + + for (int i = 0; i < 16; i += 8) { + a0 = _mm_mullo_epi32(in[i], fact); + a1 = _mm_mullo_epi32(in[i + 1], fact); + a0 = _mm_add_epi32(a0, offset); + a1 = _mm_add_epi32(a1, offset); + v[i] = _mm_srai_epi32(a0, NewSqrt2Bits); + v[i + 1] = _mm_srai_epi32(a1, NewSqrt2Bits); + + a2 = _mm_mullo_epi32(in[i + 2], fact); + a3 = _mm_mullo_epi32(in[i + 3], fact); + a2 = _mm_add_epi32(a2, offset); + a3 = _mm_add_epi32(a3, offset); + v[i + 2] = _mm_srai_epi32(a2, NewSqrt2Bits); + v[i + 3] = _mm_srai_epi32(a3, NewSqrt2Bits); + + a0 = _mm_mullo_epi32(in[i + 4], fact); + a1 = _mm_mullo_epi32(in[i + 5], fact); + a0 = _mm_add_epi32(a0, offset); + a1 = _mm_add_epi32(a1, offset); + v[i + 4] = _mm_srai_epi32(a0, NewSqrt2Bits); + v[i + 5] = _mm_srai_epi32(a1, NewSqrt2Bits); + + a2 = _mm_mullo_epi32(in[i + 6], fact); + a3 = _mm_mullo_epi32(in[i + 7], fact); + a2 = _mm_add_epi32(a2, offset); + a3 = _mm_add_epi32(a3, offset); + v[i + 6] = _mm_srai_epi32(a2, NewSqrt2Bits); + v[i + 7] = _mm_srai_epi32(a3, NewSqrt2Bits); + } + + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( + -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); + const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( + (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); + shift_sse4_1(v, out, &clamp_lo_out, &clamp_hi_out, out_shift, 16); + } else { + highbd_clamp_epi32_sse4_1(v, out, &clamp_lo, &clamp_hi, 16); + } +} static INLINE void idct64_stage8_sse4_1( __m128i *u, const __m128i *cospim32, const __m128i *cospi32, const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16, @@ -5020,207 +5228,23 @@ void av1_highbd_inv_txfm_add_8x8_sse4_1(const tran_low_t *input, uint8_t *dest, const TX_TYPE tx_type = txfm_param->tx_type; const int32_t *src = cast_to_int32(input); switch (tx_type) { - // Assembly version doesn't support some transform types, so use C version - // for those. - case V_DCT: - case H_DCT: - case V_ADST: - case H_ADST: - case V_FLIPADST: - case H_FLIPADST: case IDTX: - av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, - bd); - break; - default: - av1_inv_txfm2d_add_8x8_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride, - tx_type, bd); - break; - } -} - -void av1_highbd_inv_txfm_add_16x8_sse4_1(const tran_low_t *input, uint8_t *dest, - int stride, - const TxfmParam *txfm_param) { - int bd = txfm_param->bd; - const TX_TYPE tx_type = txfm_param->tx_type; - const int32_t *src = cast_to_int32(input); - switch (tx_type) { - // Assembly version doesn't support some transform types, so use C version - // for those. - case V_DCT: case H_DCT: - case V_ADST: case H_ADST: - case V_FLIPADST: case H_FLIPADST: - case IDTX: - av1_inv_txfm2d_add_16x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, - txfm_param->tx_type, txfm_param->bd); - break; - default: - av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type, - txfm_param->tx_size, - txfm_param->eob, bd); - break; - } -} - -void av1_highbd_inv_txfm_add_8x16_sse4_1(const tran_low_t *input, uint8_t *dest, - int stride, - const TxfmParam *txfm_param) { - int bd = txfm_param->bd; - const TX_TYPE tx_type = txfm_param->tx_type; - const int32_t *src = cast_to_int32(input); - switch (tx_type) { - // Assembly version doesn't support some transform types, so use C version - // for those. case V_DCT: - case H_DCT: case V_ADST: - case H_ADST: case V_FLIPADST: - case H_FLIPADST: - case IDTX: - av1_inv_txfm2d_add_8x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, - txfm_param->tx_type, txfm_param->bd); - break; - default: av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type, txfm_param->tx_size, txfm_param->eob, bd); break; - } -} - -void av1_highbd_inv_txfm_add_16x16_sse4_1(const tran_low_t *input, - uint8_t *dest, int stride, - const TxfmParam *txfm_param) { - int bd = txfm_param->bd; - const TX_TYPE tx_type = txfm_param->tx_type; - const int32_t *src = cast_to_int32(input); - switch (tx_type) { - // Assembly version doesn't support some transform types, so use C version - // for those. - case V_DCT: - case H_DCT: - case V_ADST: - case H_ADST: - case V_FLIPADST: - case H_FLIPADST: - case IDTX: - av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, - tx_type, bd); - break; default: - av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type, - txfm_param->tx_size, - txfm_param->eob, bd); - break; - } -} - -void av1_highbd_inv_txfm_add_32x32_sse4_1(const tran_low_t *input, - uint8_t *dest, int stride, - const TxfmParam *txfm_param) { - int bd = txfm_param->bd; - const TX_TYPE tx_type = txfm_param->tx_type; - const int32_t *src = cast_to_int32(input); - switch (tx_type) { - case DCT_DCT: - av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type, - txfm_param->tx_size, - txfm_param->eob, bd); - break; - // Assembly version doesn't support IDTX, so use C version for it. - case IDTX: - av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, - tx_type, bd); - break; - default: assert(0); - } -} - -void av1_highbd_inv_txfm_add_16x32_sse4_1(const tran_low_t *input, - uint8_t *dest, int stride, - const TxfmParam *txfm_param) { - int bd = txfm_param->bd; - const TX_TYPE tx_type = txfm_param->tx_type; - const int32_t *src = cast_to_int32(input); - switch (tx_type) { - case DCT_DCT: - av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type, - txfm_param->tx_size, - txfm_param->eob, bd); - break; - case IDTX: - av1_inv_txfm2d_add_16x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, - txfm_param->tx_type, txfm_param->bd); - break; - default: assert(0); - } -} - -void av1_highbd_inv_txfm_add_32x16_sse4_1(const tran_low_t *input, - uint8_t *dest, int stride, - const TxfmParam *txfm_param) { - int bd = txfm_param->bd; - const TX_TYPE tx_type = txfm_param->tx_type; - const int32_t *src = cast_to_int32(input); - switch (tx_type) { - case DCT_DCT: - av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type, - txfm_param->tx_size, - txfm_param->eob, bd); - break; - case IDTX: - av1_inv_txfm2d_add_32x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, - txfm_param->tx_type, txfm_param->bd); - break; - default: assert(0); - } -} - -void av1_highbd_inv_txfm_add_8x32_sse4_1(const tran_low_t *input, uint8_t *dest, - int stride, - const TxfmParam *txfm_param) { - int bd = txfm_param->bd; - const TX_TYPE tx_type = txfm_param->tx_type; - const int32_t *src = cast_to_int32(input); - switch (tx_type) { - case DCT_DCT: - av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type, - txfm_param->tx_size, - txfm_param->eob, bd); - break; - case IDTX: - av1_inv_txfm2d_add_8x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, - txfm_param->tx_type, txfm_param->bd); - break; - default: assert(0); - } -} - -void av1_highbd_inv_txfm_add_32x8_sse4_1(const tran_low_t *input, uint8_t *dest, - int stride, - const TxfmParam *txfm_param) { - int bd = txfm_param->bd; - const TX_TYPE tx_type = txfm_param->tx_type; - const int32_t *src = cast_to_int32(input); - switch (tx_type) { - case DCT_DCT: - av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type, - txfm_param->tx_size, - txfm_param->eob, bd); - break; - case IDTX: - av1_inv_txfm2d_add_32x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, - txfm_param->tx_type, txfm_param->bd); + av1_inv_txfm2d_add_8x8_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride, + tx_type, bd); break; - default: assert(0); } } - void av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param) { @@ -5235,53 +5259,271 @@ void av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t *input, uint8_t *dest, av1_highbd_iwht4x4_add(input, dest, stride, eob, bd); return; } - switch (tx_type) { - // Assembly version doesn't support some transform types, so use C version - // for those. - case V_DCT: - case H_DCT: - case V_ADST: - case H_ADST: - case V_FLIPADST: - case H_FLIPADST: - case IDTX: - av1_inv_txfm2d_add_4x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, - bd); - break; - default: - av1_inv_txfm2d_add_4x4_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride, - tx_type, bd); - break; - } + av1_inv_txfm2d_add_4x4_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, + bd); } +static void iidentity32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols, + int bd, int out_shift) { + (void)bit; + const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8)); + const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1))); + const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1); + __m128i v[32]; + for (int i = 0; i < 32; i += 16) { + v[i] = _mm_slli_epi32(in[i], 2); + v[i + 1] = _mm_slli_epi32(in[i + 1], 2); + v[i + 2] = _mm_slli_epi32(in[i + 2], 2); + v[i + 3] = _mm_slli_epi32(in[i + 3], 2); + v[i + 4] = _mm_slli_epi32(in[i + 4], 2); + v[i + 5] = _mm_slli_epi32(in[i + 5], 2); + v[i + 6] = _mm_slli_epi32(in[i + 6], 2); + v[i + 7] = _mm_slli_epi32(in[i + 7], 2); + v[i + 8] = _mm_slli_epi32(in[i + 8], 2); + v[i + 9] = _mm_slli_epi32(in[i + 9], 2); + v[i + 10] = _mm_slli_epi32(in[i + 10], 2); + v[i + 11] = _mm_slli_epi32(in[i + 11], 2); + v[i + 12] = _mm_slli_epi32(in[i + 12], 2); + v[i + 13] = _mm_slli_epi32(in[i + 13], 2); + v[i + 14] = _mm_slli_epi32(in[i + 14], 2); + v[i + 15] = _mm_slli_epi32(in[i + 15], 2); + } + if (!do_cols) { + const int log_range_out = AOMMAX(16, bd + 6); + const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX( + -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift)))); + const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN( + (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift)))); + shift_sse4_1(v, out, &clamp_lo_out, &clamp_hi_out, out_shift, 32); + } else { + highbd_clamp_epi32_sse4_1(v, out, &clamp_lo, &clamp_hi, 32); + } +} static const transform_1d_sse4_1 highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = { { { idct4x4_sse4_1, NULL, NULL, NULL }, { iadst4x4_sse4_1, NULL, NULL, NULL }, - { NULL, NULL, NULL, NULL }, + { iidentity4_sse4_1, iidentity4_sse4_1, iidentity4_sse4_1, NULL }, }, { { idct8x8_low1_sse4_1, idct8x8_new_sse4_1, NULL, NULL }, { iadst8x8_low1_sse4_1, iadst8x8_new_sse4_1, NULL, NULL }, - { NULL, NULL, NULL, NULL } }, + { iidentity8_sse4_1, iidentity8_sse4_1, NULL, NULL } }, { { idct16x16_low1_sse4_1, idct16x16_low8_sse4_1, idct16x16_sse4_1, NULL }, { iadst16x16_low1_sse4_1, iadst16x16_low8_sse4_1, iadst16x16_sse4_1, NULL }, - { NULL, NULL, NULL, NULL }, + { iidentity16_sse4_1, NULL, iidentity16_sse4_1, NULL }, }, { { idct32x32_low1_sse4_1, idct32x32_low8_sse4_1, idct32x32_low16_sse4_1, idct32x32_sse4_1 }, { NULL, NULL, NULL, NULL }, - { NULL, NULL, NULL, NULL } }, + { iidentity32_sse4_1, NULL, NULL, NULL } }, { { idct64x64_low1_sse4_1, idct64x64_low8_sse4_1, idct64x64_low16_sse4_1, idct64x64_sse4_1 }, { NULL, NULL, NULL, NULL }, { NULL, NULL, NULL, NULL } } }; +static void highbd_inv_txfm2d_add_h_identity_ssse41(const int32_t *input, + uint16_t *output, + int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob, + const int bd) { + __m128i buf1[64]; + int eobx, eoby; + get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob); + const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int input_stride = AOMMIN(32, txfm_size_col); + const int buf_size_w_div4 = input_stride >> 2; + const int buf_size_h_div8 = (eoby + 8) >> 3; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby]; + const transform_1d_sse4_1 row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0]; + const transform_1d_sse4_1 col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx]; + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < (buf_size_h_div8 << 1); ++i) { + __m128i buf0[16]; + const int32_t *input_row = input + i * input_stride * 4; + for (int j = 0; j < buf_size_w_div4; ++j) { + __m128i *buf0_cur = buf0 + j * 4; + load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4); + } + if (rect_type == 1 || rect_type == -1) { + av1_round_shift_rect_array_32_sse4_1(buf0, buf0, input_stride, 0, + NewInvSqrt2); + } + row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]); + + __m128i *_buf1 = buf1 + i * 4; + + for (int j = 0; j < buf_size_w_div4; ++j) { + _buf1[j * txfm_size_row + 0] = buf0[j * 4 + 0]; + _buf1[j * txfm_size_row + 1] = buf0[j * 4 + 1]; + _buf1[j * txfm_size_row + 2] = buf0[j * 4 + 2]; + _buf1[j * txfm_size_row + 3] = buf0[j * 4 + 3]; + } + } + for (int i = 0; i < buf_size_w_div4; i++) { + col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, + inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + + av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row, + buf1 + i * txfm_size_row, txfm_size_row, + -shift[1]); + } + + // write to buffer + for (int i = 0; i < (txfm_size_col >> 3); i++) { + highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2, output + 8 * i, + stride, ud_flip, txfm_size_row, bd); + } +} +static void highbd_inv_txfm2d_add_v_identity_ssse41(const int32_t *input, + uint16_t *output, + int stride, TX_TYPE tx_type, + TX_SIZE tx_size, int eob, + const int bd) { + __m128i buf1[64]; + int eobx, eoby; + get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob); + const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int input_stride = AOMMIN(32, txfm_size_col); + const int buf_size_w_div8 = input_stride >> 2; + const int row_max = AOMMIN(32, txfm_size_row); + const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3; + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx]; + const transform_1d_sse4_1 row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx]; + const transform_1d_sse4_1 col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0]; + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + for (int i = 0; i < (row_max >> 2); ++i) { + __m128i buf0[16]; + const int32_t *input_row = input + i * input_stride * 4; + for (int j = 0; j < (buf_size_nonzero_w_div8 << 1); ++j) { + __m128i *buf0_cur = buf0 + j * 4; + load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4); + + TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3], + buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]); + } + if (rect_type == 1 || rect_type == -1) { + av1_round_shift_rect_array_32_sse4_1( + buf0, buf0, (buf_size_nonzero_w_div8 << 3), 0, NewInvSqrt2); + } + row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]); + + __m128i *_buf1 = buf1 + i * 4; + if (lr_flip) { + for (int j = 0; j < buf_size_w_div8; ++j) { + TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1], + buf0[4 * j], + _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 0], + _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 1], + _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 2], + _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 3]); + } + } else { + for (int j = 0; j < buf_size_w_div8; ++j) { + TRANSPOSE_4X4( + buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3], + _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1], + _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]); + } + } + } + for (int i = 0; i < buf_size_w_div8; i++) { + col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, + inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + + av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row, + buf1 + i * txfm_size_row, txfm_size_row, + -shift[1]); + } + + // write to buffer + { + for (int i = 0; i < (txfm_size_col >> 3); i++) { + highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2, + output + 8 * i, stride, ud_flip, + txfm_size_row, bd); + } + } +} +static void highbd_inv_txfm2d_add_idtx_ssse41(const int32_t *input, + uint16_t *output, int stride, + TX_TYPE tx_type, TX_SIZE tx_size, + int eob, const int bd) { + (void)eob; + __m128i buf1[64 * 4]; + const int8_t *shift = inv_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int txfm_size_col = tx_size_wide[tx_size]; + const int txfm_size_row = tx_size_high[tx_size]; + const int input_stride = AOMMIN(32, txfm_size_col); + const int row_max = AOMMIN(32, txfm_size_row); + const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); + const transform_1d_sse4_1 row_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0]; + const transform_1d_sse4_1 col_txfm = + highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0]; + + for (int i = 0; i < (row_max >> 2); ++i) { + __m128i buf0[32]; + const int32_t *input_row = input + i * input_stride * 4; + for (int j = 0; j < (input_stride >> 2); ++j) { + __m128i *buf0_cur = buf0 + j * 4; + load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4); + } + if (rect_type == 1 || rect_type == -1) { + av1_round_shift_rect_array_32_sse4_1(buf0, buf0, input_stride, 0, + NewInvSqrt2); + } + row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]); + + __m128i *_buf1 = buf1 + i * 4; + for (int j = 0; j < (input_stride >> 2); ++j) { + _buf1[j * txfm_size_row + 0] = buf0[j * 4 + 0]; + _buf1[j * txfm_size_row + 1] = buf0[j * 4 + 1]; + _buf1[j * txfm_size_row + 2] = buf0[j * 4 + 2]; + _buf1[j * txfm_size_row + 3] = buf0[j * 4 + 3]; + } + } + for (int i = 0; i < (input_stride >> 2); i++) { + col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, + inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0); + + av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row, + buf1 + i * txfm_size_row, txfm_size_row, + -shift[1]); + } + // write to buffer + { + for (int i = 0; i < (txfm_size_col >> 3); i++) { + highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2, + output + 8 * i, stride, 0, txfm_size_row, + bd); + } + } +} static void highbd_inv_txfm2d_add_no_identity_sse41(const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, @@ -5613,6 +5855,24 @@ void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input, input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob, bd); break; + case V_DCT: + case V_ADST: + case V_FLIPADST: + highbd_inv_txfm2d_add_h_identity_ssse41( + input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob, + bd); + break; + case H_DCT: + case H_ADST: + case H_FLIPADST: + highbd_inv_txfm2d_add_v_identity_ssse41( + input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob, + bd); + break; + case IDTX: + highbd_inv_txfm2d_add_idtx_ssse41(input, CONVERT_TO_SHORTPTR(output), + stride, tx_type, tx_size, eob, bd); + break; default: assert(0); break; } } @@ -5623,26 +5883,9 @@ void av1_highbd_inv_txfm_add_4x8_sse4_1(const tran_low_t *input, uint8_t *dest, int bd = txfm_param->bd; const TX_TYPE tx_type = txfm_param->tx_type; const TX_SIZE tx_size = txfm_param->tx_size; - const int32_t *src = cast_to_int32(input); int eob = txfm_param->eob; - switch (tx_type) { - // Assembly version doesn't support some transform types, so use C version - // for those. - case V_DCT: - case H_DCT: - case V_ADST: - case H_ADST: - case V_FLIPADST: - case H_FLIPADST: - case IDTX: - av1_inv_txfm2d_add_4x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, - bd); - break; - default: - highbd_inv_txfm2d_add_4x8_sse41(input, CONVERT_TO_SHORTPTR(dest), stride, - tx_type, tx_size, eob, bd); - break; - } + highbd_inv_txfm2d_add_4x8_sse41(input, CONVERT_TO_SHORTPTR(dest), stride, + tx_type, tx_size, eob, bd); } void av1_highbd_inv_txfm_add_8x4_sse4_1(const tran_low_t *input, uint8_t *dest, @@ -5651,26 +5894,9 @@ void av1_highbd_inv_txfm_add_8x4_sse4_1(const tran_low_t *input, uint8_t *dest, int bd = txfm_param->bd; const TX_TYPE tx_type = txfm_param->tx_type; const TX_SIZE tx_size = txfm_param->tx_size; - const int32_t *src = cast_to_int32(input); int eob = txfm_param->eob; - switch (tx_type) { - // Assembly version doesn't support some transform types, so use C version - // for those. - case V_DCT: - case H_DCT: - case V_ADST: - case H_ADST: - case V_FLIPADST: - case H_FLIPADST: - case IDTX: - av1_inv_txfm2d_add_8x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, - bd); - break; - default: - highbd_inv_txfm2d_add_8x4_sse41(input, CONVERT_TO_SHORTPTR(dest), stride, - tx_type, tx_size, eob, bd); - break; - } + highbd_inv_txfm2d_add_8x4_sse41(input, CONVERT_TO_SHORTPTR(dest), stride, + tx_type, tx_size, eob, bd); } void av1_highbd_inv_txfm_add_4x16_sse4_1(const tran_low_t *input, uint8_t *dest, @@ -5679,26 +5905,9 @@ void av1_highbd_inv_txfm_add_4x16_sse4_1(const tran_low_t *input, uint8_t *dest, int bd = txfm_param->bd; const TX_TYPE tx_type = txfm_param->tx_type; const TX_SIZE tx_size = txfm_param->tx_size; - const int32_t *src = cast_to_int32(input); int eob = txfm_param->eob; - switch (tx_type) { - // Assembly version doesn't support some transform types, so use C version - // for those. - case V_DCT: - case H_DCT: - case V_ADST: - case H_ADST: - case V_FLIPADST: - case H_FLIPADST: - case IDTX: - av1_inv_txfm2d_add_4x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, - bd); - break; - default: - highbd_inv_txfm2d_add_4x16_sse4_1(input, CONVERT_TO_SHORTPTR(dest), - stride, tx_type, tx_size, eob, bd); - break; - } + highbd_inv_txfm2d_add_4x16_sse4_1(input, CONVERT_TO_SHORTPTR(dest), stride, + tx_type, tx_size, eob, bd); } void av1_highbd_inv_txfm_add_16x4_sse4_1(const tran_low_t *input, uint8_t *dest, @@ -5707,26 +5916,9 @@ void av1_highbd_inv_txfm_add_16x4_sse4_1(const tran_low_t *input, uint8_t *dest, int bd = txfm_param->bd; const TX_TYPE tx_type = txfm_param->tx_type; const TX_SIZE tx_size = txfm_param->tx_size; - const int32_t *src = cast_to_int32(input); int eob = txfm_param->eob; - switch (tx_type) { - // Assembly version doesn't support some transform types, so use C version - // for those. - case V_DCT: - case H_DCT: - case V_ADST: - case H_ADST: - case V_FLIPADST: - case H_FLIPADST: - case IDTX: - av1_inv_txfm2d_add_16x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, - bd); - break; - default: - highbd_inv_txfm2d_add_16x4_sse4_1(input, CONVERT_TO_SHORTPTR(dest), - stride, tx_type, tx_size, eob, bd); - break; - } + highbd_inv_txfm2d_add_16x4_sse4_1(input, CONVERT_TO_SHORTPTR(dest), stride, + tx_type, tx_size, eob, bd); } void av1_highbd_inv_txfm_add_sse4_1(const tran_low_t *input, uint8_t *dest, @@ -5734,57 +5926,16 @@ void av1_highbd_inv_txfm_add_sse4_1(const tran_low_t *input, uint8_t *dest, assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]); const TX_SIZE tx_size = txfm_param->tx_size; switch (tx_size) { - case TX_32X32: - av1_highbd_inv_txfm_add_32x32_sse4_1(input, dest, stride, txfm_param); - break; - case TX_16X16: - av1_highbd_inv_txfm_add_16x16_sse4_1(input, dest, stride, txfm_param); - break; - case TX_8X8: - av1_highbd_inv_txfm_add_8x8_sse4_1(input, dest, stride, txfm_param); - break; case TX_4X8: av1_highbd_inv_txfm_add_4x8_sse4_1(input, dest, stride, txfm_param); break; case TX_8X4: av1_highbd_inv_txfm_add_8x4_sse4_1(input, dest, stride, txfm_param); break; - case TX_8X16: - av1_highbd_inv_txfm_add_8x16_sse4_1(input, dest, stride, txfm_param); - break; - case TX_16X8: - av1_highbd_inv_txfm_add_16x8_sse4_1(input, dest, stride, txfm_param); - break; - case TX_16X32: - av1_highbd_inv_txfm_add_16x32_sse4_1(input, dest, stride, txfm_param); - break; - case TX_32X16: - av1_highbd_inv_txfm_add_32x16_sse4_1(input, dest, stride, txfm_param); - break; - case TX_4X4: - av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param); - break; - case TX_16X4: - av1_highbd_inv_txfm_add_16x4_sse4_1(input, dest, stride, txfm_param); - break; - case TX_4X16: - av1_highbd_inv_txfm_add_4x16_sse4_1(input, dest, stride, txfm_param); - break; - case TX_8X32: - av1_highbd_inv_txfm_add_8x32_sse4_1(input, dest, stride, txfm_param); - break; - case TX_32X8: - av1_highbd_inv_txfm_add_32x8_sse4_1(input, dest, stride, txfm_param); - break; - case TX_64X64: - case TX_32X64: - case TX_64X32: - case TX_16X64: - case TX_64X16: - av1_highbd_inv_txfm2d_add_universe_sse4_1( - input, dest, stride, txfm_param->tx_type, txfm_param->tx_size, - txfm_param->eob, txfm_param->bd); + default: + // TODO(http://crbug.com/aomedia/2350): the remaining sse4_1 versions + // cause test vector mismatches. + av1_highbd_inv_txfm_add_c(input, dest, stride, txfm_param); break; - default: assert(0 && "Invalid transform size"); break; } } diff --git a/libaom/av1/common/x86/highbd_jnt_convolve_avx2.c b/libaom/av1/common/x86/highbd_jnt_convolve_avx2.c index e298cf6..c5040c4 100644 --- a/libaom/av1/common/x86/highbd_jnt_convolve_avx2.c +++ b/libaom/av1/common/x86/highbd_jnt_convolve_avx2.c @@ -22,7 +22,7 @@ #include "aom_dsp/aom_filter.h" #include "av1/common/convolve.h" -void av1_highbd_jnt_convolve_2d_copy_avx2( +void av1_highbd_dist_wtd_convolve_2d_copy_avx2( const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, @@ -38,7 +38,7 @@ void av1_highbd_jnt_convolve_2d_copy_avx2( FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0; const __m128i left_shift = _mm_cvtsi32_si128(bits); const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const int w0 = conv_params->fwd_offset; const int w1 = conv_params->bck_offset; const __m256i wt0 = _mm256_set1_epi32(w0); @@ -78,15 +78,17 @@ void av1_highbd_jnt_convolve_2d_copy_avx2( const __m256i res_unsigned_lo = _mm256_add_epi32(res_32b_lo, offset_const); - const __m256i comp_avg_res_lo = highbd_comp_avg( - &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg); + const __m256i comp_avg_res_lo = + highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, + use_dist_wtd_comp_avg); const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero); const __m256i res_unsigned_hi = _mm256_add_epi32(res_32b_hi, offset_const); - const __m256i comp_avg_res_hi = highbd_comp_avg( - &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg); + const __m256i comp_avg_res_hi = + highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, + use_dist_wtd_comp_avg); const __m256i round_result_lo = highbd_convolve_rounding( &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); @@ -135,8 +137,9 @@ void av1_highbd_jnt_convolve_2d_copy_avx2( const __m256i res_unsigned_lo = _mm256_add_epi32(res_32b, offset_const); - const __m256i comp_avg_res = highbd_comp_avg( - &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg); + const __m256i comp_avg_res = + highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1, + use_dist_wtd_comp_avg); const __m256i round_result = highbd_convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); @@ -179,15 +182,17 @@ void av1_highbd_jnt_convolve_2d_copy_avx2( const __m256i res_unsigned_lo = _mm256_add_epi32(res_32b_lo, offset_const); - const __m256i comp_avg_res_lo = highbd_comp_avg( - &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg); + const __m256i comp_avg_res_lo = + highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, + use_dist_wtd_comp_avg); const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero); const __m256i res_unsigned_hi = _mm256_add_epi32(res_32b_hi, offset_const); - const __m256i comp_avg_res_hi = highbd_comp_avg( - &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg); + const __m256i comp_avg_res_hi = + highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, + use_dist_wtd_comp_avg); const __m256i round_result_lo = highbd_convolve_rounding(&comp_avg_res_lo, &offset_const, @@ -223,7 +228,7 @@ void av1_highbd_jnt_convolve_2d_copy_avx2( } } -void av1_highbd_jnt_convolve_2d_avx2( +void av1_highbd_dist_wtd_convolve_2d_avx2( const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, @@ -244,7 +249,7 @@ void av1_highbd_jnt_convolve_2d_avx2( __m256i s[8], coeffs_y[4], coeffs_x[4]; const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const int w0 = conv_params->fwd_offset; const int w1 = conv_params->bck_offset; @@ -364,8 +369,9 @@ void av1_highbd_jnt_convolve_2d_avx2( const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero); - const __m256i comp_avg_res = highbd_comp_avg( - &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg); + const __m256i comp_avg_res = + highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1, + use_dist_wtd_comp_avg); const __m256i round_result = highbd_convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); @@ -409,10 +415,12 @@ void av1_highbd_jnt_convolve_2d_avx2( const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero); const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero); - const __m256i comp_avg_res_lo = highbd_comp_avg( - &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg); - const __m256i comp_avg_res_hi = highbd_comp_avg( - &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg); + const __m256i comp_avg_res_lo = + highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, + use_dist_wtd_comp_avg); + const __m256i comp_avg_res_hi = + highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, + use_dist_wtd_comp_avg); const __m256i round_result_lo = highbd_convolve_rounding(&comp_avg_res_lo, &offset_const, @@ -456,7 +464,7 @@ void av1_highbd_jnt_convolve_2d_avx2( } } -void av1_highbd_jnt_convolve_x_avx2( +void av1_highbd_dist_wtd_convolve_x_avx2( const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, @@ -473,7 +481,7 @@ void av1_highbd_jnt_convolve_x_avx2( __m256i s[4], coeffs_x[4]; const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const int w0 = conv_params->fwd_offset; const int w1 = conv_params->bck_offset; const __m256i wt0 = _mm256_set1_epi32(w0); @@ -548,7 +556,7 @@ void av1_highbd_jnt_convolve_x_avx2( const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero); const __m256i comp_avg_res = highbd_comp_avg( - &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg); + &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg); const __m256i round_result = highbd_convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); @@ -588,10 +596,12 @@ void av1_highbd_jnt_convolve_x_avx2( const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero); const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero); - const __m256i comp_avg_res_lo = highbd_comp_avg( - &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg); - const __m256i comp_avg_res_hi = highbd_comp_avg( - &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg); + const __m256i comp_avg_res_lo = + highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, + use_dist_wtd_comp_avg); + const __m256i comp_avg_res_hi = + highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, + use_dist_wtd_comp_avg); const __m256i round_result_lo = highbd_convolve_rounding( &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); @@ -623,7 +633,7 @@ void av1_highbd_jnt_convolve_x_avx2( } } -void av1_highbd_jnt_convolve_y_avx2( +void av1_highbd_dist_wtd_convolve_y_avx2( const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, @@ -640,7 +650,7 @@ void av1_highbd_jnt_convolve_y_avx2( int i, j; __m256i s[8], coeffs_y[4]; const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const int w0 = conv_params->fwd_offset; const int w1 = conv_params->bck_offset; @@ -753,8 +763,9 @@ void av1_highbd_jnt_convolve_y_avx2( const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero); - const __m256i comp_avg_res = highbd_comp_avg( - &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg); + const __m256i comp_avg_res = + highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1, + use_dist_wtd_comp_avg); const __m256i round_result = highbd_convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); @@ -799,10 +810,12 @@ void av1_highbd_jnt_convolve_y_avx2( const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero); const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero); - const __m256i comp_avg_res_lo = highbd_comp_avg( - &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg); - const __m256i comp_avg_res_hi = highbd_comp_avg( - &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg); + const __m256i comp_avg_res_lo = + highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, + use_dist_wtd_comp_avg); + const __m256i comp_avg_res_hi = + highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, + use_dist_wtd_comp_avg); const __m256i round_result_lo = highbd_convolve_rounding(&comp_avg_res_lo, &offset_const, diff --git a/libaom/av1/common/x86/highbd_jnt_convolve_sse4.c b/libaom/av1/common/x86/highbd_jnt_convolve_sse4.c index 1a29985..7fea36a 100644 --- a/libaom/av1/common/x86/highbd_jnt_convolve_sse4.c +++ b/libaom/av1/common/x86/highbd_jnt_convolve_sse4.c @@ -17,7 +17,7 @@ #include "aom_dsp/x86/convolve_sse2.h" #include "aom_dsp/x86/convolve_sse4_1.h" -void av1_highbd_jnt_convolve_y_sse4_1( +void av1_highbd_dist_wtd_convolve_y_sse4_1( const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, @@ -33,7 +33,7 @@ void av1_highbd_jnt_convolve_y_sse4_1( assert(bits >= 0); int i, j; const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const int w0 = conv_params->fwd_offset; const int w1 = conv_params->bck_offset; @@ -121,10 +121,12 @@ void av1_highbd_jnt_convolve_y_sse4_1( const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero); const __m128i data_ref_1 = _mm_unpacklo_epi16(data_1, zero); - const __m128i comp_avg_res_0 = highbd_comp_avg_sse4_1( - &data_ref_0, &res_unsigned_lo_0, &wt0, &wt1, use_jnt_comp_avg); - const __m128i comp_avg_res_1 = highbd_comp_avg_sse4_1( - &data_ref_1, &res_unsigned_lo_1, &wt0, &wt1, use_jnt_comp_avg); + const __m128i comp_avg_res_0 = + highbd_comp_avg_sse4_1(&data_ref_0, &res_unsigned_lo_0, &wt0, + &wt1, use_dist_wtd_comp_avg); + const __m128i comp_avg_res_1 = + highbd_comp_avg_sse4_1(&data_ref_1, &res_unsigned_lo_1, &wt0, + &wt1, use_dist_wtd_comp_avg); const __m128i round_result_0 = highbd_convolve_rounding_sse2(&comp_avg_res_0, &offset_const, @@ -186,16 +188,16 @@ void av1_highbd_jnt_convolve_y_sse4_1( const __m128i comp_avg_res_lo_0 = highbd_comp_avg_sse4_1(&data_ref_0_lo_0, &res_unsigned_lo_0, - &wt0, &wt1, use_jnt_comp_avg); + &wt0, &wt1, use_dist_wtd_comp_avg); const __m128i comp_avg_res_lo_1 = highbd_comp_avg_sse4_1(&data_ref_0_lo_1, &res_unsigned_lo_1, - &wt0, &wt1, use_jnt_comp_avg); + &wt0, &wt1, use_dist_wtd_comp_avg); const __m128i comp_avg_res_hi_0 = highbd_comp_avg_sse4_1(&data_ref_0_hi_0, &res_unsigned_hi_0, - &wt0, &wt1, use_jnt_comp_avg); + &wt0, &wt1, use_dist_wtd_comp_avg); const __m128i comp_avg_res_hi_1 = highbd_comp_avg_sse4_1(&data_ref_0_hi_1, &res_unsigned_hi_1, - &wt0, &wt1, use_jnt_comp_avg); + &wt0, &wt1, use_dist_wtd_comp_avg); const __m128i round_result_lo_0 = highbd_convolve_rounding_sse2(&comp_avg_res_lo_0, &offset_const, @@ -257,7 +259,7 @@ void av1_highbd_jnt_convolve_y_sse4_1( } } -void av1_highbd_jnt_convolve_x_sse4_1( +void av1_highbd_dist_wtd_convolve_x_sse4_1( const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, @@ -274,7 +276,7 @@ void av1_highbd_jnt_convolve_x_sse4_1( __m128i s[4], coeffs_x[4]; const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const int w0 = conv_params->fwd_offset; const int w1 = conv_params->bck_offset; const __m128i wt0 = _mm_set1_epi32(w0); @@ -339,7 +341,7 @@ void av1_highbd_jnt_convolve_x_sse4_1( const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero); const __m128i comp_avg_res = highbd_comp_avg_sse4_1( - &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg); + &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg); const __m128i round_result = highbd_convolve_rounding_sse2( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); @@ -359,10 +361,12 @@ void av1_highbd_jnt_convolve_x_sse4_1( const __m128i data_ref_0_lo = _mm_unpacklo_epi16(data_0, zero); const __m128i data_ref_0_hi = _mm_unpackhi_epi16(data_0, zero); - const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1( - &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg); - const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1( - &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg); + const __m128i comp_avg_res_lo = + highbd_comp_avg_sse4_1(&data_ref_0_lo, &res_unsigned_lo, &wt0, + &wt1, use_dist_wtd_comp_avg); + const __m128i comp_avg_res_hi = + highbd_comp_avg_sse4_1(&data_ref_0_hi, &res_unsigned_hi, &wt0, + &wt1, use_dist_wtd_comp_avg); const __m128i round_result_lo = highbd_convolve_rounding_sse2( &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); diff --git a/libaom/av1/common/x86/highbd_warp_plane_sse4.c b/libaom/av1/common/x86/highbd_warp_plane_sse4.c index 4bcab05..3765c5e 100644 --- a/libaom/av1/common/x86/highbd_warp_plane_sse4.c +++ b/libaom/av1/common/x86/highbd_warp_plane_sse4.c @@ -537,7 +537,7 @@ void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref, __m128i *const dst16 = (__m128i *)&pred[(i + k + 4) * p_stride + j]; __m128i p_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p)); - if (conv_params->use_jnt_comp_avg) { + if (conv_params->use_dist_wtd_comp_avg) { res_lo = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0), _mm_mullo_epi32(res_lo, wt1)); res_lo = _mm_srai_epi32(res_lo, DIST_PRECISION_BITS); @@ -570,7 +570,7 @@ void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref, (__m128i *)&pred[(i + k + 4) * p_stride + j + 4]; __m128i p4_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p4)); - if (conv_params->use_jnt_comp_avg) { + if (conv_params->use_dist_wtd_comp_avg) { res_hi = _mm_add_epi32(_mm_mullo_epi32(p4_32, wt0), _mm_mullo_epi32(res_hi, wt1)); res_hi = _mm_srai_epi32(res_hi, DIST_PRECISION_BITS); diff --git a/libaom/av1/common/x86/jnt_convolve_avx2.c b/libaom/av1/common/x86/jnt_convolve_avx2.c index 9f2e2b4..23cd6ab 100644 --- a/libaom/av1/common/x86/jnt_convolve_avx2.c +++ b/libaom/av1/common/x86/jnt_convolve_avx2.c @@ -35,22 +35,20 @@ static INLINE __m256i load_line2_avx2(const void *a, const void *b) { _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)b)), 0x20); } -void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, - int dst_stride0, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { +void av1_dist_wtd_convolve_x_avx2(const uint8_t *src, int src_stride, + uint8_t *dst0, int dst_stride0, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_q4, const int subpel_y_q4, + ConvolveParams *conv_params) { CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; const int bd = 8; - int i, j; - const int fo_horiz = filter_params_x->taps / 2 - 1; - const uint8_t *const src_ptr = src - fo_horiz; + int i, j, is_horiz_4tap = 0; const int bits = FILTER_BITS - conv_params->round_1; const __m256i wt = unpack_weights_avx2(conv_params); const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const int offset_0 = bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); @@ -58,18 +56,10 @@ void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, const int rounding_shift = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1); - __m256i filt[4], coeffs[4]; assert(bits >= 0); assert(conv_params->round_0 > 0); - filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2); - filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); - filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); - filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); - - prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs); - const __m256i round_const = _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1); const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1); @@ -77,68 +67,136 @@ void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, (void)filter_params_y; (void)subpel_y_q4; - for (i = 0; i < h; i += 2) { - const uint8_t *src_data = src_ptr + i * src_stride; - CONV_BUF_TYPE *dst_data = dst + i * dst_stride; - for (j = 0; j < w; j += 8) { - const __m256i data = - load_line2_avx2(&src_data[j], &src_data[j + src_stride]); + __m256i filt[4], coeffs[4]; - __m256i res = convolve_lowbd_x(data, coeffs, filt); + filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2); + filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); - res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift); + prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs); - res = _mm256_slli_epi16(res, bits); + // Condition for checking valid horz_filt taps + if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0))) + is_horiz_4tap = 1; - const __m256i res_unsigned = _mm256_add_epi16(res, offset_const); + // horz_filt as 4 tap + if (is_horiz_4tap) { + const int fo_horiz = 1; + const uint8_t *const src_ptr = src - fo_horiz; + for (i = 0; i < h; i += 2) { + const uint8_t *src_data = src_ptr + i * src_stride; + CONV_BUF_TYPE *dst_data = dst + i * dst_stride; + for (j = 0; j < w; j += 8) { + const __m256i data = + load_line2_avx2(&src_data[j], &src_data[j + src_stride]); - // Accumulate values into the destination buffer - if (do_average) { - const __m256i data_ref_0 = - load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]); - const __m256i comp_avg_res = - comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg); + __m256i res = convolve_lowbd_x_4tap(data, coeffs + 1, filt); + res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift); + res = _mm256_slli_epi16(res, bits); - const __m256i round_result = convolve_rounding( - &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + const __m256i res_unsigned = _mm256_add_epi16(res, offset_const); - const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); - const __m128i res_0 = _mm256_castsi256_si128(res_8); - const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); + // Accumulate values into the destination buffer + if (do_average) { + const __m256i data_ref_0 = + load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]); + const __m256i comp_avg_res = + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m256i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); - if (w > 4) { - _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); - _mm_storel_epi64( - (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); + const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); + const __m128i res_0 = _mm256_castsi256_si128(res_8); + const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); + + if (w > 4) { + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_storel_epi64( + (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); + } else { + *(uint32_t *)(&dst0[i * dst_stride0 + j]) = + _mm_cvtsi128_si32(res_0); + *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) = + _mm_cvtsi128_si32(res_1); + } } else { - *(uint32_t *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); - *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) = - _mm_cvtsi128_si32(res_1); + const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); + + const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); } - } else { - const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); - _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); + } + } + } else { + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint8_t *const src_ptr = src - fo_horiz; + + filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); + for (i = 0; i < h; i += 2) { + const uint8_t *src_data = src_ptr + i * src_stride; + CONV_BUF_TYPE *dst_data = dst + i * dst_stride; + for (j = 0; j < w; j += 8) { + const __m256i data = + load_line2_avx2(&src_data[j], &src_data[j + src_stride]); + + __m256i res = convolve_lowbd_x(data, coeffs, filt); + + res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift); + + res = _mm256_slli_epi16(res, bits); + + const __m256i res_unsigned = _mm256_add_epi16(res, offset_const); + + // Accumulate values into the destination buffer + if (do_average) { + const __m256i data_ref_0 = + load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]); + const __m256i comp_avg_res = + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m256i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + + const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); + const __m128i res_0 = _mm256_castsi256_si128(res_8); + const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); + + if (w > 4) { + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_storel_epi64( + (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); + } else { + *(uint32_t *)(&dst0[i * dst_stride0 + j]) = + _mm_cvtsi128_si32(res_0); + *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) = + _mm_cvtsi128_si32(res_1); + } + } else { + const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); - const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); - _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), - res_1); + const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } } } } } -void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, - int dst_stride0, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { +void av1_dist_wtd_convolve_y_avx2(const uint8_t *src, int src_stride, + uint8_t *dst0, int dst_stride0, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_q4, const int subpel_y_q4, + ConvolveParams *conv_params) { CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; const int bd = 8; - int i, j; - const int fo_vert = filter_params_y->taps / 2 - 1; - const uint8_t *const src_ptr = src - fo_vert * src_stride; + int i, j, is_vert_4tap = 0; // +1 to compensate for dividing the filter coeffs by 2 const int left_shift = FILTER_BITS - conv_params->round_0 + 1; const __m256i round_const = @@ -146,7 +204,7 @@ void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); const __m256i wt = unpack_weights_avx2(conv_params); const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const int offset_0 = bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); @@ -168,195 +226,389 @@ void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, (void)filter_params_x; (void)subpel_x_q4; - for (j = 0; j < w; j += 16) { - const uint8_t *data = &src_ptr[j]; - __m256i src6; - // Load lines a and b. Line a to lower 128, line b to upper 128 - { - __m256i src_ab[7]; - __m256i src_a[7]; - src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); - for (int kk = 0; kk < 6; ++kk) { - data += src_stride; - src_a[kk + 1] = - _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); - src_ab[kk] = _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20); + // Condition for checking valid vert_filt taps + if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0))) + is_vert_4tap = 1; + + if (is_vert_4tap) { + const int fo_vert = 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride; + for (j = 0; j < w; j += 16) { + const uint8_t *data = &src_ptr[j]; + __m256i src4; + // Load lines a and b. Line a to lower 128, line b to upper 128 + { + __m256i src_ab[4]; + __m256i src_a[5]; + src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); + for (int kk = 0; kk < 4; ++kk) { + data += src_stride; + src_a[kk + 1] = + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); + src_ab[kk] = + _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20); + } + src4 = src_a[4]; + s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]); + s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]); + + s[3] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]); + s[4] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]); } - src6 = src_a[6]; - s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]); - s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]); - s[2] = _mm256_unpacklo_epi8(src_ab[4], src_ab[5]); - s[4] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]); - s[5] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]); - s[6] = _mm256_unpackhi_epi8(src_ab[4], src_ab[5]); - } - for (i = 0; i < h; i += 2) { - data = &src_ptr[(i + 7) * src_stride + j]; - const __m256i src7 = - _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); - const __m256i src_67a = _mm256_permute2x128_si256(src6, src7, 0x20); + for (i = 0; i < h; i += 2) { + data = &src_ptr[(i + 5) * src_stride + j]; + const __m256i src5 = + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); + const __m256i src_45a = _mm256_permute2x128_si256(src4, src5, 0x20); - src6 = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)(data + src_stride))); - const __m256i src_78a = _mm256_permute2x128_si256(src7, src6, 0x20); + src4 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + src_stride))); + const __m256i src_56a = _mm256_permute2x128_si256(src5, src4, 0x20); - s[3] = _mm256_unpacklo_epi8(src_67a, src_78a); - s[7] = _mm256_unpackhi_epi8(src_67a, src_78a); + s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); + s[5] = _mm256_unpackhi_epi8(src_45a, src_56a); - __m256i res_lo = convolve_lowbd(s, coeffs); + __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1); - res_lo = _mm256_add_epi16(res_lo, offset_const_1); + res_lo = _mm256_add_epi16(res_lo, offset_const_1); - const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero); - const __m256i res_lo_0_shift = - _mm256_slli_epi32(res_lo_0_32b, left_shift); - const __m256i res_lo_0_round = _mm256_sra_epi32( - _mm256_add_epi32(res_lo_0_shift, round_const), round_shift); + const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero); + const __m256i res_lo_0_shift = + _mm256_slli_epi32(res_lo_0_32b, left_shift); + const __m256i res_lo_0_round = _mm256_sra_epi32( + _mm256_add_epi32(res_lo_0_shift, round_const), round_shift); - const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero); - const __m256i res_lo_1_shift = - _mm256_slli_epi32(res_lo_1_32b, left_shift); - const __m256i res_lo_1_round = _mm256_sra_epi32( - _mm256_add_epi32(res_lo_1_shift, round_const), round_shift); + const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero); + const __m256i res_lo_1_shift = + _mm256_slli_epi32(res_lo_1_32b, left_shift); + const __m256i res_lo_1_round = _mm256_sra_epi32( + _mm256_add_epi32(res_lo_1_shift, round_const), round_shift); - const __m256i res_lo_round = - _mm256_packs_epi32(res_lo_0_round, res_lo_1_round); + const __m256i res_lo_round = + _mm256_packs_epi32(res_lo_0_round, res_lo_1_round); - const __m256i res_lo_unsigned = - _mm256_add_epi16(res_lo_round, offset_const_2); + const __m256i res_lo_unsigned = + _mm256_add_epi16(res_lo_round, offset_const_2); - if (w - j < 16) { - if (do_average) { - const __m256i data_ref_0 = load_line2_avx2( - &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]); - const __m256i comp_avg_res = - comp_avg(&data_ref_0, &res_lo_unsigned, &wt, use_jnt_comp_avg); + if (w - j < 16) { + if (do_average) { + const __m256i data_ref_0 = + load_line2_avx2(&dst[i * dst_stride + j], + &dst[i * dst_stride + j + dst_stride]); + const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_lo_unsigned, + &wt, use_dist_wtd_comp_avg); - const __m256i round_result = convolve_rounding( - &comp_avg_res, &offset_const, &rounding_const, rounding_shift); + const __m256i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); - const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); - const __m128i res_0 = _mm256_castsi256_si128(res_8); - const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); + const __m256i res_8 = + _mm256_packus_epi16(round_result, round_result); + const __m128i res_0 = _mm256_castsi256_si128(res_8); + const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); - if (w - j > 4) { - _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); - _mm_storel_epi64( - (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); + if (w - j > 4) { + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_storel_epi64( + (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), + res_1); + } else { + *(uint32_t *)(&dst0[i * dst_stride0 + j]) = + _mm_cvtsi128_si32(res_0); + *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) = + _mm_cvtsi128_si32(res_1); + } } else { - *(uint32_t *)(&dst0[i * dst_stride0 + j]) = - _mm_cvtsi128_si32(res_0); - *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) = - _mm_cvtsi128_si32(res_1); + const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); + + const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); } } else { - const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned); - _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); + __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1); - const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1); - _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), - res_1); + res_hi = _mm256_add_epi16(res_hi, offset_const_1); + + const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero); + const __m256i res_hi_0_shift = + _mm256_slli_epi32(res_hi_0_32b, left_shift); + const __m256i res_hi_0_round = _mm256_sra_epi32( + _mm256_add_epi32(res_hi_0_shift, round_const), round_shift); + + const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero); + const __m256i res_hi_1_shift = + _mm256_slli_epi32(res_hi_1_32b, left_shift); + const __m256i res_hi_1_round = _mm256_sra_epi32( + _mm256_add_epi32(res_hi_1_shift, round_const), round_shift); + + const __m256i res_hi_round = + _mm256_packs_epi32(res_hi_0_round, res_hi_1_round); + + const __m256i res_hi_unsigned = + _mm256_add_epi16(res_hi_round, offset_const_2); + + if (do_average) { + const __m256i data_ref_0_lo = + load_line2_avx2(&dst[i * dst_stride + j], + &dst[i * dst_stride + j + dst_stride]); + + const __m256i data_ref_0_hi = + load_line2_avx2(&dst[i * dst_stride + j + 8], + &dst[i * dst_stride + j + 8 + dst_stride]); + + const __m256i comp_avg_res_lo = comp_avg( + &data_ref_0_lo, &res_lo_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m256i comp_avg_res_hi = comp_avg( + &data_ref_0_hi, &res_hi_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m256i round_result_lo = + convolve_rounding(&comp_avg_res_lo, &offset_const, + &rounding_const, rounding_shift); + + const __m256i round_result_hi = + convolve_rounding(&comp_avg_res_hi, &offset_const, + &rounding_const, rounding_shift); + + const __m256i res_8 = + _mm256_packus_epi16(round_result_lo, round_result_hi); + const __m128i res_0 = _mm256_castsi256_si128(res_8); + const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); + + _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_store_si128( + (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); + + } else { + const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0); + + const __m128i res_lo_1 = + _mm256_extracti128_si256(res_lo_unsigned, 1); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_lo_1); + + const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]), + res_hi_0); + + const __m128i res_hi_1 = + _mm256_extracti128_si256(res_hi_unsigned, 1); + _mm_store_si128( + (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]), + res_hi_1); + } + } + s[0] = s[1]; + s[1] = s[2]; + + s[3] = s[4]; + s[4] = s[5]; + } + } + } else { + const int fo_vert = filter_params_y->taps / 2 - 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride; + for (j = 0; j < w; j += 16) { + const uint8_t *data = &src_ptr[j]; + __m256i src6; + // Load lines a and b. Line a to lower 128, line b to upper 128 + { + __m256i src_ab[7]; + __m256i src_a[7]; + src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); + for (int kk = 0; kk < 6; ++kk) { + data += src_stride; + src_a[kk + 1] = + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); + src_ab[kk] = + _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20); } - } else { - __m256i res_hi = convolve_lowbd(s + 4, coeffs); + src6 = src_a[6]; + s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]); + s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]); + s[2] = _mm256_unpacklo_epi8(src_ab[4], src_ab[5]); + s[4] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]); + s[5] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]); + s[6] = _mm256_unpackhi_epi8(src_ab[4], src_ab[5]); + } - res_hi = _mm256_add_epi16(res_hi, offset_const_1); + for (i = 0; i < h; i += 2) { + data = &src_ptr[(i + 7) * src_stride + j]; + const __m256i src7 = + _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data)); + const __m256i src_67a = _mm256_permute2x128_si256(src6, src7, 0x20); - const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero); - const __m256i res_hi_0_shift = - _mm256_slli_epi32(res_hi_0_32b, left_shift); - const __m256i res_hi_0_round = _mm256_sra_epi32( - _mm256_add_epi32(res_hi_0_shift, round_const), round_shift); + src6 = _mm256_castsi128_si256( + _mm_loadu_si128((__m128i *)(data + src_stride))); + const __m256i src_78a = _mm256_permute2x128_si256(src7, src6, 0x20); - const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero); - const __m256i res_hi_1_shift = - _mm256_slli_epi32(res_hi_1_32b, left_shift); - const __m256i res_hi_1_round = _mm256_sra_epi32( - _mm256_add_epi32(res_hi_1_shift, round_const), round_shift); + s[3] = _mm256_unpacklo_epi8(src_67a, src_78a); + s[7] = _mm256_unpackhi_epi8(src_67a, src_78a); - const __m256i res_hi_round = - _mm256_packs_epi32(res_hi_0_round, res_hi_1_round); + __m256i res_lo = convolve_lowbd(s, coeffs); - const __m256i res_hi_unsigned = - _mm256_add_epi16(res_hi_round, offset_const_2); + res_lo = _mm256_add_epi16(res_lo, offset_const_1); - if (do_average) { - const __m256i data_ref_0_lo = load_line2_avx2( - &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]); + const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero); + const __m256i res_lo_0_shift = + _mm256_slli_epi32(res_lo_0_32b, left_shift); + const __m256i res_lo_0_round = _mm256_sra_epi32( + _mm256_add_epi32(res_lo_0_shift, round_const), round_shift); - const __m256i data_ref_0_hi = - load_line2_avx2(&dst[i * dst_stride + j + 8], - &dst[i * dst_stride + j + 8 + dst_stride]); + const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero); + const __m256i res_lo_1_shift = + _mm256_slli_epi32(res_lo_1_32b, left_shift); + const __m256i res_lo_1_round = _mm256_sra_epi32( + _mm256_add_epi32(res_lo_1_shift, round_const), round_shift); - const __m256i comp_avg_res_lo = - comp_avg(&data_ref_0_lo, &res_lo_unsigned, &wt, use_jnt_comp_avg); + const __m256i res_lo_round = + _mm256_packs_epi32(res_lo_0_round, res_lo_1_round); - const __m256i comp_avg_res_hi = - comp_avg(&data_ref_0_hi, &res_hi_unsigned, &wt, use_jnt_comp_avg); + const __m256i res_lo_unsigned = + _mm256_add_epi16(res_lo_round, offset_const_2); - const __m256i round_result_lo = convolve_rounding( - &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); + if (w - j < 16) { + if (do_average) { + const __m256i data_ref_0 = + load_line2_avx2(&dst[i * dst_stride + j], + &dst[i * dst_stride + j + dst_stride]); + const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_lo_unsigned, + &wt, use_dist_wtd_comp_avg); - const __m256i round_result_hi = convolve_rounding( - &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift); + const __m256i round_result = convolve_rounding( + &comp_avg_res, &offset_const, &rounding_const, rounding_shift); - const __m256i res_8 = - _mm256_packus_epi16(round_result_lo, round_result_hi); - const __m128i res_0 = _mm256_castsi256_si128(res_8); - const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); + const __m256i res_8 = + _mm256_packus_epi16(round_result, round_result); + const __m128i res_0 = _mm256_castsi256_si128(res_8); + const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); - _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); - _mm_store_si128( - (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); + if (w - j > 4) { + _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_storel_epi64( + (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), + res_1); + } else { + *(uint32_t *)(&dst0[i * dst_stride0 + j]) = + _mm_cvtsi128_si32(res_0); + *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) = + _mm_cvtsi128_si32(res_1); + } + } else { + const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); + const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_1); + } } else { - const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned); - _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0); + __m256i res_hi = convolve_lowbd(s + 4, coeffs); - const __m128i res_lo_1 = _mm256_extracti128_si256(res_lo_unsigned, 1); - _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), - res_lo_1); + res_hi = _mm256_add_epi16(res_hi, offset_const_1); + + const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero); + const __m256i res_hi_0_shift = + _mm256_slli_epi32(res_hi_0_32b, left_shift); + const __m256i res_hi_0_round = _mm256_sra_epi32( + _mm256_add_epi32(res_hi_0_shift, round_const), round_shift); + + const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero); + const __m256i res_hi_1_shift = + _mm256_slli_epi32(res_hi_1_32b, left_shift); + const __m256i res_hi_1_round = _mm256_sra_epi32( + _mm256_add_epi32(res_hi_1_shift, round_const), round_shift); + + const __m256i res_hi_round = + _mm256_packs_epi32(res_hi_0_round, res_hi_1_round); + + const __m256i res_hi_unsigned = + _mm256_add_epi16(res_hi_round, offset_const_2); + + if (do_average) { + const __m256i data_ref_0_lo = + load_line2_avx2(&dst[i * dst_stride + j], + &dst[i * dst_stride + j + dst_stride]); - const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned); - _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]), res_hi_0); + const __m256i data_ref_0_hi = + load_line2_avx2(&dst[i * dst_stride + j + 8], + &dst[i * dst_stride + j + 8 + dst_stride]); - const __m128i res_hi_1 = _mm256_extracti128_si256(res_hi_unsigned, 1); - _mm_store_si128( - (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]), res_hi_1); + const __m256i comp_avg_res_lo = comp_avg( + &data_ref_0_lo, &res_lo_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m256i comp_avg_res_hi = comp_avg( + &data_ref_0_hi, &res_hi_unsigned, &wt, use_dist_wtd_comp_avg); + + const __m256i round_result_lo = + convolve_rounding(&comp_avg_res_lo, &offset_const, + &rounding_const, rounding_shift); + + const __m256i round_result_hi = + convolve_rounding(&comp_avg_res_hi, &offset_const, + &rounding_const, rounding_shift); + + const __m256i res_8 = + _mm256_packus_epi16(round_result_lo, round_result_hi); + const __m128i res_0 = _mm256_castsi256_si128(res_8); + const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); + + _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); + _mm_store_si128( + (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); + + } else { + const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0); + + const __m128i res_lo_1 = + _mm256_extracti128_si256(res_lo_unsigned, 1); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), + res_lo_1); + + const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned); + _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]), + res_hi_0); + + const __m128i res_hi_1 = + _mm256_extracti128_si256(res_hi_unsigned, 1); + _mm_store_si128( + (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]), + res_hi_1); + } } - } - s[0] = s[1]; - s[1] = s[2]; - s[2] = s[3]; + s[0] = s[1]; + s[1] = s[2]; + s[2] = s[3]; - s[4] = s[5]; - s[5] = s[6]; - s[6] = s[7]; + s[4] = s[5]; + s[5] = s[6]; + s[6] = s[7]; + } } } } -void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, - int dst_stride0, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { +void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride, + uint8_t *dst0, int dst_stride0, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_q4, const int subpel_y_q4, + ConvolveParams *conv_params) { CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; const int bd = 8; DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]); - int im_h = h + filter_params_y->taps - 1; + int im_stride = 8; - int i, j; - const int fo_vert = filter_params_y->taps / 2 - 1; - const int fo_horiz = filter_params_x->taps / 2 - 1; - const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + int i, is_horiz_4tap = 0, is_vert_4tap = 0; const __m256i wt = unpack_weights_avx2(conv_params); const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const int offset_0 = bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); @@ -364,18 +616,9 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, const int rounding_shift = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1); - __m256i filt[4], s[8], coeffs_x[4], coeffs_y[4]; assert(conv_params->round_0 > 0); - filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2); - filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); - filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); - filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); - - prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_x); - prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y); - const __m256i round_const_h = _mm256_set1_epi16( ((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2))); const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1); @@ -385,9 +628,29 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1))); const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1); - for (j = 0; j < w; j += 8) { - /* Horizontal filter */ - { + __m256i filt[4], coeffs_x[4], coeffs_y[4]; + + filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2); + filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); + + prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_x); + prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y); + + // Condition for checking valid horz_filt taps + if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_x[0], coeffs_x[3]), 0))) + is_horiz_4tap = 1; + + // Condition for checking valid vert_filt taps + if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_y[0], coeffs_y[3]), 0))) + is_vert_4tap = 1; + + if (is_horiz_4tap) { + int im_h = h + filter_params_y->taps - 1; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + for (int j = 0; j < w; j += 8) { + /* Horizontal filter */ const uint8_t *src_h = src_ptr + j; for (i = 0; i < im_h; i += 2) { __m256i data = @@ -396,49 +659,59 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, data = _mm256_inserti128_si256( data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1); src_h += (src_stride << 1); - __m256i res = convolve_lowbd_x(data, coeffs_x, filt); + __m256i res = convolve_lowbd_x_4tap(data, coeffs_x + 1, filt); res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); } + DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP; } + } else if (is_vert_4tap) { + int im_h = h + 3; + const int fo_vert = 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); + + for (int j = 0; j < w; j += 8) { + /* Horizontal filter */ + const uint8_t *src_h = src_ptr + j; + DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP; - /* Vertical filter */ - { + /* Vertical filter */ + __m256i s[6]; __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); - __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); - __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); s[0] = _mm256_unpacklo_epi16(s0, s1); s[1] = _mm256_unpacklo_epi16(s2, s3); - s[2] = _mm256_unpacklo_epi16(s4, s5); - s[4] = _mm256_unpackhi_epi16(s0, s1); - s[5] = _mm256_unpackhi_epi16(s2, s3); - s[6] = _mm256_unpackhi_epi16(s4, s5); + s[3] = _mm256_unpackhi_epi16(s0, s1); + s[4] = _mm256_unpackhi_epi16(s2, s3); for (i = 0; i < h; i += 2) { const int16_t *data = &im_block[i * im_stride]; - const __m256i s6 = - _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); - const __m256i s7 = - _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); + const __m256i s4 = + _mm256_loadu_si256((__m256i *)(data + 4 * im_stride)); + const __m256i s5 = + _mm256_loadu_si256((__m256i *)(data + 5 * im_stride)); - s[3] = _mm256_unpacklo_epi16(s6, s7); - s[7] = _mm256_unpackhi_epi16(s6, s7); + s[2] = _mm256_unpacklo_epi16(s4, s5); + s[5] = _mm256_unpackhi_epi16(s4, s5); - const __m256i res_a = convolve(s, coeffs_y); + const __m256i res_a = convolve_4tap(s, coeffs_y + 1); const __m256i res_a_round = _mm256_sra_epi32( _mm256_add_epi32(res_a, round_const_v), round_shift_v); if (w - j > 4) { - const __m256i res_b = convolve(s + 4, coeffs_y); + const __m256i res_b = convolve_4tap(s + 3, coeffs_y + 1); const __m256i res_b_round = _mm256_sra_epi32( _mm256_add_epi32(res_b, round_const_v), round_shift_v); const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round); @@ -448,8 +721,8 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, const __m256i data_ref_0 = load_line2_avx2(&dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]); - const __m256i comp_avg_res = - comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg); + const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, + &wt, use_dist_wtd_comp_avg); const __m256i round_result = convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); @@ -479,8 +752,8 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, load_line2_avx2(&dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]); - const __m256i comp_avg_res = - comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg); + const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, + &wt, use_dist_wtd_comp_avg); const __m256i round_result = convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); @@ -504,25 +777,36 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0, res_1); } } - s[0] = s[1]; s[1] = s[2]; - s[2] = s[3]; - + s[3] = s[4]; s[4] = s[5]; - s[5] = s[6]; - s[6] = s[7]; } } + } else { + int im_h = h + filter_params_y->taps - 1; + const int fo_vert = filter_params_y->taps / 2 - 1; + const int fo_horiz = filter_params_x->taps / 2 - 1; + const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; + + filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); + filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); + + for (int j = 0; j < w; j += 8) { + /* Horizontal filter */ + const uint8_t *src_h = src_ptr + j; + DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP; + + DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP; + } } } -void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride, - uint8_t *dst0, int dst_stride0, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { +void av1_dist_wtd_convolve_2d_copy_avx2( + const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, + const int subpel_y_q4, ConvolveParams *conv_params) { const int bd = 8; CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; @@ -535,7 +819,7 @@ void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride, FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0; const __m128i left_shift = _mm_cvtsi32_si128(bits); const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const __m256i wt = unpack_weights_avx2(conv_params); const __m256i zero = _mm256_setzero_si256(); @@ -562,7 +846,7 @@ void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride, _mm256_loadu_si256((__m256i *)(&dst[i * dst_stride + j])); const __m256i comp_avg_res = - comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg); + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); const __m256i round_result = convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); @@ -600,7 +884,7 @@ void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride, const __m256i data_ref_0 = load_line2_avx2( &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]); const __m256i comp_avg_res = - comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg); + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); const __m256i round_result = convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); diff --git a/libaom/av1/common/x86/jnt_convolve_sse2.c b/libaom/av1/common/x86/jnt_convolve_sse2.c index 7f5677b..641cd02 100644 --- a/libaom/av1/common/x86/jnt_convolve_sse2.c +++ b/libaom/av1/common/x86/jnt_convolve_sse2.c @@ -16,12 +16,12 @@ #include "aom_dsp/aom_filter.h" #include "aom_dsp/x86/convolve_sse2.h" -void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0, - int dst_stride0, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { +void av1_dist_wtd_convolve_x_sse2(const uint8_t *src, int src_stride, + uint8_t *dst0, int dst_stride0, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_q4, const int subpel_y_q4, + ConvolveParams *conv_params) { const int bd = 8; CONV_BUF_TYPE *dst = conv_params->dst; const int dst_stride = conv_params->dst_stride; @@ -37,7 +37,7 @@ void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0, const __m128i wt1 = _mm_set1_epi16(w1); const __m128i wt = _mm_unpacklo_epi16(wt0, wt1); const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const int offset_0 = bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); @@ -77,7 +77,7 @@ void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0, const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst); const __m128i comp_avg_res = - comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg); + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); const __m128i round_result = convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); @@ -134,7 +134,7 @@ void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0, _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); const __m128i comp_avg_res = - comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg); + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); const __m128i round_result = convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); @@ -150,12 +150,12 @@ void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0, } } -void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0, - int dst_stride0, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { +void av1_dist_wtd_convolve_y_sse2(const uint8_t *src, int src_stride, + uint8_t *dst0, int dst_stride0, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_q4, const int subpel_y_q4, + ConvolveParams *conv_params) { const int bd = 8; CONV_BUF_TYPE *dst = conv_params->dst; const int dst_stride = conv_params->dst_stride; @@ -167,7 +167,7 @@ void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0, const __m128i wt1 = _mm_set1_epi16(conv_params->bck_offset); const __m128i wt = _mm_unpacklo_epi16(wt0, wt1); const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const int offset_0 = bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); @@ -225,7 +225,7 @@ void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0, const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst); const __m128i comp_avg_res = - comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg); + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); const __m128i round_result = convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); @@ -254,7 +254,7 @@ void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0, const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst); const __m128i comp_avg_res = - comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg); + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); const __m128i round_result = convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); @@ -331,7 +331,7 @@ void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0, _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); const __m128i comp_avg_res = - comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg); + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); const __m128i round_result = convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); @@ -360,7 +360,7 @@ void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0, _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); const __m128i comp_avg_res = - comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg); + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); const __m128i round_result = convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); @@ -384,12 +384,12 @@ void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0, } } -void av1_jnt_convolve_2d_sse2(const uint8_t *src, int src_stride, uint8_t *dst0, - int dst_stride0, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { +void av1_dist_wtd_convolve_2d_sse2(const uint8_t *src, int src_stride, + uint8_t *dst0, int dst_stride0, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_q4, const int subpel_y_q4, + ConvolveParams *conv_params) { CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; const int bd = 8; @@ -402,7 +402,7 @@ void av1_jnt_convolve_2d_sse2(const uint8_t *src, int src_stride, uint8_t *dst0, const int fo_vert = filter_params_y->taps / 2 - 1; const int fo_horiz = filter_params_x->taps / 2 - 1; const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; const __m128i zero = _mm_setzero_si128(); @@ -594,7 +594,7 @@ void av1_jnt_convolve_2d_sse2(const uint8_t *src, int src_stride, uint8_t *dst0, _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); const __m128i comp_avg_res = - comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg); + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); const __m128i round_result = convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); diff --git a/libaom/av1/common/x86/jnt_convolve_ssse3.c b/libaom/av1/common/x86/jnt_convolve_ssse3.c index 8227727..9aeab29 100644 --- a/libaom/av1/common/x86/jnt_convolve_ssse3.c +++ b/libaom/av1/common/x86/jnt_convolve_ssse3.c @@ -16,12 +16,11 @@ #include "aom_dsp/aom_filter.h" #include "aom_dsp/x86/convolve_sse2.h" -void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride, - uint8_t *dst0, int dst_stride0, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_q4, const int subpel_y_q4, - ConvolveParams *conv_params) { +void av1_dist_wtd_convolve_2d_ssse3( + const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w, + int h, const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, const int subpel_x_q4, + const int subpel_y_q4, ConvolveParams *conv_params) { CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; const int bd = 8; @@ -34,7 +33,7 @@ void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride, const int fo_vert = filter_params_y->taps / 2 - 1; const int fo_horiz = filter_params_x->taps / 2 - 1; const int do_average = conv_params->do_average; - const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; + const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; const __m128i zero = _mm_setzero_si128(); @@ -211,7 +210,7 @@ void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride, _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); const __m128i comp_avg_res = - comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg); + comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); const __m128i round_result = convolve_rounding( &comp_avg_res, &offset_const, &rounding_const, rounding_shift); diff --git a/libaom/av1/common/x86/warp_plane_sse4.c b/libaom/av1/common/x86/warp_plane_sse4.c index b810cea..4532d17 100644 --- a/libaom/av1/common/x86/warp_plane_sse4.c +++ b/libaom/av1/common/x86/warp_plane_sse4.c @@ -577,7 +577,7 @@ static INLINE void store_vertical_filter_output( __m128i *const dst8 = (__m128i *)&pred[(i + k + 4) * p_stride + j]; const __m128i p_16 = _mm_loadl_epi64(p); - if (conv_params->use_jnt_comp_avg) { + if (conv_params->use_dist_wtd_comp_avg) { const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, temp_lo_16); const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, *wt); const __m128i shifted_32 = @@ -610,7 +610,7 @@ static INLINE void store_vertical_filter_output( (__m128i *)&pred[(i + k + 4) * p_stride + j + 4]; const __m128i p4_16 = _mm_loadl_epi64(p4); - if (conv_params->use_jnt_comp_avg) { + if (conv_params->use_dist_wtd_comp_avg) { const __m128i p_16_hi = _mm_unpacklo_epi16(p4_16, temp_hi_16); const __m128i wt_res_hi = _mm_madd_epi16(p_16_hi, *wt); const __m128i shifted_32 = diff --git a/libaom/av1/common/x86/wiener_convolve_avx2.c b/libaom/av1/common/x86/wiener_convolve_avx2.c index 1f13e2f..87a6e12 100644 --- a/libaom/av1/common/x86/wiener_convolve_avx2.c +++ b/libaom/av1/common/x86/wiener_convolve_avx2.c @@ -17,7 +17,6 @@ #include "av1/common/convolve.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" -#include "aom_dsp/x86/convolve_avx2.h" #include "aom_dsp/x86/synonyms.h" #include "aom_dsp/x86/synonyms_avx2.h" @@ -26,207 +25,236 @@ // on the left. // A row of, say, 8-bit pixels with values p0, p1, p2, ..., p30, p31 will be // loaded and stored as [ p31 ... p17 p16 ][ p15 ... p1 p0 ]. - -// Exploiting the range of wiener filter coefficients, -// horizontal filtering can be done in 16 bit intermediate precision. -// The details are as follows : -// Consider the horizontal wiener filter coefficients of the following form : -// [C0, C1, C2, 2^(FILTER_BITS) -2 * (C0 + C1 + C2), C2, C1, C0] -// Subtracting 2^(FILTER_BITS) from the centre tap we get the following : -// [C0, C1, C2, -2 * (C0 + C1 + C2), C2, C1, C0] -// The sum of the product "C0 * p0 + C1 * p1 + C2 * p2 -2 * (C0 + C1 + C2) * p3 -// + C2 * p4 + C1 * p5 + C0 * p6" would be in the range of signed 16 bit -// precision. Finally, after rounding the above result by round_0, we multiply -// the centre pixel by 2^(FILTER_BITS - round_0) and add it to get the -// horizontal filter output. - void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params) { + const int bd = 8; assert(x_step_q4 == 16 && y_step_q4 == 16); assert(!(w & 7)); (void)x_step_q4; (void)y_step_q4; - DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + SUBPEL_TAPS) * 8]); - int im_h = h + SUBPEL_TAPS - 2; - int im_stride = 8; - memset(im_block + (im_h * im_stride), 0, MAX_SB_SIZE); - int i, j; - const int center_tap = (SUBPEL_TAPS - 1) / 2; + DECLARE_ALIGNED(32, uint16_t, + temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); + int intermediate_height = h + SUBPEL_TAPS - 2; + memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE); + const int center_tap = ((SUBPEL_TAPS - 1) / 2); const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap; - __m256i filt[4], coeffs_h[4], coeffs_v[4], filt_center; - - assert(conv_params->round_0 > 0); - - filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2); - filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2); - filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2); - filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2); - - filt_center = _mm256_load_si256((__m256i const *)filt_center_global_avx2); - - const __m128i coeffs_x = _mm_loadu_si128((__m128i *)filter_x); - const __m256i filter_coeffs_x = _mm256_broadcastsi128_si256(coeffs_x); - - // coeffs 0 1 0 1 0 1 0 1 - coeffs_h[0] = - _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0200u)); - // coeffs 2 3 2 3 2 3 2 3 - coeffs_h[1] = - _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0604u)); - // coeffs 4 5 4 5 4 5 4 5 - coeffs_h[2] = - _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0a08u)); - // coeffs 6 7 6 7 6 7 6 7 - coeffs_h[3] = - _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0e0cu)); - - const __m256i round_const_h = - _mm256_set1_epi16((1 << (conv_params->round_0 - 1))); - const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0); + const __m128i zero_128 = _mm_setzero_si128(); + const __m256i zero_256 = _mm256_setzero_si256(); // Add an offset to account for the "add_src" part of the convolve function. - const __m128i zero_128 = _mm_setzero_si128(); - const __m128i offset_0 = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3); - const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset_0); - - const __m256i filter_coeffs_y = _mm256_broadcastsi128_si256(coeffs_y); - - // coeffs 0 1 0 1 0 1 0 1 - coeffs_v[0] = _mm256_shuffle_epi32(filter_coeffs_y, 0x00); - // coeffs 2 3 2 3 2 3 2 3 - coeffs_v[1] = _mm256_shuffle_epi32(filter_coeffs_y, 0x55); - // coeffs 4 5 4 5 4 5 4 5 - coeffs_v[2] = _mm256_shuffle_epi32(filter_coeffs_y, 0xaa); - // coeffs 6 7 6 7 6 7 6 7 - coeffs_v[3] = _mm256_shuffle_epi32(filter_coeffs_y, 0xff); - - const __m256i round_const_v = - _mm256_set1_epi32((1 << (conv_params->round_1 - 1))); - const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1); - - for (j = 0; j < w; j += 8) { - for (i = 0; i < im_h; i += 2) { - __m256i data = _mm256_castsi128_si256( - _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); - - // Load the next line - if (i + 1 < im_h) - data = _mm256_inserti128_si256( - data, - _mm_loadu_si128( - (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]), - 1); - - __m256i res = convolve_lowbd_x(data, coeffs_h, filt); - - res = - _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); - - __m256i data_0 = _mm256_shuffle_epi8(data, filt_center); - - // multiply the center pixel by 2^(FILTER_BITS - round_0) and add it to - // the result - data_0 = _mm256_slli_epi16(data_0, FILTER_BITS - conv_params->round_0); - res = _mm256_add_epi16(res, data_0); - - _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); + const __m128i offset = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3); + + const __m256i clamp_low = zero_256; + const __m256i clamp_high = + _mm256_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1); + + /* Horizontal filter */ + { + // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ] + const __m128i coeffs_x = _mm_add_epi16(xx_loadu_128(filter_x), offset); + + // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ] + const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); + // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ] + const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); + + // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ] + const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123); + // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ] + const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123); + // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ] + const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567); + // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ] + const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567); + + // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ] + const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128); + // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ] + const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128); + // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ] + const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128); + // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ] + const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128); + + const __m256i round_const = _mm256_set1_epi32( + (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1))); + + for (int i = 0; i < intermediate_height; ++i) { + for (int j = 0; j < w; j += 16) { + const uint8_t *data_ij = src_ptr + i * src_stride + j; + + // Load 8-bit src data + const __m128i data_0 = xx_loadu_128(data_ij + 0); + const __m128i data_1 = xx_loadu_128(data_ij + 1); + const __m128i data_2 = xx_loadu_128(data_ij + 2); + const __m128i data_3 = xx_loadu_128(data_ij + 3); + const __m128i data_4 = xx_loadu_128(data_ij + 4); + const __m128i data_5 = xx_loadu_128(data_ij + 5); + const __m128i data_6 = xx_loadu_128(data_ij + 6); + const __m128i data_7 = xx_loadu_128(data_ij + 7); + + // (Zero-)Extend 8-bit data to 16-bit data + const __m256i src_0 = _mm256_cvtepu8_epi16(data_0); + const __m256i src_1 = _mm256_cvtepu8_epi16(data_1); + const __m256i src_2 = _mm256_cvtepu8_epi16(data_2); + const __m256i src_3 = _mm256_cvtepu8_epi16(data_3); + const __m256i src_4 = _mm256_cvtepu8_epi16(data_4); + const __m256i src_5 = _mm256_cvtepu8_epi16(data_5); + const __m256i src_6 = _mm256_cvtepu8_epi16(data_6); + const __m256i src_7 = _mm256_cvtepu8_epi16(data_7); + + // Multiply src data by filter coeffs and sum pairs + const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01); + const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01); + const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23); + const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23); + const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45); + const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45); + const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67); + const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67); + + // Calculate scalar product for even- and odd-indices separately, + // increasing to 32-bit precision + const __m256i res_even_sum = _mm256_add_epi32( + _mm256_add_epi32(res_0, res_4), _mm256_add_epi32(res_2, res_6)); + const __m256i res_odd_sum = _mm256_add_epi32( + _mm256_add_epi32(res_1, res_5), _mm256_add_epi32(res_3, res_7)); + + const __m256i res_even = _mm256_srai_epi32( + _mm256_add_epi32(res_even_sum, round_const), conv_params->round_0); + const __m256i res_odd = _mm256_srai_epi32( + _mm256_add_epi32(res_odd_sum, round_const), conv_params->round_0); + + // Reduce to 16-bit precision and pack even- and odd-index results + // back into one register. The _mm256_packs_epi32 intrinsic returns + // a register with the pixels ordered as follows: + // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ] + const __m256i res = _mm256_packs_epi32(res_even, res_odd); + const __m256i res_clamped = + _mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high); + + // Store in a temporary array + yy_storeu_256(temp + i * MAX_SB_SIZE + j, res_clamped); + } } + } - /* Vertical filter */ - { - __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); - __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); - __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); - __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); - __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); - __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); - - __m256i s[8]; - s[0] = _mm256_unpacklo_epi16(src_0, src_1); - s[1] = _mm256_unpacklo_epi16(src_2, src_3); - s[2] = _mm256_unpacklo_epi16(src_4, src_5); - - s[4] = _mm256_unpackhi_epi16(src_0, src_1); - s[5] = _mm256_unpackhi_epi16(src_2, src_3); - s[6] = _mm256_unpackhi_epi16(src_4, src_5); - - for (i = 0; i < h - 1; i += 2) { - const int16_t *data = &im_block[i * im_stride]; - - const __m256i s6 = - _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); - const __m256i s7 = - _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); - - s[3] = _mm256_unpacklo_epi16(s6, s7); - s[7] = _mm256_unpackhi_epi16(s6, s7); - - __m256i res_a = convolve(s, coeffs_v); - __m256i res_b = convolve(s + 4, coeffs_v); - - const __m256i res_a_round = _mm256_sra_epi32( - _mm256_add_epi32(res_a, round_const_v), round_shift_v); - const __m256i res_b_round = _mm256_sra_epi32( - _mm256_add_epi32(res_b, round_const_v), round_shift_v); - - /* rounding code */ - // 16 bit conversion - const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); - // 8 bit conversion and saturation to uint8 - const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit); - - const __m128i res_0 = _mm256_castsi256_si128(res_8b); - const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); - - // Store values into the destination buffer - __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; - __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride]; - - _mm_storel_epi64(p_0, res_0); - _mm_storel_epi64(p_1, res_1); - - s[0] = s[1]; - s[1] = s[2]; - s[2] = s[3]; - - s[4] = s[5]; - s[5] = s[6]; - s[6] = s[7]; - } - if (h - i) { - s[0] = _mm256_permute2x128_si256(s[0], s[4], 0x20); - s[1] = _mm256_permute2x128_si256(s[1], s[5], 0x20); - s[2] = _mm256_permute2x128_si256(s[2], s[6], 0x20); - - const int16_t *data = &im_block[i * im_stride]; - const __m128i s6_ = _mm_loadu_si128((__m128i *)(data + 6 * im_stride)); - const __m128i s7_ = _mm_loadu_si128((__m128i *)(data + 7 * im_stride)); - - __m128i s3 = _mm_unpacklo_epi16(s6_, s7_); - __m128i s7 = _mm_unpackhi_epi16(s6_, s7_); - - s[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s7, 1); - __m256i convolveres = convolve(s, coeffs_v); - - const __m256i res_round = _mm256_sra_epi32( - _mm256_add_epi32(convolveres, round_const_v), round_shift_v); - - /* rounding code */ - // 16 bit conversion - __m128i reslo = _mm256_castsi256_si128(res_round); - __m128i reshi = _mm256_extracti128_si256(res_round, 1); - const __m128i res_16bit = _mm_packus_epi32(reslo, reshi); - - // 8 bit conversion and saturation to uint8 - const __m128i res_8b = _mm_packus_epi16(res_16bit, res_16bit); - __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; - _mm_storel_epi64(p_0, res_8b); + /* Vertical filter */ + { + // coeffs [ g7 g6 g5 g4 g3 g2 g1 g0 ] + const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset); + + // coeffs [ g3 g2 g3 g2 g1 g0 g1 g0 ] + const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); + // coeffs [ g7 g6 g7 g6 g5 g4 g5 g4 ] + const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); + + // coeffs [ g1 g0 g1 g0 g1 g0 g1 g0 ] + const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123); + // coeffs [ g3 g2 g3 g2 g3 g2 g3 g2 ] + const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123); + // coeffs [ g5 g4 g5 g4 g5 g4 g5 g4 ] + const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567); + // coeffs [ g7 g6 g7 g6 g7 g6 g7 g6 ] + const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567); + + // coeffs [ g1 g0 g1 g0 g1 g0 g1 g0 ][ g1 g0 g1 g0 g1 g0 g1 g0 ] + const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128); + // coeffs [ g3 g2 g3 g2 g3 g2 g3 g2 ][ g3 g2 g3 g2 g3 g2 g3 g2 ] + const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128); + // coeffs [ g5 g4 g5 g4 g5 g4 g5 g4 ][ g5 g4 g5 g4 g5 g4 g5 g4 ] + const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128); + // coeffs [ g7 g6 g7 g6 g7 g6 g7 g6 ][ g7 g6 g7 g6 g7 g6 g7 g6 ] + const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128); + + const __m256i round_const = + _mm256_set1_epi32((1 << (conv_params->round_1 - 1)) - + (1 << (bd + conv_params->round_1 - 1))); + + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; j += 16) { + const uint16_t *data_ij = temp + i * MAX_SB_SIZE + j; + + // Load 16-bit data from the output of the horizontal filter in + // which the pixels are ordered as follows: + // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ] + const __m256i data_0 = yy_loadu_256(data_ij + 0 * MAX_SB_SIZE); + const __m256i data_1 = yy_loadu_256(data_ij + 1 * MAX_SB_SIZE); + const __m256i data_2 = yy_loadu_256(data_ij + 2 * MAX_SB_SIZE); + const __m256i data_3 = yy_loadu_256(data_ij + 3 * MAX_SB_SIZE); + const __m256i data_4 = yy_loadu_256(data_ij + 4 * MAX_SB_SIZE); + const __m256i data_5 = yy_loadu_256(data_ij + 5 * MAX_SB_SIZE); + const __m256i data_6 = yy_loadu_256(data_ij + 6 * MAX_SB_SIZE); + const __m256i data_7 = yy_loadu_256(data_ij + 7 * MAX_SB_SIZE); + + // Filter the even-indices, increasing to 32-bit precision + const __m256i src_0 = _mm256_unpacklo_epi16(data_0, data_1); + const __m256i src_2 = _mm256_unpacklo_epi16(data_2, data_3); + const __m256i src_4 = _mm256_unpacklo_epi16(data_4, data_5); + const __m256i src_6 = _mm256_unpacklo_epi16(data_6, data_7); + + const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01); + const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23); + const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45); + const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67); + + const __m256i res_even = _mm256_add_epi32( + _mm256_add_epi32(res_0, res_2), _mm256_add_epi32(res_4, res_6)); + + // Filter the odd-indices, increasing to 32-bit precision + const __m256i src_1 = _mm256_unpackhi_epi16(data_0, data_1); + const __m256i src_3 = _mm256_unpackhi_epi16(data_2, data_3); + const __m256i src_5 = _mm256_unpackhi_epi16(data_4, data_5); + const __m256i src_7 = _mm256_unpackhi_epi16(data_6, data_7); + + const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01); + const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23); + const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45); + const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67); + + const __m256i res_odd = _mm256_add_epi32( + _mm256_add_epi32(res_1, res_3), _mm256_add_epi32(res_5, res_7)); + + // Pixels are currently in the following order: + // res_even order: [ 14 12 10 8 ] [ 6 4 2 0 ] + // res_odd order: [ 15 13 11 9 ] [ 7 5 3 1 ] + // + // Rearrange the pixels into the following order: + // res_lo order: [ 11 10 9 8 ] [ 3 2 1 0 ] + // res_hi order: [ 15 14 13 12 ] [ 7 6 5 4 ] + const __m256i res_lo = _mm256_unpacklo_epi32(res_even, res_odd); + const __m256i res_hi = _mm256_unpackhi_epi32(res_even, res_odd); + + const __m256i res_lo_round = _mm256_srai_epi32( + _mm256_add_epi32(res_lo, round_const), conv_params->round_1); + const __m256i res_hi_round = _mm256_srai_epi32( + _mm256_add_epi32(res_hi, round_const), conv_params->round_1); + + // Reduce to 16-bit precision and pack into the correct order: + // [ 15 14 13 12 11 10 9 8 ][ 7 6 5 4 3 2 1 0 ] + const __m256i res_16bit = + _mm256_packs_epi32(res_lo_round, res_hi_round); + + // Reduce to 8-bit precision. This messes up the order: + // [ - - - - - - - - 15 14 13 12 11 10 9 8 ] + // [ - - - - - - - - 7 6 5 4 3 2 1 0 ] + const __m256i res_8bit = + _mm256_packus_epi16(res_16bit, zero_256 /* don't care value */); + + // Swap the two central 32-bit values to get the order: + // [ - - - - - - - - - - - - - - - - ] + // [ 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 ] + const __m256i res_8bit2 = _mm256_permute4x64_epi64(res_8bit, 0xd8); + + // Store the lower 128-bit lane in the dst array + xx_storeu_128(dst + i * dst_stride + j, + _mm256_castsi256_si128(res_8bit2)); } } } diff --git a/libaom/av1/decoder/decodeframe.c b/libaom/av1/decoder/decodeframe.c index a30b267..b7fc370 100644 --- a/libaom/av1/decoder/decodeframe.c +++ b/libaom/av1/decoder/decodeframe.c @@ -64,6 +64,9 @@ #define ACCT_STR __func__ +#define AOM_MIN_THREADS_PER_TILE 1 +#define AOM_MAX_THREADS_PER_TILE 2 + // This is needed by ext_tile related unit tests. #define EXT_TILE_DEBUG 1 #define MC_TEMP_BUF_PELS \ @@ -153,13 +156,10 @@ static void inverse_transform_block(MACROBLOCKD *xd, int plane, const TX_SIZE tx_size, uint8_t *dst, int stride, int reduced_tx_set) { struct macroblockd_plane *const pd = &xd->plane[plane]; - tran_low_t *const dqcoeff = pd->dqcoeff; + tran_low_t *const dqcoeff = pd->dqcoeff_block + xd->cb_offset[plane]; eob_info *eob_data = pd->eob_data + xd->txb_offset[plane]; uint16_t scan_line = eob_data->max_scan_line; uint16_t eob = eob_data->eob; - - memcpy(dqcoeff, pd->dqcoeff_block + xd->cb_offset[plane], - (scan_line + 1) * sizeof(dqcoeff[0])); av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst, stride, eob, reduced_tx_set); memset(dqcoeff, 0, (scan_line + 1) * sizeof(dqcoeff[0])); @@ -696,27 +696,28 @@ static INLINE void dec_build_inter_predictors(const AV1_COMMON *cm, assert(bw < 8 || bh < 8); ConvolveParams conv_params = get_conv_params_no_round( 0, plane, xd->tmp_conv_dst, tmp_dst_stride, is_compound, xd->bd); - conv_params.use_jnt_comp_avg = 0; + conv_params.use_dist_wtd_comp_avg = 0; struct buf_2d *const dst_buf = &pd->dst; uint8_t *dst = dst_buf->buf + dst_buf->stride * y + x; ref = 0; - const RefBuffer *ref_buf = - &cm->current_frame - .frame_refs[this_mbmi->ref_frame[ref] - LAST_FRAME]; + const RefCntBuffer *ref_buf = + get_ref_frame_buf(cm, this_mbmi->ref_frame[ref]); + const struct scale_factors *ref_scale_factors = + get_ref_scale_factors_const(cm, this_mbmi->ref_frame[ref]); - pd->pre[ref].buf0 = (plane == 1) ? ref_buf->buf->buf.u_buffer - : ref_buf->buf->buf.v_buffer; + pd->pre[ref].buf0 = + (plane == 1) ? ref_buf->buf.u_buffer : ref_buf->buf.v_buffer; pd->pre[ref].buf = - pd->pre[ref].buf0 + - scaled_buffer_offset(pre_x, pre_y, ref_buf->buf->buf.uv_stride, - &ref_buf->sf); - pd->pre[ref].width = ref_buf->buf->buf.uv_crop_width; - pd->pre[ref].height = ref_buf->buf->buf.uv_crop_height; - pd->pre[ref].stride = ref_buf->buf->buf.uv_stride; + pd->pre[ref].buf0 + scaled_buffer_offset(pre_x, pre_y, + ref_buf->buf.uv_stride, + ref_scale_factors); + pd->pre[ref].width = ref_buf->buf.uv_crop_width; + pd->pre[ref].height = ref_buf->buf.uv_crop_height; + pd->pre[ref].stride = ref_buf->buf.uv_stride; const struct scale_factors *const sf = - is_intrabc ? &cm->sf_identity : &ref_buf->sf; + is_intrabc ? &cm->sf_identity : ref_scale_factors; struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref]; const MV mv = this_mbmi->mv[ref].as_mv; @@ -736,7 +737,7 @@ static INLINE void dec_build_inter_predictors(const AV1_COMMON *cm, &scaled_mv, &subpel_x_mv, &subpel_y_mv); pre = pre_buf->buf0 + block.y0 * pre_buf->stride + block.x0; src_stride = pre_buf->stride; - highbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH; + highbd = is_cur_buf_hbd(xd); extend_mc_border(sf, pre_buf, scaled_mv, block, subpel_x_mv, subpel_y_mv, 0, is_intrabc, highbd, xd->mc_buf[ref], &pre, &src_stride); @@ -769,7 +770,7 @@ static INLINE void dec_build_inter_predictors(const AV1_COMMON *cm, int src_stride[2]; for (ref = 0; ref < 1 + is_compound; ++ref) { const struct scale_factors *const sf = - is_intrabc ? &cm->sf_identity : &xd->block_refs[ref]->sf; + is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref]; struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref]; const MV mv = mi->mv[ref].as_mv; PadBlock block; @@ -780,9 +781,9 @@ static INLINE void dec_build_inter_predictors(const AV1_COMMON *cm, dec_calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, 0, 0, pre_buf, &subpel_params[ref], bw, bh, &block, mi_x, mi_y, &scaled_mv, &subpel_x_mv, &subpel_y_mv); - pre[ref] = pre_buf->buf0 + block.y0 * pre_buf->stride + block.x0; + pre[ref] = pre_buf->buf0 + (int64_t)block.y0 * pre_buf->stride + block.x0; src_stride[ref] = pre_buf->stride; - highbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH; + highbd = is_cur_buf_hbd(xd); WarpTypesAllowed warp_types; warp_types.global_warp_allowed = is_global[ref]; @@ -800,13 +801,13 @@ static INLINE void dec_build_inter_predictors(const AV1_COMMON *cm, ConvolveParams conv_params = get_conv_params_no_round( 0, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd); - av1_jnt_comp_weight_assign(cm, mi, 0, &conv_params.fwd_offset, - &conv_params.bck_offset, - &conv_params.use_jnt_comp_avg, is_compound); + av1_dist_wtd_comp_weight_assign( + cm, mi, 0, &conv_params.fwd_offset, &conv_params.bck_offset, + &conv_params.use_dist_wtd_comp_avg, is_compound); for (ref = 0; ref < 1 + is_compound; ++ref) { const struct scale_factors *const sf = - is_intrabc ? &cm->sf_identity : &xd->block_refs[ref]->sf; + is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref]; WarpTypesAllowed warp_types; warp_types.global_warp_allowed = is_global[ref]; warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL; @@ -855,7 +856,7 @@ static void dec_build_inter_predictors_for_planes(const AV1_COMMON *cm, static void dec_build_inter_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, - int mi_col, BUFFER_SET *ctx, + int mi_col, const BUFFER_SET *ctx, BLOCK_SIZE bsize) { dec_build_inter_predictors_for_planes(cm, xd, bsize, mi_row, mi_col, 0, 0); @@ -870,7 +871,7 @@ static void dec_build_inter_predictors_sby(const AV1_COMMON *cm, static void dec_build_inter_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, - int mi_col, BUFFER_SET *ctx, + int mi_col, const BUFFER_SET *ctx, BLOCK_SIZE bsize) { dec_build_inter_predictors_for_planes(cm, xd, bsize, mi_row, mi_col, 1, MAX_MB_PLANE - 1); @@ -1015,7 +1016,7 @@ static void dec_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + if (is_cur_buf_hbd(xd)) { int len = sizeof(uint16_t); dst_buf1[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0]); dst_buf1[1] = @@ -1063,11 +1064,13 @@ static void predict_inter_block(AV1_COMMON *const cm, MACROBLOCKD *const xd, assert(frame == INTRA_FRAME); assert(ref == 0); } else { - RefBuffer *ref_buf = &cm->current_frame.frame_refs[frame - LAST_FRAME]; + const RefCntBuffer *ref_buf = get_ref_frame_buf(cm, frame); + const struct scale_factors *ref_scale_factors = + get_ref_scale_factors_const(cm, frame); - xd->block_refs[ref] = ref_buf; - av1_setup_pre_planes(xd, ref, &ref_buf->buf->buf, mi_row, mi_col, - &ref_buf->sf, num_planes); + xd->block_ref_scale_factors[ref] = ref_scale_factors; + av1_setup_pre_planes(xd, ref, &ref_buf->buf, mi_row, mi_col, + ref_scale_factors, num_planes); } } @@ -2238,7 +2241,6 @@ static void setup_quantization(AV1_COMMON *const cm, cm->v_dc_delta_q = 0; cm->v_ac_delta_q = 0; } - cm->dequant_bit_depth = seq_params->bit_depth; cm->using_qmatrix = aom_rb_read_bit(rb); if (cm->using_qmatrix) { cm->qm_y = aom_rb_read_literal(rb, QM_LEVEL_BITS); @@ -2374,7 +2376,7 @@ static void setup_buffer_pool(AV1_COMMON *cm) { if (aom_realloc_frame_buffer( &cm->cur_frame->buf, cm->width, cm->height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, - AOM_BORDER_IN_PIXELS, cm->byte_alignment, + AOM_DEC_BORDER_IN_PIXELS, cm->byte_alignment, &cm->cur_frame->raw_frame_buffer, pool->get_fb_cb, pool->cb_priv)) { unlock_buffer_pool(pool); aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, @@ -2438,17 +2440,28 @@ static void setup_frame_size_with_refs(AV1_COMMON *cm, int width, height; int found = 0; int has_valid_ref_frame = 0; - for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { + for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { if (aom_rb_read_bit(rb)) { - YV12_BUFFER_CONFIG *const buf = &cm->current_frame.frame_refs[i].buf->buf; - width = buf->y_crop_width; - height = buf->y_crop_height; - cm->render_width = buf->render_width; - cm->render_height = buf->render_height; - setup_superres(cm, rb, &width, &height); - resize_context_buffers(cm, width, height); - found = 1; - break; + const RefCntBuffer *const ref_buf = get_ref_frame_buf(cm, i); + // This will never be NULL in a normal stream, as streams are required to + // have a shown keyframe before any inter frames, which would refresh all + // the reference buffers. However, it might be null if we're starting in + // the middle of a stream, and static analysis will error if we don't do + // a null check here. + if (ref_buf == NULL) { + aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + "Invalid condition: invalid reference buffer"); + } else { + const YV12_BUFFER_CONFIG *const buf = &ref_buf->buf; + width = buf->y_crop_width; + height = buf->y_crop_height; + cm->render_width = buf->render_width; + cm->render_height = buf->render_height; + setup_superres(cm, rb, &width, &height); + resize_context_buffers(cm, width, height); + found = 1; + break; + } } } @@ -2469,20 +2482,20 @@ static void setup_frame_size_with_refs(AV1_COMMON *cm, // Check to make sure at least one of frames that this frame references // has valid dimensions. - for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { - RefBuffer *const ref_frame = &cm->current_frame.frame_refs[i]; + for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { + const RefCntBuffer *const ref_frame = get_ref_frame_buf(cm, i); has_valid_ref_frame |= - valid_ref_frame_size(ref_frame->buf->buf.y_crop_width, - ref_frame->buf->buf.y_crop_height, width, height); + valid_ref_frame_size(ref_frame->buf.y_crop_width, + ref_frame->buf.y_crop_height, width, height); } if (!has_valid_ref_frame) aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, "Referenced frame has invalid size"); - for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { - RefBuffer *const ref_frame = &cm->current_frame.frame_refs[i]; + for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { + const RefCntBuffer *const ref_frame = get_ref_frame_buf(cm, i); if (!valid_ref_frame_img_fmt( - ref_frame->buf->buf.bit_depth, ref_frame->buf->buf.subsampling_x, - ref_frame->buf->buf.subsampling_y, seq_params->bit_depth, + ref_frame->buf.bit_depth, ref_frame->buf.subsampling_x, + ref_frame->buf.subsampling_y, seq_params->bit_depth, seq_params->subsampling_x, seq_params->subsampling_y)) aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, "Referenced frame has incompatible color format"); @@ -2716,9 +2729,10 @@ static const uint8_t *get_ls_tile_buffers( const int tile_col_size_bytes = pbi->tile_col_size_bytes; const int tile_size_bytes = pbi->tile_size_bytes; + int tile_width, tile_height; + av1_get_uniform_tile_size(cm, &tile_width, &tile_height); const int tile_copy_mode = - ((AOMMAX(cm->tile_width, cm->tile_height) << MI_SIZE_LOG2) <= 256) ? 1 - : 0; + ((AOMMAX(tile_width, tile_height) << MI_SIZE_LOG2) <= 256) ? 1 : 0; // Read tile column sizes for all columns (we need the last tile buffer) for (int c = 0; c < tile_cols; ++c) { const int is_last = c == tile_cols - 1; @@ -3206,7 +3220,7 @@ static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data, continue; td->bit_reader = &tile_data->bit_reader; - av1_zero(td->dqcoeff); + av1_zero(td->cb_buffer_base.dqcoeff); av1_tile_init(&td->xd.tile, cm, row, col); td->xd.current_qindex = cm->base_qindex; setup_bool_decoder(tile_bs_buf->data, data_end, tile_bs_buf->size, @@ -3220,7 +3234,7 @@ static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data, td->bit_reader->accounting = NULL; } #endif - av1_init_macroblockd(cm, &td->xd, td->dqcoeff); + av1_init_macroblockd(cm, &td->xd, NULL); av1_init_above_context(cm, &td->xd, row); // Initialise the tile context from the frame context @@ -3277,7 +3291,7 @@ static void tile_worker_hook_init(AV1Decoder *const pbi, int tile_col = tile_data->tile_info.tile_col; td->bit_reader = &tile_data->bit_reader; - av1_zero(td->dqcoeff); + av1_zero(td->cb_buffer_base.dqcoeff); av1_tile_init(&td->xd.tile, cm, tile_row, tile_col); td->xd.current_qindex = cm->base_qindex; setup_bool_decoder(tile_buffer->data, thread_data->data_end, @@ -3292,7 +3306,7 @@ static void tile_worker_hook_init(AV1Decoder *const pbi, td->bit_reader->accounting = NULL; } #endif - av1_init_macroblockd(cm, &td->xd, td->dqcoeff); + av1_init_macroblockd(cm, &td->xd, NULL); td->xd.error_info = &thread_data->error_info; av1_init_above_context(cm, &td->xd, tile_row); @@ -3350,6 +3364,20 @@ static int tile_worker_hook(void *arg1, void *arg2) { return !td->xd.corrupted; } +static INLINE int get_max_row_mt_workers_per_tile(AV1_COMMON *cm, + TileInfo tile) { + // NOTE: Currently value of max workers is calculated based + // on the parse and decode time. As per the theoretical estimate + // when percentage of parse time is equal to percentage of decode + // time, number of workers needed to parse + decode a tile can not + // exceed more than 2. + // TODO(any): Modify this value if parsing is optimized in future. + int sb_rows = av1_get_sb_rows_in_tile(cm, tile); + int max_workers = + sb_rows == 1 ? AOM_MIN_THREADS_PER_TILE : AOM_MAX_THREADS_PER_TILE; + return max_workers; +} + // The caller must hold pbi->row_mt_mutex_ when calling this function. // Returns 1 if either the next job is stored in *next_job_info or 1 is stored // in *end_of_frame. @@ -3380,8 +3408,8 @@ static int get_next_job_info(AV1Decoder *const pbi, int min_threads_working = INT_MAX; int max_mis_to_decode = 0; int tile_row_idx, tile_col_idx; - int tile_row = 0; - int tile_col = 0; + int tile_row = -1; + int tile_col = -1; memset(next_job_info, 0, sizeof(*next_job_info)); @@ -3429,7 +3457,9 @@ static int get_next_job_info(AV1Decoder *const pbi, max_mis_to_decode = 0; } if (num_threads_working == min_threads_working && - num_mis_to_decode > max_mis_to_decode) { + num_mis_to_decode > max_mis_to_decode && + num_threads_working < + get_max_row_mt_workers_per_tile(cm, tile_data->tile_info)) { max_mis_to_decode = num_mis_to_decode; tile_row = tile_row_idx; tile_col = tile_col_idx; @@ -3437,6 +3467,8 @@ static int get_next_job_info(AV1Decoder *const pbi, } } } + // No job found to process + if (tile_row == -1 || tile_col == -1) return 0; tile_data = pbi->tile_data + tile_row * cm->tile_cols + tile_col; tile_info = tile_data->tile_info; @@ -3565,9 +3597,22 @@ static int row_mt_worker_hook(void *arg1, void *arg2) { TileDataDec *const tile_data = cur_job_info->tile_data; tile_worker_hook_init(pbi, thread_data, tile_buffer, tile_data, allow_update_cdf); - +#if CONFIG_MULTITHREAD + pthread_mutex_lock(pbi->row_mt_mutex_); +#endif + tile_data->dec_row_mt_sync.num_threads_working++; +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(pbi->row_mt_mutex_); +#endif // decode tile parse_tile_row_mt(pbi, td, tile_data); +#if CONFIG_MULTITHREAD + pthread_mutex_lock(pbi->row_mt_mutex_); +#endif + tile_data->dec_row_mt_sync.num_threads_working--; +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(pbi->row_mt_mutex_); +#endif } else { break; } @@ -3616,7 +3661,7 @@ static int row_mt_worker_hook(void *arg1, void *arg2) { TileInfo tile_info = tile_data->tile_info; av1_tile_init(&td->xd.tile, cm, tile_row, tile_col); - av1_init_macroblockd(cm, &td->xd, td->dqcoeff); + av1_init_macroblockd(cm, &td->xd, NULL); td->xd.error_info = &thread_data->error_info; decode_tile_sb_row(pbi, td, tile_info, mi_row); @@ -3825,7 +3870,7 @@ static void decode_mt_init(AV1Decoder *pbi) { thread_data->error_info.setjmp = 0; } } - const int use_highbd = cm->seq_params.use_highbitdepth ? 1 : 0; + const int use_highbd = cm->seq_params.use_highbitdepth; const int buf_size = MC_TEMP_BUF_PELS << use_highbd; for (worker_idx = 0; worker_idx < pbi->max_threads - 1; ++worker_idx) { DecWorkerData *const thread_data = pbi->thread_data + worker_idx; @@ -3956,6 +4001,7 @@ static void dec_alloc_cb_buf(AV1Decoder *pbi) { av1_dec_free_cb_buf(pbi); CHECK_MEM_ERROR(cm, pbi->cb_buffer_base, aom_memalign(32, sizeof(*pbi->cb_buffer_base) * size)); + memset(pbi->cb_buffer_base, 0, sizeof(*pbi->cb_buffer_base) * size); pbi->cb_buffer_alloc_size = size; } } @@ -4043,7 +4089,8 @@ static const uint8_t *decode_tiles_row_mt(AV1Decoder *pbi, const uint8_t *data, int tile_cols_start; int tile_cols_end; int tile_count_tg; - int num_workers; + int num_workers = 0; + int max_threads; const uint8_t *raw_data_end = NULL; int max_sb_rows = 0; @@ -4059,7 +4106,7 @@ static const uint8_t *decode_tiles_row_mt(AV1Decoder *pbi, const uint8_t *data, tile_cols_end = tile_cols; } tile_count_tg = end_tile - start_tile + 1; - num_workers = pbi->max_threads; + max_threads = pbi->max_threads; // No tiles to decode. if (tile_rows_end <= tile_rows_start || tile_cols_end <= tile_cols_start || @@ -4072,7 +4119,7 @@ static const uint8_t *decode_tiles_row_mt(AV1Decoder *pbi, const uint8_t *data, assert(tile_rows <= MAX_TILE_ROWS); assert(tile_cols <= MAX_TILE_COLS); assert(tile_count_tg > 0); - assert(num_workers > 0); + assert(max_threads > 0); assert(start_tile <= end_tile); assert(start_tile >= 0 && end_tile < n_tiles); @@ -4104,8 +4151,10 @@ static const uint8_t *decode_tiles_row_mt(AV1Decoder *pbi, const uint8_t *data, max_sb_rows = AOMMAX(max_sb_rows, av1_get_sb_rows_in_tile(cm, tile_data->tile_info)); + num_workers += get_max_row_mt_workers_per_tile(cm, tile_data->tile_info); } } + num_workers = AOMMIN(num_workers, max_threads); if (pbi->allocated_row_mt_sync_rows != max_sb_rows) { for (int i = 0; i < n_tiles; ++i) { @@ -4190,20 +4239,38 @@ void av1_read_film_grain_params(AV1_COMMON *cm, if (!pars->update_parameters) { // inherit parameters from a previous reference frame - RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; int film_grain_params_ref_idx = aom_rb_read_literal(rb, 3); - int buf_idx = cm->ref_frame_map[film_grain_params_ref_idx]; - if (buf_idx == INVALID_IDX) { + // Section 6.8.20: It is a requirement of bitstream conformance that + // film_grain_params_ref_idx is equal to ref_frame_idx[ j ] for some value + // of j in the range 0 to REFS_PER_FRAME - 1. + int found = 0; + for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { + if (film_grain_params_ref_idx == cm->remapped_ref_idx[i]) { + found = 1; + break; + } + } + if (!found) { + aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "Invalid film grain reference idx %d. ref_frame_idx = " + "{%d, %d, %d, %d, %d, %d, %d}", + film_grain_params_ref_idx, cm->remapped_ref_idx[0], + cm->remapped_ref_idx[1], cm->remapped_ref_idx[2], + cm->remapped_ref_idx[3], cm->remapped_ref_idx[4], + cm->remapped_ref_idx[5], cm->remapped_ref_idx[6]); + } + RefCntBuffer *const buf = cm->ref_frame_map[film_grain_params_ref_idx]; + if (buf == NULL) { aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, "Invalid Film grain reference idx"); } - if (!frame_bufs[buf_idx].film_grain_params_present) { + if (!buf->film_grain_params_present) { aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, "Film grain reference parameters not available"); } uint16_t random_seed = pars->random_seed; - *pars = frame_bufs[buf_idx].film_grain_params; // inherit paramaters - pars->random_seed = random_seed; // with new random seed + *pars = buf->film_grain_params; // inherit paramaters + pars->random_seed = random_seed; // with new random seed return; } @@ -4420,13 +4487,13 @@ void av1_read_timing_info_header(AV1_COMMON *cm, cm->timing_info.equal_picture_interval = aom_rb_read_bit(rb); // Equal picture interval bit if (cm->timing_info.equal_picture_interval) { - cm->timing_info.num_ticks_per_picture = - aom_rb_read_uvlc(rb) + 1; // ticks per picture - if (cm->timing_info.num_ticks_per_picture == 0) { + const uint32_t num_ticks_per_picture_minus_1 = aom_rb_read_uvlc(rb); + if (num_ticks_per_picture_minus_1 == UINT32_MAX) { aom_internal_error( &cm->error, AOM_CODEC_UNSUP_BITSTREAM, "num_ticks_per_picture_minus_1 cannot be (1 << 32) − 1."); } + cm->timing_info.num_ticks_per_picture = num_ticks_per_picture_minus_1 + 1; } } @@ -4505,7 +4572,7 @@ void av1_read_sequence_header(AV1_COMMON *cm, struct aom_read_bit_buffer *rb, seq_params->enable_warped_motion = 0; seq_params->enable_dual_filter = 0; seq_params->order_hint_info.enable_order_hint = 0; - seq_params->order_hint_info.enable_jnt_comp = 0; + seq_params->order_hint_info.enable_dist_wtd_comp = 0; seq_params->order_hint_info.enable_ref_frame_mvs = 0; seq_params->force_screen_content_tools = 2; // SELECT_SCREEN_CONTENT_TOOLS seq_params->force_integer_mv = 2; // SELECT_INTEGER_MV @@ -4517,7 +4584,7 @@ void av1_read_sequence_header(AV1_COMMON *cm, struct aom_read_bit_buffer *rb, seq_params->enable_dual_filter = aom_rb_read_bit(rb); seq_params->order_hint_info.enable_order_hint = aom_rb_read_bit(rb); - seq_params->order_hint_info.enable_jnt_comp = + seq_params->order_hint_info.enable_dist_wtd_comp = seq_params->order_hint_info.enable_order_hint ? aom_rb_read_bit(rb) : 0; seq_params->order_hint_info.enable_ref_frame_mvs = seq_params->order_hint_info.enable_order_hint ? aom_rb_read_bit(rb) : 0; @@ -4663,62 +4730,71 @@ static void read_global_motion(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) { } // Release the references to the frame buffers in cm->ref_frame_map and reset -// all elements of cm->ref_frame_map to -1. +// all elements of cm->ref_frame_map to NULL. static void reset_ref_frame_map(AV1_COMMON *const cm) { BufferPool *const pool = cm->buffer_pool; - RefCntBuffer *const frame_bufs = pool->frame_bufs; for (int i = 0; i < REF_FRAMES; i++) { - decrease_ref_count(cm->ref_frame_map[i], frame_bufs, pool); + decrease_ref_count(cm->ref_frame_map[i], pool); + cm->ref_frame_map[i] = NULL; } - memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map)); } // Generate next_ref_frame_map. static void generate_next_ref_frame_map(AV1Decoder *const pbi) { AV1_COMMON *const cm = &pbi->common; BufferPool *const pool = cm->buffer_pool; - RefCntBuffer *const frame_bufs = pool->frame_bufs; lock_buffer_pool(pool); // cm->next_ref_frame_map holds references to frame buffers. After storing a // frame buffer index in cm->next_ref_frame_map, we need to increase the // frame buffer's ref_count. int ref_index = 0; - for (int mask = pbi->refresh_frame_flags; mask; mask >>= 1) { + for (int mask = cm->current_frame.refresh_frame_flags; mask; mask >>= 1) { if (mask & 1) { - cm->next_ref_frame_map[ref_index] = cm->new_fb_idx; + cm->next_ref_frame_map[ref_index] = cm->cur_frame; } else { cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index]; } - if (cm->next_ref_frame_map[ref_index] >= 0) - ++frame_bufs[cm->next_ref_frame_map[ref_index]].ref_count; + if (cm->next_ref_frame_map[ref_index] != NULL) + ++cm->next_ref_frame_map[ref_index]->ref_count; ++ref_index; } for (; ref_index < REF_FRAMES; ++ref_index) { cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index]; - if (cm->next_ref_frame_map[ref_index] >= 0) - ++frame_bufs[cm->next_ref_frame_map[ref_index]].ref_count; + if (cm->next_ref_frame_map[ref_index] != NULL) + ++cm->next_ref_frame_map[ref_index]->ref_count; } unlock_buffer_pool(pool); pbi->hold_ref_buf = 1; } +// If the refresh_frame_flags bitmask is set, update reference frame id values +// and mark frames as valid for reference. +static void update_ref_frame_id(AV1_COMMON *const cm, int frame_id) { + assert(cm->seq_params.frame_id_numbers_present_flag); + int refresh_frame_flags = cm->current_frame.refresh_frame_flags; + for (int i = 0; i < REF_FRAMES; i++) { + if ((refresh_frame_flags >> i) & 1) { + cm->ref_frame_id[i] = frame_id; + cm->valid_for_referencing[i] = 1; + } + } +} + static void show_existing_frame_reset(AV1Decoder *const pbi, int existing_frame_idx) { AV1_COMMON *const cm = &pbi->common; - BufferPool *const pool = cm->buffer_pool; - RefCntBuffer *const frame_bufs = pool->frame_bufs; assert(cm->show_existing_frame); cm->current_frame.frame_type = KEY_FRAME; - pbi->refresh_frame_flags = (1 << REF_FRAMES) - 1; + cm->current_frame.refresh_frame_flags = (1 << REF_FRAMES) - 1; for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { - cm->current_frame.frame_refs[i].buf = NULL; + cm->remapped_ref_idx[i] = INVALID_IDX; } if (pbi->need_resync) { @@ -4726,22 +4802,10 @@ static void show_existing_frame_reset(AV1Decoder *const pbi, pbi->need_resync = 0; } - cm->cur_frame->intra_only = 1; - + // Note that the displayed frame must be valid for referencing in order to + // have been selected. if (cm->seq_params.frame_id_numbers_present_flag) { - /* If bitmask is set, update reference frame id values and - mark frames as valid for reference. - Note that the displayed frame be valid for referencing - in order to have been selected. - */ - int refresh_frame_flags = pbi->refresh_frame_flags; - int display_frame_id = cm->ref_frame_id[existing_frame_idx]; - for (int i = 0; i < REF_FRAMES; i++) { - if ((refresh_frame_flags >> i) & 1) { - cm->ref_frame_id[i] = display_frame_id; - cm->valid_for_referencing[i] = 1; - } - } + update_ref_frame_id(cm, cm->ref_frame_id[existing_frame_idx]); } cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED; @@ -4749,8 +4813,7 @@ static void show_existing_frame_reset(AV1Decoder *const pbi, generate_next_ref_frame_map(pbi); // Reload the adapted CDFs from when we originally coded this keyframe - *cm->fc = - frame_bufs[cm->next_ref_frame_map[existing_frame_idx]].frame_context; + *cm->fc = cm->next_ref_frame_map[existing_frame_idx]->frame_context; } static INLINE void reset_frame_buffers(AV1_COMMON *cm) { @@ -4758,16 +4821,18 @@ static INLINE void reset_frame_buffers(AV1_COMMON *cm) { int i; // We have not stored any references to frame buffers in - // cm->next_ref_frame_map, so we can directly reset it to all -1. - memset(&cm->next_ref_frame_map, -1, sizeof(cm->next_ref_frame_map)); + // cm->next_ref_frame_map, so we can directly reset it to all NULL. + for (i = 0; i < REF_FRAMES; ++i) { + cm->next_ref_frame_map[i] = NULL; + } lock_buffer_pool(cm->buffer_pool); reset_ref_frame_map(cm); assert(cm->cur_frame->ref_count == 1); for (i = 0; i < FRAME_BUFFERS; ++i) { - // Reset all unreferenced frame buffers. We can also reset cm->new_fb_idx - // because we are the sole owner of cm->new_fb_idx. - if (frame_bufs[i].ref_count > 0 && i != cm->new_fb_idx) { + // Reset all unreferenced frame buffers. We can also reset cm->cur_frame + // because we are the sole owner of cm->cur_frame. + if (frame_bufs[i].ref_count > 0 && &frame_bufs[i] != cm->cur_frame) { continue; } frame_bufs[i].order_hint = 0; @@ -4794,10 +4859,6 @@ static int read_uncompressed_header(AV1Decoder *pbi, } cm->last_frame_type = current_frame->frame_type; - cm->last_intra_only = current_frame->intra_only; - - // NOTE: By default all coded frames to be used as a reference - cm->is_reference_frame = 1; if (seq_params->reduced_still_picture_hdr) { cm->show_existing_frame = 0; @@ -4812,7 +4873,7 @@ static int read_uncompressed_header(AV1Decoder *pbi, cm->error_resilient_mode = 1; } else { cm->show_existing_frame = aom_rb_read_bit(rb); - cm->reset_decoder_state = 0; + pbi->reset_decoder_state = 0; if (cm->show_existing_frame) { if (pbi->sequence_header_changed) { @@ -4822,7 +4883,11 @@ static int read_uncompressed_header(AV1Decoder *pbi, } // Show an existing frame directly. const int existing_frame_idx = aom_rb_read_literal(rb, 3); - const int frame_to_show = cm->ref_frame_map[existing_frame_idx]; + RefCntBuffer *const frame_to_show = cm->ref_frame_map[existing_frame_idx]; + if (frame_to_show == NULL) { + aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "Buffer does not contain a decoded frame"); + } if (seq_params->decoder_model_info_present_flag && cm->timing_info.equal_picture_interval == 0) { av1_read_temporal_point_info(cm, rb); @@ -4838,42 +4903,36 @@ static int read_uncompressed_header(AV1Decoder *pbi, "Reference buffer frame ID mismatch"); } lock_buffer_pool(pool); - if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) { - unlock_buffer_pool(pool); - aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, - "Buffer %d does not contain a decoded frame", - frame_to_show); - } + assert(frame_to_show->ref_count > 0); // cm->cur_frame should be the buffer referenced by the return value // of the get_free_fb() call in av1_receive_compressed_data(), and // generate_next_ref_frame_map() has not been called, so ref_count // should still be 1. assert(cm->cur_frame->ref_count == 1); - // ref_cnt_fb() decrements ref_count directly rather than call - // decrease_ref_count(). If cm->cur_frame->raw_frame_buffer - // has already been allocated, it will not be released by ref_cnt_fb()! + // assign_frame_buffer_p() decrements ref_count directly rather than + // call decrease_ref_count(). If cm->cur_frame->raw_frame_buffer has + // already been allocated, it will not be released by + // assign_frame_buffer_p()! assert(!cm->cur_frame->raw_frame_buffer.data); - assign_frame_buffer(frame_bufs, &cm->new_fb_idx, frame_to_show); - cm->cur_frame = &cm->buffer_pool->frame_bufs[cm->new_fb_idx]; - cm->reset_decoder_state = - frame_bufs[frame_to_show].frame_type == KEY_FRAME; + assign_frame_buffer_p(&cm->cur_frame, frame_to_show); + pbi->reset_decoder_state = frame_to_show->frame_type == KEY_FRAME; unlock_buffer_pool(pool); cm->lf.filter_level[0] = 0; cm->lf.filter_level[1] = 0; cm->show_frame = 1; - if (!frame_bufs[frame_to_show].showable_frame) { + if (!frame_to_show->showable_frame) { aom_merge_corrupted_flag(&xd->corrupted, 1); } - if (cm->reset_decoder_state) frame_bufs[frame_to_show].showable_frame = 0; + if (pbi->reset_decoder_state) frame_to_show->showable_frame = 0; - cm->film_grain_params = frame_bufs[frame_to_show].film_grain_params; + cm->film_grain_params = frame_to_show->film_grain_params; - if (cm->reset_decoder_state) { + if (pbi->reset_decoder_state) { show_existing_frame_reset(pbi, existing_frame_idx); } else { - pbi->refresh_frame_flags = 0; + current_frame->refresh_frame_flags = 0; } return 0; @@ -4908,7 +4967,6 @@ static int read_uncompressed_header(AV1Decoder *pbi, cm->showable_frame = aom_rb_read_bit(rb); } cm->cur_frame->showable_frame = cm->showable_frame; - current_frame->intra_only = current_frame->frame_type == INTRA_ONLY_FRAME; cm->error_resilient_mode = frame_is_sframe(cm) || (current_frame->frame_type == KEY_FRAME && cm->show_frame) @@ -4933,7 +4991,6 @@ static int read_uncompressed_header(AV1Decoder *pbi, cm->cur_frame_force_integer_mv = 0; } - cm->frame_refs_short_signaling = 0; int frame_size_override_flag = 0; cm->allow_intrabc = 0; cm->primary_ref_frame = PRIMARY_REF_NONE; @@ -5020,22 +5077,23 @@ static int read_uncompressed_header(AV1Decoder *pbi, } } if (current_frame->frame_type == KEY_FRAME) { - if (!cm->show_frame) // unshown keyframe (forward keyframe) - pbi->refresh_frame_flags = aom_rb_read_literal(rb, REF_FRAMES); - else // shown keyframe - pbi->refresh_frame_flags = (1 << REF_FRAMES) - 1; + if (!cm->show_frame) { // unshown keyframe (forward keyframe) + current_frame->refresh_frame_flags = aom_rb_read_literal(rb, REF_FRAMES); + } else { // shown keyframe + current_frame->refresh_frame_flags = (1 << REF_FRAMES) - 1; + } for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { - cm->current_frame.frame_refs[i].buf = NULL; + cm->remapped_ref_idx[i] = INVALID_IDX; } if (pbi->need_resync) { reset_ref_frame_map(cm); pbi->need_resync = 0; } } else { - if (current_frame->intra_only) { - pbi->refresh_frame_flags = aom_rb_read_literal(rb, REF_FRAMES); - if (pbi->refresh_frame_flags == 0xFF) { + if (current_frame->frame_type == INTRA_ONLY_FRAME) { + current_frame->refresh_frame_flags = aom_rb_read_literal(rb, REF_FRAMES); + if (current_frame->refresh_frame_flags == 0xFF) { aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, "Intra only frames cannot have refresh flags 0xFF"); } @@ -5044,17 +5102,12 @@ static int read_uncompressed_header(AV1Decoder *pbi, pbi->need_resync = 0; } } else if (pbi->need_resync != 1) { /* Skip if need resync */ - pbi->refresh_frame_flags = + current_frame->refresh_frame_flags = frame_is_sframe(cm) ? 0xFF : aom_rb_read_literal(rb, REF_FRAMES); - if (!pbi->refresh_frame_flags) { - // NOTE: "pbi->refresh_frame_flags == 0" indicates that the coded frame - // will not be used as a reference - cm->is_reference_frame = 0; - } } } - if (!frame_is_intra_only(cm) || pbi->refresh_frame_flags != 0xFF) { + if (!frame_is_intra_only(cm) || current_frame->refresh_frame_flags != 0xFF) { // Read all ref frame order hints if error_resilient_mode == 1 if (cm->error_resilient_mode && seq_params->order_hint_info.enable_order_hint) { @@ -5062,40 +5115,39 @@ static int read_uncompressed_header(AV1Decoder *pbi, // Read order hint from bit stream unsigned int order_hint = aom_rb_read_literal( rb, seq_params->order_hint_info.order_hint_bits_minus_1 + 1); - // Get buffer index - int buf_idx = cm->ref_frame_map[ref_idx]; - assert(buf_idx < FRAME_BUFFERS); - if (buf_idx == -1 || order_hint != frame_bufs[buf_idx].order_hint) { - if (buf_idx >= 0) { + // Get buffer + RefCntBuffer *buf = cm->ref_frame_map[ref_idx]; + if (buf == NULL || order_hint != buf->order_hint) { + if (buf != NULL) { lock_buffer_pool(pool); - decrease_ref_count(buf_idx, frame_bufs, pool); + decrease_ref_count(buf, pool); unlock_buffer_pool(pool); } // If no corresponding buffer exists, allocate a new buffer with all // pixels set to neutral grey. - buf_idx = get_free_fb(cm); + int buf_idx = get_free_fb(cm); if (buf_idx == INVALID_IDX) { aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, "Unable to find free frame buffer"); } + buf = &frame_bufs[buf_idx]; lock_buffer_pool(pool); if (aom_realloc_frame_buffer( - &frame_bufs[buf_idx].buf, seq_params->max_frame_width, + &buf->buf, seq_params->max_frame_width, seq_params->max_frame_height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS, cm->byte_alignment, - &pool->frame_bufs[buf_idx].raw_frame_buffer, pool->get_fb_cb, - pool->cb_priv)) { - decrease_ref_count(buf_idx, frame_bufs, pool); + &buf->raw_frame_buffer, pool->get_fb_cb, pool->cb_priv)) { + decrease_ref_count(buf, pool); unlock_buffer_pool(pool); aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); } unlock_buffer_pool(pool); - set_planes_to_neutral_grey(seq_params, &frame_bufs[buf_idx].buf, 0); + set_planes_to_neutral_grey(seq_params, &buf->buf, 0); - cm->ref_frame_map[ref_idx] = buf_idx; - frame_bufs[buf_idx].order_hint = order_hint; + cm->ref_frame_map[ref_idx] = buf; + buf->order_hint = order_hint; } } } @@ -5111,7 +5163,7 @@ static int read_uncompressed_header(AV1Decoder *pbi, } else { cm->allow_ref_frame_mvs = 0; - if (current_frame->intra_only) { + if (current_frame->frame_type == INTRA_ONLY_FRAME) { cm->cur_frame->film_grain_params_present = seq_params->film_grain_params_present; setup_frame_size(cm, frame_size_override_flag, rb); @@ -5119,57 +5171,53 @@ static int read_uncompressed_header(AV1Decoder *pbi, cm->allow_intrabc = aom_rb_read_bit(rb); } else if (pbi->need_resync != 1) { /* Skip if need resync */ - + int frame_refs_short_signaling = 0; // Frame refs short signaling is off when error resilient mode is on. if (seq_params->order_hint_info.enable_order_hint) - cm->frame_refs_short_signaling = aom_rb_read_bit(rb); + frame_refs_short_signaling = aom_rb_read_bit(rb); - if (cm->frame_refs_short_signaling) { + if (frame_refs_short_signaling) { // == LAST_FRAME == const int lst_ref = aom_rb_read_literal(rb, REF_FRAMES_LOG2); - const int lst_idx = cm->ref_frame_map[lst_ref]; + const RefCntBuffer *const lst_buf = cm->ref_frame_map[lst_ref]; // == GOLDEN_FRAME == const int gld_ref = aom_rb_read_literal(rb, REF_FRAMES_LOG2); - const int gld_idx = cm->ref_frame_map[gld_ref]; + const RefCntBuffer *const gld_buf = cm->ref_frame_map[gld_ref]; // Most of the time, streams start with a keyframe. In that case, // ref_frame_map will have been filled in at that point and will not - // contain any -1's. However, streams are explicitly allowed to start + // contain any NULLs. However, streams are explicitly allowed to start // with an intra-only frame, so long as they don't then signal a // reference to a slot that hasn't been set yet. That's what we are // checking here. - if (lst_idx == -1) + if (lst_buf == NULL) aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, "Inter frame requests nonexistent reference"); - if (gld_idx == -1) + if (gld_buf == NULL) aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, "Inter frame requests nonexistent reference"); - av1_set_frame_refs(cm, lst_ref, gld_ref); + av1_set_frame_refs(cm, cm->remapped_ref_idx, lst_ref, gld_ref); } for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { int ref = 0; - if (!cm->frame_refs_short_signaling) { + if (!frame_refs_short_signaling) { ref = aom_rb_read_literal(rb, REF_FRAMES_LOG2); - const int idx = cm->ref_frame_map[ref]; // Most of the time, streams start with a keyframe. In that case, // ref_frame_map will have been filled in at that point and will not - // contain any -1's. However, streams are explicitly allowed to start + // contain any NULLs. However, streams are explicitly allowed to start // with an intra-only frame, so long as they don't then signal a // reference to a slot that hasn't been set yet. That's what we are // checking here. - if (idx == -1) + if (cm->ref_frame_map[ref] == NULL) aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, "Inter frame requests nonexistent reference"); - - RefBuffer *const ref_frame = &cm->current_frame.frame_refs[i]; - ref_frame->buf = &frame_bufs[idx]; - ref_frame->map_idx = ref; + cm->remapped_ref_idx[i] = ref; } else { - ref = cm->current_frame.frame_refs[i].map_idx; + ref = cm->remapped_ref_idx[i]; } cm->ref_frame_sign_bias[LAST_FRAME + i] = 0; @@ -5206,26 +5254,29 @@ static int read_uncompressed_header(AV1Decoder *pbi, cm->switchable_motion_mode = aom_rb_read_bit(rb); } - cm->prev_frame = get_prev_frame(cm); + cm->prev_frame = get_primary_ref_frame_buf(cm); if (cm->primary_ref_frame != PRIMARY_REF_NONE && - cm->current_frame.frame_refs[cm->primary_ref_frame].buf == NULL) { + get_primary_ref_frame_buf(cm) == NULL) { aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, "Reference frame containing this frame's initial " "frame context is unavailable."); } - if (!current_frame->intra_only && pbi->need_resync != 1) { + if (!(current_frame->frame_type == INTRA_ONLY_FRAME) && + pbi->need_resync != 1) { if (frame_might_allow_ref_frame_mvs(cm)) cm->allow_ref_frame_mvs = aom_rb_read_bit(rb); else cm->allow_ref_frame_mvs = 0; - for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { - RefBuffer *const ref_buf = &cm->current_frame.frame_refs[i]; + for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { + const RefCntBuffer *const ref_buf = get_ref_frame_buf(cm, i); + struct scale_factors *const ref_scale_factors = + get_ref_scale_factors(cm, i); av1_setup_scale_factors_for_frame( - &ref_buf->sf, ref_buf->buf->buf.y_crop_width, - ref_buf->buf->buf.y_crop_height, cm->width, cm->height); - if ((!av1_is_valid_scale(&ref_buf->sf))) + ref_scale_factors, ref_buf->buf.y_crop_width, + ref_buf->buf.y_crop_height, cm->width, cm->height); + if ((!av1_is_valid_scale(ref_scale_factors))) aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, "Reference frame has invalid dimensions"); } @@ -5236,20 +5287,10 @@ static int read_uncompressed_header(AV1Decoder *pbi, av1_setup_frame_sign_bias(cm); - cm->cur_frame->intra_only = - current_frame->frame_type == KEY_FRAME || current_frame->intra_only; cm->cur_frame->frame_type = current_frame->frame_type; if (seq_params->frame_id_numbers_present_flag) { - /* If bitmask is set, update reference frame id values and - mark frames as valid for reference */ - int refresh_frame_flags = pbi->refresh_frame_flags; - for (int i = 0; i < REF_FRAMES; i++) { - if ((refresh_frame_flags >> i) & 1) { - cm->ref_frame_id[i] = cm->current_frame_id; - cm->valid_for_referencing[i] = 1; - } - } + update_ref_frame_id(cm, cm->current_frame_id); } const int might_bwd_adapt = @@ -5297,6 +5338,11 @@ static int read_uncompressed_header(AV1Decoder *pbi, } read_tile_info(pbi, rb); + if (!is_min_tile_width_satisfied(cm)) { + aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, + "Minimum tile width requirement not satisfied"); + } + setup_quantization(cm, rb); xd->bd = (int)seq_params->bit_depth; @@ -5486,7 +5532,7 @@ uint32_t av1_decode_frame_headers_and_setup(AV1Decoder *pbi, if (cm->show_existing_frame) { // showing a frame directly *p_data_end = data + uncomp_hdr_size; - if (cm->reset_decoder_state) { + if (pbi->reset_decoder_state) { // Use the default frame context values. *cm->fc = *cm->default_frame_context; if (!cm->fc->initialized) @@ -5498,8 +5544,6 @@ uint32_t av1_decode_frame_headers_and_setup(AV1Decoder *pbi, cm->setup_mi(cm); - cm->current_frame_seg_map = cm->cur_frame->seg_map; - av1_setup_motion_field(cm); av1_setup_block_planes(xd, cm->seq_params.subsampling_x, @@ -5508,8 +5552,7 @@ uint32_t av1_decode_frame_headers_and_setup(AV1Decoder *pbi, // use the default frame context values *cm->fc = *cm->default_frame_context; } else { - *cm->fc = - cm->current_frame.frame_refs[cm->primary_ref_frame].buf->frame_context; + *cm->fc = get_primary_ref_frame_buf(cm)->frame_context; } if (!cm->fc->initialized) aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, @@ -5528,7 +5571,7 @@ static void setup_frame_info(AV1Decoder *pbi) { cm->rst_info[2].frame_restoration_type != RESTORE_NONE) { av1_alloc_restoration_buffers(cm); } - const int use_highbd = cm->seq_params.use_highbitdepth ? 1 : 0; + const int use_highbd = cm->seq_params.use_highbitdepth; const int buf_size = MC_TEMP_BUF_PELS << use_highbd; if (pbi->td.mc_buf_size != buf_size) { av1_free_mc_tmp_buf(&pbi->td); diff --git a/libaom/av1/decoder/decodemv.c b/libaom/av1/decoder/decodemv.c index 7a94717..2791f3a 100644 --- a/libaom/av1/decoder/decodemv.c +++ b/libaom/av1/decoder/decodemv.c @@ -299,7 +299,7 @@ static void set_segment_id(AV1_COMMON *cm, int mi_offset, int x_mis, int y_mis, for (int y = 0; y < y_mis; y++) for (int x = 0; x < x_mis; x++) - cm->current_frame_seg_map[mi_offset + y * cm->mi_cols + x] = segment_id; + cm->cur_frame->seg_map[mi_offset + y * cm->mi_cols + x] = segment_id; } static int read_intra_segment_id(AV1_COMMON *const cm, @@ -355,7 +355,7 @@ static int read_inter_segment_id(AV1_COMMON *const cm, MACROBLOCKD *const xd, if (!seg->enabled) return 0; // Default for disabled segmentation if (!seg->update_map) { - copy_segment_id(cm, cm->last_frame_seg_map, cm->current_frame_seg_map, + copy_segment_id(cm, cm->last_frame_seg_map, cm->cur_frame->seg_map, mi_offset, x_mis, y_mis); return get_predicted_segment_id(cm, mi_offset, x_mis, y_mis); } @@ -364,7 +364,6 @@ static int read_inter_segment_id(AV1_COMMON *const cm, MACROBLOCKD *const xd, if (preskip) { if (!seg->segid_preskip) return 0; } else { - if (seg->segid_preskip) return mbmi->segment_id; if (mbmi->skip) { if (seg->temporal_update) { mbmi->seg_id_predicted = 0; @@ -679,11 +678,10 @@ static void read_intrabc_info(AV1_COMMON *const cm, MACROBLOCKD *const xd, int16_t inter_mode_ctx[MODE_CTX_REF_FRAMES]; int_mv ref_mvs[INTRA_FRAME + 1][MAX_MV_REF_CANDIDATES]; - int_mv global_mvs[REF_FRAMES]; av1_find_mv_refs(cm, xd, mbmi, INTRA_FRAME, xd->ref_mv_count, - xd->ref_mv_stack, ref_mvs, global_mvs, mi_row, mi_col, - inter_mode_ctx); + xd->ref_mv_stack, ref_mvs, /*global_mvs=*/NULL, mi_row, + mi_col, inter_mode_ctx); int_mv nearestmv, nearmv; @@ -700,7 +698,8 @@ static void read_intrabc_info(AV1_COMMON *const cm, MACROBLOCKD *const xd, mi_col, bsize, r); if (!valid_dv) { // Intra bc motion vectors are not valid - signal corrupt frame - aom_merge_corrupted_flag(&xd->corrupted, 1); + aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME, + "Invalid intrabc dv"); } } } @@ -1271,9 +1270,9 @@ static void read_inter_block_mode_info(AV1Decoder *const pbi, const int is_compound = has_second_ref(mbmi); MV_REFERENCE_FRAME ref_frame = av1_ref_frame_type(mbmi->ref_frame); - int_mv global_mvs[REF_FRAMES]; av1_find_mv_refs(cm, xd, mbmi, ref_frame, xd->ref_mv_count, xd->ref_mv_stack, - ref_mvs, global_mvs, mi_row, mi_col, inter_mode_ctx); + ref_mvs, /*global_mvs=*/NULL, mi_row, mi_col, + inter_mode_ctx); int mode_ctx = av1_mode_context_analyzer(inter_mode_ctx, mbmi->ref_frame); mbmi->ref_mv_idx = 0; @@ -1388,9 +1387,7 @@ static void read_inter_block_mode_info(AV1Decoder *const pbi, for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) { const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref]; - RefBuffer *ref_buf = &cm->current_frame.frame_refs[frame - LAST_FRAME]; - - xd->block_refs[ref] = ref_buf; + xd->block_ref_scale_factors[ref] = get_ref_scale_factors_const(cm, frame); } mbmi->motion_mode = SIMPLE_TRANSLATION; @@ -1419,13 +1416,16 @@ static void read_inter_block_mode_info(AV1Decoder *const pbi, } if (mbmi->comp_group_idx == 0) { - if (cm->seq_params.order_hint_info.enable_jnt_comp) { + if (cm->seq_params.order_hint_info.enable_dist_wtd_comp) { const int comp_index_ctx = get_comp_index_context(cm, xd); mbmi->compound_idx = aom_read_symbol( r, ec_ctx->compound_index_cdf[comp_index_ctx], 2, ACCT_STR); + mbmi->interinter_comp.type = + mbmi->compound_idx ? COMPOUND_AVERAGE : COMPOUND_DISTWTD; } else { // Distance-weighted compound is disabled, so always use average mbmi->compound_idx = 1; + mbmi->interinter_comp.type = COMPOUND_AVERAGE; } } else { assert(cm->current_frame.reference_mode != SINGLE_REFERENCE && @@ -1436,8 +1436,9 @@ static void read_inter_block_mode_info(AV1Decoder *const pbi, // compound_diffwtd, wedge if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) mbmi->interinter_comp.type = - 1 + aom_read_symbol(r, ec_ctx->compound_type_cdf[bsize], - COMPOUND_TYPES - 1, ACCT_STR); + COMPOUND_WEDGE + aom_read_symbol(r, + ec_ctx->compound_type_cdf[bsize], + MASKED_COMPOUND_TYPES, ACCT_STR); else mbmi->interinter_comp.type = COMPOUND_DIFFWTD; @@ -1502,7 +1503,8 @@ static void read_inter_frame_mode_info(AV1Decoder *const pbi, else mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r); - mbmi->segment_id = read_inter_segment_id(cm, xd, mi_row, mi_col, 0, r); + if (!cm->seg.segid_preskip) + mbmi->segment_id = read_inter_segment_id(cm, xd, mi_row, mi_col, 0, r); read_cdef(cm, r, xd, mi_col, mi_row); diff --git a/libaom/av1/decoder/decoder.c b/libaom/av1/decoder/decoder.c index 773305d..bff4b7a 100644 --- a/libaom/av1/decoder/decoder.c +++ b/libaom/av1/decoder/decoder.c @@ -100,15 +100,16 @@ AV1Decoder *av1_decoder_create(BufferPool *const pool) { aom_once(initialize_dec); // Initialize the references to not point to any frame buffers. - memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map)); - memset(&cm->next_ref_frame_map, -1, sizeof(cm->next_ref_frame_map)); + for (int i = 0; i < REF_FRAMES; i++) { + cm->ref_frame_map[i] = NULL; + cm->next_ref_frame_map[i] = NULL; + } cm->current_frame.frame_number = 0; pbi->decoding_first_frame = 1; pbi->common.buffer_pool = pool; cm->seq_params.bit_depth = AOM_BITS_8; - cm->dequant_bit_depth = AOM_BITS_8; cm->alloc_mi = av1_dec_alloc_mi; cm->free_mi = dec_free_mi; @@ -321,26 +322,26 @@ aom_codec_err_t av1_copy_new_frame_dec(AV1_COMMON *cm, static void release_frame_buffers(AV1Decoder *pbi) { AV1_COMMON *const cm = &pbi->common; BufferPool *const pool = cm->buffer_pool; - RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; + cm->cur_frame->buf.corrupted = 1; lock_buffer_pool(pool); // Release all the reference buffers in cm->next_ref_frame_map if the worker // thread is holding them. if (pbi->hold_ref_buf) { - int ref_index; - for (ref_index = 0; ref_index < REF_FRAMES; ++ref_index) { - const int new_idx = cm->next_ref_frame_map[ref_index]; - decrease_ref_count(new_idx, frame_bufs, pool); + for (int ref_index = 0; ref_index < REF_FRAMES; ++ref_index) { + decrease_ref_count(cm->next_ref_frame_map[ref_index], pool); + cm->next_ref_frame_map[ref_index] = NULL; } pbi->hold_ref_buf = 0; } // Release current frame. - decrease_ref_count(cm->new_fb_idx, frame_bufs, pool); + decrease_ref_count(cm->cur_frame, pool); unlock_buffer_pool(pool); + cm->cur_frame = NULL; } // If any buffer updating is signaled it should be done here. -// Consumes a reference to cm->new_fb_idx. +// Consumes a reference to cm->cur_frame. // // This functions returns void. It reports failure by setting // cm->error.error_code. @@ -348,7 +349,6 @@ static void swap_frame_buffers(AV1Decoder *pbi, int frame_decoded) { int ref_index = 0, mask; AV1_COMMON *const cm = &pbi->common; BufferPool *const pool = cm->buffer_pool; - RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; if (frame_decoded) { lock_buffer_pool(pool); @@ -358,58 +358,55 @@ static void swap_frame_buffers(AV1Decoder *pbi, int frame_decoded) { if (!pbi->camera_frame_header_ready) { // If we are not holding reference buffers in cm->next_ref_frame_map, // assert that the following two for loops are no-ops. - assert(IMPLIES(!pbi->hold_ref_buf, pbi->refresh_frame_flags == 0)); assert(IMPLIES(!pbi->hold_ref_buf, - cm->show_existing_frame && !cm->reset_decoder_state)); + cm->current_frame.refresh_frame_flags == 0)); + assert(IMPLIES(!pbi->hold_ref_buf, + cm->show_existing_frame && !pbi->reset_decoder_state)); // The following two for loops need to release the reference stored in // cm->ref_frame_map[ref_index] before transferring the reference stored // in cm->next_ref_frame_map[ref_index] to cm->ref_frame_map[ref_index]. - for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) { - const int old_idx = cm->ref_frame_map[ref_index]; - decrease_ref_count(old_idx, frame_bufs, pool); + for (mask = cm->current_frame.refresh_frame_flags; mask; mask >>= 1) { + decrease_ref_count(cm->ref_frame_map[ref_index], pool); cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index]; + cm->next_ref_frame_map[ref_index] = NULL; ++ref_index; } const int check_on_show_existing_frame = - !cm->show_existing_frame || cm->reset_decoder_state; + !cm->show_existing_frame || pbi->reset_decoder_state; for (; ref_index < REF_FRAMES && check_on_show_existing_frame; ++ref_index) { - const int old_idx = cm->ref_frame_map[ref_index]; - decrease_ref_count(old_idx, frame_bufs, pool); + decrease_ref_count(cm->ref_frame_map[ref_index], pool); cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index]; + cm->next_ref_frame_map[ref_index] = NULL; } } if (cm->show_existing_frame || cm->show_frame) { - YV12_BUFFER_CONFIG *cur_frame = &cm->cur_frame->buf; if (pbi->output_all_layers) { // Append this frame to the output queue if (pbi->num_output_frames >= MAX_NUM_SPATIAL_LAYERS) { // We can't store the new frame anywhere, so drop it and return an // error - decrease_ref_count(cm->new_fb_idx, frame_bufs, pool); - cm->cur_frame = NULL; + cm->cur_frame->buf.corrupted = 1; + decrease_ref_count(cm->cur_frame, pool); cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM; } else { - pbi->output_frames[pbi->num_output_frames] = cur_frame; - pbi->output_frame_index[pbi->num_output_frames] = cm->new_fb_idx; + pbi->output_frames[pbi->num_output_frames] = cm->cur_frame; pbi->num_output_frames++; } } else { // Replace any existing output frame assert(pbi->num_output_frames == 0 || pbi->num_output_frames == 1); if (pbi->num_output_frames > 0) { - decrease_ref_count(pbi->output_frame_index[0], frame_bufs, pool); + decrease_ref_count(pbi->output_frames[0], pool); } - pbi->output_frames[0] = cur_frame; - pbi->output_frame_index[0] = cm->new_fb_idx; + pbi->output_frames[0] = cm->cur_frame; pbi->num_output_frames = 1; } } else { - decrease_ref_count(cm->new_fb_idx, frame_bufs, pool); - cm->cur_frame = NULL; + decrease_ref_count(cm->cur_frame, pool); } unlock_buffer_pool(pool); @@ -420,17 +417,17 @@ static void swap_frame_buffers(AV1Decoder *pbi, int frame_decoded) { assert(IMPLIES(!pbi->camera_frame_header_ready, !pbi->hold_ref_buf)); // Nothing was decoded, so just drop this frame buffer lock_buffer_pool(pool); - decrease_ref_count(cm->new_fb_idx, frame_bufs, pool); - cm->cur_frame = NULL; + decrease_ref_count(cm->cur_frame, pool); unlock_buffer_pool(pool); } + cm->cur_frame = NULL; if (!pbi->camera_frame_header_ready) { pbi->hold_ref_buf = 0; // Invalidate these references until the next frame starts. for (ref_index = 0; ref_index < INTER_REFS_PER_FRAME; ref_index++) { - cm->current_frame.frame_refs[ref_index].buf = NULL; + cm->remapped_ref_idx[ref_index] = INVALID_IDX; } } } @@ -438,7 +435,6 @@ static void swap_frame_buffers(AV1Decoder *pbi, int frame_decoded) { int av1_receive_compressed_data(AV1Decoder *pbi, size_t size, const uint8_t **psource) { AV1_COMMON *volatile const cm = &pbi->common; - BufferPool *volatile const pool = cm->buffer_pool; const uint8_t *source = *psource; cm->error.error_code = AOM_CODEC_OK; cm->error.has_detail = 0; @@ -452,24 +448,15 @@ int av1_receive_compressed_data(AV1Decoder *pbi, size_t size, // TODO(jkoleszar): Error concealment is undefined and non-normative // at this point, but if it becomes so, [0] may not always be the correct // thing to do here. - if (cm->current_frame.frame_refs[0].buf != NULL) { - cm->current_frame.frame_refs[0].buf->buf.corrupted = 1; - } + RefCntBuffer *ref_buf = get_ref_frame_buf(cm, LAST_FRAME); + if (ref_buf != NULL) ref_buf->buf.corrupted = 1; } - // Find a free buffer for the new frame, releasing the reference previously - // held. - - // Find a free frame buffer. Return error if can not find any. - cm->new_fb_idx = get_free_fb(cm); - if (cm->new_fb_idx == INVALID_IDX) { + if (assign_cur_frame_new_fb(cm) == NULL) { cm->error.error_code = AOM_CODEC_MEM_ERROR; return 1; } - // Assign a MV array to the frame buffer. - cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx]; - if (!pbi->camera_frame_header_ready) pbi->hold_ref_buf = 0; // The jmp_buf is valid only for the duration of the function that calls @@ -514,7 +501,7 @@ int av1_receive_compressed_data(AV1Decoder *pbi, size_t size, cm->txb_count = 0; #endif - // Note: At this point, this function holds a reference to cm->new_fb_idx + // Note: At this point, this function holds a reference to cm->cur_frame // in the buffer pool. This reference is consumed by swap_frame_buffers(). swap_frame_buffers(pbi, frame_decoded); @@ -541,10 +528,6 @@ int av1_receive_compressed_data(AV1Decoder *pbi, size_t size, } // Update progress in frame parallel decode. - cm->last_width = cm->width; - cm->last_height = cm->height; - cm->last_tile_cols = cm->tile_cols; - cm->last_tile_rows = cm->tile_rows; cm->error.setjmp = 0; return 0; @@ -553,11 +536,9 @@ int av1_receive_compressed_data(AV1Decoder *pbi, size_t size, // Get the frame at a particular index in the output queue int av1_get_raw_frame(AV1Decoder *pbi, size_t index, YV12_BUFFER_CONFIG **sd, aom_film_grain_t **grain_params) { - RefCntBuffer *const frame_bufs = pbi->common.buffer_pool->frame_bufs; - if (index >= pbi->num_output_frames) return -1; - *sd = pbi->output_frames[index]; - *grain_params = &frame_bufs[pbi->output_frame_index[index]].film_grain_params; + *sd = &pbi->output_frames[index]->buf; + *grain_params = &pbi->output_frames[index]->film_grain_params; aom_clear_system_state(); return 0; } @@ -567,6 +548,6 @@ int av1_get_raw_frame(AV1Decoder *pbi, size_t index, YV12_BUFFER_CONFIG **sd, int av1_get_frame_to_show(AV1Decoder *pbi, YV12_BUFFER_CONFIG *frame) { if (pbi->num_output_frames == 0) return -1; - *frame = *pbi->output_frames[pbi->num_output_frames - 1]; + *frame = pbi->output_frames[pbi->num_output_frames - 1]->buf; return 0; } diff --git a/libaom/av1/decoder/decoder.h b/libaom/av1/decoder/decoder.h index 6ca28e7..685c931 100644 --- a/libaom/av1/decoder/decoder.h +++ b/libaom/av1/decoder/decoder.h @@ -48,11 +48,9 @@ typedef void (*cfl_store_inter_block_visitor_fn_t)(AV1_COMMON *const cm, MACROBLOCKD *const xd); typedef struct ThreadData { - aom_reader *bit_reader; DECLARE_ALIGNED(32, MACROBLOCKD, xd); - /* dqcoeff are shared by all the planes. So planes must be decoded serially */ - DECLARE_ALIGNED(32, tran_low_t, dqcoeff[MAX_TX_SQUARE]); CB_BUFFER cb_buffer_base; + aom_reader *bit_reader; uint8_t *mc_buf[2]; int32_t mc_buf_size; int mc_buf_use_highbd; // Boolean: whether the byte pointers stored in @@ -163,8 +161,6 @@ typedef struct AV1Decoder { DECLARE_ALIGNED(32, AV1_COMMON, common); - int refresh_frame_flags; - AVxWorker lf_worker; AV1LfSync lf_row_sync; AV1LrSync lr_row_sync; @@ -190,8 +186,7 @@ typedef struct AV1Decoder { // Note: The saved buffers are released at the start of the next time the // application calls aom_codec_decode(). int output_all_layers; - YV12_BUFFER_CONFIG *output_frames[MAX_NUM_SPATIAL_LAYERS]; - int output_frame_index[MAX_NUM_SPATIAL_LAYERS]; // Buffer pool indices + RefCntBuffer *output_frames[MAX_NUM_SPATIAL_LAYERS]; size_t num_output_frames; // How many frames are queued up so far? // In order to properly support random-access decoding, we need @@ -205,6 +200,7 @@ typedef struct AV1Decoder { int need_resync; // wait for key/intra-only frame. int hold_ref_buf; // Boolean: whether we are holding reference buffers in // common.next_ref_frame_map. + int reset_decoder_state; int tile_size_bytes; int tile_col_size_bytes; @@ -283,23 +279,22 @@ void av1_dec_row_mt_dealloc(AV1DecRowMTSync *dec_row_mt_sync); void av1_dec_free_cb_buf(AV1Decoder *pbi); -static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs, +static INLINE void decrease_ref_count(RefCntBuffer *const buf, BufferPool *const pool) { - if (idx >= 0) { - --frame_bufs[idx].ref_count; + if (buf != NULL) { + --buf->ref_count; // Reference counts should never become negative. If this assertion fails, // there is a bug in our reference count management. - assert(frame_bufs[idx].ref_count >= 0); + assert(buf->ref_count >= 0); // A worker may only get a free framebuffer index when calling get_free_fb. // But the raw frame buffer is not set up until we finish decoding header. // So if any error happens during decoding header, frame_bufs[idx] will not // have a valid raw frame buffer. - if (frame_bufs[idx].ref_count == 0 && - frame_bufs[idx].raw_frame_buffer.data) { - pool->release_fb_cb(pool->cb_priv, &frame_bufs[idx].raw_frame_buffer); - frame_bufs[idx].raw_frame_buffer.data = NULL; - frame_bufs[idx].raw_frame_buffer.size = 0; - frame_bufs[idx].raw_frame_buffer.priv = NULL; + if (buf->ref_count == 0 && buf->raw_frame_buffer.data) { + pool->release_fb_cb(pool->cb_priv, &buf->raw_frame_buffer); + buf->raw_frame_buffer.data = NULL; + buf->raw_frame_buffer.size = 0; + buf->raw_frame_buffer.priv = NULL; } } } diff --git a/libaom/av1/decoder/decodetxb.c b/libaom/av1/decoder/decodetxb.c index f3ef2d5..223e32e 100644 --- a/libaom/av1/decoder/decodetxb.c +++ b/libaom/av1/decoder/decodetxb.c @@ -136,6 +136,15 @@ uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *const xd, uint16_t *const max_scan_line = &(eob_data->max_scan_line); *max_scan_line = 0; *eob = 0; + +#if CONFIG_INSPECTION + if (plane == 0) { + const int txk_type_idx = + av1_get_txk_type_index(mbmi->sb_type, blk_row, blk_col); + mbmi->tx_skip[txk_type_idx] = all_zero; + } +#endif + if (all_zero) { *max_scan_line = 0; if (plane == 0) { @@ -146,9 +155,6 @@ uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *const xd, return 0; } - memset(levels_buf, 0, - sizeof(*levels_buf) * - ((width + TX_PAD_HOR) * (height + TX_PAD_VER) + TX_PAD_END)); if (plane == AOM_PLANE_Y) { // only y plane's tx_type is transmitted av1_read_tx_type(cm, xd, blk_row, blk_col, tx_size, r); @@ -214,23 +220,30 @@ uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *const xd, break; } - if (k_eob_offset_bits[eob_pt] > 0) { + const int eob_offset_bits = k_eob_offset_bits[eob_pt]; + if (eob_offset_bits > 0) { const int eob_ctx = eob_pt - 3; int bit = aom_read_symbol( r, ec_ctx->eob_extra_cdf[txs_ctx][plane_type][eob_ctx], 2, ACCT_STR); if (bit) { - eob_extra += (1 << (k_eob_offset_bits[eob_pt] - 1)); + eob_extra += (1 << (eob_offset_bits - 1)); } - for (int i = 1; i < k_eob_offset_bits[eob_pt]; i++) { + for (int i = 1; i < eob_offset_bits; i++) { bit = aom_read_bit(r, ACCT_STR); if (bit) { - eob_extra += (1 << (k_eob_offset_bits[eob_pt] - 1 - i)); + eob_extra += (1 << (eob_offset_bits - 1 - i)); } } } *eob = rec_eob_pos(eob_pt, eob_extra); + if (*eob > 1) { + memset(levels_buf, 0, + sizeof(*levels_buf) * + ((width + TX_PAD_HOR) * (height + TX_PAD_VER) + TX_PAD_END)); + } + { // Read the non-zero coefficient with scan index eob-1 // TODO(angiebird): Put this into a function @@ -242,12 +255,10 @@ uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *const xd, ec_ctx->coeff_base_eob_cdf[txs_ctx][plane_type][coeff_ctx]; int level = aom_read_symbol(r, cdf, nsymbs, ACCT_STR) + 1; if (level > NUM_BASE_LEVELS) { - const int br_ctx = get_br_ctx(levels, pos, bwl, tx_class); + const int br_ctx = get_br_ctx_eob(pos, bwl, tx_class); + cdf = ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type][br_ctx]; for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) { - const int k = aom_read_symbol( - r, - ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type][br_ctx], - BR_CDF_SIZE, ACCT_STR); + const int k = aom_read_symbol(r, cdf, BR_CDF_SIZE, ACCT_STR); level += k; if (k < BR_CDF_SIZE - 1) break; } @@ -269,13 +280,6 @@ uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *const xd, } } - int16_t num_zero_coeffs = 0; - for (int c = 0; c < *eob; ++c) { - const int pos = scan[c]; - num_zero_coeffs = AOMMAX(num_zero_coeffs, pos); - } - memset(tcoeffs, 0, (num_zero_coeffs + 1) * sizeof(tcoeffs[0])); - for (int c = 0; c < *eob; ++c) { const int pos = scan[c]; uint8_t sign; diff --git a/libaom/av1/decoder/inspection.c b/libaom/av1/decoder/inspection.c index 17a9f98..eeed1d3 100644 --- a/libaom/av1/decoder/inspection.c +++ b/libaom/av1/decoder/inspection.c @@ -33,7 +33,7 @@ void ifd_clear(insp_frame_data *fd) { /* TODO(negge) This function may be called by more than one thread when using a multi-threaded decoder and this may cause a data race. */ -int ifd_inspect(insp_frame_data *fd, void *decoder) { +int ifd_inspect(insp_frame_data *fd, void *decoder, int skip_not_transform) { struct AV1Decoder *pbi = (struct AV1Decoder *)decoder; AV1_COMMON *const cm = &pbi->common; @@ -82,6 +82,9 @@ int ifd_inspect(insp_frame_data *fd, void *decoder) { mi->ref_frame[1] = mbmi->ref_frame[1]; // Prediction Mode mi->mode = mbmi->mode; + mi->intrabc = (int16_t)mbmi->use_intrabc; + mi->palette = (int16_t)mbmi->palette_mode_info.palette_size[0]; + mi->uv_palette = (int16_t)mbmi->palette_mode_info.palette_size[1]; // Prediction Mode for Chromatic planes if (mi->mode < INTRA_MODES) { mi->uv_mode = mbmi->uv_mode; @@ -111,13 +114,19 @@ int ifd_inspect(insp_frame_data *fd, void *decoder) { else mi->tx_size = mbmi->tx_size; + if (skip_not_transform && mi->skip) mi->tx_size = -1; + mi->tx_type = (mi->skip ? 0 : mbmi->txk_type[av1_get_txk_type_index(bsize, r, c)]); + if (skip_not_transform && + (mi->skip || mbmi->tx_skip[av1_get_txk_type_index(bsize, r, c)])) + mi->tx_type = -1; mi->cdef_level = cm->cdef_info.cdef_strengths[mbmi->cdef_strength] / CDEF_SEC_STRENGTHS; mi->cdef_strength = cm->cdef_info.cdef_strengths[mbmi->cdef_strength] % CDEF_SEC_STRENGTHS; + mi->cdef_strength += mi->cdef_strength == 3; if (mbmi->uv_mode == UV_CFL_PRED) { mi->cfl_alpha_idx = mbmi->cfl_alpha_idx; diff --git a/libaom/av1/decoder/inspection.h b/libaom/av1/decoder/inspection.h index 0c6f3ad..b963f6a 100644 --- a/libaom/av1/decoder/inspection.h +++ b/libaom/av1/decoder/inspection.h @@ -52,6 +52,9 @@ struct insp_mi_data { int16_t current_qindex; int16_t compound_type; int16_t motion_mode; + int16_t intrabc; + int16_t palette; + int16_t uv_palette; }; typedef struct insp_frame_data insp_frame_data; @@ -80,7 +83,7 @@ struct insp_frame_data { void ifd_init(insp_frame_data *fd, int frame_width, int frame_height); void ifd_clear(insp_frame_data *fd); -int ifd_inspect(insp_frame_data *fd, void *decoder); +int ifd_inspect(insp_frame_data *fd, void *decoder, int skip_not_transform); #ifdef __cplusplus } // extern "C" diff --git a/libaom/av1/decoder/obu.c b/libaom/av1/decoder/obu.c index d892dc4..aaea572 100644 --- a/libaom/av1/decoder/obu.c +++ b/libaom/av1/decoder/obu.c @@ -26,7 +26,7 @@ #include "av1/decoder/obu.h" // Picture prediction structures (0-12 are predefined) in scalability metadata. -typedef enum { +enum { SCALABILITY_L1T2 = 0, SCALABILITY_L1T3 = 1, SCALABILITY_L2T1 = 2, @@ -42,7 +42,7 @@ typedef enum { SCALABILITY_S2T2h = 12, SCALABILITY_S2T3h = 13, SCALABILITY_SS = 14 -} SCALABILITY_STRUCTURES; +} UENUM1BYTE(SCALABILITY_STRUCTURES); aom_codec_err_t aom_get_num_layers_from_operating_point_idc( int operating_point_idc, unsigned int *number_spatial_layers, @@ -98,12 +98,10 @@ static int byte_alignment(AV1_COMMON *const cm, static uint32_t read_temporal_delimiter_obu() { return 0; } // Returns a boolean that indicates success. -static int read_bitstream_level(BitstreamLevel *bl, +static int read_bitstream_level(AV1_LEVEL *seq_level_idx, struct aom_read_bit_buffer *rb) { - const uint8_t seq_level_idx = aom_rb_read_literal(rb, LEVEL_BITS); - if (!is_valid_seq_level_idx(seq_level_idx)) return 0; - bl->major = (seq_level_idx >> LEVEL_MINOR_BITS) + LEVEL_MAJOR_MIN; - bl->minor = seq_level_idx & ((1 << LEVEL_MINOR_BITS) - 1); + *seq_level_idx = aom_rb_read_literal(rb, LEVEL_BITS); + if (!is_valid_seq_level_idx(*seq_level_idx)) return 0; return 1; } @@ -151,7 +149,7 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi, seq_params->display_model_info_present_flag = 0; seq_params->operating_points_cnt_minus_1 = 0; seq_params->operating_point_idc[0] = 0; - if (!read_bitstream_level(&seq_params->level[0], rb)) { + if (!read_bitstream_level(&seq_params->seq_level_idx[0], rb)) { cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM; return 0; } @@ -175,13 +173,13 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi, for (int i = 0; i < seq_params->operating_points_cnt_minus_1 + 1; i++) { seq_params->operating_point_idc[i] = aom_rb_read_literal(rb, OP_POINTS_IDC_BITS); - if (!read_bitstream_level(&seq_params->level[i], rb)) { + if (!read_bitstream_level(&seq_params->seq_level_idx[i], rb)) { cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM; return 0; } // This is the seq_level_idx[i] > 7 check in the spec. seq_level_idx 7 // is equivalent to level 3.3. - if (seq_params->level[i].major > 3) + if (seq_params->seq_level_idx[i] >= SEQ_LEVEL_4_0) seq_params->tier[i] = aom_rb_read_bit(rb); else seq_params->tier[i] = 0; @@ -195,10 +193,9 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi, if (cm->timing_info_present && (cm->timing_info.equal_picture_interval || cm->op_params[i].decoder_model_param_present_flag)) { - cm->op_params[i].bitrate = max_level_bitrate( - seq_params->profile, - major_minor_to_seq_level_idx(seq_params->level[i]), - seq_params->tier[i]); + cm->op_params[i].bitrate = + max_level_bitrate(seq_params->profile, seq_params->seq_level_idx[i], + seq_params->tier[i]); // Level with seq_level_idx = 31 returns a high "dummy" bitrate to pass // the check if (cm->op_params[i].bitrate == 0) @@ -364,8 +361,10 @@ static void alloc_tile_list_buffer(AV1Decoder *pbi) { // image format 4:2:0, the output frame of U plane and V plane is 1/4 of the // output frame. AV1_COMMON *const cm = &pbi->common; - const int tile_width_in_pixels = cm->tile_width * MI_SIZE; - const int tile_height_in_pixels = cm->tile_height * MI_SIZE; + int tile_width, tile_height; + av1_get_uniform_tile_size(cm, &tile_width, &tile_height); + const int tile_width_in_pixels = tile_width * MI_SIZE; + const int tile_height_in_pixels = tile_height * MI_SIZE; const int output_frame_width = (pbi->output_frame_width_in_tiles_minus_1 + 1) * tile_width_in_pixels; const int output_frame_height = @@ -415,8 +414,10 @@ static void yv12_tile_copy(const YV12_BUFFER_CONFIG *src, int hstart1, static void copy_decoded_tile_to_tile_list_buffer(AV1Decoder *pbi, int tile_idx) { AV1_COMMON *const cm = &pbi->common; - const int tile_width_in_pixels = cm->tile_width * MI_SIZE; - const int tile_height_in_pixels = cm->tile_height * MI_SIZE; + int tile_width, tile_height; + av1_get_uniform_tile_size(cm, &tile_width, &tile_height); + const int tile_width_in_pixels = tile_width * MI_SIZE; + const int tile_height_in_pixels = tile_height * MI_SIZE; const int ssy = cm->seq_params.subsampling_y; const int ssx = cm->seq_params.subsampling_x; const int num_planes = av1_num_planes(cm); diff --git a/libaom/av1/encoder/aq_cyclicrefresh.c b/libaom/av1/encoder/aq_cyclicrefresh.c index 8d96b23..bfb2a90 100644 --- a/libaom/av1/encoder/aq_cyclicrefresh.c +++ b/libaom/av1/encoder/aq_cyclicrefresh.c @@ -31,9 +31,9 @@ struct CYCLIC_REFRESH { // excess of the cycle time, i.e., in the case of all zero motion, block // will be refreshed every (100/percent_refresh + time_for_refresh) frames. int time_for_refresh; - // Target number of (8x8) blocks that are set for delta-q. + // Target number of (4x4) blocks that are set for delta-q. int target_num_seg_blocks; - // Actual number of (8x8) blocks that were applied delta-q. + // Actual number of (4x4) blocks that were applied delta-q. int actual_num_seg1_blocks; int actual_num_seg2_blocks; // RD mult. parameters for segment 1. @@ -55,6 +55,8 @@ struct CYCLIC_REFRESH { int rate_boost_fac; double low_content_avg; int qindex_delta[3]; + double weight_segment; + int apply_cyclic_refresh; }; CYCLIC_REFRESH *av1_cyclic_refresh_alloc(int mi_rows, int mi_cols) { @@ -87,27 +89,6 @@ void av1_cyclic_refresh_free(CYCLIC_REFRESH *cr) { } } -// Check if we should turn off cyclic refresh based on bitrate condition. -static int apply_cyclic_refresh_bitrate(const AV1_COMMON *cm, - const RATE_CONTROL *rc) { - // Turn off cyclic refresh if bits available per frame is not sufficiently - // larger than bit cost of segmentation. Segment map bit cost should scale - // with number of seg blocks, so compare available bits to number of blocks. - // Average bits available per frame = avg_frame_bandwidth - // Number of (8x8) blocks in frame = mi_rows * mi_cols; - const float factor = 0.25; - const int number_blocks = cm->mi_rows * cm->mi_cols; - // The condition below corresponds to turning off at target bitrates: - // (at 30fps), ~12kbps for CIF, 36kbps for VGA, 100kps for HD/720p. - // Also turn off at very small frame sizes, to avoid too large fraction of - // superblocks to be refreshed per frame. Threshold below is less than QCIF. - if (rc->avg_frame_bandwidth < factor * number_blocks || - number_blocks / 64 < 5) - return 0; - else - return 1; -} - // Check if this coding block, of size bsize, should be considered for refresh // (lower-qp coding). Decision can be based on various factors, such as // size of the coding block (i.e., below min_block size rejected), coding @@ -158,11 +139,11 @@ int av1_cyclic_refresh_estimate_bits_at_q(const AV1_COMP *cpi, const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; int estimated_bits; int mbs = cm->MBs; - int num8x8bl = mbs << 2; + int num4x4bl = mbs << 4; // Weight for non-base segments: use actual number of blocks refreshed in - // previous/just encoded frame. Note number of blocks here is in 8x8 units. - double weight_segment1 = (double)cr->actual_num_seg1_blocks / num8x8bl; - double weight_segment2 = (double)cr->actual_num_seg2_blocks / num8x8bl; + // previous/just encoded frame. Note number of blocks here is in 4x4 units. + double weight_segment1 = (double)cr->actual_num_seg1_blocks / num4x4bl; + double weight_segment2 = (double)cr->actual_num_seg2_blocks / num4x4bl; // Take segment weighted average for estimated bits. estimated_bits = (int)((1.0 - weight_segment1 - weight_segment2) * @@ -190,14 +171,14 @@ int av1_cyclic_refresh_rc_bits_per_mb(const AV1_COMP *cpi, int i, const AV1_COMMON *const cm = &cpi->common; CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; int bits_per_mb; - int num8x8bl = cm->MBs << 2; + int num4x4bl = cm->MBs << 4; // Weight for segment prior to encoding: take the average of the target // number for the frame to be encoded and the actual from the previous frame. double weight_segment = (double)((cr->target_num_seg_blocks + cr->actual_num_seg1_blocks + cr->actual_num_seg2_blocks) >> 1) / - num8x8bl; + num4x4bl; // Compute delta-q corresponding to qindex i. int deltaq = compute_deltaq(cpi, i, cr->rate_ratio_qdelta); // Take segment weighted average for bits per mb. @@ -264,21 +245,6 @@ void av1_cyclic_refresh_update_segment(const AV1_COMP *cpi, int map_offset = block_index + y * cm->mi_cols + x; cr->map[map_offset] = new_map_value; cpi->segmentation_map[map_offset] = mbmi->segment_id; - // Inter skip blocks were clearly not coded at the current qindex, so - // don't update the map for them. For cases where motion is non-zero or - // the reference frame isn't the previous frame, the previous value in - // the map for this spatial location is not entirely correct. - if ((!is_inter_block(mbmi) || !skip) && - mbmi->segment_id <= CR_SEGMENT_ID_BOOST2) { - cr->last_coded_q_map[map_offset] = clamp( - cm->base_qindex + cr->qindex_delta[mbmi->segment_id], 0, MAXQ); - } else if (is_inter_block(mbmi) && skip && - mbmi->segment_id <= CR_SEGMENT_ID_BOOST2) { - cr->last_coded_q_map[map_offset] = - AOMMIN(clamp(cm->base_qindex + cr->qindex_delta[mbmi->segment_id], - 0, MAXQ), - cr->last_coded_q_map[map_offset]); - } } } @@ -315,73 +281,6 @@ void av1_cyclic_refresh_set_golden_update(AV1_COMP *const cpi) { rc->baseline_gf_interval = 40; } -// Update some encoding stats (from the just encoded frame). If this frame's -// background has high motion, refresh the golden frame. Otherwise, if the -// golden reference is to be updated check if we should NOT update the golden -// ref. -void av1_cyclic_refresh_check_golden_update(AV1_COMP *const cpi) { - AV1_COMMON *const cm = &cpi->common; - CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; - int mi_row, mi_col; - double fraction_low = 0.0; - int low_content_frame = 0; - - MB_MODE_INFO **mi; - RATE_CONTROL *const rc = &cpi->rc; - const int rows = cm->mi_rows, cols = cm->mi_cols; - int cnt1 = 0, cnt2 = 0; - int force_gf_refresh = 0; - - for (mi_row = 0; mi_row < rows; mi_row++) { - mi = cm->mi_grid_visible + mi_row * cm->mi_stride; - - for (mi_col = 0; mi_col < cols; mi_col++) { - int16_t abs_mvr = mi[0]->mv[0].as_mv.row >= 0 - ? mi[0]->mv[0].as_mv.row - : -1 * mi[0]->mv[0].as_mv.row; - int16_t abs_mvc = mi[0]->mv[0].as_mv.col >= 0 - ? mi[0]->mv[0].as_mv.col - : -1 * mi[0]->mv[0].as_mv.col; - - // Calculate the motion of the background. - if (abs_mvr <= 16 && abs_mvc <= 16) { - cnt1++; - if (abs_mvr == 0 && abs_mvc == 0) cnt2++; - } - mi++; - - // Accumulate low_content_frame. - if (cr->map[mi_row * cols + mi_col] < 1) low_content_frame++; - } - } - - // For video conference clips, if the background has high motion in current - // frame because of the camera movement, set this frame as the golden frame. - // Use 70% and 5% as the thresholds for golden frame refreshing. - if (cnt1 * 10 > (70 * rows * cols) && cnt2 * 20 < cnt1) { - av1_cyclic_refresh_set_golden_update(cpi); - rc->frames_till_gf_update_due = rc->baseline_gf_interval; - - if (rc->frames_till_gf_update_due > rc->frames_to_key) - rc->frames_till_gf_update_due = rc->frames_to_key; - cpi->refresh_golden_frame = 1; - force_gf_refresh = 1; - } - - fraction_low = (double)low_content_frame / (rows * cols); - // Update average. - cr->low_content_avg = (fraction_low + 3 * cr->low_content_avg) / 4; - if (!force_gf_refresh && cpi->refresh_golden_frame == 1) { - // Don't update golden reference if the amount of low_content for the - // current encoded frame is small, or if the recursive average of the - // low_content over the update interval window falls below threshold. - if (fraction_low < 0.8 || cr->low_content_avg < 0.7) - cpi->refresh_golden_frame = 0; - // Reset for next internal. - cr->low_content_avg = fraction_low; - } -} - // Update the segmentation map, and related quantities: cyclic refresh map, // refresh sb_index, and target number of blocks to be refreshed. // The map is set to either 0/CR_SEGMENT_ID_BASE (no refresh) or to @@ -458,26 +357,70 @@ static void cyclic_refresh_update_map(AV1_COMP *const cpi) { // Set cyclic refresh parameters. void av1_cyclic_refresh_update_parameters(AV1_COMP *const cpi) { + // TODO(marpan): Parameters need to be tuned. const RATE_CONTROL *const rc = &cpi->rc; const AV1_COMMON *const cm = &cpi->common; CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + int num4x4bl = cm->MBs << 4; + int target_refresh = 0; + double weight_segment_target = 0; + double weight_segment = 0; + int qp_thresh = AOMMIN(20, rc->best_quality << 1); + cr->apply_cyclic_refresh = 1; + if (frame_is_intra_only(cm) || is_lossless_requested(&cpi->oxcf) || + rc->avg_frame_qindex[INTER_FRAME] < qp_thresh) { + cr->apply_cyclic_refresh = 0; + return; + } cr->percent_refresh = 10; - cr->max_qdelta_perc = 50; + cr->max_qdelta_perc = 60; cr->time_for_refresh = 0; + cr->motion_thresh = 32; + cr->rate_boost_fac = 15; // Use larger delta-qp (increase rate_ratio_qdelta) for first few (~4) // periods of the refresh cycle, after a key frame. - if (rc->frames_since_key < 4 * cr->percent_refresh) + // Account for larger interval on base layer for temporal layers. + if (cr->percent_refresh > 0 && + rc->frames_since_key < 400 / cr->percent_refresh) { cr->rate_ratio_qdelta = 3.0; - else + } else { cr->rate_ratio_qdelta = 2.0; - // Adjust some parameters for low resolutions at low bitrates. - if (cm->width <= 352 && cm->height <= 288 && rc->avg_frame_bandwidth < 3400) { - cr->motion_thresh = 4; + } + // Adjust some parameters for low resolutions. + if (cm->width <= 352 && cm->height <= 288) { + if (rc->avg_frame_bandwidth < 3000) { + cr->motion_thresh = 16; + cr->rate_boost_fac = 13; + } else { + cr->max_qdelta_perc = 70; + cr->rate_ratio_qdelta = AOMMAX(cr->rate_ratio_qdelta, 2.5); + } + } + if (cpi->oxcf.rc_mode == AOM_VBR) { + // To be adjusted for VBR mode, e.g., based on gf period and boost. + // For now use smaller qp-delta (than CBR), no second boosted seg, and + // turn-off (no refresh) on golden refresh (since it's already boosted). + cr->percent_refresh = 10; + cr->rate_ratio_qdelta = 1.5; cr->rate_boost_fac = 10; - } else { - cr->motion_thresh = 32; - cr->rate_boost_fac = 17; + if (cpi->refresh_golden_frame == 1) { + cr->percent_refresh = 0; + cr->rate_ratio_qdelta = 1.0; + } } + // Weight for segment prior to encoding: take the average of the target + // number for the frame to be encoded and the actual from the previous frame. + // Use the target if its less. To be used for setting the base qp for the + // frame in vp9_rc_regulate_q. + target_refresh = cr->percent_refresh * cm->mi_rows * cm->mi_cols / 100; + weight_segment_target = (double)(target_refresh) / num4x4bl; + weight_segment = (double)((target_refresh + cr->actual_num_seg1_blocks + + cr->actual_num_seg2_blocks) >> + 1) / + num4x4bl; + if (weight_segment_target < 7 * weight_segment / 8) + weight_segment = weight_segment_target; + cr->weight_segment = weight_segment; } // Setup cyclic background refresh: set delta q and segmentation map. @@ -486,7 +429,6 @@ void av1_cyclic_refresh_setup(AV1_COMP *const cpi) { const RATE_CONTROL *const rc = &cpi->rc; CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; struct segmentation *const seg = &cm->seg; - const int apply_cyclic_refresh = apply_cyclic_refresh_bitrate(cm, rc); int resolution_change = cm->prev_frame && (cm->width != cm->prev_frame->width || cm->height != cm->prev_frame->height); @@ -498,8 +440,7 @@ void av1_cyclic_refresh_setup(AV1_COMP *const cpi) { return; } if (cm->current_frame.frame_number == 0) cr->low_content_avg = 0.0; - // Don't apply refresh on key frame or enhancement layer frames. - if (!apply_cyclic_refresh || cm->current_frame.frame_type == KEY_FRAME) { + if (!cr->apply_cyclic_refresh) { // Set segmentation map to 0 and disable. unsigned char *const seg_map = cpi->segmentation_map; memset(seg_map, 0, cm->mi_rows * cm->mi_cols); diff --git a/libaom/av1/encoder/aq_cyclicrefresh.h b/libaom/av1/encoder/aq_cyclicrefresh.h index b457819..ddabae6 100644 --- a/libaom/av1/encoder/aq_cyclicrefresh.h +++ b/libaom/av1/encoder/aq_cyclicrefresh.h @@ -54,19 +54,12 @@ void av1_cyclic_refresh_update_segment(const struct AV1_COMP *cpi, int mi_col, BLOCK_SIZE bsize, int64_t rate, int64_t dist, int skip); -// Update the segmentation map, and related quantities: cyclic refresh map, -// refresh sb_index, and target number of blocks to be refreshed. -void av1_cyclic_refresh_update__map(struct AV1_COMP *const cpi); - // Update the actual number of blocks that were applied the segment delta q. void av1_cyclic_refresh_postencode(struct AV1_COMP *const cpi); // Set golden frame update interval, for 1 pass CBR mode. void av1_cyclic_refresh_set_golden_update(struct AV1_COMP *const cpi); -// Check if we should not update golden reference, based on past refresh stats. -void av1_cyclic_refresh_check_golden_update(struct AV1_COMP *const cpi); - // Set/update global/frame level refresh parameters. void av1_cyclic_refresh_update_parameters(struct AV1_COMP *const cpi); diff --git a/libaom/av1/encoder/aq_variance.c b/libaom/av1/encoder/aq_variance.c index cfd7610..d572948 100644 --- a/libaom/av1/encoder/aq_variance.c +++ b/libaom/av1/encoder/aq_variance.c @@ -121,7 +121,7 @@ int av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) { for (i = 0; i < bh; i += 4) { for (j = 0; j < bw; j += 4) { - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + if (is_cur_buf_hbd(xd)) { var += log(1.0 + cpi->fn_ptr[BLOCK_4X4].vf( x->plane[0].src.buf + i * x->plane[0].src.stride + j, @@ -153,7 +153,7 @@ static unsigned int haar_ac_energy(MACROBLOCK *x, BLOCK_SIZE bs) { uint8_t *buf = x->plane[0].src.buf; const int bw = MI_SIZE * mi_size_wide[bs]; const int bh = MI_SIZE * mi_size_high[bs]; - int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH; + const int hbd = is_cur_buf_hbd(xd); int var = 0; for (int r = 0; r < bh; r += 8) diff --git a/libaom/av1/encoder/av1_multi_thread.c b/libaom/av1/encoder/av1_multi_thread.c index a0c556e..1260c7a 100644 --- a/libaom/av1/encoder/av1_multi_thread.c +++ b/libaom/av1/encoder/av1_multi_thread.c @@ -35,6 +35,14 @@ void av1_row_mt_mem_alloc(AV1_COMP *cpi, int max_sb_rows) { &cpi->tile_data[tile_row * multi_thread_ctxt->allocated_tile_cols + tile_col]; av1_row_mt_sync_mem_alloc(&this_tile->row_mt_sync, cm, max_sb_rows); + if (cpi->oxcf.cdf_update_mode) + CHECK_MEM_ERROR( + cm, this_tile->row_ctx, + (FRAME_CONTEXT *)aom_memalign( + 16, + AOMMAX(1, (av1_get_sb_cols_in_tile(cm, this_tile->tile_info) - + 1)) * + sizeof(*this_tile->row_ctx))); } } } @@ -53,6 +61,7 @@ void av1_row_mt_mem_dealloc(AV1_COMP *cpi) { &cpi->tile_data[tile_row * multi_thread_ctxt->allocated_tile_cols + tile_col]; av1_row_mt_sync_mem_dealloc(&this_tile->row_mt_sync); + if (cpi->oxcf.cdf_update_mode) aom_free(this_tile->row_ctx); } } multi_thread_ctxt->allocated_sb_rows = 0; diff --git a/libaom/av1/encoder/av1_quantize.c b/libaom/av1/encoder/av1_quantize.c index 21ab4db..ff1342c 100644 --- a/libaom/av1/encoder/av1_quantize.c +++ b/libaom/av1/encoder/av1_quantize.c @@ -41,47 +41,37 @@ static void quantize_fp_helper_c( const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr, int log_scale) { int i, eob = -1; + const int rounding[2] = { ROUND_POWER_OF_TWO(round_ptr[0], log_scale), + ROUND_POWER_OF_TWO(round_ptr[1], log_scale) }; // TODO(jingning) Decide the need of these arguments after the // quantization process is completed. (void)zbin_ptr; (void)quant_shift_ptr; + (void)iscan; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); if (qm_ptr == NULL && iqm_ptr == NULL) { - const int rounding0 = ROUND_POWER_OF_TWO(round_ptr[0], log_scale); - { // rc == 0 - const int coeff = coeff_ptr[0]; - const int coeff_sign = (coeff >> 31); - int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - if ((abs_coeff << (1 + log_scale)) >= (int32_t)(dequant_ptr[0])) { - abs_coeff = clamp64(abs_coeff + rounding0, INT16_MIN, INT16_MAX); - const int tmp32 = (int)((abs_coeff * quant_ptr[0]) >> (16 - log_scale)); - if (tmp32) { - qcoeff_ptr[0] = (tmp32 ^ coeff_sign) - coeff_sign; - const tran_low_t abs_dqcoeff = (tmp32 * dequant_ptr[0]) >> log_scale; - dqcoeff_ptr[0] = (abs_dqcoeff ^ coeff_sign) - coeff_sign; - eob = 0; - } - } - } - const int rounding1 = ROUND_POWER_OF_TWO(round_ptr[1], log_scale); - const int32_t thresh1 = (int32_t)(dequant_ptr[1]); - for (i = 1; i < n_coeffs; i++) { - const int coeff = coeff_ptr[i]; + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + const int32_t thresh = (int32_t)(dequant_ptr[rc != 0]); + const int coeff = coeff_ptr[rc]; const int coeff_sign = (coeff >> 31); int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign; - if ((abs_coeff << (1 + log_scale)) >= thresh1) { - abs_coeff = clamp64(abs_coeff + rounding1, INT16_MIN, INT16_MAX); - const int tmp32 = (int)((abs_coeff * quant_ptr[1]) >> (16 - log_scale)); + int tmp32 = 0; + if ((abs_coeff << (1 + log_scale)) >= thresh) { + abs_coeff = + clamp64(abs_coeff + rounding[rc != 0], INT16_MIN, INT16_MAX); + tmp32 = (int)((abs_coeff * quant_ptr[rc != 0]) >> (16 - log_scale)); if (tmp32) { - qcoeff_ptr[i] = (tmp32 ^ coeff_sign) - coeff_sign; - const tran_low_t abs_dqcoeff = (tmp32 * dequant_ptr[1]) >> log_scale; - dqcoeff_ptr[i] = (abs_dqcoeff ^ coeff_sign) - coeff_sign; - eob = AOMMAX(iscan[i], eob); + qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign; + const tran_low_t abs_dqcoeff = + (tmp32 * dequant_ptr[rc != 0]) >> log_scale; + dqcoeff_ptr[rc] = (abs_dqcoeff ^ coeff_sign) - coeff_sign; } } + if (tmp32) eob = i; } } else { // Quantization pass: All coefficients with index >= zero_flag are @@ -99,7 +89,7 @@ static void quantize_fp_helper_c( int tmp32 = 0; if (abs_coeff * wt >= (dequant_ptr[rc != 0] << (AOM_QM_BITS - (1 + log_scale)))) { - abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale); + abs_coeff += rounding[rc != 0]; abs_coeff = clamp64(abs_coeff, INT16_MIN, INT16_MAX); tmp32 = (int)((abs_coeff * wt * quant_ptr[rc != 0]) >> (16 - log_scale + AOM_QM_BITS)); @@ -275,32 +265,65 @@ void av1_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const SCAN_ORDER *sc, const QUANT_PARAM *qparam) { const qm_val_t *qm_ptr = qparam->qmatrix; const qm_val_t *iqm_ptr = qparam->iqmatrix; - if (qm_ptr != NULL && iqm_ptr != NULL) { - quantize_b_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, - p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, - dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, - sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale); + if (qparam->use_quant_b_adapt) { + // TODO(sarahparker) These quantize_b optimizations need SIMD + // implementations + if (qm_ptr != NULL && iqm_ptr != NULL) { + quantize_b_adaptive_helper_c( + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, + sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale); + } else { + switch (qparam->log_scale) { + case 0: + aom_quantize_b_adaptive(coeff_ptr, n_coeffs, p->zbin_QTX, + p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, + p->dequant_QTX, eob_ptr, sc->scan, sc->iscan); + break; + case 1: + aom_quantize_b_32x32_adaptive( + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, + eob_ptr, sc->scan, sc->iscan); + break; + case 2: + aom_quantize_b_64x64_adaptive_c( + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, + eob_ptr, sc->scan, sc->iscan); + break; + default: assert(0); + } + } } else { - switch (qparam->log_scale) { - case 0: - aom_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, - p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, - dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, - sc->iscan); - break; - case 1: - aom_quantize_b_32x32(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, - p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, - dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, - sc->iscan); - break; - case 2: - aom_quantize_b_64x64(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, - p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, - dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, - sc->iscan); - break; - default: assert(0); + if (qm_ptr != NULL && iqm_ptr != NULL) { + quantize_b_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, + p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale); + } else { + switch (qparam->log_scale) { + case 0: + aom_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, + p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan); + break; + case 1: + aom_quantize_b_32x32(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, + p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan); + break; + case 2: + aom_quantize_b_64x64(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, + p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan); + break; + default: assert(0); + } } } } @@ -391,41 +414,81 @@ void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr, const QUANT_PARAM *qparam) { const qm_val_t *qm_ptr = qparam->qmatrix; const qm_val_t *iqm_ptr = qparam->iqmatrix; - if (qm_ptr != NULL && iqm_ptr != NULL) { - highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, - p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, - dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, - sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale); + if (qparam->use_quant_b_adapt) { + if (qm_ptr != NULL && iqm_ptr != NULL) { + highbd_quantize_b_adaptive_helper_c( + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, + sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale); + } else { + switch (qparam->log_scale) { + case 0: + if (LIKELY(n_coeffs >= 8)) { + aom_highbd_quantize_b_adaptive_c( + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, + eob_ptr, sc->scan, sc->iscan); + } else { + // TODO(luoyi): Need SIMD (e.g. sse2) for smaller block size + // quantization + aom_highbd_quantize_b_adaptive_c( + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, + eob_ptr, sc->scan, sc->iscan); + } + break; + case 1: + aom_highbd_quantize_b_32x32_adaptive_c( + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, + eob_ptr, sc->scan, sc->iscan); + break; + case 2: + aom_highbd_quantize_b_64x64_adaptive_c( + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, + eob_ptr, sc->scan, sc->iscan); + break; + default: assert(0); + } + } } else { - switch (qparam->log_scale) { - case 0: - if (LIKELY(n_coeffs >= 8)) { - aom_highbd_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, - p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, - dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, - sc->iscan); - } else { - // TODO(luoyi): Need SIMD (e.g. sse2) for smaller block size - // quantization - aom_highbd_quantize_b_c(coeff_ptr, n_coeffs, p->zbin_QTX, + if (qm_ptr != NULL && iqm_ptr != NULL) { + highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, + p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, + dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, + sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale); + } else { + switch (qparam->log_scale) { + case 0: + if (LIKELY(n_coeffs >= 8)) { + aom_highbd_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan, sc->iscan); - } - break; - case 1: - aom_highbd_quantize_b_32x32( - coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, - p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, - eob_ptr, sc->scan, sc->iscan); - break; - case 2: - aom_highbd_quantize_b_64x64( - coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, - p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, - eob_ptr, sc->scan, sc->iscan); - break; - default: assert(0); + } else { + // TODO(luoyi): Need SIMD (e.g. sse2) for smaller block size + // quantization + aom_highbd_quantize_b_c( + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, + eob_ptr, sc->scan, sc->iscan); + } + break; + case 1: + aom_highbd_quantize_b_32x32( + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, + eob_ptr, sc->scan, sc->iscan); + break; + case 2: + aom_highbd_quantize_b_64x64( + coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX, + p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, + eob_ptr, sc->scan, sc->iscan); + break; + default: assert(0); + } } } } diff --git a/libaom/av1/encoder/av1_quantize.h b/libaom/av1/encoder/av1_quantize.h index fb53881..6419265 100644 --- a/libaom/av1/encoder/av1_quantize.h +++ b/libaom/av1/encoder/av1_quantize.h @@ -22,11 +22,15 @@ extern "C" { #endif +#define EOB_FACTOR 325 +#define SKIP_EOB_FACTOR_ADJUST 200 + typedef struct QUANT_PARAM { int log_scale; TX_SIZE tx_size; const qm_val_t *qmatrix; const qm_val_t *iqmatrix; + int use_quant_b_adapt; } QUANT_PARAM; typedef void (*AV1_QUANT_FACADE)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, diff --git a/libaom/av1/encoder/bitstream.c b/libaom/av1/encoder/bitstream.c index df79b79..cbac2b2 100644 --- a/libaom/av1/encoder/bitstream.c +++ b/libaom/av1/encoder/bitstream.c @@ -145,7 +145,7 @@ static void write_inter_compound_mode(MACROBLOCKD *xd, aom_writer *w, static void write_tx_size_vartx(MACROBLOCKD *xd, const MB_MODE_INFO *mbmi, TX_SIZE tx_size, int depth, int blk_row, int blk_col, aom_writer *w) { - FRAME_CONTEXT *ec_ctx = xd->tile_ctx; + FRAME_CONTEXT *const ec_ctx = xd->tile_ctx; const int max_blocks_high = max_block_high(xd, mbmi->sb_type, 0); const int max_blocks_wide = max_block_wide(xd, mbmi->sb_type, 0); @@ -369,10 +369,18 @@ static void pack_txb_tokens(aom_writer *w, AV1_COMMON *cm, MACROBLOCK *const x, blk_col)]; if (tx_size == plane_tx_size || plane) { - tran_low_t *tcoeff = BLOCK_OFFSET(x->mbmi_ext->tcoeff[plane], block); - const uint16_t eob = x->mbmi_ext->eobs[plane][block]; - TXB_CTX txb_ctx = { x->mbmi_ext->txb_skip_ctx[plane][block], - x->mbmi_ext->dc_sign_ctx[plane][block] }; + const int txb_offset = + x->mbmi_ext->cb_offset / (TX_SIZE_W_MIN * TX_SIZE_H_MIN); + tran_low_t *tcoeff_txb = + x->mbmi_ext->cb_coef_buff->tcoeff[plane] + x->mbmi_ext->cb_offset; + uint16_t *eob_txb = x->mbmi_ext->cb_coef_buff->eobs[plane] + txb_offset; + uint8_t *txb_skip_ctx_txb = + x->mbmi_ext->cb_coef_buff->txb_skip_ctx[plane] + txb_offset; + int *dc_sign_ctx_txb = + x->mbmi_ext->cb_coef_buff->dc_sign_ctx[plane] + txb_offset; + tran_low_t *tcoeff = BLOCK_OFFSET(tcoeff_txb, block); + const uint16_t eob = eob_txb[block]; + TXB_CTX txb_ctx = { txb_skip_ctx_txb[block], dc_sign_ctx_txb[block] }; av1_write_coeffs_txb(cm, xd, w, blk_row, blk_col, plane, tx_size, tcoeff, eob, &txb_ctx); #if CONFIG_RD_DEBUG @@ -460,7 +468,7 @@ static void write_segment_id(AV1_COMP *cpi, const MB_MODE_INFO *const mbmi, // changing from lossless to lossy. assert(is_inter_block(mbmi) || !cpi->has_lossless_segment); - set_spatial_segment_id(cm, cm->current_frame_seg_map, mbmi->sb_type, mi_row, + set_spatial_segment_id(cm, cm->cur_frame->seg_map, mbmi->sb_type, mi_row, mi_col, pred); set_spatial_segment_id(cm, cpi->segmentation_map, mbmi->sb_type, mi_row, mi_col, pred); @@ -473,7 +481,7 @@ static void write_segment_id(AV1_COMP *cpi, const MB_MODE_INFO *const mbmi, av1_neg_interleave(mbmi->segment_id, pred, seg->last_active_segid + 1); aom_cdf_prob *pred_cdf = segp->spatial_pred_seg_cdf[cdf_num]; aom_write_symbol(w, coded_id, pred_cdf, MAX_SEGMENTS); - set_spatial_segment_id(cm, cm->current_frame_seg_map, mbmi->sb_type, mi_row, + set_spatial_segment_id(cm, cm->cur_frame->seg_map, mbmi->sb_type, mi_row, mi_col, mbmi->segment_id); } @@ -627,7 +635,7 @@ static void write_mb_interp_filter(AV1_COMP *cpi, const MACROBLOCKD *xd, av1_extract_interp_filter(mbmi->interp_filters, dir); aom_write_symbol(w, filter, ec_ctx->switchable_interp_cdf[ctx], SWITCHABLE_FILTERS); - ++cpi->interp_filter_selected[0][filter]; + ++cm->cur_frame->interp_filter_selected[filter]; if (cm->seq_params.enable_dual_filter == 0) return; } } @@ -867,14 +875,7 @@ static void write_cfl_alphas(FRAME_CONTEXT *const ec_ctx, int idx, static void write_cdef(AV1_COMMON *cm, MACROBLOCKD *const xd, aom_writer *w, int skip, int mi_col, int mi_row) { - if (cm->coded_lossless || cm->allow_intrabc) { - // Initialize to indicate no CDEF for safety. - cm->cdef_info.cdef_bits = 0; - cm->cdef_info.cdef_strengths[0] = 0; - cm->cdef_info.nb_cdef_strengths = 1; - cm->cdef_info.cdef_uv_strengths[0] = 0; - return; - } + if (cm->coded_lossless || cm->allow_intrabc) return; const int m = ~((1 << (6 - MI_SIZE_LOG2)) - 1); const MB_MODE_INFO *mbmi = @@ -903,7 +904,7 @@ static void write_inter_segment_id(AV1_COMP *cpi, aom_writer *w, int mi_row, int mi_col, int skip, int preskip) { MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; - const MB_MODE_INFO *const mbmi = xd->mi[0]; + MB_MODE_INFO *const mbmi = xd->mi[0]; AV1_COMMON *const cm = &cpi->common; if (seg->update_map) { @@ -913,7 +914,7 @@ static void write_inter_segment_id(AV1_COMP *cpi, aom_writer *w, if (seg->segid_preskip) return; if (skip) { write_segment_id(cpi, mbmi, w, seg, segp, mi_row, mi_col, 1); - if (seg->temporal_update) ((MB_MODE_INFO *)mbmi)->seg_id_predicted = 0; + if (seg->temporal_update) mbmi->seg_id_predicted = 0; return; } } @@ -925,7 +926,7 @@ static void write_inter_segment_id(AV1_COMP *cpi, aom_writer *w, write_segment_id(cpi, mbmi, w, seg, segp, mi_row, mi_col, 0); } if (pred_flag) { - set_spatial_segment_id(cm, cm->current_frame_seg_map, mbmi->sb_type, + set_spatial_segment_id(cm, cm->cur_frame->seg_map, mbmi->sb_type, mi_row, mi_col, mbmi->segment_id); } } else { @@ -1134,7 +1135,7 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row, if (mbmi->ref_frame[1] != INTRA_FRAME) write_motion_mode(cm, xd, mbmi, w); // First write idx to indicate current compound inter prediction mode group - // Group A (0): jnt_comp, compound_average + // Group A (0): dist_wtd_comp, compound_average // Group B (1): interintra, compound_diffwtd, wedge if (has_second_ref(mbmi)) { const int masked_compound_used = is_any_masked_compound_used(bsize) && @@ -1152,7 +1153,7 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row, if (mbmi->compound_idx) assert(mbmi->interinter_comp.type == COMPOUND_AVERAGE); - if (cm->seq_params.order_hint_info.enable_jnt_comp) { + if (cm->seq_params.order_hint_info.enable_dist_wtd_comp) { const int comp_index_ctx = get_comp_index_context(cm, xd); aom_write_symbol(w, mbmi->compound_idx, ec_ctx->compound_index_cdf[comp_index_ctx], 2); @@ -1169,9 +1170,9 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row, mbmi->interinter_comp.type == COMPOUND_DIFFWTD); if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) - aom_write_symbol(w, mbmi->interinter_comp.type - 1, + aom_write_symbol(w, mbmi->interinter_comp.type - COMPOUND_WEDGE, ec_ctx->compound_type_cdf[bsize], - COMPOUND_TYPES - 1); + MASKED_COMPOUND_TYPES); if (mbmi->interinter_comp.type == COMPOUND_WEDGE) { assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize)); @@ -1185,7 +1186,6 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row, } } } - write_mb_interp_filter(cpi, xd, w); } } @@ -1237,13 +1237,14 @@ static void write_mb_modes_kf(AV1_COMP *cpi, MACROBLOCKD *xd, } #if CONFIG_RD_DEBUG -static void dump_mode_info(MODE_INFO *mi) { +static void dump_mode_info(MB_MODE_INFO *mi) { printf("\nmi->mi_row == %d\n", mi->mi_row); printf("&& mi->mi_col == %d\n", mi->mi_col); printf("&& mi->sb_type == %d\n", mi->sb_type); printf("&& mi->tx_size == %d\n", mi->tx_size); printf("&& mi->mode == %d\n", mi->mode); } + static int rd_token_stats_mismatch(RD_STATS *rd_stats, TOKEN_STATS *token_stats, int plane) { if (rd_stats->txb_coeff_cost[plane] != token_stats->cost) { @@ -1274,30 +1275,28 @@ static int rd_token_stats_mismatch(RD_STATS *rd_stats, TOKEN_STATS *token_stats, #if ENC_MISMATCH_DEBUG static void enc_dump_logs(AV1_COMP *cpi, int mi_row, int mi_col) { AV1_COMMON *const cm = &cpi->common; - MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; - xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col); - const MB_MODE_INFO *const *mbmi = xd->mi[0]; + const MB_MODE_INFO *const *mbmi = + *(cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col)); + const MB_MODE_INFO_EXT *const *mbmi_ext = + cpi->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col); if (is_inter_block(mbmi)) { #define FRAME_TO_CHECK 11 if (cm->current_frame.frame_number == FRAME_TO_CHECK && cm->show_frame == 1) { const BLOCK_SIZE bsize = mbmi->sb_type; - int_mv mv[2]; - int is_comp_ref = has_second_ref(mbmi); - int ref; + int_mv mv[2] = { 0 }; + const int is_comp_ref = has_second_ref(mbmi); - for (ref = 0; ref < 1 + is_comp_ref; ++ref) + for (int ref = 0; ref < 1 + is_comp_ref; ++ref) mv[ref].as_mv = mbmi->mv[ref].as_mv; if (!is_comp_ref) { mv[1].as_int = 0; } - MACROBLOCK *const x = &cpi->td.mb; - const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; const int16_t mode_ctx = - is_comp_ref ? mbmi_ext->compound_mode_context[mbmi->ref_frame[0]] + is_comp_ref ? 0 : av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame); @@ -1479,14 +1478,16 @@ static void write_tokens_b(AV1_COMP *cpi, const TileInfo *const tile, row, col, &block[plane], plane); } } + } #if CONFIG_RD_DEBUG + for (plane = 0; plane < num_planes && is_inter_block(mbmi); ++plane) { if (mbmi->sb_type >= BLOCK_8X8 && rd_token_stats_mismatch(&mbmi->rd_stats, &token_stats, plane)) { - dump_mode_info(m); + dump_mode_info(mbmi); assert(0); } -#endif // CONFIG_RD_DEBUG } +#endif // CONFIG_RD_DEBUG } } } @@ -1875,8 +1876,8 @@ static void loop_restoration_write_sb_coeffs(const AV1_COMMON *const cm, assert(!cm->all_lossless); const int wiener_win = (plane > 0) ? WIENER_WIN_CHROMA : WIENER_WIN; - WienerInfo *wiener_info = xd->wiener_info + plane; - SgrprojInfo *sgrproj_info = xd->sgrproj_info + plane; + WienerInfo *ref_wiener_info = &xd->wiener_info[plane]; + SgrprojInfo *ref_sgrproj_info = &xd->sgrproj_info[plane]; RestorationType unit_rtype = rui->restoration_type; if (frame_rtype == RESTORE_SWITCHABLE) { @@ -1887,10 +1888,10 @@ static void loop_restoration_write_sb_coeffs(const AV1_COMMON *const cm, #endif switch (unit_rtype) { case RESTORE_WIENER: - write_wiener_filter(wiener_win, &rui->wiener_info, wiener_info, w); + write_wiener_filter(wiener_win, &rui->wiener_info, ref_wiener_info, w); break; case RESTORE_SGRPROJ: - write_sgrproj_filter(&rui->sgrproj_info, sgrproj_info, w); + write_sgrproj_filter(&rui->sgrproj_info, ref_sgrproj_info, w); break; default: assert(unit_rtype == RESTORE_NONE); break; } @@ -1901,7 +1902,7 @@ static void loop_restoration_write_sb_coeffs(const AV1_COMMON *const cm, ++counts->wiener_restore[unit_rtype != RESTORE_NONE]; #endif if (unit_rtype != RESTORE_NONE) { - write_wiener_filter(wiener_win, &rui->wiener_info, wiener_info, w); + write_wiener_filter(wiener_win, &rui->wiener_info, ref_wiener_info, w); } } else if (frame_rtype == RESTORE_SGRPROJ) { aom_write_symbol(w, unit_rtype != RESTORE_NONE, @@ -1910,7 +1911,7 @@ static void loop_restoration_write_sb_coeffs(const AV1_COMMON *const cm, ++counts->sgrproj_restore[unit_rtype != RESTORE_NONE]; #endif if (unit_rtype != RESTORE_NONE) { - write_sgrproj_filter(&rui->sgrproj_info, sgrproj_info, w); + write_sgrproj_filter(&rui->sgrproj_info, ref_sgrproj_info, w); } } } @@ -1941,13 +1942,9 @@ static void encode_loopfilter(AV1_COMMON *cm, struct aom_write_bit_buffer *wb) { aom_wb_write_bit(wb, lf->mode_ref_delta_update); if (lf->mode_ref_delta_update) { - const int prime_idx = cm->primary_ref_frame; - const RefCntBuffer *const buf = - prime_idx == PRIMARY_REF_NONE - ? NULL - : cm->current_frame.frame_refs[prime_idx].buf; + const RefCntBuffer *buf = get_primary_ref_frame_buf(cm); int8_t last_ref_deltas[REF_FRAMES]; - if (prime_idx == PRIMARY_REF_NONE || buf == NULL) { + if (buf == NULL) { av1_set_default_ref_deltas(last_ref_deltas); } else { memcpy(last_ref_deltas, buf->ref_deltas, REF_FRAMES); @@ -1960,7 +1957,7 @@ static void encode_loopfilter(AV1_COMMON *cm, struct aom_write_bit_buffer *wb) { } int8_t last_mode_deltas[MAX_MODE_LF_DELTAS]; - if (prime_idx == PRIMARY_REF_NONE || buf == NULL) { + if (buf == NULL) { av1_set_default_mode_deltas(last_mode_deltas); } else { memcpy(last_mode_deltas, buf->mode_deltas, MAX_MODE_LF_DELTAS); @@ -2076,15 +2073,6 @@ static void encode_segmentation(AV1_COMMON *cm, MACROBLOCKD *xd, } } -static void write_tx_mode(AV1_COMMON *cm, TX_MODE *mode, - struct aom_write_bit_buffer *wb) { - if (cm->coded_lossless) { - *mode = ONLY_4X4; - return; - } - aom_wb_write_bit(wb, *mode == TX_MODE_SELECT); -} - static void write_frame_interp_filter(InterpFilter filter, struct aom_write_bit_buffer *wb) { aom_wb_write_bit(wb, filter == SWITCHABLE); @@ -2092,29 +2080,6 @@ static void write_frame_interp_filter(InterpFilter filter, aom_wb_write_literal(wb, filter, LOG_SWITCHABLE_FILTERS); } -static void fix_interp_filter(AV1_COMMON *cm, FRAME_COUNTS *counts) { - if (cm->interp_filter == SWITCHABLE) { - // Check to see if only one of the filters is actually used - int count[SWITCHABLE_FILTERS]; - int i, j, c = 0; - for (i = 0; i < SWITCHABLE_FILTERS; ++i) { - count[i] = 0; - for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) - count[i] += counts->switchable_interp[j][i]; - c += (count[i] > 0); - } - if (c == 1) { - // Only one filter is used. So set the filter at frame level - for (i = 0; i < SWITCHABLE_FILTERS; ++i) { - if (count[i]) { - if (i == EIGHTTAP_REGULAR) cm->interp_filter = i; - break; - } - } - } - } -} - // Same function as write_uniform but writing to uncompresses header wb static void wb_write_uniform(struct aom_write_bit_buffer *wb, int n, int v) { const int l = get_unsigned_bits(n); @@ -2212,63 +2177,12 @@ static void write_ext_tile_info(const AV1_COMMON *const cm, } } -static int get_refresh_mask(AV1_COMP *cpi) { - if ((cpi->common.current_frame.frame_type == KEY_FRAME && - cpi->common.show_frame) || - frame_is_sframe(&cpi->common)) - return 0xFF; - - int refresh_mask = 0; - - // NOTE(zoeliu): When LAST_FRAME is to get refreshed, the decoder will be - // notified to get LAST3_FRAME refreshed and then the virtual indexes for all - // the 3 LAST reference frames will be updated accordingly, i.e.: - // (1) The original virtual index for LAST3_FRAME will become the new virtual - // index for LAST_FRAME; and - // (2) The original virtual indexes for LAST_FRAME and LAST2_FRAME will be - // shifted and become the new virtual indexes for LAST2_FRAME and - // LAST3_FRAME. - refresh_mask |= - (cpi->refresh_last_frame << get_ref_frame_map_idx(cpi, LAST3_FRAME)); - -#if USE_SYMM_MULTI_LAYER - const int bwd_ref_frame = - (cpi->new_bwdref_update_rule == 1) ? EXTREF_FRAME : BWDREF_FRAME; -#else - const int bwd_ref_frame = BWDREF_FRAME; -#endif - refresh_mask |= - (cpi->refresh_bwd_ref_frame << get_ref_frame_map_idx(cpi, bwd_ref_frame)); - - refresh_mask |= (cpi->refresh_alt2_ref_frame - << get_ref_frame_map_idx(cpi, ALTREF2_FRAME)); - - if (av1_preserve_existing_gf(cpi)) { - // We have decided to preserve the previously existing golden frame as our - // new ARF frame. However, in the short term we leave it in the GF slot and, - // if we're updating the GF with the current decoded frame, we save it - // instead to the ARF slot. - // Later, in the function av1_encoder.c:av1_update_reference_frames() we - // will swap gld_fb_idx and alt_fb_idx to achieve our objective. We do it - // there so that it can be done outside of the recode loop. - // Note: This is highly specific to the use of ARF as a forward reference, - // and this needs to be generalized as other uses are implemented - // (like RTC/temporal scalability). - - if (cpi->preserve_arf_as_gld) { - return refresh_mask; - } else { - return refresh_mask | (cpi->refresh_golden_frame - << get_ref_frame_map_idx(cpi, ALTREF_FRAME)); - } - } else { - const int arf_idx = get_ref_frame_map_idx(cpi, ALTREF_FRAME); - return refresh_mask | - (cpi->refresh_golden_frame - << get_ref_frame_map_idx(cpi, GOLDEN_FRAME)) | - (cpi->refresh_alt_ref_frame << arf_idx); - } -} +// Stores the location and size of a tile's data in the bitstream. Used for +// later identifying identical tiles +typedef struct TileBufferEnc { + uint8_t *data; + size_t size; +} TileBufferEnc; static INLINE int find_identical_tile( const int tile_row, const int tile_col, @@ -2289,18 +2203,18 @@ static INLINE int find_identical_tile( int col_offset = candidate_offset[0].col; int row = tile_row - row_offset; int col = tile_col - col_offset; - uint8_t tile_hdr; const uint8_t *tile_data; TileBufferEnc *candidate; if (row < 0 || col < 0) continue; - tile_hdr = *(tile_buffers[row][col].data); + const uint32_t tile_hdr = mem_get_le32(tile_buffers[row][col].data); - // Read out tcm bit - if ((tile_hdr >> 7) == 1) { - // The candidate is a copy tile itself - row_offset += tile_hdr & 0x7f; + // Read out tile-copy-mode bit: + if ((tile_hdr >> 31) == 1) { + // The candidate is a copy tile itself: the offset is stored in bits + // 30 through 24 inclusive. + row_offset += (tile_hdr >> 24) & 0x7f; row = tile_row - row_offset; } @@ -2370,14 +2284,13 @@ static void write_frame_size(const AV1_COMMON *cm, int frame_size_override, write_render_size(cm, wb); } -static void write_frame_size_with_refs(AV1_COMP *cpi, +static void write_frame_size_with_refs(const AV1_COMMON *const cm, struct aom_write_bit_buffer *wb) { - AV1_COMMON *const cm = &cpi->common; int found = 0; MV_REFERENCE_FRAME ref_frame; for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { - YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, ref_frame); + const YV12_BUFFER_CONFIG *cfg = get_ref_frame_yv12_buf(cm, ref_frame); if (cfg != NULL) { found = cm->superres_upscaled_width == cfg->y_crop_width && @@ -2539,34 +2452,27 @@ static void write_tu_pts_info(AV1_COMMON *const cm, cm->buffer_model.frame_presentation_time_length); } -static void write_film_grain_params(AV1_COMP *cpi, +static void write_film_grain_params(const AV1_COMP *const cpi, struct aom_write_bit_buffer *wb) { - AV1_COMMON *const cm = &cpi->common; - aom_film_grain_t *pars = &cm->film_grain_params; - - cm->cur_frame->film_grain_params = *pars; + const AV1_COMMON *const cm = &cpi->common; + const aom_film_grain_t *const pars = &cm->cur_frame->film_grain_params; aom_wb_write_bit(wb, pars->apply_grain); if (!pars->apply_grain) return; aom_wb_write_literal(wb, pars->random_seed, 16); - pars->random_seed += 3381; // Changing random seed for film grain - if (!pars->random_seed) // Random seed should not be zero - pars->random_seed += 7391; if (cm->current_frame.frame_type == INTER_FRAME) aom_wb_write_bit(wb, pars->update_parameters); - else - pars->update_parameters = 1; + if (!pars->update_parameters) { - RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; - int ref_frame, ref_idx, buf_idx; + int ref_frame, ref_idx; for (ref_frame = LAST_FRAME; ref_frame < REF_FRAMES; ref_frame++) { - ref_idx = get_ref_frame_map_idx(cpi, ref_frame); + ref_idx = get_ref_frame_map_idx(cm, ref_frame); assert(ref_idx != INVALID_IDX); - buf_idx = cm->ref_frame_map[ref_idx]; - if (frame_bufs[buf_idx].film_grain_params_present && - memcmp(pars, &frame_bufs[buf_idx].film_grain_params, sizeof(*pars))) { + const RefCntBuffer *const buf = cm->ref_frame_map[ref_idx]; + if (buf->film_grain_params_present && + av1_check_grain_params_equiv(pars, &buf->film_grain_params)) { break; } } @@ -2582,16 +2488,16 @@ static void write_film_grain_params(AV1_COMP *cpi, aom_wb_write_literal(wb, pars->scaling_points_y[i][1], 8); } - if (!cm->seq_params.monochrome) + if (!cm->seq_params.monochrome) { aom_wb_write_bit(wb, pars->chroma_scaling_from_luma); - else - pars->chroma_scaling_from_luma = 0; // for monochrome override to 0 + } else { + assert(!pars->chroma_scaling_from_luma); + } if (cm->seq_params.monochrome || pars->chroma_scaling_from_luma || ((cm->seq_params.subsampling_x == 1) && (cm->seq_params.subsampling_y == 1) && (pars->num_y_points == 0))) { - pars->num_cb_points = 0; - pars->num_cr_points = 0; + assert(pars->num_cb_points == 0 && pars->num_cr_points == 0); } else { aom_wb_write_literal(wb, pars->num_cb_points, 4); // max 10 for (int i = 0; i < pars->num_cb_points; i++) { @@ -2651,7 +2557,7 @@ static void write_film_grain_params(AV1_COMP *cpi, aom_wb_write_bit(wb, pars->clip_to_restricted_range); } -static void write_sb_size(SequenceHeader *seq_params, +static void write_sb_size(const SequenceHeader *const seq_params, struct aom_write_bit_buffer *wb) { (void)seq_params; (void)wb; @@ -2662,41 +2568,16 @@ static void write_sb_size(SequenceHeader *seq_params, aom_wb_write_bit(wb, seq_params->sb_size == BLOCK_128X128 ? 1 : 0); } -static void write_sequence_header(AV1_COMP *cpi, +static void write_sequence_header(const SequenceHeader *const seq_params, struct aom_write_bit_buffer *wb) { - AV1_COMMON *const cm = &cpi->common; - SequenceHeader *seq_params = &cm->seq_params; - - int max_frame_width = cpi->oxcf.forced_max_frame_width - ? cpi->oxcf.forced_max_frame_width - : cpi->oxcf.width; - int max_frame_height = cpi->oxcf.forced_max_frame_height - ? cpi->oxcf.forced_max_frame_height - : cpi->oxcf.height; - // max((int)ceil(log2(max_frame_width)), 1) - const int num_bits_width = - (max_frame_width > 1) ? get_msb(max_frame_width - 1) + 1 : 1; - // max((int)ceil(log2(max_frame_height)), 1) - const int num_bits_height = - (max_frame_height > 1) ? get_msb(max_frame_height - 1) + 1 : 1; - assert(num_bits_width <= 16); - assert(num_bits_height <= 16); - - seq_params->num_bits_width = num_bits_width; - seq_params->num_bits_height = num_bits_height; - seq_params->max_frame_width = max_frame_width; - seq_params->max_frame_height = max_frame_height; - - aom_wb_write_literal(wb, num_bits_width - 1, 4); - aom_wb_write_literal(wb, num_bits_height - 1, 4); - aom_wb_write_literal(wb, max_frame_width - 1, num_bits_width); - aom_wb_write_literal(wb, max_frame_height - 1, num_bits_height); - - /* Placeholder for actually writing to the bitstream */ - if (!seq_params->reduced_still_picture_hdr) { - seq_params->frame_id_length = FRAME_ID_LENGTH; - seq_params->delta_frame_id_length = DELTA_FRAME_ID_LENGTH; + aom_wb_write_literal(wb, seq_params->num_bits_width - 1, 4); + aom_wb_write_literal(wb, seq_params->num_bits_height - 1, 4); + aom_wb_write_literal(wb, seq_params->max_frame_width - 1, + seq_params->num_bits_width); + aom_wb_write_literal(wb, seq_params->max_frame_height - 1, + seq_params->num_bits_height); + if (!seq_params->reduced_still_picture_hdr) { aom_wb_write_bit(wb, seq_params->frame_id_numbers_present_flag); if (seq_params->frame_id_numbers_present_flag) { // We must always have delta_frame_id_length < frame_id_length, @@ -2724,7 +2605,7 @@ static void write_sequence_header(AV1_COMP *cpi, aom_wb_write_bit(wb, seq_params->order_hint_info.enable_order_hint); if (seq_params->order_hint_info.enable_order_hint) { - aom_wb_write_bit(wb, seq_params->order_hint_info.enable_jnt_comp); + aom_wb_write_bit(wb, seq_params->order_hint_info.enable_dist_wtd_comp); aom_wb_write_bit(wb, seq_params->order_hint_info.enable_ref_frame_mvs); } if (seq_params->force_screen_content_tools == 2) { @@ -2821,7 +2702,7 @@ static void write_global_motion(AV1_COMP *cpi, // does not work currently and causes mismatches when resize is on. // Fix it before turning the optimization back on. /* - YV12_BUFFER_CONFIG *ref_buf = get_ref_frame_buffer(cpi, frame); + YV12_BUFFER_CONFIG *ref_buf = get_ref_frame_yv12_buf(cpi, frame); if (cpi->source->y_crop_width == ref_buf->y_crop_width && cpi->source->y_crop_height == ref_buf->y_crop_height) { write_global_motion_params(&cm->global_motion[frame], @@ -2842,78 +2723,72 @@ static void write_global_motion(AV1_COMP *cpi, } } -static void check_frame_refs_short_signaling(AV1_COMP *const cpi) { - AV1_COMMON *const cm = &cpi->common; - if (!cm->frame_refs_short_signaling) return; - +static int check_frame_refs_short_signaling(AV1_COMMON *const cm) { // Check whether all references are distinct frames. - int buf_markers[FRAME_BUFFERS] = { 0 }; + const RefCntBuffer *seen_bufs[FRAME_BUFFERS] = { NULL }; + int num_refs = 0; for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { - const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame); - if (buf_idx != INVALID_IDX) { - assert(buf_idx >= 0 && buf_idx < FRAME_BUFFERS); - buf_markers[buf_idx] = 1; + const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame); + if (buf != NULL) { + int seen = 0; + for (int i = 0; i < num_refs; i++) { + if (seen_bufs[i] == buf) { + seen = 1; + break; + } + } + if (!seen) seen_bufs[num_refs++] = buf; } } - int num_refs = 0; - for (int buf_idx = 0; buf_idx < FRAME_BUFFERS; ++buf_idx) { - num_refs += buf_markers[buf_idx]; - } - // We only turn on frame_refs_short_signaling when all references are // distinct. if (num_refs < INTER_REFS_PER_FRAME) { // It indicates that there exist more than one reference frame pointing to // the same reference buffer, i.e. two or more references are duplicate. - cm->frame_refs_short_signaling = 0; - return; + return 0; } // Check whether the encoder side ref frame choices are aligned with that to // be derived at the decoder side. - RefBuffer frame_refs_copy[INTER_REFS_PER_FRAME]; + int remapped_ref_idx_decoder[REF_FRAMES]; - // Backup the frame refs info - memcpy(frame_refs_copy, cm->current_frame.frame_refs, - INTER_REFS_PER_FRAME * sizeof(RefBuffer)); - - const int lst_map_idx = get_ref_frame_map_idx(cpi, LAST_FRAME); - const int gld_map_idx = get_ref_frame_map_idx(cpi, GOLDEN_FRAME); + const int lst_map_idx = get_ref_frame_map_idx(cm, LAST_FRAME); + const int gld_map_idx = get_ref_frame_map_idx(cm, GOLDEN_FRAME); // Set up the frame refs mapping indexes according to the // frame_refs_short_signaling policy. - av1_set_frame_refs(cm, lst_map_idx, gld_map_idx); + av1_set_frame_refs(cm, remapped_ref_idx_decoder, lst_map_idx, gld_map_idx); // We only turn on frame_refs_short_signaling when the encoder side decision // on ref frames is identical to that at the decoder side. + int frame_refs_short_signaling = 1; for (int ref_idx = 0; ref_idx < INTER_REFS_PER_FRAME; ++ref_idx) { // Compare the buffer index between two reference frames indexed // respectively by the encoder and the decoder side decisions. - if (cm->current_frame.frame_refs[ref_idx].buf != - frame_refs_copy[ref_idx].buf) { - cm->frame_refs_short_signaling = 0; + RefCntBuffer *ref_frame_buf_new = NULL; + if (remapped_ref_idx_decoder[ref_idx] != INVALID_IDX) { + ref_frame_buf_new = cm->ref_frame_map[remapped_ref_idx_decoder[ref_idx]]; + } + if (get_ref_frame_buf(cm, LAST_FRAME + ref_idx) != ref_frame_buf_new) { + frame_refs_short_signaling = 0; break; } } #if 0 // For debug printf("\nFrame=%d: \n", cm->current_frame.frame_number); - printf("***frame_refs_short_signaling=%d\n", cm->frame_refs_short_signaling); + printf("***frame_refs_short_signaling=%d\n", frame_refs_short_signaling); for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { - printf("enc_ref(map_idx=%d, buf_idx=%d)=%d, vs. " + printf("enc_ref(map_idx=%d)=%d, vs. " "dec_ref(map_idx=%d)=%d\n", - get_ref_frame_map_idx(cpi, ref_frame), - get_ref_frame_buf_idx(cpi, ref_frame), ref_frame, - cm->current_frame.frame_refs[ref_frame - LAST_FRAME].map_idx, + get_ref_frame_map_idx(cm, ref_frame), ref_frame, + cm->remapped_ref_idx[ref_frame - LAST_FRAME], ref_frame); } #endif // 0 - // Restore the frame refs info if frame_refs_short_signaling is off. - if (!cm->frame_refs_short_signaling) - memcpy(cm->current_frame.frame_refs, frame_refs_copy, - INTER_REFS_PER_FRAME * sizeof(RefBuffer)); + return frame_refs_short_signaling; } // New function based on HLS R18 @@ -2925,10 +2800,7 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi, MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; CurrentFrame *const current_frame = &cm->current_frame; - // NOTE: By default all coded frames to be used as a reference - cm->is_reference_frame = 1; - current_frame->frame_type = - current_frame->intra_only ? INTRA_ONLY_FRAME : current_frame->frame_type; + current_frame->frame_refs_short_signaling = 0; if (seq_params->still_picture) { assert(cm->show_existing_frame == 0); @@ -2937,17 +2809,6 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi, } if (!seq_params->reduced_still_picture_hdr) { if (encode_show_existing_frame(cm)) { - RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; - const int frame_to_show = cm->ref_frame_map[cpi->existing_fb_idx_to_show]; - - if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) { - aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, - "Buffer %d does not contain a reconstructed frame", - frame_to_show); - } - assign_frame_buffer(frame_bufs, &cm->new_fb_idx, frame_to_show); - cm->cur_frame = &frame_bufs[cm->new_fb_idx]; - aom_wb_write_bit(wb, 1); // show_existing_frame aom_wb_write_literal(wb, cpi->existing_fb_idx_to_show, 3); @@ -2960,14 +2821,6 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi, int display_frame_id = cm->ref_frame_id[cpi->existing_fb_idx_to_show]; aom_wb_write_literal(wb, display_frame_id, frame_id_len); } - - if (cm->reset_decoder_state && - frame_bufs[frame_to_show].frame_type != KEY_FRAME) { - aom_internal_error( - &cm->error, AOM_CODEC_UNSUP_BITSTREAM, - "show_existing_frame to reset state on KEY_FRAME only"); - } - return; } else { aom_wb_write_bit(wb, 0); // show_existing_frame @@ -3008,29 +2861,28 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi, assert(cm->cur_frame_force_integer_mv == 0); } - cm->invalid_delta_frame_id_minus_1 = 0; int frame_size_override_flag = 0; - cm->frame_refs_short_signaling = 0; if (seq_params->reduced_still_picture_hdr) { - assert(cm->width == seq_params->max_frame_width && - cm->height == seq_params->max_frame_height); + assert(cm->superres_upscaled_width == seq_params->max_frame_width && + cm->superres_upscaled_height == seq_params->max_frame_height); } else { if (seq_params->frame_id_numbers_present_flag) { int frame_id_len = seq_params->frame_id_length; aom_wb_write_literal(wb, cm->current_frame_id, frame_id_len); } - if (cm->width > seq_params->max_frame_width || - cm->height > seq_params->max_frame_height) { + if (cm->superres_upscaled_width > seq_params->max_frame_width || + cm->superres_upscaled_height > seq_params->max_frame_height) { aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, "Frame dimensions are larger than the maximum values"); } frame_size_override_flag = - frame_is_sframe(cm) ? 1 - : (cm->width != seq_params->max_frame_width || - cm->height != seq_params->max_frame_height); + frame_is_sframe(cm) + ? 1 + : (cm->superres_upscaled_width != seq_params->max_frame_width || + cm->superres_upscaled_height != seq_params->max_frame_height); if (!frame_is_sframe(cm)) aom_wb_write_bit(wb, frame_size_override_flag); if (seq_params->order_hint_info.enable_order_hint) @@ -3069,70 +2921,21 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi, } } } - cpi->refresh_frame_mask = get_refresh_mask(cpi); - if (current_frame->frame_type == KEY_FRAME) { - if (!cm->show_frame) { // unshown keyframe (forward keyframe) - aom_wb_write_literal(wb, cpi->refresh_frame_mask, REF_FRAMES); - } else { - assert(cpi->refresh_frame_mask == 0xFF); - } - } else { - if (current_frame->frame_type == INTRA_ONLY_FRAME) { - assert(cpi->refresh_frame_mask != 0xFF); - int updated_fb = -1; - for (int i = 0; i < REF_FRAMES; i++) { - // If more than one frame is refreshed, it doesn't matter which one - // we pick, so pick the first. - if (cpi->refresh_frame_mask & (1 << i)) { - updated_fb = i; - break; - } - } - assert(updated_fb >= 0); - cm->fb_of_context_type[cm->frame_context_idx] = updated_fb; - aom_wb_write_literal(wb, cpi->refresh_frame_mask, REF_FRAMES); - } else if (current_frame->frame_type == INTER_FRAME || - frame_is_sframe(cm)) { - if (current_frame->frame_type == INTER_FRAME) { - aom_wb_write_literal(wb, cpi->refresh_frame_mask, REF_FRAMES); - } else { - assert(frame_is_sframe(cm) && cpi->refresh_frame_mask == 0xFF); - } - int updated_fb = -1; - for (int i = 0; i < REF_FRAMES; i++) { - // If more than one frame is refreshed, it doesn't matter which one - // we pick, so pick the first. - if (cpi->refresh_frame_mask & (1 << i)) { - updated_fb = i; - break; - } - } - // large scale tile sometimes won't refresh any fbs - if (updated_fb >= 0) { - cm->fb_of_context_type[cm->frame_context_idx] = updated_fb; - } - if (!cpi->refresh_frame_mask) { - // NOTE: "cpi->refresh_frame_mask == 0" indicates that the coded frame - // will not be used as a reference - cm->is_reference_frame = 0; - } - } - } + // Shown keyframes and switch-frames automatically refreshes all reference + // frames. For all other frame types, we need to write refresh_frame_flags. + if ((current_frame->frame_type == KEY_FRAME && !cm->show_frame) || + current_frame->frame_type == INTER_FRAME || + current_frame->frame_type == INTRA_ONLY_FRAME) + aom_wb_write_literal(wb, current_frame->refresh_frame_flags, REF_FRAMES); - if (!frame_is_intra_only(cm) || cpi->refresh_frame_mask != 0xFF) { + if (!frame_is_intra_only(cm) || current_frame->refresh_frame_flags != 0xff) { // Write all ref frame order hints if error_resilient_mode == 1 if (cm->error_resilient_mode && seq_params->order_hint_info.enable_order_hint) { - RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; for (int ref_idx = 0; ref_idx < REF_FRAMES; ref_idx++) { - // Get buffer index - const int buf_idx = cm->ref_frame_map[ref_idx]; - assert(buf_idx >= 0 && buf_idx < FRAME_BUFFERS); - - // Write order hint to bit stream aom_wb_write_literal( - wb, frame_bufs[buf_idx].order_hint, + wb, cm->ref_frame_map[ref_idx]->order_hint, seq_params->order_hint_info.order_hint_bits_minus_1 + 1); } } @@ -3143,8 +2946,6 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi, assert(!av1_superres_scaled(cm) || !cm->allow_intrabc); if (cm->allow_screen_content_tools && !av1_superres_scaled(cm)) aom_wb_write_bit(wb, cm->allow_intrabc); - // all eight fbs are refreshed, pick one that will live long enough - cm->fb_of_context_type[REGULAR_FRAME] = 0; } else { if (current_frame->frame_type == INTRA_ONLY_FRAME) { write_frame_size(cm, frame_size_override_flag, wb); @@ -3159,36 +2960,37 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi, // automatically. #define FRAME_REFS_SHORT_SIGNALING 0 #if FRAME_REFS_SHORT_SIGNALING - cm->frame_refs_short_signaling = + current_frame->frame_refs_short_signaling = seq_params->order_hint_info.enable_order_hint; #endif // FRAME_REFS_SHORT_SIGNALING - if (cm->frame_refs_short_signaling) { + if (current_frame->frame_refs_short_signaling) { // NOTE(zoeliu@google.com): // An example solution for encoder-side implementation on frame refs // short signaling, which is only turned on when the encoder side // decision on ref frames is identical to that at the decoder side. - check_frame_refs_short_signaling(cpi); + current_frame->frame_refs_short_signaling = + check_frame_refs_short_signaling(cm); } if (seq_params->order_hint_info.enable_order_hint) - aom_wb_write_bit(wb, cm->frame_refs_short_signaling); + aom_wb_write_bit(wb, current_frame->frame_refs_short_signaling); - if (cm->frame_refs_short_signaling) { - const int lst_ref = get_ref_frame_map_idx(cpi, LAST_FRAME); + if (current_frame->frame_refs_short_signaling) { + const int lst_ref = get_ref_frame_map_idx(cm, LAST_FRAME); aom_wb_write_literal(wb, lst_ref, REF_FRAMES_LOG2); - const int gld_ref = get_ref_frame_map_idx(cpi, GOLDEN_FRAME); + const int gld_ref = get_ref_frame_map_idx(cm, GOLDEN_FRAME); aom_wb_write_literal(wb, gld_ref, REF_FRAMES_LOG2); } for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { - assert(get_ref_frame_map_idx(cpi, ref_frame) != INVALID_IDX); - if (!cm->frame_refs_short_signaling) - aom_wb_write_literal(wb, get_ref_frame_map_idx(cpi, ref_frame), + assert(get_ref_frame_map_idx(cm, ref_frame) != INVALID_IDX); + if (!current_frame->frame_refs_short_signaling) + aom_wb_write_literal(wb, get_ref_frame_map_idx(cm, ref_frame), REF_FRAMES_LOG2); if (seq_params->frame_id_numbers_present_flag) { - int i = get_ref_frame_map_idx(cpi, ref_frame); + int i = get_ref_frame_map_idx(cm, ref_frame); int frame_id_len = seq_params->frame_id_length; int diff_len = seq_params->delta_frame_id_length; int delta_frame_id_minus_1 = @@ -3197,24 +2999,22 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi, (1 << frame_id_len)) - 1; if (delta_frame_id_minus_1 < 0 || - delta_frame_id_minus_1 >= (1 << diff_len)) - cm->invalid_delta_frame_id_minus_1 = 1; + delta_frame_id_minus_1 >= (1 << diff_len)) { + aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, + "Invalid delta_frame_id_minus_1"); + } aom_wb_write_literal(wb, delta_frame_id_minus_1, diff_len); } } if (!cm->error_resilient_mode && frame_size_override_flag) { - write_frame_size_with_refs(cpi, wb); + write_frame_size_with_refs(cm, wb); } else { write_frame_size(cm, frame_size_override_flag, wb); } - if (cm->cur_frame_force_integer_mv) { - cm->allow_high_precision_mv = 0; - } else { + if (!cm->cur_frame_force_integer_mv) aom_wb_write_bit(wb, cm->allow_high_precision_mv); - } - fix_interp_filter(cm, cpi->td.counts); write_frame_interp_filter(cm->interp_filter, wb); aom_wb_write_bit(wb, cm->switchable_motion_mode); if (frame_might_allow_ref_frame_mvs(cm)) { @@ -3228,7 +3028,7 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi, const int might_bwd_adapt = !(seq_params->reduced_still_picture_hdr) && !(cm->disable_cdf_update); if (cm->large_scale_tile) - cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED; + assert(cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_DISABLED); if (might_bwd_adapt) { aom_wb_write_bit( @@ -3268,9 +3068,13 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi, encode_restoration_mode(cm, wb); } - write_tx_mode(cm, &cm->tx_mode, wb); + // Write TX mode + if (cm->coded_lossless) + assert(cm->tx_mode == ONLY_4X4); + else + aom_wb_write_bit(wb, cm->tx_mode == TX_MODE_SELECT); - if (cpi->allow_comp_inter_inter) { + if (!frame_is_intra_only(cm)) { const int use_hybrid_pred = current_frame->reference_mode == REFERENCE_MODE_SELECT; @@ -3290,19 +3094,9 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi, if (!frame_is_intra_only(cm)) write_global_motion(cpi, wb); if (seq_params->film_grain_params_present && - (cm->show_frame || cm->showable_frame)) { - int flip_back_update_parameters_flag = 0; - if (current_frame->frame_type != INTER_FRAME && - cm->film_grain_params.update_parameters == 0) { - cm->film_grain_params.update_parameters = 1; - flip_back_update_parameters_flag = 1; - } + (cm->show_frame || cm->showable_frame)) write_film_grain_params(cpi, wb); - if (flip_back_update_parameters_flag) - cm->film_grain_params.update_parameters = 0; - } - if (cm->large_scale_tile) write_ext_tile_info(cm, saved_wb, wb); } @@ -3440,8 +3234,12 @@ static int remux_tiles(const AV1_COMMON *const cm, uint8_t *dst, return wpos; } -uint32_t write_obu_header(OBU_TYPE obu_type, int obu_extension, - uint8_t *const dst) { +uint32_t av1_write_obu_header(AV1_COMP *const cpi, OBU_TYPE obu_type, + int obu_extension, uint8_t *const dst) { + if (cpi->keep_level_stats && + (obu_type == OBU_FRAME || obu_type == OBU_FRAME_HEADER)) + ++cpi->frame_header_count; + struct aom_write_bit_buffer wb = { dst, 0 }; uint32_t size = 0; @@ -3493,9 +3291,8 @@ static void add_trailing_bits(struct aom_write_bit_buffer *wb) { } } -static void write_bitstream_level(BitstreamLevel bl, +static void write_bitstream_level(AV1_LEVEL seq_level_idx, struct aom_write_bit_buffer *wb) { - uint8_t seq_level_idx = major_minor_to_seq_level_idx(bl); assert(is_valid_seq_level_idx(seq_level_idx)); aom_wb_write_literal(wb, seq_level_idx, LEVEL_BITS); } @@ -3518,7 +3315,7 @@ uint32_t write_sequence_header_obu(AV1_COMP *cpi, uint8_t *const dst) { assert(cm->timing_info_present == 0); assert(cm->seq_params.decoder_model_info_present_flag == 0); assert(cm->seq_params.display_model_info_present_flag == 0); - write_bitstream_level(cm->seq_params.level[0], &wb); + write_bitstream_level(cm->seq_params.seq_level_idx[0], &wb); } else { aom_wb_write_bit(&wb, cm->timing_info_present); // timing info present flag @@ -3537,8 +3334,8 @@ uint32_t write_sequence_header_obu(AV1_COMP *cpi, uint8_t *const dst) { for (i = 0; i < cm->seq_params.operating_points_cnt_minus_1 + 1; i++) { aom_wb_write_literal(&wb, cm->seq_params.operating_point_idc[i], OP_POINTS_IDC_BITS); - write_bitstream_level(cm->seq_params.level[i], &wb); - if (cm->seq_params.level[i].major > 3) + write_bitstream_level(cm->seq_params.seq_level_idx[i], &wb); + if (cm->seq_params.seq_level_idx[i] >= SEQ_LEVEL_4_0) aom_wb_write_bit(&wb, cm->seq_params.tier[i]); if (cm->seq_params.decoder_model_info_present_flag) { aom_wb_write_bit(&wb, @@ -3557,7 +3354,7 @@ uint32_t write_sequence_header_obu(AV1_COMP *cpi, uint8_t *const dst) { } } } - write_sequence_header(cpi, &wb); + write_sequence_header(&cm->seq_params, &wb); write_color_config(&cm->seq_params, &wb); @@ -3607,11 +3404,13 @@ typedef struct { static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst, struct aom_write_bit_buffer *saved_wb, uint8_t obu_extension_header, - const FrameHeaderInfo *fh_info) { + const FrameHeaderInfo *fh_info, + int *const largest_tile_id) { AV1_COMMON *const cm = &cpi->common; aom_writer mode_bc; int tile_row, tile_col; - TileBufferEnc(*const tile_buffers)[MAX_TILE_COLS] = cpi->tile_buffers; + // Store the location and size of each tile's data in the bitstream: + TileBufferEnc tile_buffers[MAX_TILE_ROWS][MAX_TILE_COLS]; uint32_t total_size = 0; const int tile_cols = cm->tile_cols; const int tile_rows = cm->tile_rows; @@ -3632,13 +3431,13 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst, const int have_tiles = tile_cols * tile_rows > 1; int first_tg = 1; - cm->largest_tile_id = 0; + *largest_tile_id = 0; if (cm->large_scale_tile) { // For large_scale_tile case, we always have only one tile group, so it can // be written as an OBU_FRAME. const OBU_TYPE obu_type = OBU_FRAME; - const uint32_t tg_hdr_size = write_obu_header(obu_type, 0, data); + const uint32_t tg_hdr_size = av1_write_obu_header(cpi, obu_type, 0, data); data += tg_hdr_size; const uint32_t frame_header_size = @@ -3685,8 +3484,6 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst, // Is CONFIG_EXT_TILE = 1, every tile in the row has a header, // even for the last one, unless no tiling is used at all. total_size += data_offset; - // Initialise tile context from the frame context - this_tile->tctx = *cm->fc; cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx; mode_bc.allow_update_cdf = !cm->large_scale_tile; mode_bc.allow_update_cdf = @@ -3700,7 +3497,7 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst, // Record the maximum tile size we see, so we can compact headers later. if (tile_size > max_tile_size) { max_tile_size = tile_size; - cm->largest_tile_id = tile_cols * tile_row + tile_col; + *largest_tile_id = tile_cols * tile_row + tile_col; } if (have_tiles) { @@ -3718,6 +3515,9 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst, const int identical_tile_offset = find_identical_tile(tile_row, tile_col, tile_buffers); + // Indicate a copy-tile by setting the most significant bit. + // The row-offset to copy from is stored in the highest byte. + // remux_tiles will move these around later if (identical_tile_offset > 0) { tile_size = 0; tile_header = identical_tile_offset | 0x80; @@ -3792,7 +3592,7 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst, const OBU_TYPE obu_type = (num_tg_hdrs == 1) ? OBU_FRAME : OBU_TILE_GROUP; curr_tg_data_size = - write_obu_header(obu_type, obu_extension_header, data); + av1_write_obu_header(cpi, obu_type, obu_extension_header, data); obu_header_size = curr_tg_data_size; if (num_tg_hdrs == 1) { @@ -3823,8 +3623,6 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst, // The last tile of the tile group does not have a header. if (!is_last_tile_in_tg) total_size += 4; - // Initialise tile context from the frame context - this_tile->tctx = *cm->fc; cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx; mode_bc.allow_update_cdf = 1; mode_bc.allow_update_cdf = @@ -3841,7 +3639,7 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst, curr_tg_data_size += (tile_size + (is_last_tile_in_tg ? 0 : 4)); buf->size = tile_size; if (tile_size > max_tile_size) { - cm->largest_tile_id = tile_cols * tile_row + tile_col; + *largest_tile_id = tile_cols * tile_row + tile_col; max_tile_size = tile_size; } @@ -3876,12 +3674,13 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst, // Force context update tile to be the first tile in error // resiliant mode as the duplicate frame headers will have // context_update_tile_id set to 0 - cm->largest_tile_id = 0; + *largest_tile_id = 0; // Rewrite the OBU header to change the OBU type to Redundant Frame // Header. - write_obu_header(OBU_REDUNDANT_FRAME_HEADER, obu_extension_header, - &data[fh_info->obu_header_byte_offset]); + av1_write_obu_header(cpi, OBU_REDUNDANT_FRAME_HEADER, + obu_extension_header, + &data[fh_info->obu_header_byte_offset]); data += fh_info->total_length; @@ -3899,7 +3698,7 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst, // Fill in context_update_tile_id indicating the tile to use for the // cdf update. The encoder currently sets it to the largest tile // (but is up to the encoder) - aom_wb_overwrite_literal(saved_wb, cm->largest_tile_id, + aom_wb_overwrite_literal(saved_wb, *largest_tile_id, cm->log2_tile_cols + cm->log2_tile_rows); // If more than one tile group. tile_size_bytes takes the default value 4 // and does not need to be set. For a single tile group it is set in the @@ -3945,7 +3744,8 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst, return total_size; } -int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size) { +int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size, + int *const largest_tile_id) { uint8_t *data = dst; uint32_t data_size; AV1_COMMON *const cm = &cpi->common; @@ -3959,11 +3759,13 @@ int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size) { bitstream_queue_reset_write(); #endif + cpi->frame_header_count = 0; + // The TD is now written outside the frame encode loop // write sequence header obu if KEY_FRAME, preceded by 4-byte size if (cm->current_frame.frame_type == KEY_FRAME && cm->show_frame) { - obu_header_size = write_obu_header(OBU_SEQUENCE_HEADER, 0, data); + obu_header_size = av1_write_obu_header(cpi, OBU_SEQUENCE_HEADER, 0, data); obu_payload_size = write_sequence_header_obu(cpi, data + obu_header_size); const size_t length_field_size = @@ -3983,7 +3785,7 @@ int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size) { // Write Frame Header OBU. fh_info.frame_header = data; obu_header_size = - write_obu_header(OBU_FRAME_HEADER, obu_extension_header, data); + av1_write_obu_header(cpi, OBU_FRAME_HEADER, obu_extension_header, data); obu_payload_size = write_frame_header_obu(cpi, &saved_wb, data + obu_header_size, 1); @@ -4009,8 +3811,8 @@ int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size) { } else { // Each tile group obu will be preceded by 4-byte size of the tile group // obu - data_size = write_tiles_in_tg_obus(cpi, data, &saved_wb, - obu_extension_header, &fh_info); + data_size = write_tiles_in_tg_obus( + cpi, data, &saved_wb, obu_extension_header, &fh_info, largest_tile_id); } data += data_size; *size = data - dst; diff --git a/libaom/av1/encoder/bitstream.h b/libaom/av1/encoder/bitstream.h index 465ccae..b05d0d5 100644 --- a/libaom/av1/encoder/bitstream.h +++ b/libaom/av1/encoder/bitstream.h @@ -27,18 +27,14 @@ uint32_t write_sequence_header_obu(AV1_COMP *cpi, uint8_t *const dst); // Writes the OBU header byte, and the OBU header extension byte when // 'obu_extension' is non-zero. Returns number of bytes written to 'dst'. -uint32_t write_obu_header(OBU_TYPE obu_type, int obu_extension, - uint8_t *const dst); +uint32_t av1_write_obu_header(AV1_COMP *const cpi, OBU_TYPE obu_type, + int obu_extension, uint8_t *const dst); int write_uleb_obu_size(uint32_t obu_header_size, uint32_t obu_payload_size, uint8_t *dest); -int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dest, size_t *size); - -static INLINE int av1_preserve_existing_gf(AV1_COMP *cpi) { - // Do not swap gf and arf indices for internal overlay frames - return cpi->rc.is_src_frame_alt_ref && !cpi->rc.is_src_frame_ext_arf; -} +int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size, + int *const largest_tile_id); void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd, int blk_row, int blk_col, int plane, TX_SIZE tx_size, diff --git a/libaom/av1/encoder/block.h b/libaom/av1/encoder/block.h index 1b04519..96b0991 100644 --- a/libaom/av1/encoder/block.h +++ b/libaom/av1/encoder/block.h @@ -54,10 +54,10 @@ typedef struct macroblock_plane { typedef struct { int txb_skip_cost[TXB_SKIP_CONTEXTS][2]; int base_eob_cost[SIG_COEF_CONTEXTS_EOB][3]; - int base_cost[SIG_COEF_CONTEXTS][4]; + int base_cost[SIG_COEF_CONTEXTS][8]; int eob_extra_cost[EOB_COEF_CONTEXTS][2]; int dc_sign_cost[DC_SIGN_CONTEXTS][2]; - int lps_cost[LEVEL_CONTEXTS][COEFF_BASE_RANGE + 1]; + int lps_cost[LEVEL_CONTEXTS][COEFF_BASE_RANGE + 1 + COEFF_BASE_RANGE + 1]; } LV_MAP_COEFF_COST; typedef struct { @@ -74,16 +74,13 @@ typedef struct { } CB_COEFF_BUFFER; typedef struct { - int16_t mode_context[MODE_CTX_REF_FRAMES]; // TODO(angiebird): Reduce the buffer size according to sb_type - tran_low_t *tcoeff[MAX_MB_PLANE]; - uint16_t *eobs[MAX_MB_PLANE]; - uint8_t *txb_skip_ctx[MAX_MB_PLANE]; - int *dc_sign_ctx[MAX_MB_PLANE]; - uint8_t ref_mv_count[MODE_CTX_REF_FRAMES]; + CB_COEFF_BUFFER *cb_coef_buff; CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE]; int_mv global_mvs[REF_FRAMES]; - int16_t compound_mode_context[MODE_CTX_REF_FRAMES]; + int cb_offset; + int16_t mode_context[MODE_CTX_REF_FRAMES]; + uint8_t ref_mv_count[MODE_CTX_REF_FRAMES]; } MB_MODE_INFO_EXT; typedef struct { @@ -156,7 +153,7 @@ typedef struct { // Region size for mode decision sampling in the first pass of partition // search(two_pass_partition_search speed feature), in units of mi size(4). -// Used by the mode_pruning_based_on_two_pass_partition_search speed feature. +// Used by the mode pruning in two_pass_partition_search feature. #define FIRST_PARTITION_PASS_SAMPLE_REGION 8 #define FIRST_PARTITION_PASS_SAMPLE_REGION_LOG2 3 #define FIRST_PARTITION_PASS_STATS_TABLES \ @@ -177,6 +174,8 @@ typedef struct { uint8_t ref0_counts[REF_FRAMES]; // Counters for ref_frame[0]. uint8_t ref1_counts[REF_FRAMES]; // Counters for ref_frame[1]. int sample_counts; // Number of samples collected. + uint8_t interintra_motion_mode_count[REF_FRAMES]; // Counter for interintra + // motion mode } FIRST_PARTITION_PASS_STATS; #define MAX_INTERP_FILTER_STATS 64 @@ -185,11 +184,26 @@ typedef struct { int_mv mv[2]; int8_t ref_frames[2]; COMPOUND_TYPE comp_type; + int64_t rd; + int skip_txfm_sb; + int64_t skip_sse_sb; + unsigned int pred_sse; } INTERPOLATION_FILTER_STATS; -#if CONFIG_COLLECT_INTER_MODE_RD_STATS +#define MAX_COMP_RD_STATS 64 +typedef struct { + int32_t rate[COMPOUND_TYPES]; + int64_t dist[COMPOUND_TYPES]; + int64_t comp_model_rd[COMPOUND_TYPES]; + int_mv mv[2]; + MV_REFERENCE_FRAME ref_frames[2]; + PREDICTION_MODE mode; + InterpFilters filter; + int ref_mv_idx; + int is_global[2]; +} COMP_RD_STATS; + struct inter_modes_info; -#endif typedef struct macroblock MACROBLOCK; struct macroblock { struct macroblock_plane plane[MAX_MB_PLANE]; @@ -251,6 +265,9 @@ struct macroblock { int *ex_search_count_ptr; unsigned int txb_split_count; +#if CONFIG_SPEED_STATS + unsigned int tx_search_count; +#endif // CONFIG_SPEED_STATS // These are set to their default values at the beginning, and then adjusted // further in the encoding process. @@ -259,6 +276,7 @@ struct macroblock { unsigned int max_mv_context[REF_FRAMES]; unsigned int source_variance; + unsigned int simple_motion_pred_sse; unsigned int pred_sse[REF_FRAMES]; int pred_mv_sad[REF_FRAMES]; @@ -277,7 +295,7 @@ struct macroblock { CONV_BUF_TYPE *tmp_conv_dst; uint8_t *tmp_obmc_bufs[2]; - FRAME_CONTEXT *backup_tile_ctx; + FRAME_CONTEXT *row_ctx; // This context will be used to update color_map_cdf pointer which would be // used during pack bitstream. For single thread and tile-multithreading case // this ponter will be same as xd->tile_ctx, but for the case of row-mt: @@ -285,9 +303,7 @@ struct macroblock { // to the accurate tile context. FRAME_CONTEXT *tile_pb_ctx; -#if CONFIG_COLLECT_INTER_MODE_RD_STATS struct inter_modes_info *inter_modes_info; -#endif // buffer for hash value calculation of a block // used only in av1_get_block_hash_value() @@ -340,7 +356,7 @@ struct macroblock { // BWDREF_FRAME) in bidir-comp mode. int comp_bwdref_cost[REF_CONTEXTS][BWD_REFS - 1][2]; int inter_compound_mode_cost[INTER_MODE_CONTEXTS][INTER_COMPOUND_MODES]; - int compound_type_cost[BLOCK_SIZES_ALL][COMPOUND_TYPES - 1]; + int compound_type_cost[BLOCK_SIZES_ALL][MASKED_COMPOUND_TYPES]; int wedge_idx_cost[BLOCK_SIZES_ALL][16]; int interintra_cost[BLOCK_SIZE_GROUPS][2]; int wedge_interintra_cost[BLOCK_SIZES_ALL][2]; @@ -385,6 +401,11 @@ struct macroblock { // Store the fractional best motion vector during sub/Qpel-pixel motion search int_mv fractional_best_mv[3]; + // Ref frames that are selected by square partition blocks within a super- + // block, in MI resolution. They can be used to prune ref frames for + // rectangular blocks. + int picked_ref_frames_mask[32 * 32]; + // use default transform and skip transform type search for intra modes int use_default_intra_tx_type; // use default transform and skip transform type search for inter modes @@ -405,6 +426,13 @@ struct macroblock { // detection). For reference, 556 is the value returned for a solid // vertical black/white edge. uint16_t edge_strength; + // The strongest edge strength seen along the x/y axis. + uint16_t edge_strength_x; + uint16_t edge_strength_y; + + // [Saved stat index] + COMP_RD_STATS comp_rd_stats[MAX_COMP_RD_STATS]; + int comp_rd_stats_idx; }; static INLINE int is_rect_tx_allowed_bsize(BLOCK_SIZE bsize) { diff --git a/libaom/av1/encoder/context_tree.h b/libaom/av1/encoder/context_tree.h index cde3f2b..205ac8a 100644 --- a/libaom/av1/encoder/context_tree.h +++ b/libaom/av1/encoder/context_tree.h @@ -23,7 +23,7 @@ struct AV1_COMP; struct AV1Common; struct ThreadData; -typedef enum { +enum { // Search all the partition types in this plane. SEARCH_FULL_PLANE = 0, // Only search none_partition coding block. @@ -32,12 +32,14 @@ typedef enum { SEARCH_SAME_PLANE = 2, // Skip search partition on this plane. Go split directly. SPLIT_PLANE = 3, -} CB_TREE_SEARCH; +} UENUM1BYTE(CB_TREE_SEARCH); // Structure to hold snapshot of coding context during the mode picking process typedef struct { MB_MODE_INFO mic; MB_MODE_INFO_EXT mbmi_ext; + int64_t dist; + int64_t rdcost; uint8_t *color_index_map[2]; uint8_t *blk_skip; @@ -56,51 +58,32 @@ typedef struct { int hybrid_pred_diff; int comp_pred_diff; int single_pred_diff; - // Skip certain ref frames during RD search of rectangular partitions. - int skip_ref_frame_mask; // TODO(jingning) Use RD_COST struct here instead. This involves a boarder // scope of refactoring. int rate; - int64_t dist; - int64_t rdcost; + int rd_mode_is_ready; // Flag to indicate whether rd pick mode decision has // been made. -#if CONFIG_ONE_PASS_SVM - // Features for one pass svm early term - int seg_feat; -#endif - // motion vector cache for adaptive motion search control in partition // search loop MV pred_mv[REF_FRAMES]; InterpFilter pred_interp_filter; PARTITION_TYPE partition; - - // Reference and prediction mode cache for ref/mode speedup - // TODO(zoeliu@gmail.com): The values of ref_selected and mode_selected will - // be explored for further encoder speedup, to differentiate this approach for - // setting skip_ref_frame_mask from others. For instance, it is possible that - // the underlying square block(s) share the same SIMPLE_TRANSLATION motion - // mode as well as the mode of GLOBALMV, more ref/mode combos could be - // skipped. - MV_REFERENCE_FRAME ref_selected[2]; - int mode_selected; } PICK_MODE_CONTEXT; typedef struct { + int64_t rdcost; + int64_t sub_block_rdcost[4]; int valid; int split; - int skip; - int64_t rdcost; int sub_block_split[4]; int sub_block_skip[4]; - int64_t sub_block_rdcost[4]; + int skip; } PC_TREE_STATS; typedef struct PC_TREE { - int index; PARTITION_TYPE partitioning; BLOCK_SIZE block_size; PICK_MODE_CONTEXT none; @@ -112,9 +95,11 @@ typedef struct PC_TREE { PICK_MODE_CONTEXT verticalb[3]; PICK_MODE_CONTEXT horizontal4[4]; PICK_MODE_CONTEXT vertical4[4]; - CB_TREE_SEARCH cb_search_range; struct PC_TREE *split[4]; PC_TREE_STATS pc_tree_stats; + CB_TREE_SEARCH cb_search_range; + int index; + MV mv_ref_fulls[REF_FRAMES]; } PC_TREE; void av1_setup_pc_tree(struct AV1Common *cm, struct ThreadData *td); diff --git a/libaom/av1/encoder/cost.h b/libaom/av1/encoder/cost.h index af5b098..be0241a 100644 --- a/libaom/av1/encoder/cost.h +++ b/libaom/av1/encoder/cost.h @@ -30,6 +30,10 @@ extern const uint16_t av1_prob_cost[128]; // Calculate the cost of a symbol with probability p15 / 2^15 static INLINE int av1_cost_symbol(aom_cdf_prob p15) { + // p15 can be out of range [1, CDF_PROB_TOP - 1]. Clamping it, so that the + // following cost calculation works correctly. Otherwise, if p15 = + // CDF_PROB_TOP, shift would be -1, and "p15 << shift" would be wrong. + p15 = (aom_cdf_prob)clamp(p15, 1, CDF_PROB_TOP - 1); assert(0 < p15 && p15 < CDF_PROB_TOP); const int shift = CDF_PROB_BITS - 1 - get_msb(p15); const int prob = get_prob(p15 << shift, CDF_PROB_TOP); diff --git a/libaom/av1/encoder/encode_strategy.c b/libaom/av1/encoder/encode_strategy.c new file mode 100644 index 0000000..e9d6ee7 --- /dev/null +++ b/libaom/av1/encoder/encode_strategy.c @@ -0,0 +1,1173 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <stdint.h> + +#include "config/aom_config.h" +#include "config/aom_scale_rtcd.h" + +#include "aom/aom_codec.h" +#include "aom/aom_encoder.h" + +#include "aom_ports/system_state.h" + +#if CONFIG_MISMATCH_DEBUG +#include "aom_util/debug_util.h" +#endif // CONFIG_MISMATCH_DEBUG + +#include "av1/common/onyxc_int.h" + +#include "av1/encoder/encoder.h" +#include "av1/encoder/encode_strategy.h" +#include "av1/encoder/firstpass.h" +#include "av1/encoder/pass2_strategy.h" +#include "av1/encoder/temporal_filter.h" +#include "av1/encoder/tpl_model.h" + +void av1_configure_buffer_updates(AV1_COMP *const cpi, + EncodeFrameParams *const frame_params, + const FRAME_UPDATE_TYPE type, + int force_refresh_all) { + // NOTE(weitinglin): Should we define another function to take care of + // cpi->rc.is_$Source_Type to make this function as it is in the comment? + + cpi->rc.is_src_frame_alt_ref = 0; + cpi->rc.is_src_frame_internal_arf = 0; + + switch (type) { + case KF_UPDATE: + frame_params->refresh_last_frame = 1; + frame_params->refresh_golden_frame = 1; + frame_params->refresh_bwd_ref_frame = 1; + frame_params->refresh_alt2_ref_frame = 1; + frame_params->refresh_alt_ref_frame = 1; + break; + + case LF_UPDATE: + frame_params->refresh_last_frame = 1; + frame_params->refresh_golden_frame = 0; + frame_params->refresh_bwd_ref_frame = 0; + frame_params->refresh_alt2_ref_frame = 0; + frame_params->refresh_alt_ref_frame = 0; + break; + + case GF_UPDATE: + // TODO(zoeliu): To further investigate whether 'refresh_last_frame' is + // needed. + frame_params->refresh_last_frame = 1; + frame_params->refresh_golden_frame = 1; + frame_params->refresh_bwd_ref_frame = 0; + frame_params->refresh_alt2_ref_frame = 0; + frame_params->refresh_alt_ref_frame = 0; + break; + + case OVERLAY_UPDATE: + frame_params->refresh_last_frame = 0; + frame_params->refresh_golden_frame = 1; + frame_params->refresh_bwd_ref_frame = 0; + frame_params->refresh_alt2_ref_frame = 0; + frame_params->refresh_alt_ref_frame = 0; + + cpi->rc.is_src_frame_alt_ref = 1; + break; + + case ARF_UPDATE: + frame_params->refresh_last_frame = 0; + frame_params->refresh_golden_frame = 0; + // NOTE: BWDREF does not get updated along with ALTREF_FRAME. + frame_params->refresh_bwd_ref_frame = 0; + frame_params->refresh_alt2_ref_frame = 0; + frame_params->refresh_alt_ref_frame = 1; + break; + + case INTNL_OVERLAY_UPDATE: + frame_params->refresh_last_frame = 1; + frame_params->refresh_golden_frame = 0; + frame_params->refresh_bwd_ref_frame = 0; + frame_params->refresh_alt2_ref_frame = 0; + frame_params->refresh_alt_ref_frame = 0; + + cpi->rc.is_src_frame_alt_ref = 1; + cpi->rc.is_src_frame_internal_arf = 1; + break; + + case INTNL_ARF_UPDATE: + frame_params->refresh_last_frame = 0; + frame_params->refresh_golden_frame = 0; + if (cpi->oxcf.pass == 2) { + frame_params->refresh_bwd_ref_frame = 1; + frame_params->refresh_alt2_ref_frame = 0; + } else { + frame_params->refresh_bwd_ref_frame = 0; + frame_params->refresh_alt2_ref_frame = 1; + } + frame_params->refresh_alt_ref_frame = 0; + break; + + default: assert(0); break; + } + + if (cpi->ext_refresh_frame_flags_pending && + (cpi->oxcf.pass == 0 || cpi->oxcf.pass == 2)) { + frame_params->refresh_last_frame = cpi->ext_refresh_last_frame; + frame_params->refresh_golden_frame = cpi->ext_refresh_golden_frame; + frame_params->refresh_alt_ref_frame = cpi->ext_refresh_alt_ref_frame; + frame_params->refresh_bwd_ref_frame = cpi->ext_refresh_bwd_ref_frame; + frame_params->refresh_alt2_ref_frame = cpi->ext_refresh_alt2_ref_frame; + } + + if (force_refresh_all) { + frame_params->refresh_last_frame = 1; + frame_params->refresh_golden_frame = 1; + frame_params->refresh_bwd_ref_frame = 1; + frame_params->refresh_alt2_ref_frame = 1; + frame_params->refresh_alt_ref_frame = 1; + } +} + +static void set_additional_frame_flags(const AV1_COMMON *const cm, + unsigned int *const frame_flags) { + if (frame_is_intra_only(cm)) *frame_flags |= FRAMEFLAGS_INTRAONLY; + if (frame_is_sframe(cm)) *frame_flags |= FRAMEFLAGS_SWITCH; + if (cm->error_resilient_mode) *frame_flags |= FRAMEFLAGS_ERROR_RESILIENT; +} + +static INLINE void update_keyframe_counters(AV1_COMP *cpi) { + if (cpi->common.show_frame) { + if (!cpi->common.show_existing_frame || cpi->rc.is_src_frame_alt_ref || + cpi->common.current_frame.frame_type == KEY_FRAME) { + // If this is a show_existing_frame with a source other than altref, + // or if it is not a displayed forward keyframe, the keyframe update + // counters were incremented when it was originally encoded. + cpi->rc.frames_since_key++; + cpi->rc.frames_to_key--; + } + } +} + +static INLINE int is_frame_droppable(const AV1_COMP *const cpi) { + return !(cpi->refresh_alt_ref_frame || cpi->refresh_alt2_ref_frame || + cpi->refresh_bwd_ref_frame || cpi->refresh_golden_frame || + cpi->refresh_last_frame); +} + +static INLINE void update_frames_till_gf_update(AV1_COMP *cpi) { + // TODO(weitinglin): Updating this counter for is_frame_droppable + // is a work-around to handle the condition when a frame is drop. + // We should fix the cpi->common.show_frame flag + // instead of checking the other condition to update the counter properly. + if (cpi->common.show_frame || is_frame_droppable(cpi)) { + // Decrement count down till next gf + if (cpi->rc.frames_till_gf_update_due > 0) + cpi->rc.frames_till_gf_update_due--; + } +} + +static INLINE void update_twopass_gf_group_index(AV1_COMP *cpi) { + // Increment the gf group index ready for the next frame. If this is + // a show_existing_frame with a source other than altref, or if it is not + // a displayed forward keyframe, the index was incremented when it was + // originally encoded. + if (!cpi->common.show_existing_frame || cpi->rc.is_src_frame_alt_ref || + cpi->common.current_frame.frame_type == KEY_FRAME) { + ++cpi->twopass.gf_group.index; + } +} + +static void update_rc_counts(AV1_COMP *cpi) { + update_keyframe_counters(cpi); + update_frames_till_gf_update(cpi); + if (cpi->oxcf.pass == 2) update_twopass_gf_group_index(cpi); +} + +static void check_show_existing_frame(AV1_COMP *const cpi, + EncodeFrameParams *const frame_params) { + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + AV1_COMMON *const cm = &cpi->common; + const FRAME_UPDATE_TYPE frame_update_type = + gf_group->update_type[gf_group->index]; + const int which_arf = (gf_group->arf_update_idx[gf_group->index] > 0); + + if (cm->show_existing_frame == 1) { + frame_params->show_existing_frame = 0; + } else if (cpi->is_arf_filter_off[which_arf] && + (frame_update_type == OVERLAY_UPDATE || + frame_update_type == INTNL_OVERLAY_UPDATE)) { + // Other parameters related to OVERLAY_UPDATE will be taken care of + // in av1_get_second_pass_params(cpi) + frame_params->show_existing_frame = 1; + frame_params->existing_fb_idx_to_show = + (frame_update_type == OVERLAY_UPDATE) + ? get_ref_frame_map_idx(cm, ALTREF_FRAME) + : get_ref_frame_map_idx(cm, BWDREF_FRAME); + } +} + +static void set_ext_overrides(AV1_COMP *const cpi, + EncodeFrameParams *const frame_params) { + // Overrides the defaults with the externally supplied values with + // av1_update_reference() and av1_update_entropy() calls + // Note: The overrides are valid only for the next frame passed + // to av1_encode_lowlevel() + + AV1_COMMON *const cm = &cpi->common; + + if (cpi->ext_use_s_frame) { + frame_params->frame_type = S_FRAME; + } + + if (cpi->ext_refresh_frame_context_pending) { + cm->refresh_frame_context = cpi->ext_refresh_frame_context; + cpi->ext_refresh_frame_context_pending = 0; + } + cm->allow_ref_frame_mvs = cpi->ext_use_ref_frame_mvs; + + frame_params->error_resilient_mode = cpi->ext_use_error_resilient; + // A keyframe is already error resilient and keyframes with + // error_resilient_mode interferes with the use of show_existing_frame + // when forward reference keyframes are enabled. + frame_params->error_resilient_mode &= frame_params->frame_type != KEY_FRAME; + // For bitstream conformance, s-frames must be error-resilient + frame_params->error_resilient_mode |= frame_params->frame_type == S_FRAME; +} + +static int get_ref_frame_flags(const AV1_COMP *const cpi) { + const AV1_COMMON *const cm = &cpi->common; + + const RefCntBuffer *last_buf = get_ref_frame_buf(cm, LAST_FRAME); + const RefCntBuffer *last2_buf = get_ref_frame_buf(cm, LAST2_FRAME); + const RefCntBuffer *last3_buf = get_ref_frame_buf(cm, LAST3_FRAME); + const RefCntBuffer *golden_buf = get_ref_frame_buf(cm, GOLDEN_FRAME); + const RefCntBuffer *bwd_buf = get_ref_frame_buf(cm, BWDREF_FRAME); + const RefCntBuffer *alt2_buf = get_ref_frame_buf(cm, ALTREF2_FRAME); + const RefCntBuffer *alt_buf = get_ref_frame_buf(cm, ALTREF_FRAME); + + // No.1 Priority: LAST_FRAME + const int last2_is_last = (last2_buf == last_buf); + const int last3_is_last = (last3_buf == last_buf); + const int gld_is_last = (golden_buf == last_buf); + const int bwd_is_last = (bwd_buf == last_buf); + const int alt2_is_last = (alt2_buf == last_buf); + const int alt_is_last = (alt_buf == last_buf); + + // No.2 Priority: ALTREF_FRAME + const int last2_is_alt = (last2_buf == alt_buf); + const int last3_is_alt = (last3_buf == alt_buf); + const int gld_is_alt = (golden_buf == alt_buf); + const int bwd_is_alt = (bwd_buf == alt_buf); + const int alt2_is_alt = (alt2_buf == alt_buf); + + // No.3 Priority: LAST2_FRAME + const int last3_is_last2 = (last3_buf == last2_buf); + const int gld_is_last2 = (golden_buf == last2_buf); + const int bwd_is_last2 = (bwd_buf == last2_buf); + const int alt2_is_last2 = (alt2_buf == last2_buf); + + // No.4 Priority: LAST3_FRAME + const int gld_is_last3 = (golden_buf == last3_buf); + const int bwd_is_last3 = (bwd_buf == last3_buf); + const int alt2_is_last3 = (alt2_buf == last3_buf); + + // No.5 Priority: GOLDEN_FRAME + const int bwd_is_gld = (bwd_buf == golden_buf); + const int alt2_is_gld = (alt2_buf == golden_buf); + + // No.6 Priority: BWDREF_FRAME + const int alt2_is_bwd = (alt2_buf == bwd_buf); + + // No.7 Priority: ALTREF2_FRAME + + // cpi->ext_ref_frame_flags allows certain reference types to be disabled + // by the external interface. These are set by av1_apply_encoding_flags(). + // Start with what the external interface allows, then suppress any reference + // types which we have found to be duplicates. + + int flags = cpi->ext_ref_frame_flags; + + if (cpi->rc.frames_till_gf_update_due == INT_MAX) flags &= ~AOM_GOLD_FLAG; + + if (alt_is_last) flags &= ~AOM_ALT_FLAG; + + if (last2_is_last || last2_is_alt) flags &= ~AOM_LAST2_FLAG; + + if (last3_is_last || last3_is_alt || last3_is_last2) flags &= ~AOM_LAST3_FLAG; + + if (gld_is_last || gld_is_alt || gld_is_last2 || gld_is_last3) + flags &= ~AOM_GOLD_FLAG; + + if ((bwd_is_last || bwd_is_alt || bwd_is_last2 || bwd_is_last3 || bwd_is_gld)) + flags &= ~AOM_BWD_FLAG; + + if ((alt2_is_last || alt2_is_alt || alt2_is_last2 || alt2_is_last3 || + alt2_is_gld || alt2_is_bwd)) + flags &= ~AOM_ALT2_FLAG; + + return flags; +} + +static int get_current_frame_ref_type( + const AV1_COMP *const cpi, const EncodeFrameParams *const frame_params) { + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + // We choose the reference "type" of this frame from the flags which indicate + // which reference frames will be refreshed by it. More than one of these + // flags may be set, so the order here implies an order of precedence. + // This is just used to choose the primary_ref_frame (as the most recent + // reference buffer of the same reference-type as the current frame) + + const int intra_only = frame_params->frame_type == KEY_FRAME || + frame_params->frame_type == INTRA_ONLY_FRAME; + if (intra_only || frame_params->error_resilient_mode || + cpi->ext_use_primary_ref_none) + return REGULAR_FRAME; + else if (gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE) + return INTERNAL_ARF_FRAME; + else if (frame_params->refresh_alt_ref_frame) + return ARF_FRAME; + else if (cpi->rc.is_src_frame_alt_ref) + return OVERLAY_FRAME; + else if (frame_params->refresh_golden_frame) + return GLD_FRAME; + else if (frame_params->refresh_bwd_ref_frame) + return BRF_FRAME; + else + return REGULAR_FRAME; +} + +static int choose_primary_ref_frame( + const AV1_COMP *const cpi, const EncodeFrameParams *const frame_params) { + const AV1_COMMON *const cm = &cpi->common; + + const int intra_only = frame_params->frame_type == KEY_FRAME || + frame_params->frame_type == INTRA_ONLY_FRAME; + if (intra_only || frame_params->error_resilient_mode || + cpi->ext_use_primary_ref_none) { + return PRIMARY_REF_NONE; + } + + // Find the most recent reference frame with the same reference type as the + // current frame + const FRAME_CONTEXT_INDEX current_ref_type = + get_current_frame_ref_type(cpi, frame_params); + int wanted_fb = cpi->fb_of_context_type[current_ref_type]; + + int primary_ref_frame = PRIMARY_REF_NONE; + for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { + if (get_ref_frame_map_idx(cm, ref_frame) == wanted_fb) { + primary_ref_frame = ref_frame - LAST_FRAME; + } + } + return primary_ref_frame; +} + +static void update_fb_of_context_type( + const AV1_COMP *const cpi, const EncodeFrameParams *const frame_params, + int *const fb_of_context_type) { + const AV1_COMMON *const cm = &cpi->common; + + if (frame_is_intra_only(cm) || cm->error_resilient_mode || + cpi->ext_use_primary_ref_none) { + for (int i = 0; i < REF_FRAMES; i++) { + fb_of_context_type[i] = -1; + } + fb_of_context_type[REGULAR_FRAME] = + cm->show_frame ? get_ref_frame_map_idx(cm, GOLDEN_FRAME) + : get_ref_frame_map_idx(cm, ALTREF_FRAME); + } + + if (!encode_show_existing_frame(cm)) { + // Refresh fb_of_context_type[]: see encoder.h for explanation + if (cm->current_frame.frame_type == KEY_FRAME) { + // All ref frames are refreshed, pick one that will live long enough + fb_of_context_type[REGULAR_FRAME] = 0; + } else { + // If more than one frame is refreshed, it doesn't matter which one we + // pick so pick the first. LST sometimes doesn't refresh any: this is ok + const int current_frame_ref_type = + get_current_frame_ref_type(cpi, frame_params); + for (int i = 0; i < REF_FRAMES; i++) { + if (cm->current_frame.refresh_frame_flags & (1 << i)) { + fb_of_context_type[current_frame_ref_type] = i; + break; + } + } + } + } +} + +static int get_order_offset(const GF_GROUP *const gf_group, + const EncodeFrameParams *const frame_params) { + // shown frame by definition has order offset 0 + // show_existing_frame ignores order_offset and simply takes the order_hint + // from the reference frame being shown. + if (frame_params->show_frame || frame_params->show_existing_frame) return 0; + + const int arf_offset = + AOMMIN((MAX_GF_INTERVAL - 1), gf_group->arf_src_offset[gf_group->index]); + return AOMMIN((MAX_GF_INTERVAL - 1), arf_offset); +} + +static void adjust_frame_rate(AV1_COMP *cpi, + const struct lookahead_entry *source) { + int64_t this_duration; + int step = 0; + + // Clear down mmx registers + aom_clear_system_state(); + + if (source->ts_start == cpi->first_time_stamp_ever) { + this_duration = source->ts_end - source->ts_start; + step = 1; + } else { + int64_t last_duration = + cpi->last_end_time_stamp_seen - cpi->last_time_stamp_seen; + + this_duration = source->ts_end - cpi->last_end_time_stamp_seen; + + // do a step update if the duration changes by 10% + if (last_duration) + step = (int)((this_duration - last_duration) * 10 / last_duration); + } + + if (this_duration) { + if (step) { + av1_new_framerate(cpi, 10000000.0 / this_duration); + } else { + // Average this frame's rate into the last second's average + // frame rate. If we haven't seen 1 second yet, then average + // over the whole interval seen. + const double interval = AOMMIN( + (double)(source->ts_end - cpi->first_time_stamp_ever), 10000000.0); + double avg_duration = 10000000.0 / cpi->framerate; + avg_duration *= (interval - avg_duration + this_duration); + avg_duration /= interval; + + av1_new_framerate(cpi, 10000000.0 / avg_duration); + } + } + cpi->last_time_stamp_seen = source->ts_start; + cpi->last_end_time_stamp_seen = source->ts_end; +} + +// If this is an alt-ref, returns the offset of the source frame used +// as the arf midpoint. Otherwise, returns 0. +static int get_arf_src_index(AV1_COMP *cpi) { + RATE_CONTROL *const rc = &cpi->rc; + int arf_src_index = 0; + if (cpi->oxcf.pass == 2) { + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + if (gf_group->update_type[gf_group->index] == ARF_UPDATE) { + assert(is_altref_enabled(cpi)); + arf_src_index = gf_group->arf_src_offset[gf_group->index]; + } + } else if (rc->source_alt_ref_pending) { + arf_src_index = rc->frames_till_gf_update_due; + } + return arf_src_index; +} + +// If this is an internal alt-ref, returns the offset of the source frame used +// as the internal arf midpoint. Otherwise, returns 0. +static int get_internal_arf_src_index(AV1_COMP *cpi) { + int internal_arf_src_index = 0; + if (cpi->oxcf.pass == 2) { + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + if (gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE) { + assert(is_altref_enabled(cpi) && cpi->internal_altref_allowed); + internal_arf_src_index = gf_group->arf_src_offset[gf_group->index]; + } + } + return internal_arf_src_index; +} + +// Called if this frame is an ARF or ARF2. Also handles forward-keyframes +// For an ARF set arf2=0, for ARF2 set arf2=1 +// temporal_filtered is set to 1 if we temporally filter the ARF frame, so that +// the correct post-filter buffer can be used. +static struct lookahead_entry *setup_arf_or_arf2( + AV1_COMP *const cpi, const int arf_src_index, const int arf2, + int *temporal_filtered, EncodeFrameParams *const frame_params) { + AV1_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + + assert(arf_src_index <= rc->frames_to_key); + *temporal_filtered = 0; + + struct lookahead_entry *source = + av1_lookahead_peek(cpi->lookahead, arf_src_index); + + if (source != NULL) { + cm->showable_frame = 1; + cpi->alt_ref_source = source; + + // When arf_src_index == rc->frames_to_key, it indicates a fwd_kf + if (!arf2 && arf_src_index == rc->frames_to_key) { + // Skip temporal filtering and mark as intra_only if we have a fwd_kf + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + int which_arf = gf_group->arf_update_idx[gf_group->index]; + cpi->is_arf_filter_off[which_arf] = 1; + cpi->no_show_kf = 1; + } else { + if (oxcf->arnr_max_frames > 0) { + // Produce the filtered ARF frame. + av1_temporal_filter(cpi, arf_src_index); + aom_extend_frame_borders(&cpi->alt_ref_buffer, av1_num_planes(cm)); + *temporal_filtered = 1; + } + } + frame_params->show_frame = 0; + } + rc->source_alt_ref_pending = 0; + return source; +} + +// Determine whether there is a forced keyframe pending in the lookahead buffer +static int is_forced_keyframe_pending(struct lookahead_ctx *lookahead, + const int up_to_index) { + for (int i = 0; i <= up_to_index; i++) { + const struct lookahead_entry *e = av1_lookahead_peek(lookahead, i); + if (e == NULL) { + // We have reached the end of the lookahead buffer and not early-returned + // so there isn't a forced key-frame pending. + return 0; + } else if (e->flags == AOM_EFLAG_FORCE_KF) { + return 1; + } else { + continue; + } + } + return 0; // Never reached +} + +// Check if we should encode an ARF or internal ARF. If not, try a LAST +// Do some setup associated with the chosen source +// temporal_filtered, flush, and frame_update_type are outputs. +// Return the frame source, or NULL if we couldn't find one +struct lookahead_entry *choose_frame_source( + AV1_COMP *const cpi, int *const temporal_filtered, int *const flush, + struct lookahead_entry **last_source, FRAME_UPDATE_TYPE *frame_update_type, + EncodeFrameParams *const frame_params) { + AV1_COMMON *const cm = &cpi->common; + struct lookahead_entry *source = NULL; + *temporal_filtered = 0; + + // Should we encode an alt-ref frame. + int arf_src_index = get_arf_src_index(cpi); + if (arf_src_index && + is_forced_keyframe_pending(cpi->lookahead, arf_src_index)) { + arf_src_index = 0; + *flush = 1; + } + + if (arf_src_index) { + source = setup_arf_or_arf2(cpi, arf_src_index, 0, temporal_filtered, + frame_params); + *frame_update_type = ARF_UPDATE; + } + + // Should we encode an internal Alt-ref frame (mutually exclusive to ARF) + arf_src_index = get_internal_arf_src_index(cpi); + if (arf_src_index && + is_forced_keyframe_pending(cpi->lookahead, arf_src_index)) { + arf_src_index = 0; + *flush = 1; + } + + if (arf_src_index) { + source = setup_arf_or_arf2(cpi, arf_src_index, 1, temporal_filtered, + frame_params); + *frame_update_type = INTNL_ARF_UPDATE; + } + + if (!source) { + // Get last frame source. + if (cm->current_frame.frame_number > 0) { + *last_source = av1_lookahead_peek(cpi->lookahead, -1); + } + // Read in the source frame. + source = av1_lookahead_pop(cpi->lookahead, *flush); + if (source == NULL) return NULL; + *frame_update_type = LF_UPDATE; // Default update type + frame_params->show_frame = 1; + + // Check to see if the frame should be encoded as an arf overlay. + if (cpi->alt_ref_source == source) { + *frame_update_type = OVERLAY_UPDATE; + cpi->alt_ref_source = NULL; + } + } + return source; +} + +// Don't allow a show_existing_frame to coincide with an error resilient or +// S-Frame. An exception can be made in the case of a keyframe, since it does +// not depend on any previous frames. +static int allow_show_existing(const AV1_COMP *const cpi, + unsigned int frame_flags) { + if (cpi->common.current_frame.frame_number == 0) return 0; + + const struct lookahead_entry *lookahead_src = + av1_lookahead_peek(cpi->lookahead, 0); + if (lookahead_src == NULL) return 1; + + const int is_error_resilient = + cpi->oxcf.error_resilient_mode || + (lookahead_src->flags & AOM_EFLAG_ERROR_RESILIENT); + const int is_s_frame = + cpi->oxcf.s_frame_mode || (lookahead_src->flags & AOM_EFLAG_SET_S_FRAME); + const int is_key_frame = + (cpi->rc.frames_to_key == 0) || (frame_flags & FRAMEFLAGS_KEY); + return !(is_error_resilient || is_s_frame) || is_key_frame; +} + +// Update frame_flags to tell the encoder's caller what sort of frame was +// encoded. +static void update_frame_flags(AV1_COMP *cpi, unsigned int *frame_flags) { + if (encode_show_existing_frame(&cpi->common)) { + *frame_flags &= ~FRAMEFLAGS_GOLDEN; + *frame_flags &= ~FRAMEFLAGS_BWDREF; + *frame_flags &= ~FRAMEFLAGS_ALTREF; + *frame_flags &= ~FRAMEFLAGS_KEY; + return; + } + + if (cpi->refresh_golden_frame == 1) { + *frame_flags |= FRAMEFLAGS_GOLDEN; + } else { + *frame_flags &= ~FRAMEFLAGS_GOLDEN; + } + + if (cpi->refresh_alt_ref_frame == 1) { + *frame_flags |= FRAMEFLAGS_ALTREF; + } else { + *frame_flags &= ~FRAMEFLAGS_ALTREF; + } + + if (cpi->refresh_bwd_ref_frame == 1) { + *frame_flags |= FRAMEFLAGS_BWDREF; + } else { + *frame_flags &= ~FRAMEFLAGS_BWDREF; + } + + if (cpi->common.current_frame.frame_type == KEY_FRAME) { + *frame_flags |= FRAMEFLAGS_KEY; + } else { + *frame_flags &= ~FRAMEFLAGS_KEY; + } +} + +#define DUMP_REF_FRAME_IMAGES 0 + +#if DUMP_REF_FRAME_IMAGES == 1 +static int dump_one_image(AV1_COMMON *cm, + const YV12_BUFFER_CONFIG *const ref_buf, + char *file_name) { + int h; + FILE *f_ref = NULL; + + if (ref_buf == NULL) { + printf("Frame data buffer is NULL.\n"); + return AOM_CODEC_MEM_ERROR; + } + + if ((f_ref = fopen(file_name, "wb")) == NULL) { + printf("Unable to open file %s to write.\n", file_name); + return AOM_CODEC_MEM_ERROR; + } + + // --- Y --- + for (h = 0; h < cm->height; ++h) { + fwrite(&ref_buf->y_buffer[h * ref_buf->y_stride], 1, cm->width, f_ref); + } + // --- U --- + for (h = 0; h < (cm->height >> 1); ++h) { + fwrite(&ref_buf->u_buffer[h * ref_buf->uv_stride], 1, (cm->width >> 1), + f_ref); + } + // --- V --- + for (h = 0; h < (cm->height >> 1); ++h) { + fwrite(&ref_buf->v_buffer[h * ref_buf->uv_stride], 1, (cm->width >> 1), + f_ref); + } + + fclose(f_ref); + + return AOM_CODEC_OK; +} + +static void dump_ref_frame_images(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + MV_REFERENCE_FRAME ref_frame; + + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + char file_name[256] = ""; + snprintf(file_name, sizeof(file_name), "/tmp/enc_F%d_ref_%d.yuv", + cm->current_frame.frame_number, ref_frame); + dump_one_image(cm, get_ref_frame_yv12_buf(cpi, ref_frame), file_name); + } +} +#endif // DUMP_REF_FRAME_IMAGES == 1 + +// Assign new_ref in the new mapping to point at the reference buffer pointed at +// by old_ref in the old_map. The new mapping is stored in *new_map, while the +// old map comes from cm->remapped_ref_idx[]. +static void assign_new_map(AV1_COMMON *const cm, int *new_map, int new_ref, + int old_ref) { + new_map[new_ref - LAST_FRAME] = cm->remapped_ref_idx[old_ref - LAST_FRAME]; +} + +// Generate a new reference frame mapping. This function updates +// cm->remapped_ref_idx[] depending on the frame_update_type of this frame. +// This determines which references (e.g. LAST_FRAME, ALTREF_FRAME) point at the +// 8 underlying buffers and, together with get_refresh_frame_flags(), implements +// our reference frame management strategy. +static void update_ref_frame_map(AV1_COMP *cpi, + FRAME_UPDATE_TYPE frame_update_type) { + AV1_COMMON *const cm = &cpi->common; + + // If check_frame_refs_short_signaling() decided to set + // frame_refs_short_signaling=1 then we update remapped_ref_idx[] here. Every + // reference will still map to the same RefCntBuffer (through ref_frame_map[]) + // after this, but that does not necessarily mean that remapped_ref_idx[] is + // unchanged. + if (cm->current_frame.frame_refs_short_signaling) { + const int lst_map_idx = get_ref_frame_map_idx(cm, LAST_FRAME); + const int gld_map_idx = get_ref_frame_map_idx(cm, GOLDEN_FRAME); + av1_set_frame_refs(cm, cm->remapped_ref_idx, lst_map_idx, gld_map_idx); + } + + // For shown keyframes and S-frames all buffers are refreshed, but we don't + // change any of the mapping. + if ((cm->current_frame.frame_type == KEY_FRAME && cm->show_frame) || + frame_is_sframe(cm)) { + return; + } + + // Initialize the new reference map as a copy of the old one. + int new_map[REF_FRAMES]; + memcpy(new_map, cm->remapped_ref_idx, sizeof(new_map)); + + // The reference management strategy is currently as follows. See + // gop_structure.c for more details of the structure and DOI + // 10.1109/DCC.2018.00045 for a higher-level explanation + // + // * ALTREF_FRAME and GOLDEN_FRAME are kept separate from the other + // references. When we code an ALTREF it refreshes the ALTREF buffer. When + // we code an OVERLAY the old GOLDEN becomes the new ALTREF and the old + // ALTREF (possibly refreshed by the OVERLAY) becomes the new GOLDEN. + // * LAST_FRAME, LAST2_FRAME, and LAST3_FRAME work like a FIFO. When we code + // a frame which does a last-frame update we pick a buffer to refresh and + // then point the LAST_FRAME reference at it. The old LAST_FRAME becomes + // LAST2_FRAME and the old LAST2_FRAME becomes LAST3_FRAME. The old + // LAST3_FRAME is re-used somewhere else. + // * BWDREF, ALTREF2, and EXTREF act like a stack structure, so we can + // "push" and "pop" internal alt-ref frames through the three references. + // * When we code a BRF or internal-ARF (they work the same in this + // structure) we push it onto the bwdref stack. Because we have a finite + // number of buffers, we actually refresh EXTREF, the bottom of the stack, + // and rotate the three references to make EXTREF the top. + // * When we code an INTNL_OVERLAY we refresh BWDREF, then pop it off of the + // bwdref stack and push it into the last-frame FIFO. The old LAST3 + // buffer gets pushed out of the last-frame FIFO and becomes the new + // EXTREF, bottom of the bwdref stack. + // * LAST_BIPRED just acts like a LAST_FRAME. The BWDREF will have an + // INTNL_OVERLAY and so can do its own ref map update. + // + // Note that this function runs *after* a frame has been coded, so it does not + // affect reference assignment of the current frame, it only affects future + // frames. This is why we refresh buffers using the old reference map before + // remapping them. + // + // show_existing_frames don't refresh any buffers or send the reference map to + // the decoder, but we can still update our reference map if we want to: the + // decoder will update its map next time we code a non-show-existing frame. + + if (frame_update_type == OVERLAY_UPDATE) { + // We want the old golden-frame to become our new ARF so swap the + // references. If cpi->preserve_arf_as_gld == 0 then we will refresh the + // old ARF before it becomes our new GF + assign_new_map(cm, new_map, ALTREF_FRAME, GOLDEN_FRAME); + assign_new_map(cm, new_map, GOLDEN_FRAME, ALTREF_FRAME); + } else if (frame_update_type == INTNL_OVERLAY_UPDATE && + encode_show_existing_frame(cm)) { + // Note that because encode_show_existing_frame(cm) we don't refresh any + // buffers. + // Pop BWDREF (shown as current frame) from the bwdref stack and make it + // the new LAST_FRAME. + assign_new_map(cm, new_map, LAST_FRAME, BWDREF_FRAME); + + // Progress the last-frame FIFO and the bwdref stack + assign_new_map(cm, new_map, LAST2_FRAME, LAST_FRAME); + assign_new_map(cm, new_map, LAST3_FRAME, LAST2_FRAME); + assign_new_map(cm, new_map, BWDREF_FRAME, ALTREF2_FRAME); + assign_new_map(cm, new_map, ALTREF2_FRAME, EXTREF_FRAME); + assign_new_map(cm, new_map, EXTREF_FRAME, LAST3_FRAME); + } else if (frame_update_type == INTNL_ARF_UPDATE && + !cm->show_existing_frame) { + // We want to push the current frame onto the bwdref stack. We refresh + // EXTREF (the old bottom of the stack) and rotate the references so it + // becomes BWDREF, the top of the stack. + assign_new_map(cm, new_map, BWDREF_FRAME, EXTREF_FRAME); + assign_new_map(cm, new_map, ALTREF2_FRAME, BWDREF_FRAME); + assign_new_map(cm, new_map, EXTREF_FRAME, ALTREF2_FRAME); + } + + if ((frame_update_type == LF_UPDATE || frame_update_type == GF_UPDATE || + frame_update_type == INTNL_OVERLAY_UPDATE) && + !encode_show_existing_frame(cm) && + (!cm->show_existing_frame || frame_update_type == INTNL_OVERLAY_UPDATE)) { + // A standard last-frame: we refresh the LAST3_FRAME buffer and then push it + // into the last-frame FIFO. + assign_new_map(cm, new_map, LAST3_FRAME, LAST2_FRAME); + assign_new_map(cm, new_map, LAST2_FRAME, LAST_FRAME); + assign_new_map(cm, new_map, LAST_FRAME, LAST3_FRAME); + } + + memcpy(cm->remapped_ref_idx, new_map, sizeof(new_map)); + +#if DUMP_REF_FRAME_IMAGES == 1 + // Dump out all reference frame images. + dump_ref_frame_images(cpi); +#endif // DUMP_REF_FRAME_IMAGES +} + +static int get_refresh_frame_flags(const AV1_COMP *const cpi, + const EncodeFrameParams *const frame_params, + FRAME_UPDATE_TYPE frame_update_type) { + const AV1_COMMON *const cm = &cpi->common; + + // Switch frames and shown key-frames overwrite all reference slots + if ((frame_params->frame_type == KEY_FRAME && frame_params->show_frame) || + frame_params->frame_type == S_FRAME) + return 0xFF; + + // show_existing_frames don't actually send refresh_frame_flags so set the + // flags to 0 to keep things consistent. + if (frame_params->show_existing_frame && + (!frame_params->error_resilient_mode || + frame_params->frame_type == KEY_FRAME)) { + return 0; + } + + int refresh_mask = 0; + + if (cpi->ext_refresh_frame_flags_pending) { + // Unfortunately the encoder interface reflects the old refresh_*_frame + // flags so we have to replicate the old refresh_frame_flags logic here in + // order to preserve the behaviour of the flag overrides. + refresh_mask |= cpi->ext_refresh_last_frame + << get_ref_frame_map_idx(cm, LAST3_FRAME); + refresh_mask |= cpi->ext_refresh_bwd_ref_frame + << get_ref_frame_map_idx(cm, EXTREF_FRAME); + refresh_mask |= cpi->ext_refresh_alt2_ref_frame + << get_ref_frame_map_idx(cm, ALTREF2_FRAME); + if (frame_update_type == OVERLAY_UPDATE) { + if (!cpi->preserve_arf_as_gld) { + refresh_mask |= cpi->ext_refresh_golden_frame + << get_ref_frame_map_idx(cm, ALTREF_FRAME); + } + } else { + refresh_mask |= cpi->ext_refresh_golden_frame + << get_ref_frame_map_idx(cm, GOLDEN_FRAME); + refresh_mask |= cpi->ext_refresh_alt_ref_frame + << get_ref_frame_map_idx(cm, ALTREF_FRAME); + } + return refresh_mask; + } + + // See update_ref_frame_map() for a thorough description of the reference + // buffer management strategy currently in use. This function just decides + // which buffers should be refreshed. + + switch (frame_update_type) { + case KF_UPDATE: + // Note that a real shown key-frame or S-frame refreshes every buffer, + // handled in a special case above. This case is for frames which aren't + // really a shown key-frame or S-frame but want to refresh all the + // important buffers. + refresh_mask |= 1 << get_ref_frame_map_idx(cm, LAST3_FRAME); + refresh_mask |= 1 << get_ref_frame_map_idx(cm, EXTREF_FRAME); + refresh_mask |= 1 << get_ref_frame_map_idx(cm, ALTREF2_FRAME); + refresh_mask |= 1 << get_ref_frame_map_idx(cm, GOLDEN_FRAME); + refresh_mask |= 1 << get_ref_frame_map_idx(cm, ALTREF_FRAME); + break; + case LF_UPDATE: + // Refresh LAST3, which becomes the new LAST while LAST becomes LAST2 + // and LAST2 becomes the new LAST3 (like a FIFO but circular) + refresh_mask |= 1 << get_ref_frame_map_idx(cm, LAST3_FRAME); + break; + case GF_UPDATE: + // In addition to refreshing the GF buffer, we refresh LAST3 and push it + // into the last-frame FIFO. + refresh_mask |= 1 << get_ref_frame_map_idx(cm, LAST3_FRAME); + refresh_mask |= 1 << get_ref_frame_map_idx(cm, GOLDEN_FRAME); + break; + case OVERLAY_UPDATE: + if (!cpi->preserve_arf_as_gld) { + // The result of our OVERLAY should become the GOLDEN_FRAME but we'd + // like to keep the old GOLDEN as our new ALTREF. So we refresh the + // ALTREF and swap around the ALTREF and GOLDEN references. + refresh_mask |= 1 << get_ref_frame_map_idx(cm, ALTREF_FRAME); + } + break; + case ARF_UPDATE: + refresh_mask |= 1 << get_ref_frame_map_idx(cm, ALTREF_FRAME); + break; + case INTNL_OVERLAY_UPDATE: + // INTNL_OVERLAY may be a show_existing_frame in which case we don't + // refresh anything and the BWDREF or ALTREF2 being shown becomes the new + // LAST_FRAME. But, if it's not a show_existing_frame, then we update as + // though it's a normal LF_UPDATE: we refresh LAST3 and + // update_ref_frame_map() makes that the new LAST_FRAME. + refresh_mask |= 1 << get_ref_frame_map_idx(cm, LAST3_FRAME); + break; + case INTNL_ARF_UPDATE: + if (cpi->oxcf.pass == 2) { + // Push the new ARF2 onto the bwdref stack. We refresh EXTREF which is + // at the bottom of the stack then move it to the top. + refresh_mask |= 1 << get_ref_frame_map_idx(cm, EXTREF_FRAME); + } else { + // ARF2 just gets stored in the ARF2 slot, no reference map change. + refresh_mask |= 1 << get_ref_frame_map_idx(cm, ALTREF2_FRAME); + } + break; + default: assert(0); break; + } + return refresh_mask; +} + +int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size, + uint8_t *const dest, unsigned int *frame_flags, + int64_t *const time_stamp, int64_t *const time_end, + const aom_rational_t *const timebase, int flush) { + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + AV1_COMMON *const cm = &cpi->common; + + EncodeFrameInput frame_input; + EncodeFrameParams frame_params; + EncodeFrameResults frame_results; + memset(&frame_input, 0, sizeof(frame_input)); + memset(&frame_params, 0, sizeof(frame_params)); + memset(&frame_results, 0, sizeof(frame_results)); + + if (oxcf->pass == 0 || oxcf->pass == 2) { + check_show_existing_frame(cpi, &frame_params); + frame_params.show_existing_frame &= allow_show_existing(cpi, *frame_flags); + } else { + frame_params.show_existing_frame = 0; + } + + int temporal_filtered = 0; + struct lookahead_entry *source = NULL; + struct lookahead_entry *last_source = NULL; + FRAME_UPDATE_TYPE frame_update_type; + if (frame_params.show_existing_frame) { + source = av1_lookahead_pop(cpi->lookahead, flush); + frame_update_type = LF_UPDATE; + } else { + source = choose_frame_source(cpi, &temporal_filtered, &flush, &last_source, + &frame_update_type, &frame_params); + } + + // In pass 2 we get the frame_update_type from gf_group + if (oxcf->pass == 2) { + frame_update_type = + cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index]; + } + + if (source == NULL) { // If no source was found, we can't encode a frame. + if (flush && oxcf->pass == 1 && !cpi->twopass.first_pass_done) { + av1_end_first_pass(cpi); /* get last stats packet */ + cpi->twopass.first_pass_done = 1; + } + return -1; + } + + frame_input.source = temporal_filtered ? &cpi->alt_ref_buffer : &source->img; + frame_input.last_source = last_source != NULL ? &last_source->img : NULL; + frame_input.ts_duration = source->ts_end - source->ts_start; + + *time_stamp = source->ts_start; + *time_end = source->ts_end; + if (source->ts_start < cpi->first_time_stamp_ever) { + cpi->first_time_stamp_ever = source->ts_start; + cpi->last_end_time_stamp_seen = source->ts_start; + } + + av1_apply_encoding_flags(cpi, source->flags); + if (!frame_params.show_existing_frame) + *frame_flags = (source->flags & AOM_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0; + + const int is_overlay = frame_params.show_existing_frame && + (frame_update_type == OVERLAY_UPDATE || + frame_update_type == INTNL_OVERLAY_UPDATE); + if (frame_params.show_frame || is_overlay) { + // Shown frames and arf-overlay frames need frame-rate considering + adjust_frame_rate(cpi, source); + } + + if (frame_params.show_existing_frame) { + // show_existing_frame implies this frame is shown! + frame_params.show_frame = 1; + } else { + if (cpi->film_grain_table) { + cm->cur_frame->film_grain_params_present = aom_film_grain_table_lookup( + cpi->film_grain_table, *time_stamp, *time_end, 0 /* =erase */, + &cm->film_grain_params); + } else { + cm->cur_frame->film_grain_params_present = + cm->seq_params.film_grain_params_present; + } + // only one operating point supported now + const int64_t pts64 = ticks_to_timebase_units(timebase, *time_stamp); + if (pts64 < 0 || pts64 > UINT32_MAX) return AOM_CODEC_ERROR; + cpi->common.frame_presentation_time = (uint32_t)pts64; + } + + if (oxcf->pass == 2 && (!frame_params.show_existing_frame || is_overlay)) { + // GF_GROUP needs updating for arf overlays as well as non-show-existing + av1_get_second_pass_params(cpi, &frame_params, *frame_flags); + frame_update_type = + cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index]; + } + + if (frame_params.show_existing_frame && + frame_params.frame_type != KEY_FRAME) { + // Force show-existing frames to be INTER, except forward keyframes + frame_params.frame_type = INTER_FRAME; + } + + // TODO(david.turner@argondesign.com): Move all the encode strategy + // (largely near av1_get_compressed_data) in here + + // TODO(david.turner@argondesign.com): Change all the encode strategy to + // modify frame_params instead of cm or cpi. + + // Per-frame encode speed. In theory this can vary, but things may have been + // written assuming speed-level will not change within a sequence, so this + // parameter should be used with caution. + frame_params.speed = oxcf->speed; + + if (!frame_params.show_existing_frame) { + cm->using_qmatrix = cpi->oxcf.using_qm; + cm->min_qmlevel = cpi->oxcf.qm_minlevel; + cm->max_qmlevel = cpi->oxcf.qm_maxlevel; + if (cpi->twopass.gf_group.index == 1 && cpi->oxcf.enable_tpl_model) { + av1_configure_buffer_updates(cpi, &frame_params, frame_update_type, 0); + av1_set_frame_size(cpi, cm->width, cm->height); + av1_tpl_setup_stats(cpi, &frame_input); + } + } + + // Work out some encoding parameters specific to the pass: + if (oxcf->pass == 0) { + if (cpi->oxcf.rc_mode == AOM_CBR) { + av1_rc_get_one_pass_cbr_params(cpi, &frame_update_type, &frame_params, + *frame_flags); + } else { + av1_rc_get_one_pass_vbr_params(cpi, &frame_update_type, &frame_params, + *frame_flags); + } + } else if (oxcf->pass == 1) { + cpi->td.mb.e_mbd.lossless[0] = is_lossless_requested(&cpi->oxcf); + const int kf_requested = (cm->current_frame.frame_number == 0 || + (*frame_flags & FRAMEFLAGS_KEY)); + if (kf_requested && frame_update_type != OVERLAY_UPDATE && + frame_update_type != INTNL_OVERLAY_UPDATE) { + frame_params.frame_type = KEY_FRAME; + } else { + frame_params.frame_type = INTER_FRAME; + } + } else if (oxcf->pass == 2) { +#if CONFIG_MISMATCH_DEBUG + mismatch_move_frame_idx_w(); +#endif +#if TXCOEFF_COST_TIMER + cm->txcoeff_cost_timer = 0; + cm->txcoeff_cost_count = 0; +#endif + } + + if (oxcf->pass == 0 || oxcf->pass == 2) set_ext_overrides(cpi, &frame_params); + + // Shown keyframes and S frames refresh all reference buffers + const int force_refresh_all = + ((frame_params.frame_type == KEY_FRAME && frame_params.show_frame) || + frame_params.frame_type == S_FRAME) && + !frame_params.show_existing_frame; + + av1_configure_buffer_updates(cpi, &frame_params, frame_update_type, + force_refresh_all); + + if (oxcf->pass == 0 || oxcf->pass == 2) { + // Work out which reference frame slots may be used. + frame_params.ref_frame_flags = get_ref_frame_flags(cpi); + + frame_params.primary_ref_frame = + choose_primary_ref_frame(cpi, &frame_params); + frame_params.order_offset = + get_order_offset(&cpi->twopass.gf_group, &frame_params); + + frame_params.refresh_frame_flags = + get_refresh_frame_flags(cpi, &frame_params, frame_update_type); + } + + // The way frame_params->remapped_ref_idx is setup is a placeholder. + // Currently, reference buffer assignment is done by update_ref_frame_map() + // which is called by high-level strategy AFTER encoding a frame. It modifies + // cm->remapped_ref_idx. If you want to use an alternative method to + // determine reference buffer assignment, just put your assignments into + // frame_params->remapped_ref_idx here and they will be used when encoding + // this frame. If frame_params->remapped_ref_idx is setup independently of + // cm->remapped_ref_idx then update_ref_frame_map() will have no effect. + memcpy(frame_params.remapped_ref_idx, cm->remapped_ref_idx, + REF_FRAMES * sizeof(*cm->remapped_ref_idx)); + + if (av1_encode(cpi, dest, &frame_input, &frame_params, &frame_results) != + AOM_CODEC_OK) { + return AOM_CODEC_ERROR; + } + + if (oxcf->pass == 0 || oxcf->pass == 2) { + // First pass doesn't modify reference buffer assignment or produce frame + // flags + update_frame_flags(cpi, frame_flags); + update_ref_frame_map(cpi, frame_update_type); + } + + if (oxcf->pass == 2) { +#if TXCOEFF_COST_TIMER + cm->cum_txcoeff_cost_timer += cm->txcoeff_cost_timer; + fprintf(stderr, + "\ntxb coeff cost block number: %ld, frame time: %ld, cum time %ld " + "in us\n", + cm->txcoeff_cost_count, cm->txcoeff_cost_timer, + cm->cum_txcoeff_cost_timer); +#endif + av1_twopass_postencode_update(cpi); + } + + if (oxcf->pass == 0 || oxcf->pass == 2) { + update_fb_of_context_type(cpi, &frame_params, cpi->fb_of_context_type); + set_additional_frame_flags(cm, frame_flags); + update_rc_counts(cpi); + } + + // Unpack frame_results: + *size = frame_results.size; + + // Leave a signal for a higher level caller about if this frame is droppable + if (*size > 0) { + cpi->droppable = is_frame_droppable(cpi); + } + + return AOM_CODEC_OK; +} diff --git a/libaom/av1/encoder/encode_strategy.h b/libaom/av1/encoder/encode_strategy.h new file mode 100644 index 0000000..6830e44 --- /dev/null +++ b/libaom/av1/encoder/encode_strategy.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_ENCODE_STRATEGY_H_ +#define AOM_AV1_ENCODER_ENCODE_STRATEGY_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <stdint.h> + +#include "aom/aom_encoder.h" + +#include "av1/encoder/encoder.h" +#include "av1/encoder/firstpass.h" + +// This function will implement high-level encode strategy, choosing frame type, +// frame placement, etc. It populates an EncodeFrameParams struct with the +// results of these decisions and then calls av1_encode() +int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size, + uint8_t *const dest, unsigned int *frame_flags, + int64_t *const time_stamp, int64_t *const time_end, + const aom_rational_t *const timebase, int flush); + +// Set individual buffer update flags based on frame reference type. +// force_refresh_all is used when we have a KEY_FRAME or S_FRAME. It forces all +// refresh_*_frame flags to be set, because we refresh all buffers in this case. +void av1_configure_buffer_updates(AV1_COMP *const cpi, + EncodeFrameParams *const frame_params, + const FRAME_UPDATE_TYPE type, + int force_refresh_all); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_ENCODE_STRATEGY_H_ diff --git a/libaom/av1/encoder/encodeframe.c b/libaom/av1/encoder/encodeframe.c index ebfc8c2..2952184 100644 --- a/libaom/av1/encoder/encodeframe.c +++ b/libaom/av1/encoder/encodeframe.c @@ -10,6 +10,7 @@ */ #include <limits.h> +#include <float.h> #include <math.h> #include <stdbool.h> #include <stdio.h> @@ -54,12 +55,14 @@ #include "av1/encoder/ethread.h" #include "av1/encoder/extend.h" #include "av1/encoder/ml.h" +#include "av1/encoder/partition_strategy.h" #include "av1/encoder/partition_model_weights.h" #include "av1/encoder/rd.h" #include "av1/encoder/rdopt.h" #include "av1/encoder/reconinter_enc.h" #include "av1/encoder/segmentation.h" #include "av1/encoder/tokenize.h" +#include "av1/encoder/var_based_part.h" static void encode_superblock(const AV1_COMP *const cpi, TileDataEnc *tile_data, ThreadData *td, TOKENEXTRA **t, RUN_TYPE dry_run, @@ -74,7 +77,7 @@ static int ml_predict_breakout(const AV1_COMP *const cpi, BLOCK_SIZE bsize, // purposes of activity masking. // Eventually this should be replaced by custom no-reference routines, // which will be faster. -static const uint8_t AV1_VAR_OFFS[MAX_SB_SIZE] = { +const uint8_t AV1_VAR_OFFS[MAX_SB_SIZE] = { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, @@ -139,15 +142,6 @@ static const uint16_t AV1_HIGH_VAR_OFFS_12[MAX_SB_SIZE] = { 128 * 16, 128 * 16 }; -#if CONFIG_FP_MB_STATS -static const uint8_t num_16x16_blocks_wide_lookup[BLOCK_SIZES_ALL] = { - 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 1, 1, 1, 2, 2, 4 -}; -static const uint8_t num_16x16_blocks_high_lookup[BLOCK_SIZES_ALL] = { - 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 1, 1, 2, 1, 4, 2 -}; -#endif // CONFIG_FP_MB_STATS - unsigned int av1_get_sby_perpixel_variance(const AV1_COMP *cpi, const struct buf_2d *ref, BLOCK_SIZE bs) { @@ -188,7 +182,8 @@ static unsigned int get_sby_perpixel_diff_variance(const AV1_COMP *const cpi, BLOCK_SIZE bs) { unsigned int sse, var; uint8_t *last_y; - const YV12_BUFFER_CONFIG *last = get_ref_frame_buffer(cpi, LAST_FRAME); + const YV12_BUFFER_CONFIG *last = + get_ref_frame_yv12_buf(&cpi->common, LAST_FRAME); assert(last != NULL); last_y = @@ -211,18 +206,6 @@ static BLOCK_SIZE get_rd_var_based_fixed_partition(AV1_COMP *cpi, MACROBLOCK *x, return BLOCK_8X8; } -// Lighter version of set_offsets that only sets the mode info -// pointers. -static void set_mode_info_offsets(const AV1_COMP *const cpi, - MACROBLOCK *const x, MACROBLOCKD *const xd, - int mi_row, int mi_col) { - const AV1_COMMON *const cm = &cpi->common; - const int idx_str = xd->mi_stride * mi_row + mi_col; - xd->mi = cm->mi_grid_visible + idx_str; - xd->mi[0] = cm->mi + idx_str; - x->mbmi_ext = cpi->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col); -} - static void set_offsets_without_segment_id(const AV1_COMP *const cpi, const TileInfo *const tile, MACROBLOCK *const x, int mi_row, @@ -267,25 +250,24 @@ static void set_offsets_without_segment_id(const AV1_COMP *const cpi, // required by av1_append_sub8x8_mvs_for_idx() and av1_find_best_ref_mvs() xd->tile = *tile; + + xd->cfl.mi_row = mi_row; + xd->cfl.mi_col = mi_col; } static void set_offsets(const AV1_COMP *const cpi, const TileInfo *const tile, MACROBLOCK *const x, int mi_row, int mi_col, BLOCK_SIZE bsize) { const AV1_COMMON *const cm = &cpi->common; + const struct segmentation *const seg = &cm->seg; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *mbmi; - const struct segmentation *const seg = &cm->seg; set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize); + // Setup segment ID. mbmi = xd->mi[0]; - xd->cfl.mi_row = mi_row; - xd->cfl.mi_col = mi_col; - mbmi->segment_id = 0; - - // Setup segment ID. if (seg->enabled) { if (seg->enabled && !cpi->vaq_refresh) { const uint8_t *const map = @@ -297,15 +279,6 @@ static void set_offsets(const AV1_COMP *const cpi, const TileInfo *const tile, } } -static void reset_intmv_filter_type(MB_MODE_INFO *mbmi) { - InterpFilter filters[2]; - - for (int dir = 0; dir < 2; ++dir) { - filters[dir] = av1_extract_interp_filter(mbmi->interp_filters, dir); - } - mbmi->interp_filters = av1_make_interp_filters(filters[0], filters[1]); -} - static void update_filter_type_count(uint8_t allow_update_cdf, FRAME_COUNTS *counts, const MACROBLOCKD *xd, @@ -380,8 +353,6 @@ static void update_state(const AV1_COMP *const cpi, *mi_addr = *mi; *x->mbmi_ext = ctx->mbmi_ext; - reset_intmv_filter_type(mi_addr); - memcpy(x->blk_skip, ctx->blk_skip, sizeof(x->blk_skip[0]) * ctx->num_4x4_blk); x->skip = ctx->skip; @@ -401,7 +372,6 @@ static void update_state(const AV1_COMP *const cpi, if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) { av1_cyclic_refresh_update_segment(cpi, mi_addr, mi_row, mi_col, bsize, ctx->rate, ctx->dist, x->skip); - reset_tx_size(x, mi_addr, cm->tx_mode); } if (mi_addr->uv_mode == UV_CFL_PRED && !is_cfl_allowed(xd)) mi_addr->uv_mode = UV_DC_PRED; @@ -512,24 +482,32 @@ static int set_deltaq_rdmult(const AV1_COMP *const cpi, MACROBLOCKD *const xd) { cpi, cm->base_qindex + xd->delta_qindex + cm->y_dc_delta_q); } -static uint16_t edge_strength(const struct buf_2d *ref, const BLOCK_SIZE bsize, - const bool high_bd, const int bd) { +static EdgeInfo edge_info(const struct buf_2d *ref, const BLOCK_SIZE bsize, + const bool high_bd, const int bd) { const int width = block_size_wide[bsize]; const int height = block_size_high[bsize]; // Implementation requires width to be a multiple of 8. It also requires // height to be a multiple of 4, but this is always the case. assert(height % 4 == 0); if (width % 8 != 0) { - return 0; + EdgeInfo ei = { .magnitude = 0, .x = 0, .y = 0 }; + return ei; } return av1_edge_exists(ref->buf, ref->stride, width, height, high_bd, bd); } -static void rd_pick_sb_modes(AV1_COMP *const cpi, TileDataEnc *tile_data, - MACROBLOCK *const x, int mi_row, int mi_col, - RD_STATS *rd_cost, PARTITION_TYPE partition, - BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, - int64_t best_rd) { +static int use_pb_simple_motion_pred_sse(const AV1_COMP *const cpi) { + // TODO(debargha, yuec): Not in use, need to implement a speed feature + // utilizing this data point, and replace '0' by the corresponding speed + // feature flag. + return 0 && !frame_is_intra_only(&cpi->common); +} + +static void pick_sb_modes(AV1_COMP *const cpi, TileDataEnc *tile_data, + MACROBLOCK *const x, int mi_row, int mi_col, + RD_STATS *rd_cost, PARTITION_TYPE partition, + BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, + int64_t best_rd, int use_nonrd_pick_mode) { AV1_COMMON *const cm = &cpi->common; const int num_planes = av1_num_planes(cm); TileInfo *const tile_info = &tile_data->tile_info; @@ -542,6 +520,10 @@ static void rd_pick_sb_modes(AV1_COMP *const cpi, TileDataEnc *tile_data, const DELTAQ_MODE deltaq_mode = cpi->oxcf.deltaq_mode; int i, orig_rdmult; +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, rd_pick_sb_modes_time); +#endif + if (best_rd < 0) { ctx->rdcost = INT64_MAX; ctx->skip = 0; @@ -602,21 +584,32 @@ static void rd_pick_sb_modes(AV1_COMP *const cpi, TileDataEnc *tile_data, return; } - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + if (is_cur_buf_hbd(xd)) { x->source_variance = av1_high_get_sby_perpixel_variance( cpi, &x->plane[0].src, bsize, xd->bd); } else { x->source_variance = av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize); } + if (use_pb_simple_motion_pred_sse(cpi)) { + const MV ref_mv_full = { .row = 0, .col = 0 }; + unsigned int var = 0; + av1_simple_motion_sse_var(cpi, x, mi_row, mi_col, bsize, ref_mv_full, 0, + &x->simple_motion_pred_sse, &var); + } + // If the threshold for disabling wedge search is zero, it means the feature // should not be used. Use a value that will always succeed in the check. if (cpi->sf.disable_wedge_search_edge_thresh == 0) { x->edge_strength = UINT16_MAX; + x->edge_strength_x = UINT16_MAX; + x->edge_strength_y = UINT16_MAX; } else { - x->edge_strength = - edge_strength(&x->plane[0].src, bsize, - xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd); + EdgeInfo ei = + edge_info(&x->plane[0].src, bsize, is_cur_buf_hbd(xd), xd->bd); + x->edge_strength = ei.magnitude; + x->edge_strength_x = ei.x; + x->edge_strength_y = ei.y; } // Save rdmult before it might be changed, so it can be restored later. orig_rdmult = x->rdmult; @@ -644,22 +637,35 @@ static void rd_pick_sb_modes(AV1_COMP *const cpi, TileDataEnc *tile_data, // Find best coding mode & reconstruct the MB so it is available // as a predictor for MBs that follow in the SB if (frame_is_intra_only(cm)) { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, av1_rd_pick_intra_mode_sb_time); +#endif av1_rd_pick_intra_mode_sb(cpi, x, mi_row, mi_col, rd_cost, bsize, ctx, best_rd); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, av1_rd_pick_intra_mode_sb_time); +#endif } else { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, av1_rd_pick_inter_mode_sb_time); +#endif if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { av1_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, mi_row, mi_col, rd_cost, bsize, ctx, best_rd); -#if CONFIG_ONE_PASS_SVM - ctx->seg_feat = 1; -#endif } else { - av1_rd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col, rd_cost, - bsize, ctx, best_rd); -#if CONFIG_ONE_PASS_SVM - ctx->seg_feat = 0; -#endif + // TODO(kyslov): do the same for pick_intra_mode and + // pick_inter_mode_sb_seg_skip + if (use_nonrd_pick_mode) { + av1_nonrd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col, rd_cost, + bsize, ctx, best_rd); + } else { + av1_rd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col, rd_cost, + bsize, ctx, best_rd); + } } +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, av1_rd_pick_inter_mode_sb_time); +#endif } // Examine the resulting rate and for AQ mode 2 make a segment choice. @@ -680,6 +686,10 @@ static void rd_pick_sb_modes(AV1_COMP *const cpi, TileDataEnc *tile_data, ctx->rate = rd_cost->rate; ctx->dist = rd_cost->dist; ctx->rdcost = rd_cost->rdcost; + +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, rd_pick_sb_modes_time); +#endif } static void update_inter_mode_stats(FRAME_CONTEXT *fc, FRAME_COUNTS *counts, @@ -1287,11 +1297,13 @@ static void update_stats(const AV1_COMMON *const cm, TileDataEnc *tile_data, assert(masked_compound_used); if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) { #if CONFIG_ENTROPY_STATS - ++counts->compound_type[bsize][mbmi->interinter_comp.type - 1]; + ++counts->compound_type[bsize][mbmi->interinter_comp.type - + COMPOUND_WEDGE]; #endif if (allow_update_cdf) { update_cdf(fc->compound_type_cdf[bsize], - mbmi->interinter_comp.type - 1, COMPOUND_TYPES - 1); + mbmi->interinter_comp.type - COMPOUND_WEDGE, + MASKED_COMPOUND_TYPES); } } } @@ -1474,10 +1486,8 @@ static void encode_b(const AV1_COMP *const cpi, TileDataEnc *tile_data, encode_superblock(cpi, tile_data, td, tp, dry_run, mi_row, mi_col, bsize, rate); - if (dry_run == 0) - x->cb_offset += block_size_wide[bsize] * block_size_high[bsize]; - if (!dry_run) { + x->cb_offset += block_size_wide[bsize] * block_size_high[bsize]; if (bsize == cpi->common.seq_params.sb_size && mbmi->skip == 1 && cpi->common.delta_q_info.delta_lf_present_flag) { const int frame_lf_count = av1_num_planes(&cpi->common) > 1 @@ -1624,25 +1634,6 @@ static void encode_sb(const AV1_COMP *const cpi, ThreadData *td, update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition); } -// Check to see if the given partition size is allowed for a specified number -// of mi block rows and columns remaining in the image. -// If not then return the largest allowed partition size -static BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize, int rows_left, - int cols_left, int *bh, int *bw) { - if (rows_left <= 0 || cols_left <= 0) { - return AOMMIN(bsize, BLOCK_8X8); - } else { - for (; bsize > 0; bsize -= 3) { - *bh = mi_size_high[bsize]; - *bw = mi_size_wide[bsize]; - if ((*bh <= rows_left) && (*bw <= cols_left)) { - break; - } - } - } - return bsize; -} - static void set_partial_sb_partition(const AV1_COMMON *const cm, MB_MODE_INFO *mi, int bh_in, int bw_in, int mi_rows_remaining, @@ -1766,8 +1757,8 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td, if (partition != PARTITION_NONE && !splits_below && mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols) { pc_tree->partitioning = PARTITION_NONE; - rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc, - PARTITION_NONE, bsize, ctx_none, INT64_MAX); + pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc, + PARTITION_NONE, bsize, ctx_none, INT64_MAX, 0); if (none_rdc.rate < INT_MAX) { none_rdc.rate += x->partition_cost[pl][PARTITION_NONE]; @@ -1779,29 +1770,16 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td, pc_tree->partitioning = partition; } } - for (int b = 0; b < 2; ++b) { - pc_tree->horizontal[b].skip_ref_frame_mask = 0; - pc_tree->vertical[b].skip_ref_frame_mask = 0; - } - for (int b = 0; b < 3; ++b) { - pc_tree->horizontala[b].skip_ref_frame_mask = 0; - pc_tree->horizontalb[b].skip_ref_frame_mask = 0; - pc_tree->verticala[b].skip_ref_frame_mask = 0; - pc_tree->verticalb[b].skip_ref_frame_mask = 0; - } - for (int b = 0; b < 4; ++b) { - pc_tree->horizontal4[b].skip_ref_frame_mask = 0; - pc_tree->vertical4[b].skip_ref_frame_mask = 0; - } + switch (partition) { case PARTITION_NONE: - rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, - PARTITION_NONE, bsize, ctx_none, INT64_MAX); + pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, + PARTITION_NONE, bsize, ctx_none, INT64_MAX, 0); break; case PARTITION_HORZ: - rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, - PARTITION_HORZ, subsize, &pc_tree->horizontal[0], - INT64_MAX); + pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, + PARTITION_HORZ, subsize, &pc_tree->horizontal[0], INT64_MAX, + 0); if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 && mi_row + hbs < cm->mi_rows) { RD_STATS tmp_rdc; @@ -1810,9 +1788,9 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td, update_state(cpi, tile_data, td, ctx_h, mi_row, mi_col, subsize, 1); encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize, NULL); - rd_pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, &tmp_rdc, - PARTITION_HORZ, subsize, &pc_tree->horizontal[1], - INT64_MAX); + pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, &tmp_rdc, + PARTITION_HORZ, subsize, &pc_tree->horizontal[1], + INT64_MAX, 0); if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) { av1_invalid_rd_stats(&last_part_rdc); break; @@ -1823,9 +1801,9 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td, } break; case PARTITION_VERT: - rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, - PARTITION_VERT, subsize, &pc_tree->vertical[0], - INT64_MAX); + pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, + PARTITION_VERT, subsize, &pc_tree->vertical[0], INT64_MAX, + 0); if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 && mi_col + hbs < cm->mi_cols) { RD_STATS tmp_rdc; @@ -1834,9 +1812,9 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td, update_state(cpi, tile_data, td, ctx_v, mi_row, mi_col, subsize, 1); encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row, mi_col, subsize, NULL); - rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, &tmp_rdc, - PARTITION_VERT, subsize, - &pc_tree->vertical[bsize > BLOCK_8X8], INT64_MAX); + pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, &tmp_rdc, + PARTITION_VERT, subsize, + &pc_tree->vertical[bsize > BLOCK_8X8], INT64_MAX, 0); if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) { av1_invalid_rd_stats(&last_part_rdc); break; @@ -1910,9 +1888,9 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td, save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); pc_tree->split[i]->partitioning = PARTITION_NONE; - rd_pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx, - &tmp_rdc, PARTITION_SPLIT, split_subsize, - &pc_tree->split[i]->none, INT64_MAX); + pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx, &tmp_rdc, + PARTITION_SPLIT, split_subsize, &pc_tree->split[i]->none, + INT64_MAX, 0); restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) { @@ -1973,67 +1951,170 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td, *dist = chosen_rdc.dist; } -/* clang-format off */ -static const BLOCK_SIZE min_partition_size[BLOCK_SIZES_ALL] = { - BLOCK_4X4, // 4x4 - BLOCK_4X4, BLOCK_4X4, BLOCK_4X4, // 4x8, 8x4, 8x8 - BLOCK_4X4, BLOCK_4X4, BLOCK_8X8, // 8x16, 16x8, 16x16 - BLOCK_8X8, BLOCK_8X8, BLOCK_16X16, // 16x32, 32x16, 32x32 - BLOCK_16X16, BLOCK_16X16, BLOCK_16X16, // 32x64, 64x32, 64x64 - BLOCK_16X16, BLOCK_16X16, BLOCK_16X16, // 64x128, 128x64, 128x128 - BLOCK_4X4, BLOCK_4X4, BLOCK_8X8, // 4x16, 16x4, 8x32 - BLOCK_8X8, BLOCK_16X16, BLOCK_16X16, // 32x8, 16x64, 64x16 -}; +// TODO(kyslov): now this is very similar to rd_use_partition (except that +// doesn't do extra search arounf suggested partitioning) +// consider passing a flag to select non-rd path (similar to +// encode_sb_row) +static void nonrd_use_partition(AV1_COMP *cpi, ThreadData *td, + TileDataEnc *tile_data, MB_MODE_INFO **mib, + TOKENEXTRA **tp, int mi_row, int mi_col, + BLOCK_SIZE bsize, int *rate, int64_t *dist, + int do_recon, PC_TREE *pc_tree) { + AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + TileInfo *const tile_info = &tile_data->tile_info; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + const int bs = mi_size_wide[bsize]; + const int hbs = bs / 2; + int i; + const int pl = (bsize >= BLOCK_8X8) + ? partition_plane_context(xd, mi_row, mi_col, bsize) + : 0; + const PARTITION_TYPE partition = + (bsize >= BLOCK_8X8) ? get_partition(cm, mi_row, mi_col, bsize) + : PARTITION_NONE; + const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition); + RD_SEARCH_MACROBLOCK_CONTEXT x_ctx; + RD_STATS last_part_rdc; + PICK_MODE_CONTEXT *ctx_none = &pc_tree->none; -static const BLOCK_SIZE max_partition_size[BLOCK_SIZES_ALL] = { - BLOCK_8X8, // 4x4 - BLOCK_16X16, BLOCK_16X16, BLOCK_16X16, // 4x8, 8x4, 8x8 - BLOCK_32X32, BLOCK_32X32, BLOCK_32X32, // 8x16, 16x8, 16x16 - BLOCK_64X64, BLOCK_64X64, BLOCK_64X64, // 16x32, 32x16, 32x32 - BLOCK_LARGEST, BLOCK_LARGEST, BLOCK_LARGEST, // 32x64, 64x32, 64x64 - BLOCK_LARGEST, BLOCK_LARGEST, BLOCK_LARGEST, // 64x128, 128x64, 128x128 - BLOCK_16X16, BLOCK_16X16, BLOCK_32X32, // 4x16, 16x4, 8x32 - BLOCK_32X32, BLOCK_LARGEST, BLOCK_LARGEST, // 32x8, 16x64, 64x16 -}; + if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; -// Next square block size less or equal than current block size. -static const BLOCK_SIZE next_square_size[BLOCK_SIZES_ALL] = { - BLOCK_4X4, // 4x4 - BLOCK_4X4, BLOCK_4X4, BLOCK_8X8, // 4x8, 8x4, 8x8 - BLOCK_8X8, BLOCK_8X8, BLOCK_16X16, // 8x16, 16x8, 16x16 - BLOCK_16X16, BLOCK_16X16, BLOCK_32X32, // 16x32, 32x16, 32x32 - BLOCK_32X32, BLOCK_32X32, BLOCK_64X64, // 32x64, 64x32, 64x64 - BLOCK_64X64, BLOCK_64X64, BLOCK_128X128, // 64x128, 128x64, 128x128 - BLOCK_4X4, BLOCK_4X4, BLOCK_8X8, // 4x16, 16x4, 8x32 - BLOCK_8X8, BLOCK_16X16, BLOCK_16X16, // 32x8, 16x64, 64x16 -}; -/* clang-format on */ - -// Look at all the mode_info entries for blocks that are part of this -// partition and find the min and max values for sb_type. -// At the moment this is designed to work on a superblock but could be -// adjusted to use a size parameter. -// -// The min and max are assumed to have been initialized prior to calling this -// function so repeat calls can accumulate a min and max of more than one -// superblock. -static void get_sb_partition_size_range(const AV1_COMMON *const cm, - MACROBLOCKD *xd, MB_MODE_INFO **mib, - BLOCK_SIZE *min_block_size, - BLOCK_SIZE *max_block_size) { - int i, j; - int index = 0; - - // Check the sb_type for each block that belongs to this region. - for (i = 0; i < cm->seq_params.mib_size; ++i) { - for (j = 0; j < cm->seq_params.mib_size; ++j) { - MB_MODE_INFO *mi = mib[index + j]; - BLOCK_SIZE sb_type = mi ? mi->sb_type : BLOCK_4X4; - *min_block_size = AOMMIN(*min_block_size, sb_type); - *max_block_size = AOMMAX(*max_block_size, sb_type); - } - index += xd->mi_stride; + assert(mi_size_wide[bsize] == mi_size_high[bsize]); + + av1_invalid_rd_stats(&last_part_rdc); + + pc_tree->partitioning = partition; + + xd->above_txfm_context = cm->above_txfm_context[tile_info->tile_row] + mi_col; + xd->left_txfm_context = + xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); + save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + + if (bsize == BLOCK_16X16 && cpi->vaq_refresh) { + set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize); + x->mb_energy = av1_log_block_var(cpi, x, bsize); } + + switch (partition) { + case PARTITION_NONE: + pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, + PARTITION_NONE, bsize, ctx_none, INT64_MAX, 1); + break; + case PARTITION_HORZ: + pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, + PARTITION_HORZ, subsize, &pc_tree->horizontal[0], INT64_MAX, + 1); + if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 && + mi_row + hbs < cm->mi_rows) { + RD_STATS tmp_rdc; + const PICK_MODE_CONTEXT *const ctx_h = &pc_tree->horizontal[0]; + av1_init_rd_stats(&tmp_rdc); + update_state(cpi, tile_data, td, ctx_h, mi_row, mi_col, subsize, 1); + encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row, + mi_col, subsize, NULL); + pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, &tmp_rdc, + PARTITION_HORZ, subsize, &pc_tree->horizontal[1], + INT64_MAX, 1); + if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) { + av1_invalid_rd_stats(&last_part_rdc); + break; + } + last_part_rdc.rate += tmp_rdc.rate; + last_part_rdc.dist += tmp_rdc.dist; + last_part_rdc.rdcost += tmp_rdc.rdcost; + } + break; + case PARTITION_VERT: + pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, + PARTITION_VERT, subsize, &pc_tree->vertical[0], INT64_MAX, + 1); + if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 && + mi_col + hbs < cm->mi_cols) { + RD_STATS tmp_rdc; + const PICK_MODE_CONTEXT *const ctx_v = &pc_tree->vertical[0]; + av1_init_rd_stats(&tmp_rdc); + update_state(cpi, tile_data, td, ctx_v, mi_row, mi_col, subsize, 1); + encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row, + mi_col, subsize, NULL); + pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, &tmp_rdc, + PARTITION_VERT, subsize, + &pc_tree->vertical[bsize > BLOCK_8X8], INT64_MAX, 1); + if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) { + av1_invalid_rd_stats(&last_part_rdc); + break; + } + last_part_rdc.rate += tmp_rdc.rate; + last_part_rdc.dist += tmp_rdc.dist; + last_part_rdc.rdcost += tmp_rdc.rdcost; + } + break; + case PARTITION_SPLIT: + last_part_rdc.rate = 0; + last_part_rdc.dist = 0; + last_part_rdc.rdcost = 0; + for (i = 0; i < 4; i++) { + int x_idx = (i & 1) * hbs; + int y_idx = (i >> 1) * hbs; + int jj = i >> 1, ii = i & 0x01; + RD_STATS tmp_rdc; + if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols)) + continue; + + av1_init_rd_stats(&tmp_rdc); + nonrd_use_partition( + cpi, td, tile_data, mib + jj * hbs * cm->mi_stride + ii * hbs, tp, + mi_row + y_idx, mi_col + x_idx, subsize, &tmp_rdc.rate, + &tmp_rdc.dist, i != 3, pc_tree->split[i]); + if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) { + av1_invalid_rd_stats(&last_part_rdc); + break; + } + last_part_rdc.rate += tmp_rdc.rate; + last_part_rdc.dist += tmp_rdc.dist; + } + break; + case PARTITION_VERT_A: + case PARTITION_VERT_B: + case PARTITION_HORZ_A: + case PARTITION_HORZ_B: + case PARTITION_HORZ_4: + case PARTITION_VERT_4: + assert(0 && "Cannot handle extended partition types"); + default: assert(0); break; + } + + if (last_part_rdc.rate < INT_MAX) { + last_part_rdc.rate += x->partition_cost[pl][partition]; + last_part_rdc.rdcost = + RDCOST(x->rdmult, last_part_rdc.rate, last_part_rdc.dist); + } + + restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); + + // We must have chosen a partitioning and encoding or we'll fail later on. + // No other opportunities for success. + if (bsize == cm->seq_params.sb_size) + assert(last_part_rdc.rate < INT_MAX && last_part_rdc.dist < INT64_MAX); + + if (do_recon) { + if (bsize == cm->seq_params.sb_size) { + // NOTE: To get estimate for rate due to the tokens, use: + // int rate_coeffs = 0; + // encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_COSTCOEFFS, + // bsize, pc_tree, &rate_coeffs); + x->cb_offset = 0; + encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize, + pc_tree, NULL); + } else { + encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize, + pc_tree, NULL); + } + } + + *rate = last_part_rdc.rate; + *dist = last_part_rdc.dist; } // Checks to see if a super block is on a horizontal image edge. @@ -2090,234 +2171,6 @@ static int active_v_edge(const AV1_COMP *cpi, int mi_col, int mi_step) { return is_active_v_edge; } -// Checks to see if a super block is at the edge of the active image. -// In most cases this is the "real" edge unless there are formatting -// bars embedded in the stream. -static int active_edge_sb(const AV1_COMP *cpi, int mi_row, int mi_col) { - return active_h_edge(cpi, mi_row, cpi->common.seq_params.mib_size) || - active_v_edge(cpi, mi_col, cpi->common.seq_params.mib_size); -} - -// Performs a motion search in SIMPLE_TRANSLATION mode using -// reference frame ref. Returns the sad of the result -static void simple_motion_search(AV1_COMP *const cpi, MACROBLOCK *x, int mi_row, - int mi_col, BLOCK_SIZE bsize, int ref, - int num_planes, int use_subpixel) { - AV1_COMMON *const cm = &cpi->common; - MACROBLOCKD *xd = &x->e_mbd; - MB_MODE_INFO *mbmi = xd->mi[0]; - - mbmi->ref_frame[0] = ref; - mbmi->ref_frame[1] = NONE_FRAME; - mbmi->sb_type = bsize; - mbmi->motion_mode = SIMPLE_TRANSLATION; - - YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref); - const YV12_BUFFER_CONFIG *scaled_ref_frame = - av1_get_scaled_ref_frame(cpi, ref); - struct buf_2d backup_yv12; - // ref_mv is in units of 1/8-pel whereas ref_mv_full is in units of pel - MV ref_mv = { 0, 0 }; - MV ref_mv_full = { 0, 0 }; - const int step_param = cpi->mv_step_param; - const MvLimits tmp_mv_limits = x->mv_limits; - const SEARCH_METHODS search_methods = NSTEP; - const int do_mesh_search = 0; - const int sadpb = x->sadperbit16; - int cost_list[5]; - const int ref_idx = 0; - int var; - - av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize); - - if (scaled_ref_frame) { - backup_yv12 = xd->plane[AOM_PLANE_Y].pre[ref_idx]; - av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL, - num_planes); - } else { - av1_setup_pre_planes(xd, ref_idx, yv12, mi_row, mi_col, - &cm->current_frame.frame_refs[ref - LAST_FRAME].sf, - num_planes); - } - - // This overwrites the mv_limits so we will need to restore it later. - av1_set_mv_search_range(&x->mv_limits, &ref_mv); - var = av1_full_pixel_search(cpi, x, bsize, &ref_mv_full, step_param, - search_methods, do_mesh_search, sadpb, - cond_cost_list(cpi, cost_list), &ref_mv, INT_MAX, - 1, mi_col * MI_SIZE, mi_row * MI_SIZE, 0); - // Restore - x->mv_limits = tmp_mv_limits; - - const int use_subpel_search = - var < INT_MAX && !cpi->common.cur_frame_force_integer_mv && use_subpixel; - if (use_subpel_search) { - int not_used = 0; - if (cpi->sf.use_accurate_subpel_search) { - const int pw = block_size_wide[bsize]; - const int ph = block_size_high[bsize]; - cpi->find_fractional_mv_step( - x, cm, mi_row, mi_col, &ref_mv, cm->allow_high_precision_mv, - x->errorperbit, &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop, - cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list), - x->nmv_vec_cost, x->mv_cost_stack, ¬_used, &x->pred_sse[ref], NULL, - NULL, 0, 0, pw, ph, cpi->sf.use_accurate_subpel_search, 1); - } else { - cpi->find_fractional_mv_step( - x, cm, mi_row, mi_col, &ref_mv, cm->allow_high_precision_mv, - x->errorperbit, &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop, - cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list), - x->nmv_vec_cost, x->mv_cost_stack, ¬_used, &x->pred_sse[ref], NULL, - NULL, 0, 0, 0, 0, 0, 1); - } - } else { - // Manually convert from units of pixel to 1/8-pixels if we are not doing - // subpel search - x->best_mv.as_mv.row *= 8; - x->best_mv.as_mv.col *= 8; - } - - mbmi->mv[0].as_mv = x->best_mv.as_mv; - - // Get a copy of the prediction output - set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); - av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, NULL, bsize); - - aom_clear_system_state(); - - if (scaled_ref_frame) { - xd->plane[AOM_PLANE_Y].pre[ref_idx] = backup_yv12; - } -} - -// Look at neighboring blocks and set a min and max partition size based on -// what they chose. -static void rd_auto_partition_range(AV1_COMP *cpi, const TileInfo *const tile, - MACROBLOCKD *const xd, int mi_row, - int mi_col, BLOCK_SIZE *min_block_size, - BLOCK_SIZE *max_block_size) { - AV1_COMMON *const cm = &cpi->common; - MB_MODE_INFO **mi = xd->mi; - const int left_in_image = xd->left_available && mi[-1]; - const int above_in_image = xd->up_available && mi[-xd->mi_stride]; - const int mi_rows_remaining = tile->mi_row_end - mi_row; - const int mi_cols_remaining = tile->mi_col_end - mi_col; - int bh, bw; - BLOCK_SIZE min_size = BLOCK_4X4; - BLOCK_SIZE max_size = BLOCK_LARGEST; - - // Trap case where we do not have a prediction. - if (left_in_image || above_in_image || - cm->current_frame.frame_type != KEY_FRAME) { - // Default "min to max" and "max to min" - min_size = BLOCK_LARGEST; - max_size = BLOCK_4X4; - - // NOTE: each call to get_sb_partition_size_range() uses the previous - // passed in values for min and max as a starting point. - // Find the min and max partition used in previous frame at this location - if (cm->current_frame.frame_type != KEY_FRAME) { - MB_MODE_INFO **prev_mi = - &cm->prev_mi_grid_visible[mi_row * xd->mi_stride + mi_col]; - get_sb_partition_size_range(cm, xd, prev_mi, &min_size, &max_size); - } - // Find the min and max partition sizes used in the left superblock - if (left_in_image) { - MB_MODE_INFO **left_sb_mi = &mi[-cm->seq_params.mib_size]; - get_sb_partition_size_range(cm, xd, left_sb_mi, &min_size, &max_size); - } - // Find the min and max partition sizes used in the above suprblock. - if (above_in_image) { - MB_MODE_INFO **above_sb_mi = - &mi[-xd->mi_stride * cm->seq_params.mib_size]; - get_sb_partition_size_range(cm, xd, above_sb_mi, &min_size, &max_size); - } - - // Adjust observed min and max for "relaxed" auto partition case. - if (cpi->sf.auto_min_max_partition_size == RELAXED_NEIGHBORING_MIN_MAX) { - min_size = min_partition_size[min_size]; - max_size = max_partition_size[max_size]; - } - } - - // Check border cases where max and min from neighbors may not be legal. - max_size = find_partition_size(max_size, mi_rows_remaining, mi_cols_remaining, - &bh, &bw); - min_size = AOMMIN(min_size, max_size); - - // Test for blocks at the edge of the active image. - // This may be the actual edge of the image or where there are formatting - // bars. - if (active_edge_sb(cpi, mi_row, mi_col)) { - min_size = BLOCK_4X4; - } else { - min_size = AOMMIN(cpi->sf.rd_auto_partition_min_limit, min_size); - } - - // When use_square_partition_only is true, make sure at least one square - // partition is allowed by selecting the next smaller square size as - // *min_block_size. - if (min_size >= cpi->sf.use_square_partition_only_threshold) { - min_size = AOMMIN(min_size, next_square_size[max_size]); - } - - *min_block_size = AOMMIN(min_size, cm->seq_params.sb_size); - *max_block_size = AOMMIN(max_size, cm->seq_params.sb_size); -} - -// TODO(jingning) refactor functions setting partition search range -static void set_partition_range(const AV1_COMMON *const cm, - const MACROBLOCKD *const xd, int mi_row, - int mi_col, BLOCK_SIZE bsize, - BLOCK_SIZE *const min_bs, - BLOCK_SIZE *const max_bs) { - const int mi_width = mi_size_wide[bsize]; - const int mi_height = mi_size_high[bsize]; - int idx, idy; - - const int idx_str = cm->mi_stride * mi_row + mi_col; - MB_MODE_INFO **const prev_mi = &cm->prev_mi_grid_visible[idx_str]; - BLOCK_SIZE min_size = cm->seq_params.sb_size; // default values - BLOCK_SIZE max_size = BLOCK_4X4; - - if (prev_mi) { - for (idy = 0; idy < mi_height; ++idy) { - for (idx = 0; idx < mi_width; ++idx) { - const MB_MODE_INFO *const mi = prev_mi[idy * cm->mi_stride + idx]; - const BLOCK_SIZE bs = mi ? mi->sb_type : bsize; - min_size = AOMMIN(min_size, bs); - max_size = AOMMAX(max_size, bs); - } - } - } - - if (xd->left_available) { - for (idy = 0; idy < mi_height; ++idy) { - const MB_MODE_INFO *const mi = xd->mi[idy * cm->mi_stride - 1]; - const BLOCK_SIZE bs = mi ? mi->sb_type : bsize; - min_size = AOMMIN(min_size, bs); - max_size = AOMMAX(max_size, bs); - } - } - - if (xd->up_available) { - for (idx = 0; idx < mi_width; ++idx) { - const MB_MODE_INFO *const mi = xd->mi[idx - cm->mi_stride]; - const BLOCK_SIZE bs = mi ? mi->sb_type : bsize; - min_size = AOMMIN(min_size, bs); - max_size = AOMMAX(max_size, bs); - } - } - - if (min_size == max_size) { - min_size = min_partition_size[min_size]; - max_size = max_partition_size[max_size]; - } - - *min_bs = AOMMIN(min_size, cm->seq_params.sb_size); - *max_bs = AOMMIN(max_size, cm->seq_params.sb_size); -} - static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) { memcpy(ctx->pred_mv, x->pred_mv, sizeof(x->pred_mv)); } @@ -2327,56 +2180,6 @@ static INLINE void load_pred_mv(MACROBLOCK *x, memcpy(x->pred_mv, ctx->pred_mv, sizeof(x->pred_mv)); } -#if CONFIG_FP_MB_STATS -const int qindex_skip_threshold_lookup[BLOCK_SIZES] = { - 0, 10, 10, 30, 40, 40, 60, 80, 80, 90, 100, 100, 120, - // TODO(debargha): What are the correct numbers here? - 130, 130, 150 -}; -const int qindex_split_threshold_lookup[BLOCK_SIZES] = { - 0, 3, 3, 7, 15, 15, 30, 40, 40, 60, 80, 80, 120, - // TODO(debargha): What are the correct numbers here? - 160, 160, 240 -}; -const int complexity_16x16_blocks_threshold[BLOCK_SIZES] = { - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 6, - // TODO(debargha): What are the correct numbers here? - 8, 8, 10 -}; - -typedef enum { - MV_ZERO = 0, - MV_LEFT = 1, - MV_UP = 2, - MV_RIGHT = 3, - MV_DOWN = 4, - MV_INVALID -} MOTION_DIRECTION; - -static INLINE MOTION_DIRECTION get_motion_direction_fp(uint8_t fp_byte) { - if (fp_byte & FPMB_MOTION_ZERO_MASK) { - return MV_ZERO; - } else if (fp_byte & FPMB_MOTION_LEFT_MASK) { - return MV_LEFT; - } else if (fp_byte & FPMB_MOTION_RIGHT_MASK) { - return MV_RIGHT; - } else if (fp_byte & FPMB_MOTION_UP_MASK) { - return MV_UP; - } else { - return MV_DOWN; - } -} - -static INLINE int get_motion_inconsistency(MOTION_DIRECTION this_mv, - MOTION_DIRECTION that_mv) { - if (this_mv == that_mv) { - return 0; - } else { - return abs(this_mv - that_mv) == 2 ? 2 : 1; - } -} -#endif - // Try searching for an encoding for the given subblock. Returns zero if the // rdcost is already too high (to tell the caller not to bother searching for // encodings of further subblocks) @@ -2398,9 +2201,9 @@ static int rd_try_subblock(AV1_COMP *const cpi, ThreadData *td, ? INT64_MAX : (best_rdc->rdcost - sum_rdc->rdcost); - rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, this_rdc, - RTS_X_RATE_NOCOEF_ARG partition, subsize, this_ctx, - rdcost_remaining); + pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, this_rdc, + RTS_X_RATE_NOCOEF_ARG partition, subsize, this_ctx, + rdcost_remaining, 0); if (this_rdc->rate == INT_MAX) { sum_rdc->rdcost = INT64_MAX; @@ -2616,8 +2419,8 @@ static void rd_pick_sqr_partition(AV1_COMP *const cpi, ThreadData *td, const int64_t best_remain_rdcost = best_rdc.rdcost == INT64_MAX ? INT64_MAX : (best_rdc.rdcost - partition_rd_cost); - rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, - PARTITION_NONE, bsize, ctx_none, best_remain_rdcost); + pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, PARTITION_NONE, + bsize, ctx_none, best_remain_rdcost, 0); pc_tree->pc_tree_stats.rdcost = ctx_none->rdcost; pc_tree->pc_tree_stats.skip = ctx_none->skip; @@ -2669,6 +2472,17 @@ static void rd_pick_sqr_partition(AV1_COMP *const cpi, ThreadData *td, do_square_split = 0; } } + + if (cpi->sf.firstpass_simple_motion_search_early_term && + cm->show_frame && bsize <= BLOCK_32X32 && bsize >= BLOCK_8X8 && + !frame_is_intra_only(cm) && mi_row + mi_step < cm->mi_rows && + mi_col + mi_step < cm->mi_cols && this_rdc.rdcost < INT64_MAX && + this_rdc.rdcost >= 0 && this_rdc.rate < INT_MAX && + this_rdc.rate >= 0 && do_square_split) { + av1_firstpass_simple_motion_search_early_term( + cpi, x, pc_tree, mi_row, mi_col, bsize, &this_rdc, + &do_square_split); + } } } @@ -2788,79 +2602,9 @@ static void rd_pick_sqr_partition(AV1_COMP *const cpi, ThreadData *td, } } -#define FEATURE_SIZE 19 -static const float two_pass_split_partition_weights_128[FEATURE_SIZE + 1] = { - 2.683936f, -0.193620f, -4.106470f, -0.141320f, -0.282289f, - 0.125296f, -1.134961f, 0.862757f, -0.418799f, -0.637666f, - 0.016232f, 0.345013f, 0.018823f, -0.393394f, -1.130700f, - 0.695357f, 0.112569f, -0.341975f, -0.513882f, 5.7488966f, -}; - -static const float two_pass_split_partition_weights_64[FEATURE_SIZE + 1] = { - 2.990993f, 0.423273f, -0.926544f, 0.454646f, -0.292698f, - -1.311632f, -0.284432f, 0.717141f, -0.419257f, -0.574760f, - -0.674444f, 0.669047f, -0.374255f, 0.380624f, -0.804036f, - 0.264021f, 0.004163f, 1.896802f, 0.924287f, 0.13490619f, -}; - -static const float two_pass_split_partition_weights_32[FEATURE_SIZE + 1] = { - 2.795181f, -0.136943f, -0.924842f, 0.405330f, -0.463505f, - -0.584076f, -0.831472f, 0.382985f, -0.597544f, -0.138915f, - -1.354350f, 0.466035f, -0.553961f, 0.213202f, -1.166429f, - 0.010776f, -0.096236f, 2.335084f, 1.699857f, -0.58178353f, -}; - -static const float two_pass_split_partition_weights_16[FEATURE_SIZE + 1] = { - 1.987888f, -0.431100f, -1.687703f, 0.262602f, -0.425298f, - -0.463870f, -1.493457f, 0.470917f, -0.528457f, -0.087700f, - -1.815092f, 0.152883f, -0.337908f, 0.093679f, -1.548267f, - -0.042387f, -0.000861f, 2.556746f, 1.619192f, 0.03643292f, -}; - -static const float two_pass_split_partition_weights_8[FEATURE_SIZE + 1] = { - 2.188344f, -0.817528f, -2.119219f, 0.000000f, -0.348167f, - -0.658074f, -1.960362f, 0.000000f, -0.403080f, 0.282699f, - -2.061088f, 0.000000f, -0.431919f, -0.127960f, -1.099550f, - 0.000000f, 0.121622f, 2.017455f, 2.058228f, -0.15475988f, -}; - -static const float two_pass_none_partition_weights_128[FEATURE_SIZE + 1] = { - -1.006689f, 0.777908f, 4.461072f, -0.395782f, -0.014610f, - -0.853863f, 0.729997f, -0.420477f, 0.282429f, -1.194595f, - 3.181220f, -0.511416f, 0.117084f, -1.149348f, 1.507990f, - -0.477212f, 0.202963f, -1.469581f, 0.624461f, -0.89081228f, -}; - -static const float two_pass_none_partition_weights_64[FEATURE_SIZE + 1] = { - -1.241117f, 0.844878f, 5.638803f, -0.489780f, -0.108796f, - -4.576821f, 1.540624f, -0.477519f, 0.227791f, -1.443968f, - 1.586911f, -0.505125f, 0.140764f, -0.464194f, 1.466658f, - -0.641166f, 0.195412f, 1.427905f, 2.080007f, -1.98272777f, -}; - -static const float two_pass_none_partition_weights_32[FEATURE_SIZE + 1] = { - -2.130825f, 0.476023f, 5.907343f, -0.516002f, -0.097471f, - -2.662754f, 0.614858f, -0.576728f, 0.085261f, -0.031901f, - 0.727842f, -0.600034f, 0.079326f, 0.324328f, 0.504502f, - -0.547105f, -0.037670f, 0.304995f, 0.369018f, -2.66299987f, -}; - -static const float two_pass_none_partition_weights_16[FEATURE_SIZE + 1] = { - -1.626410f, 0.872047f, 5.414965f, -0.554781f, -0.084514f, - -3.020550f, 0.467632f, -0.382280f, 0.199568f, 0.426220f, - 0.829426f, -0.467100f, 0.153098f, 0.662994f, 0.327545f, - -0.560106f, -0.141610f, 0.403372f, 0.523991f, -3.02891231f, -}; - -static const float two_pass_none_partition_weights_8[FEATURE_SIZE + 1] = { - -1.463349f, 0.375376f, 4.751430f, 0.000000f, -0.184451f, - -1.655447f, 0.443214f, 0.000000f, 0.127961f, 0.152435f, - 0.083288f, 0.000000f, 0.143105f, 0.438012f, 0.073238f, - 0.000000f, -0.278137f, 0.186134f, 0.073737f, -1.6494962f, -}; - // split_score indicates confidence of picking split partition; // none_score indicates confidence of picking none partition; +#define FEATURE_SIZE 19 static int ml_prune_2pass_split_partition(const PC_TREE_STATS *pc_tree_stats, BLOCK_SIZE bsize, int *split_score, int *none_score) { @@ -2870,24 +2614,24 @@ static int ml_prune_2pass_split_partition(const PC_TREE_STATS *pc_tree_stats, switch (bsize) { case BLOCK_4X4: break; case BLOCK_8X8: - split_weights = two_pass_split_partition_weights_8; - none_weights = two_pass_none_partition_weights_8; + split_weights = av1_2pass_split_partition_weights_8; + none_weights = av1_2pass_none_partition_weights_8; break; case BLOCK_16X16: - split_weights = two_pass_split_partition_weights_16; - none_weights = two_pass_none_partition_weights_16; + split_weights = av1_2pass_split_partition_weights_16; + none_weights = av1_2pass_none_partition_weights_16; break; case BLOCK_32X32: - split_weights = two_pass_split_partition_weights_32; - none_weights = two_pass_none_partition_weights_32; + split_weights = av1_2pass_split_partition_weights_32; + none_weights = av1_2pass_none_partition_weights_32; break; case BLOCK_64X64: - split_weights = two_pass_split_partition_weights_64; - none_weights = two_pass_none_partition_weights_64; + split_weights = av1_2pass_split_partition_weights_64; + none_weights = av1_2pass_none_partition_weights_64; break; case BLOCK_128X128: - split_weights = two_pass_split_partition_weights_128; - none_weights = two_pass_none_partition_weights_128; + split_weights = av1_2pass_split_partition_weights_128; + none_weights = av1_2pass_none_partition_weights_128; break; default: assert(0 && "Unexpected bsize."); } @@ -2981,7 +2725,7 @@ static void ml_prune_rect_partition(const AV1_COMP *const cpi, // Variance ratios const MACROBLOCKD *const xd = &x->e_mbd; int whole_block_variance; - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + if (is_cur_buf_hbd(xd)) { whole_block_variance = av1_high_get_sby_perpixel_variance( cpi, &x->plane[0].src, bsize, xd->bd); } else { @@ -2999,7 +2743,7 @@ static void ml_prune_rect_partition(const AV1_COMP *const cpi, const int x_idx = (i & 1) * bw / 2; const int y_idx = (i >> 1) * bw / 2; buf.buf = x->plane[0].src.buf + x_idx + y_idx * buf.stride; - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + if (is_cur_buf_hbd(xd)) { split_variance[i] = av1_high_get_sby_perpixel_variance(cpi, &buf, subsize, xd->bd); } else { @@ -3181,7 +2925,7 @@ static void ml_prune_4_partition(const AV1_COMP *const cpi, MACROBLOCK *const x, src + i * block_size_high[horz_4_bs] * src_stride; const uint8_t *vert_src = src + i * block_size_wide[vert_4_bs]; unsigned int horz_var, vert_var, sse; - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + if (is_cur_buf_hbd(xd)) { switch (xd->bd) { case 10: horz_var = cpi->fn_ptr[horz_4_bs].vf( @@ -3340,204 +3084,32 @@ static int ml_predict_breakout(const AV1_COMP *const cpi, BLOCK_SIZE bsize, } #undef FEATURES -#if CONFIG_ONE_PASS_SVM -#define FEATURES 24 -static void ml_op_svm_early_term(const AV1_COMP *const cpi, - const MACROBLOCK *const x, - const MACROBLOCKD *const xd, - const PICK_MODE_CONTEXT *ctx_none, - const RD_STATS *none_rdc, int pb_source_var, - BLOCK_SIZE bsize, float *const score) { - const float *ml_weights = NULL, *ml_mean = NULL, *ml_std = NULL; - if (bsize == BLOCK_128X128) { - ml_weights = av1_op_svm_early_term_weights_128; - ml_mean = av1_op_svm_early_term_mean_128; - ml_std = av1_op_svm_early_term_std_128; - } else if (bsize == BLOCK_64X64) { - ml_weights = av1_op_svm_early_term_weights_64; - ml_mean = av1_op_svm_early_term_mean_64; - ml_std = av1_op_svm_early_term_std_64; - } else if (bsize == BLOCK_32X32) { - ml_weights = av1_op_svm_early_term_weights_32; - ml_mean = av1_op_svm_early_term_mean_32; - ml_std = av1_op_svm_early_term_std_32; - } else if (bsize == BLOCK_16X16) { - ml_weights = av1_op_svm_early_term_weights_16; - ml_mean = av1_op_svm_early_term_mean_16; - ml_std = av1_op_svm_early_term_std_16; - } else { - assert(bsize == BLOCK_128X128 || bsize == BLOCK_64X64 || - bsize == BLOCK_32X32 || bsize == BLOCK_8X8); - } - if (ml_weights != NULL) { - // Compute some features - - float features[FEATURES] = { 0 }; - int f_idx = 0; - int r_idx = 0; - - // None features - // Get none stats - features[f_idx++] = none_rdc->rate; - features[f_idx++] = none_rdc->dist; - features[f_idx++] = none_rdc->rdcost; - features[f_idx++] = ctx_none->skip; - - // EOBS - features[f_idx++] = none_rdc->eob; - int scaled_eob = none_rdc->eob * 32 * 32; - features[f_idx++] = (1.0f + none_rdc->eob_0) / (4.0f + scaled_eob); - features[f_idx++] = (1.0f + none_rdc->eob_1) / (4.0f + scaled_eob); - features[f_idx++] = (1.0f + none_rdc->eob_2) / (4.0f + scaled_eob); - features[f_idx++] = (1.0f + none_rdc->eob_3) / (4.0f + scaled_eob); - - // Y_RD - features[f_idx++] = none_rdc->rd; - int64_t scaled_rd = none_rdc->rd * 32 * 32; - features[f_idx++] = (1.0f + none_rdc->rd_0) / (4.0f + scaled_rd); - features[f_idx++] = (1.0f + none_rdc->rd_1) / (4.0f + scaled_rd); - features[f_idx++] = (1.0f + none_rdc->rd_2) / (4.0f + scaled_rd); - features[f_idx++] = (1.0f + none_rdc->rd_3) / (4.0f + scaled_rd); - - // Q_SQUARED - features[f_idx++] = - (x->plane[0].dequant_QTX[0]) * (x->plane[0].dequant_QTX[0]); - - // SIZE - // Get size of surrounding blocks - int above_size = 18, left_size = 18; - const MB_MODE_INFO *above_block = xd->above_mbmi; - const MB_MODE_INFO *left_block = xd->left_mbmi; - - if (above_block) { - above_size = above_block->sb_type; - } - if (left_block) { - left_size = left_block->sb_type; - } - - features[f_idx++] = left_size; - features[f_idx++] = left_size != 18; - - features[f_idx++] = above_size; - features[f_idx++] = above_size != 18; - - // Variance - // Get variance - int var = pb_source_var, var_reg[4] = { 0 }; - const int bw = block_size_wide[bsize]; - const int bh = block_size_high[bsize]; - const BLOCK_SIZE split_size = get_partition_subsize(bsize, PARTITION_SPLIT); - struct buf_2d buf; - buf.stride = x->plane[0].src.stride; - for (int i = 0; i < 4; ++i) { - const int x_idx = (i & 1) * bw / 2; - const int y_idx = (i >> 1) * bh / 2; - buf.buf = x->plane[0].src.buf + x_idx + y_idx * buf.stride; - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - var_reg[i] = - av1_high_get_sby_perpixel_variance(cpi, &buf, split_size, xd->bd); - } else { - var_reg[i] = av1_get_sby_perpixel_variance(cpi, &buf, split_size); - } - } - - features[f_idx++] = var; - for (r_idx = 0; r_idx < 4; r_idx++) { - features[f_idx] = (var_reg[r_idx] + 1.0f) / (var + 4.0f); - f_idx++; - } - - assert(f_idx == FEATURES); - - // Calculate the score - *score = 0.0f; - for (f_idx = 0; f_idx < FEATURES; f_idx++) { - *score += ml_weights[f_idx] * (features[f_idx] - ml_mean[f_idx]) / - ml_std[f_idx]; - } - // Dont forget the bias - *score += ml_weights[FEATURES]; - } -} -#undef FEATURES -#endif - -// Performs a full_pixel_motion_search with a single reference frame and extract -// the variance of residues. Here features is assumed to be a length 6 array. -// After this function is called, we will store the following in to features: -// features[0] = log(1 + dc_q**2/256) -// features[1] = log(1 + variance_of_residue) -// for i in [2, 3, 4, 5]: -// features[i] = log(1 + variance_of_residue_in_block[i]/variance_of_residue) -static void get_res_var_features(AV1_COMP *const cpi, MACROBLOCK *x, int mi_row, - int mi_col, BLOCK_SIZE bsize, - float *features) { - // TODO(chiyotsai@google.com): The data this model trained on did not also use - // SIMPLE_TRANSLATION to build the inter_predictor. Retraining and tuning the - // model with the correct data should give better performance. +// Record the ref frames that have been selected by square partition blocks. +static void update_picked_ref_frames_mask(MACROBLOCK *const x, int ref_type, + BLOCK_SIZE bsize, int mib_size, + int mi_row, int mi_col) { assert(mi_size_wide[bsize] == mi_size_high[bsize]); - - MACROBLOCKD *xd = &x->e_mbd; - DECLARE_ALIGNED(16, uint16_t, pred_buffer[MAX_SB_SQUARE]); - int pred_stride = 128; - - // Perform a single motion search in Y_PLANE to make a prediction - const MV_REFERENCE_FRAME ref = - cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME : LAST_FRAME; - const int use_subpixel = 0; - const int num_planes = 1; - - uint8_t *const pred_buf = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) - ? CONVERT_TO_BYTEPTR(pred_buffer) - : (uint8_t *)pred_buffer; - xd->plane[0].dst.buf = pred_buf; - xd->plane[0].dst.stride = pred_stride; - - simple_motion_search(cpi, x, mi_row, mi_col, bsize, ref, num_planes, - use_subpixel); - - // Start getting the features - int f_idx = 0; - - // Q_INDEX - const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8); - features[f_idx++] = logf(1.0f + (float)(dc_q * dc_q) / 256.0f); - - // VARIANCE - const uint8_t *src = x->plane[0].src.buf; - const int src_stride = x->plane[0].src.stride; - unsigned int sse = 0; - - // Whole block - const unsigned int var = - cpi->fn_ptr[bsize].vf(src, src_stride, pred_buf, pred_stride, &sse); - features[f_idx++] = logf(1.0f + (float)var); - - // Regional - const int bw = block_size_wide[bsize]; - const int bh = block_size_high[bsize]; - const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); - int r_idx = 0; - for (r_idx = 0; r_idx < 4; r_idx++) { - const int x_idx = (r_idx & 1) * bw / 2; - const int y_idx = (r_idx >> 1) * bh / 2; - const int src_offset = y_idx * src_stride + x_idx; - const int pred_offset = y_idx * pred_stride + x_idx; - const unsigned int sub_var = - cpi->fn_ptr[subsize].vf(src + src_offset, src_stride, - pred_buf + pred_offset, pred_stride, &sse); - const float var_ratio = (1.0f + (float)sub_var) / (4.0f + (float)var); - features[f_idx++] = var_ratio; + const int sb_size_mask = mib_size - 1; + const int mi_row_in_sb = mi_row & sb_size_mask; + const int mi_col_in_sb = mi_col & sb_size_mask; + const int mi_size = mi_size_wide[bsize]; + for (int i = mi_row_in_sb; i < mi_row_in_sb + mi_size; ++i) { + for (int j = mi_col_in_sb; j < mi_col_in_sb + mi_size; ++j) { + x->picked_ref_frames_mask[i * 32 + j] |= 1 << ref_type; + } } } -// TODO(jingning,jimbankoski,rbultje): properly skip partition types that are +// TODO(jinging,jimbankoski,rbultje): properly skip partition types that are // unlikely to be selected depending on previous rate-distortion optimization // results, for encoding speed-up. +// TODO(chiyotsai@google.com): Move these ml related varables to a seprate file +// to separate low level ml logic from partition logic +#define NUM_SIMPLE_MOTION_FEATURES 28 static void rd_pick_partition(AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, TOKENEXTRA **tp, int mi_row, int mi_col, BLOCK_SIZE bsize, + BLOCK_SIZE max_sq_part, BLOCK_SIZE min_sq_part, RD_STATS *rd_cost, int64_t best_rd, PC_TREE *pc_tree, int64_t *none_rd) { const AV1_COMMON *const cm = &cpi->common; @@ -3560,11 +3132,14 @@ static void rd_pick_partition(AV1_COMP *const cpi, ThreadData *td, const int *partition_cost = pl >= 0 ? x->partition_cost[pl] : x->partition_cost[0]; - int do_rectangular_split = 1; + int do_rectangular_split = cpi->oxcf.enable_rect_partitions; int64_t cur_none_rd = 0; int64_t split_rd[4] = { 0, 0, 0, 0 }; int64_t horz_rd[2] = { 0, 0 }; int64_t vert_rd[2] = { 0, 0 }; + int prune_horz = 0; + int prune_vert = 0; + int terminate_partition_search = 0; int split_ctx_is_ready[2] = { 0, 0 }; int horz_ctx_is_ready = 0; @@ -3585,22 +3160,26 @@ static void rd_pick_partition(AV1_COMP *const cpi, ThreadData *td, const int xss = x->e_mbd.plane[1].subsampling_x; const int yss = x->e_mbd.plane[1].subsampling_y; - BLOCK_SIZE min_size = x->min_partition_size; - BLOCK_SIZE max_size = x->max_partition_size; - if (none_rd) *none_rd = 0; - -#if CONFIG_FP_MB_STATS - unsigned int src_diff_var = UINT_MAX; - int none_complexity = 0; -#endif - int partition_none_allowed = has_rows && has_cols; - int partition_horz_allowed = has_cols && yss <= xss && bsize_at_least_8x8; - int partition_vert_allowed = has_rows && xss <= yss && bsize_at_least_8x8; + int partition_horz_allowed = has_cols && yss <= xss && bsize_at_least_8x8 && + cpi->oxcf.enable_rect_partitions; + int partition_vert_allowed = has_rows && xss <= yss && bsize_at_least_8x8 && + cpi->oxcf.enable_rect_partitions; (void)*tp_orig; +#if CONFIG_COLLECT_PARTITION_STATS + int partition_decisions[EXT_PARTITION_TYPES] = { 0 }; + int partition_attempts[EXT_PARTITION_TYPES] = { 0 }; + int64_t partition_times[EXT_PARTITION_TYPES] = { 0 }; + struct aom_usec_timer partition_timer = { 0 }; + int partition_timer_on = 0; +#if CONFIG_COLLECT_PARTITION_STATS == 2 + PartitionStats *part_stats = &cpi->partition_stats; +#endif +#endif + // Override partition costs at the edges of the frame in the same // way as in read_partition (see decodeframe.c) if (!(has_rows && has_cols)) { @@ -3625,6 +3204,7 @@ static void rd_pick_partition(AV1_COMP *const cpi, ThreadData *td, } partition_cost = tmp_partition_cost; + do_square_split &= partition_cost[PARTITION_SPLIT] != INT_MAX; } #ifndef NDEBUG @@ -3647,35 +3227,12 @@ static void rd_pick_partition(AV1_COMP *const cpi, ThreadData *td, if (bsize == BLOCK_16X16 && cpi->vaq_refresh) x->mb_energy = av1_log_block_var(cpi, x, bsize); - if (cpi->sf.cb_partition_search && bsize == BLOCK_16X16) { - const int cb_partition_search_ctrl = - ((pc_tree->index == 0 || pc_tree->index == 3) + - get_chessboard_index(cm->current_frame.frame_number)) & - 0x1; - - if (cb_partition_search_ctrl && bsize > min_size && bsize < max_size) - set_partition_range(cm, xd, mi_row, mi_col, bsize, &min_size, &max_size); - } - - // Determine partition types in search according to the speed features. - // The threshold set here has to be of square block size. - if (cpi->sf.auto_min_max_partition_size) { - const int no_partition_allowed = (bsize <= max_size && bsize >= min_size); - // Note: Further partitioning is NOT allowed when bsize == min_size already. - const int partition_allowed = (bsize <= max_size && bsize > min_size); - partition_none_allowed &= no_partition_allowed; - partition_horz_allowed &= partition_allowed || !has_rows; - partition_vert_allowed &= partition_allowed || !has_cols; - do_square_split &= bsize > min_size; - } - if (bsize > cpi->sf.use_square_partition_only_threshold) { partition_horz_allowed &= !has_rows; partition_vert_allowed &= !has_cols; } - if (bsize > BLOCK_4X4 && x->use_cb_search_range && - cpi->sf.auto_min_max_partition_size == 0) { + if (bsize > BLOCK_4X4 && x->use_cb_search_range) { int split_score = 0; int none_score = 0; const int score_valid = ml_prune_2pass_split_partition( @@ -3720,8 +3277,10 @@ static void rd_pick_partition(AV1_COMP *const cpi, ThreadData *td, partition_horz_allowed == 0 && partition_vert_allowed == 0) { do_square_split = bsize_at_least_8x8; partition_none_allowed = has_rows && has_cols; - partition_horz_allowed = has_cols && yss <= xss && bsize_at_least_8x8; - partition_vert_allowed = has_rows && xss <= yss && bsize_at_least_8x8; + partition_horz_allowed = has_cols && yss <= xss && bsize_at_least_8x8 && + cpi->oxcf.enable_rect_partitions; + partition_vert_allowed = has_rows && xss <= yss && bsize_at_least_8x8 && + cpi->oxcf.enable_rect_partitions; } } @@ -3730,127 +3289,91 @@ static void rd_pick_partition(AV1_COMP *const cpi, ThreadData *td, xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK); save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); -#if CONFIG_FP_MB_STATS - if (cpi->use_fp_mb_stats) { - set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize); - src_diff_var = get_sby_perpixel_diff_variance(cpi, &x->plane[0].src, mi_row, - mi_col, bsize); - } - - // Decide whether we shall split directly and skip searching NONE by using - // the first pass block statistics - if (cpi->use_fp_mb_stats && bsize >= BLOCK_32X32 && do_square_split && - partition_none_allowed && src_diff_var > 4 && - cm->base_qindex < qindex_split_threshold_lookup[bsize]) { - int mb_row = mi_row >> 1; - int mb_col = mi_col >> 1; - int mb_row_end = - AOMMIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows); - int mb_col_end = - AOMMIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols); - int r, c; - - // compute a complexity measure, basically measure inconsistency of motion - // vectors obtained from the first pass in the current block - for (r = mb_row; r < mb_row_end; r++) { - for (c = mb_col; c < mb_col_end; c++) { - const int mb_index = r * cm->mb_cols + c; - - MOTION_DIRECTION this_mv; - MOTION_DIRECTION right_mv; - MOTION_DIRECTION bottom_mv; - - this_mv = - get_motion_direction_fp(cpi->twopass.this_frame_mb_stats[mb_index]); - - // to its right - if (c != mb_col_end - 1) { - right_mv = get_motion_direction_fp( - cpi->twopass.this_frame_mb_stats[mb_index + 1]); - none_complexity += get_motion_inconsistency(this_mv, right_mv); - } - - // to its bottom - if (r != mb_row_end - 1) { - bottom_mv = get_motion_direction_fp( - cpi->twopass.this_frame_mb_stats[mb_index + cm->mb_cols]); - none_complexity += get_motion_inconsistency(this_mv, bottom_mv); - } - - // do not count its left and top neighbors to avoid double counting - } - } - - if (none_complexity > complexity_16x16_blocks_threshold[bsize]) { - partition_none_allowed = 0; - } - } -#endif - - // Ref frames picked in the [i_th] quarter subblock during square partition - // RD search. It may be used to prune ref frame selection of rect partitions. - int ref_frames_used[4] = { - 0, - }; - - MB_MODE_INFO *split_mbmi[4] = { 0 }; - - // Perform a full_pixel_search and use the residue to estimate whether we - // should split directly. - // TODO(chiyotsai@google.com): Try the algorithm on hbd and speed 0. - // Also try pruning PARTITION_SPLIT - if (cpi->sf.full_pixel_motion_search_based_split && bsize >= BLOCK_8X8 && + // Use simple_motion_search to prune partitions. This must be done prior to + // PARTITION_SPLIT to propagate the initial mvs to a smaller blocksize. + const int try_split_only = + cpi->sf.simple_motion_search_split_only && bsize >= BLOCK_8X8 && do_square_split && mi_row + mi_size_high[bsize] <= cm->mi_rows && mi_col + mi_size_wide[bsize] <= cm->mi_cols && !frame_is_intra_only(cm) && - !cm->seq_params.enable_superres) { - const NN_CONFIG *nn_config = NULL; - float split_only_thresh = 0.0f; - if (bsize == BLOCK_128X128) { - nn_config = &full_pixel_motion_search_based_split_nn_config_128; - split_only_thresh = full_pixel_motion_search_based_split_thresh_128; - } else if (bsize == BLOCK_64X64) { - nn_config = &full_pixel_motion_search_based_split_nn_config_64; - split_only_thresh = full_pixel_motion_search_based_split_thresh_64; - } else if (bsize == BLOCK_32X32) { - nn_config = &full_pixel_motion_search_based_split_nn_config_32; - split_only_thresh = full_pixel_motion_search_based_split_thresh_32; - } else if (bsize == BLOCK_16X16) { - nn_config = &full_pixel_motion_search_based_split_nn_config_16; - split_only_thresh = full_pixel_motion_search_based_split_thresh_16; - } else if (bsize == BLOCK_8X8) { -#if !CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 - // Disable BLOCK_8X8 for now - nn_config = &full_pixel_motion_search_based_split_nn_config_8; - split_only_thresh = full_pixel_motion_search_based_split_thresh_8; -#endif - } else { - assert(0 && "Unexpected block size in full_pixel_motion_based_split"); - } - if (nn_config) { - float features[6] = { 0 }; - float score = 0; - get_res_var_features(cpi, x, mi_row, mi_col, bsize, features); - av1_nn_predict(features, nn_config, &score); - - if (score > split_only_thresh) { - partition_none_allowed = 0; - partition_horz_allowed = 0; - partition_vert_allowed = 0; - do_rectangular_split = 0; - } - } - } + !av1_superres_scaled(cm); + + if (try_split_only) { + av1_simple_motion_search_based_split( + cpi, x, mi_row, mi_col, bsize, &partition_none_allowed, + &partition_horz_allowed, &partition_vert_allowed, &do_rectangular_split, + &do_square_split); + } + + const int try_prune_rect = + cpi->sf.simple_motion_search_prune_rect && !frame_is_intra_only(cm) && + do_rectangular_split && + (do_square_split || partition_none_allowed || + (prune_horz && prune_vert)) && + (partition_horz_allowed || partition_vert_allowed) && bsize >= BLOCK_8X8; + + float simple_motion_features[NUM_SIMPLE_MOTION_FEATURES] = { 0.0f }; + int simple_motion_features_are_valid = 0; + + if (try_prune_rect) { + av1_simple_motion_search_prune_part( + cpi, x, pc_tree, mi_row, mi_col, bsize, &partition_none_allowed, + &partition_horz_allowed, &partition_vert_allowed, &do_square_split, + &do_rectangular_split, &prune_horz, &prune_vert, simple_motion_features, + &simple_motion_features_are_valid); + } + + // Max and min square partition levels are defined as the partition nodes that + // the recursive function rd_pick_partition() can reach. To implement this: + // only PARTITION_NONE is allowed if the current node equals min_sq_part, + // only PARTITION_SPLIT is allowed if the current node exceeds max_sq_part. + assert(block_size_wide[min_sq_part] == block_size_high[min_sq_part]); + assert(block_size_wide[max_sq_part] == block_size_high[max_sq_part]); + assert(min_sq_part <= max_sq_part); + assert(block_size_wide[bsize] == block_size_high[bsize]); + const int max_partition_size = block_size_wide[max_sq_part]; + const int min_partition_size = block_size_wide[min_sq_part]; + const int blksize = block_size_wide[bsize]; + assert(min_partition_size <= max_partition_size); + const int is_le_min_sq_part = blksize <= min_partition_size; + const int is_gt_max_sq_part = blksize > max_partition_size; + if (is_gt_max_sq_part) { + // If current block size is larger than max, only allow split. + partition_none_allowed = 0; + partition_horz_allowed = 0; + partition_vert_allowed = 0; + do_square_split = 1; + } else if (is_le_min_sq_part) { + // If current block size is less or equal to min, only allow none if valid + // block large enough; only allow split otherwise. + partition_horz_allowed = 0; + partition_vert_allowed = 0; + // only disable square split when current block is not at the picture + // boundary. otherwise, inherit the square split flag from previous logic + if (has_rows && has_cols) do_square_split = 0; + partition_none_allowed = !do_square_split; + } + do_square_split &= partition_cost[PARTITION_SPLIT] != INT_MAX; BEGIN_PARTITION_SEARCH: if (x->must_find_valid_partition) { + do_square_split = + bsize_at_least_8x8 && partition_cost[PARTITION_SPLIT] != INT_MAX; partition_none_allowed = has_rows && has_cols; - partition_horz_allowed = has_cols && yss <= xss && bsize_at_least_8x8; - partition_vert_allowed = has_rows && xss <= yss && bsize_at_least_8x8; + partition_horz_allowed = has_cols && yss <= xss && bsize_at_least_8x8 && + cpi->oxcf.enable_rect_partitions; + partition_vert_allowed = has_rows && xss <= yss && bsize_at_least_8x8 && + cpi->oxcf.enable_rect_partitions; + terminate_partition_search = 0; } // Partition block source pixel variance. unsigned int pb_source_variance = UINT_MAX; + // Partition block sse after simple motion compensation, not in use now, + // but will be used for upcoming speed features + unsigned int pb_simple_motion_pred_sse = UINT_MAX; + (void)pb_simple_motion_pred_sse; + #if CONFIG_DIST_8X8 if (x->using_dist_8x8) { if (block_size_high[bsize] <= 8) partition_horz_allowed = 0; @@ -3861,7 +3384,9 @@ BEGIN_PARTITION_SEARCH: #endif // PARTITION_NONE - if (partition_none_allowed) { + if (is_le_min_sq_part && has_rows && has_cols) partition_none_allowed = 1; + if (!terminate_partition_search && partition_none_allowed && + !is_gt_max_sq_part) { int pt_cost = 0; if (bsize_at_least_8x8) { pt_cost = partition_cost[PARTITION_NONE] < INT_MAX @@ -3872,17 +3397,32 @@ BEGIN_PARTITION_SEARCH: const int64_t best_remain_rdcost = (best_rdc.rdcost == INT64_MAX) ? INT64_MAX : (best_rdc.rdcost - partition_rd_cost); - rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, - PARTITION_NONE, bsize, ctx_none, best_remain_rdcost); +#if CONFIG_COLLECT_PARTITION_STATS + if (best_remain_rdcost >= 0) { + partition_attempts[PARTITION_NONE] += 1; + aom_usec_timer_start(&partition_timer); + partition_timer_on = 1; + } +#endif + pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, PARTITION_NONE, + bsize, ctx_none, best_remain_rdcost, 0); +#if CONFIG_COLLECT_PARTITION_STATS + if (partition_timer_on) { + aom_usec_timer_mark(&partition_timer); + int64_t time = aom_usec_timer_elapsed(&partition_timer); + partition_times[PARTITION_NONE] += time; + partition_timer_on = 0; + } +#endif pb_source_variance = x->source_variance; + pb_simple_motion_pred_sse = x->simple_motion_pred_sse; if (none_rd) *none_rd = this_rdc.rdcost; cur_none_rd = this_rdc.rdcost; if (this_rdc.rate != INT_MAX) { if (cpi->sf.prune_ref_frame_for_rect_partitions) { const int ref_type = av1_ref_frame_type(ctx_none->mic.ref_frame); - for (int i = 0; i < 4; ++i) { - ref_frames_used[i] |= (1 << ref_type); - } + update_picked_ref_frames_mask(x, ref_type, bsize, + cm->seq_params.mib_size, mi_row, mi_col); } if (bsize_at_least_8x8) { this_rdc.rate += pt_cost; @@ -3902,25 +3442,6 @@ BEGIN_PARTITION_SEARCH: best_rdc = this_rdc; if (bsize_at_least_8x8) pc_tree->partitioning = PARTITION_NONE; -#if CONFIG_ONE_PASS_SVM - // Use ML if the block size is square and >= 16X16 - if (bsize >= BLOCK_16X16 && !frame_is_intra_only(cm) && - this_rdc.rate < INT_MAX && this_rdc.rate >= 0 && - !ctx_none->seg_feat) { - // Model Prediction - float score = 0.0f; - ml_op_svm_early_term(cpi, x, xd, ctx_none, &this_rdc, - pb_source_variance, bsize, &score); - - // Decide if we want to terminate early - if (score >= 0) { - do_square_split = 0; - do_rectangular_split = 0; - partition_horz_allowed = 0; - partition_vert_allowed = 0; - } - } -#endif if ((do_square_split || do_rectangular_split) && !x->e_mbd.lossless[xd->mi[0]->segment_id] && ctx_none->skippable) { const int use_ml_based_breakout = @@ -3946,51 +3467,17 @@ BEGIN_PARTITION_SEARCH: } } -#if CONFIG_FP_MB_STATS - // Check if every 16x16 first pass block statistics has zero - // motion and the corresponding first pass residue is small enough. - // If that is the case, check the difference variance between the - // current frame and the last frame. If the variance is small enough, - // stop further splitting in RD optimization - if (cpi->use_fp_mb_stats && do_square_split && - cm->base_qindex > qindex_skip_threshold_lookup[bsize]) { - int mb_row = mi_row >> 1; - int mb_col = mi_col >> 1; - int mb_row_end = - AOMMIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows); - int mb_col_end = - AOMMIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols); - int r, c; - - int skip = 1; - for (r = mb_row; r < mb_row_end; r++) { - for (c = mb_col; c < mb_col_end; c++) { - const int mb_index = r * cm->mb_cols + c; - if (!(cpi->twopass.this_frame_mb_stats[mb_index] & - FPMB_MOTION_ZERO_MASK) || - !(cpi->twopass.this_frame_mb_stats[mb_index] & - FPMB_ERROR_SMALL_MASK)) { - skip = 0; - break; - } - } - if (skip == 0) { - break; - } - } - if (skip) { - if (src_diff_var == UINT_MAX) { - set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize); - src_diff_var = get_sby_perpixel_diff_variance( - cpi, &x->plane[0].src, mi_row, mi_col, bsize); - } - if (src_diff_var < 8) { - do_square_split = 0; - do_rectangular_split = 0; - } - } + if (cpi->sf.simple_motion_search_early_term_none && cm->show_frame && + !frame_is_intra_only(cm) && bsize >= BLOCK_16X16 && + mi_row + mi_step < cm->mi_rows && mi_col + mi_step < cm->mi_cols && + this_rdc.rdcost < INT64_MAX && this_rdc.rdcost >= 0 && + this_rdc.rate < INT_MAX && this_rdc.rate >= 0 && + (do_square_split || do_rectangular_split)) { + av1_simple_motion_search_early_term_none( + cpi, x, pc_tree, mi_row, mi_col, bsize, &this_rdc, + &terminate_partition_search, simple_motion_features, + &simple_motion_features_are_valid); } -#endif } } @@ -4001,13 +3488,20 @@ BEGIN_PARTITION_SEARCH: if (cpi->sf.adaptive_motion_search) store_pred_mv(x, ctx_none); // PARTITION_SPLIT - if (do_square_split) { + if ((!terminate_partition_search && do_square_split) || is_gt_max_sq_part) { av1_init_rd_stats(&sum_rdc); subsize = get_partition_subsize(bsize, PARTITION_SPLIT); sum_rdc.rate = partition_cost[PARTITION_SPLIT]; sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0); int idx; +#if CONFIG_COLLECT_PARTITION_STATS + if (best_rdc.rdcost - sum_rdc.rdcost >= 0) { + partition_attempts[PARTITION_SPLIT] += 1; + aom_usec_timer_start(&partition_timer); + partition_timer_on = 1; + } +#endif for (idx = 0; idx < 4 && sum_rdc.rdcost < best_rdc.rdcost; ++idx) { const int x_idx = (idx & 1) * mi_step; const int y_idx = (idx >> 1) * mi_step; @@ -4022,11 +3516,9 @@ BEGIN_PARTITION_SEARCH: const int64_t best_remain_rdcost = best_rdc.rdcost == INT64_MAX ? INT64_MAX : (best_rdc.rdcost - sum_rdc.rdcost); - if (cpi->sf.prune_ref_frame_for_rect_partitions) - pc_tree->split[idx]->none.rate = INT_MAX; rd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx, - subsize, &this_rdc, best_remain_rdcost, - pc_tree->split[idx], p_split_rd); + subsize, max_sq_part, min_sq_part, &this_rdc, + best_remain_rdcost, pc_tree->split[idx], p_split_rd); if (this_rdc.rate == INT_MAX) { sum_rdc.rdcost = INT64_MAX; @@ -4035,16 +3527,6 @@ BEGIN_PARTITION_SEARCH: sum_rdc.rate += this_rdc.rate; sum_rdc.dist += this_rdc.dist; sum_rdc.rdcost += this_rdc.rdcost; - if (cpi->sf.prune_ref_frame_for_rect_partitions && - pc_tree->split[idx]->none.rate != INT_MAX) { - const int ref_type = - av1_ref_frame_type(pc_tree->split[idx]->none.mic.ref_frame); - ref_frames_used[idx] |= (1 << ref_type); - - if (cpi->sf.prune_ref_mode_for_partitions) { - split_mbmi[idx] = &pc_tree->split[idx]->none.mic; - } - } if (idx <= 1 && (bsize <= BLOCK_8X8 || pc_tree->split[idx]->partitioning == PARTITION_NONE)) { const MB_MODE_INFO *const mbmi = &pc_tree->split[idx]->none.mic; @@ -4056,6 +3538,14 @@ BEGIN_PARTITION_SEARCH: } } } +#if CONFIG_COLLECT_PARTITION_STATS + if (partition_timer_on) { + aom_usec_timer_mark(&partition_timer); + int64_t time = aom_usec_timer_elapsed(&partition_timer); + partition_times[PARTITION_SPLIT] += time; + partition_timer_on = 0; + } +#endif const int reached_last_index = (idx == 4); if (reached_last_index && sum_rdc.rdcost < best_rdc.rdcost) { @@ -4075,108 +3565,19 @@ BEGIN_PARTITION_SEARCH: restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); } // if (do_split) - pc_tree->horizontal[0].skip_ref_frame_mask = 0; - pc_tree->horizontal[1].skip_ref_frame_mask = 0; - pc_tree->vertical[0].skip_ref_frame_mask = 0; - pc_tree->vertical[1].skip_ref_frame_mask = 0; - if (cpi->sf.prune_ref_frame_for_rect_partitions) { - int used_frames; - used_frames = ref_frames_used[0] | ref_frames_used[1]; - if (used_frames) pc_tree->horizontal[0].skip_ref_frame_mask = ~used_frames; - used_frames = ref_frames_used[2] | ref_frames_used[3]; - if (used_frames) pc_tree->horizontal[1].skip_ref_frame_mask = ~used_frames; - used_frames = ref_frames_used[0] | ref_frames_used[2]; - if (used_frames) pc_tree->vertical[0].skip_ref_frame_mask = ~used_frames; - used_frames = ref_frames_used[1] | ref_frames_used[3]; - if (used_frames) pc_tree->vertical[1].skip_ref_frame_mask = ~used_frames; - } - - for (int i = 0; i < 2; ++i) { - pc_tree->horizontal[i].ref_selected[0] = - pc_tree->horizontal[i].ref_selected[1] = NONE_FRAME; - pc_tree->horizontal[i].mode_selected = -1; - pc_tree->vertical[i].ref_selected[0] = - pc_tree->vertical[i].ref_selected[1] = NONE_FRAME; - pc_tree->vertical[i].mode_selected = -1; - } - - if (cpi->sf.prune_ref_mode_for_partitions) { - // horizontal partition - for (int idx = 0; idx < 4; idx += 2) { - const int horz_idx = idx / 2; - if (split_mbmi[idx] && split_mbmi[idx + 1] && - split_mbmi[idx]->ref_frame[0] > INTRA_FRAME) { - if (!has_second_ref(split_mbmi[idx])) { - // Single ref - if (split_mbmi[idx]->ref_frame[0] == - split_mbmi[idx + 1]->ref_frame[0] && - !has_second_ref(split_mbmi[idx + 1])) { - const int ref_type = av1_ref_frame_type(split_mbmi[idx]->ref_frame); - // Overwrite skip_ref_frame_mask for the current block - const int used_frames = (1 << ref_type); - pc_tree->horizontal[horz_idx].skip_ref_frame_mask = ~used_frames; - pc_tree->horizontal[horz_idx].ref_selected[0] = - split_mbmi[idx]->ref_frame[0]; -#if 0 - // TODO(zoeliu@gmail.com): To consider the scenario of obmc - if (split_mbmi[idx]->motion_mode == - split_mbmi[idx + 1]->motion_mode && - split_mbmi[idx]->motion_mode == SIMPLE_TRANSLATION && - split_mbmi[idx]->use_wedge_interintra == 0) { - pc_tree->horizontal[horz_idx].mode_selected = SIMPLE_TRANSLATION; - } -#endif // 0 - } - } else { - // TODO(zoeliu@gmail.com): To handle comp ref - } - } - } - // vertical partition - for (int idx = 0; idx < 2; ++idx) { - const int vert_idx = idx; - if (split_mbmi[idx] && split_mbmi[idx + 2] && - split_mbmi[idx]->ref_frame[0] > INTRA_FRAME) { - if (!has_second_ref(split_mbmi[idx])) { - // Single ref - if (split_mbmi[idx]->ref_frame[0] == - split_mbmi[idx + 2]->ref_frame[0] && - !has_second_ref(split_mbmi[idx + 2])) { - const int ref_type = av1_ref_frame_type(split_mbmi[idx]->ref_frame); - // Overwrite skip_ref_frame_mask for the current block - const int used_frames = (1 << ref_type); - pc_tree->vertical[vert_idx].skip_ref_frame_mask = ~used_frames; - pc_tree->vertical[vert_idx].ref_selected[0] = - split_mbmi[idx]->ref_frame[0]; -#if 0 - // TODO(zoeliu@gmail.com): To consider the scenario of obmc - if (split_mbmi[idx]->motion_mode == - split_mbmi[idx + 2]->motion_mode && - split_mbmi[idx]->motion_mode == SIMPLE_TRANSLATION && - split_mbmi[idx]->use_wedge_interintra == 0) { - pc_tree->vertical[vert_idx].mode_selected = SIMPLE_TRANSLATION; - } -#endif // 0 - } - } else { - // TODO(zoeliu@gmail.com): To handle comp ref - } - } - } - } - - int prune_horz = 0; - int prune_vert = 0; if (cpi->sf.ml_prune_rect_partition && !frame_is_intra_only(cm) && - (partition_horz_allowed || partition_vert_allowed)) { + (partition_horz_allowed || partition_vert_allowed) && + !(prune_horz || prune_vert)) { av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize); ml_prune_rect_partition(cpi, x, bsize, best_rdc.rdcost, cur_none_rd, split_rd, &prune_horz, &prune_vert); } // PARTITION_HORZ - if (partition_horz_allowed && !prune_horz && - (do_rectangular_split || active_h_edge(cpi, mi_row, mi_step))) { + assert(IMPLIES(!cpi->oxcf.enable_rect_partitions, !partition_horz_allowed)); + if (!terminate_partition_search && partition_horz_allowed && !prune_horz && + (do_rectangular_split || active_h_edge(cpi, mi_row, mi_step)) && + !is_gt_max_sq_part) { av1_init_rd_stats(&sum_rdc); subsize = get_partition_subsize(bsize, PARTITION_HORZ); if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none); @@ -4185,14 +3586,20 @@ BEGIN_PARTITION_SEARCH: pc_tree->horizontal[0].pred_interp_filter = av1_extract_interp_filter(ctx_none->mic.interp_filters, 0); } + sum_rdc.rate = partition_cost[PARTITION_HORZ]; + sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0); const int64_t best_remain_rdcost = best_rdc.rdcost == INT64_MAX ? INT64_MAX : (best_rdc.rdcost - sum_rdc.rdcost); - sum_rdc.rate = partition_cost[PARTITION_HORZ]; - sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0); - rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, - PARTITION_HORZ, subsize, &pc_tree->horizontal[0], - best_remain_rdcost); +#if CONFIG_COLLECT_PARTITION_STATS + if (best_remain_rdcost >= 0) { + partition_attempts[PARTITION_HORZ] += 1; + aom_usec_timer_start(&partition_timer); + partition_timer_on = 1; + } +#endif + pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, PARTITION_HORZ, + subsize, &pc_tree->horizontal[0], best_remain_rdcost, 0); if (this_rdc.rate == INT_MAX) { sum_rdc.rdcost = INT64_MAX; @@ -4222,9 +3629,9 @@ BEGIN_PARTITION_SEARCH: pc_tree->horizontal[1].pred_interp_filter = av1_extract_interp_filter(ctx_h->mic.interp_filters, 0); } - rd_pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col, &this_rdc, - PARTITION_HORZ, subsize, &pc_tree->horizontal[1], - best_rdc.rdcost - sum_rdc.rdcost); + pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col, &this_rdc, + PARTITION_HORZ, subsize, &pc_tree->horizontal[1], + best_rdc.rdcost - sum_rdc.rdcost, 0); horz_rd[1] = this_rdc.rdcost; if (this_rdc.rate == INT_MAX) { @@ -4235,6 +3642,14 @@ BEGIN_PARTITION_SEARCH: sum_rdc.rdcost += this_rdc.rdcost; } } +#if CONFIG_COLLECT_PARTITION_STATS + if (partition_timer_on) { + aom_usec_timer_mark(&partition_timer); + int64_t time = aom_usec_timer_elapsed(&partition_timer); + partition_times[PARTITION_HORZ] += time; + partition_timer_on = 0; + } +#endif if (sum_rdc.rdcost < best_rdc.rdcost) { sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); @@ -4248,8 +3663,10 @@ BEGIN_PARTITION_SEARCH: } // PARTITION_VERT - if (partition_vert_allowed && !prune_vert && - (do_rectangular_split || active_v_edge(cpi, mi_col, mi_step))) { + assert(IMPLIES(!cpi->oxcf.enable_rect_partitions, !partition_vert_allowed)); + if (!terminate_partition_search && partition_vert_allowed && !prune_vert && + (do_rectangular_split || active_v_edge(cpi, mi_col, mi_step)) && + !is_gt_max_sq_part) { av1_init_rd_stats(&sum_rdc); subsize = get_partition_subsize(bsize, PARTITION_VERT); @@ -4265,9 +3682,15 @@ BEGIN_PARTITION_SEARCH: const int64_t best_remain_rdcost = best_rdc.rdcost == INT64_MAX ? INT64_MAX : (best_rdc.rdcost - sum_rdc.rdcost); - rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, - PARTITION_VERT, subsize, &pc_tree->vertical[0], - best_remain_rdcost); +#if CONFIG_COLLECT_PARTITION_STATS + if (best_remain_rdcost >= 0) { + partition_attempts[PARTITION_VERT] += 1; + aom_usec_timer_start(&partition_timer); + partition_timer_on = 1; + } +#endif + pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, PARTITION_VERT, + subsize, &pc_tree->vertical[0], best_remain_rdcost, 0); if (this_rdc.rate == INT_MAX) { sum_rdc.rdcost = INT64_MAX; @@ -4296,9 +3719,9 @@ BEGIN_PARTITION_SEARCH: pc_tree->vertical[1].pred_interp_filter = av1_extract_interp_filter(ctx_none->mic.interp_filters, 0); } - rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step, &this_rdc, - PARTITION_VERT, subsize, &pc_tree->vertical[1], - best_rdc.rdcost - sum_rdc.rdcost); + pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step, &this_rdc, + PARTITION_VERT, subsize, &pc_tree->vertical[1], + best_rdc.rdcost - sum_rdc.rdcost, 0); vert_rd[1] = this_rdc.rdcost; if (this_rdc.rate == INT_MAX) { @@ -4309,6 +3732,14 @@ BEGIN_PARTITION_SEARCH: sum_rdc.rdcost += this_rdc.rdcost; } } +#if CONFIG_COLLECT_PARTITION_STATS + if (partition_timer_on) { + aom_usec_timer_mark(&partition_timer); + int64_t time = aom_usec_timer_elapsed(&partition_timer); + partition_times[PARTITION_VERT] += time; + partition_timer_on = 0; + } +#endif if (sum_rdc.rdcost < best_rdc.rdcost) { sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist); @@ -4323,7 +3754,7 @@ BEGIN_PARTITION_SEARCH: if (pb_source_variance == UINT_MAX) { av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize); - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + if (is_cur_buf_hbd(xd)) { pb_source_variance = av1_high_get_sby_perpixel_variance( cpi, &x->plane[0].src, bsize, xd->bd); } else { @@ -4332,13 +3763,26 @@ BEGIN_PARTITION_SEARCH: } } + if (use_pb_simple_motion_pred_sse(cpi) && + pb_simple_motion_pred_sse == UINT_MAX) { + const MV ref_mv_full = { .row = 0, .col = 0 }; + unsigned int var = 0; + + av1_simple_motion_sse_var(cpi, x, mi_row, mi_col, bsize, ref_mv_full, 0, + &pb_simple_motion_pred_sse, &var); + } + + assert(IMPLIES(!cpi->oxcf.enable_rect_partitions, !do_rectangular_split)); + const int ext_partition_allowed = do_rectangular_split && bsize > BLOCK_8X8 && partition_none_allowed; // The standard AB partitions are allowed whenever ext-partition-types are // allowed - int horzab_partition_allowed = ext_partition_allowed; - int vertab_partition_allowed = ext_partition_allowed; + int horzab_partition_allowed = + ext_partition_allowed & cpi->oxcf.enable_ab_partitions; + int vertab_partition_allowed = + ext_partition_allowed & cpi->oxcf.enable_ab_partitions; #if CONFIG_DIST_8X8 if (x->using_dist_8x8) { @@ -4414,9 +3858,9 @@ BEGIN_PARTITION_SEARCH: if (cpi->sf.ml_prune_ab_partition && ext_partition_allowed && partition_horz_allowed && partition_vert_allowed) { - // TODO(huisu@google.com): x->source_variance may not be the current block's - // variance. The correct one to use is pb_source_variance. - // Need to re-train the model to fix it. + // TODO(huisu@google.com): x->source_variance may not be the current + // block's variance. The correct one to use is pb_source_variance. Need to + // re-train the model to fix it. ml_prune_ab_partition(bsize, pc_tree->partitioning, get_unsigned_bits(x->source_variance), best_rdc.rdcost, horz_rd, vert_rd, split_rd, @@ -4424,8 +3868,14 @@ BEGIN_PARTITION_SEARCH: &verta_partition_allowed, &vertb_partition_allowed); } + horza_partition_allowed &= cpi->oxcf.enable_ab_partitions; + horzb_partition_allowed &= cpi->oxcf.enable_ab_partitions; + verta_partition_allowed &= cpi->oxcf.enable_ab_partitions; + vertb_partition_allowed &= cpi->oxcf.enable_ab_partitions; + // PARTITION_HORZ_A - if (partition_horz_allowed && horza_partition_allowed) { + if (!terminate_partition_search && partition_horz_allowed && + horza_partition_allowed && !is_gt_max_sq_part) { subsize = get_partition_subsize(bsize, PARTITION_HORZ_A); pc_tree->horizontala[0].rd_mode_is_ready = 0; pc_tree->horizontala[1].rd_mode_is_ready = 0; @@ -4441,56 +3891,37 @@ BEGIN_PARTITION_SEARCH: pc_tree->horizontala[1].rd_mode_is_ready = 1; } } - for (int i = 0; i < 3; ++i) { - pc_tree->horizontala[i].skip_ref_frame_mask = 0; - pc_tree->horizontala[i].ref_selected[0] = - pc_tree->horizontala[i].ref_selected[1] = NONE_FRAME; - } - if (cpi->sf.prune_ref_frame_for_rect_partitions) { - int used_frames; - used_frames = ref_frames_used[0]; - if (used_frames) - pc_tree->horizontala[0].skip_ref_frame_mask = ~used_frames; - used_frames = ref_frames_used[1]; - if (used_frames) - pc_tree->horizontala[1].skip_ref_frame_mask = ~used_frames; - used_frames = ref_frames_used[2] | ref_frames_used[3]; - if (used_frames) - pc_tree->horizontala[2].skip_ref_frame_mask = ~used_frames; - } - if (cpi->sf.prune_ref_mode_for_partitions) { - // Overwrite skip_ref_frame_mask for the current block - if (split_mbmi[0] && split_mbmi[0]->ref_frame[0] > INTRA_FRAME && - !has_second_ref(split_mbmi[0])) { // single ref - const int used_frames = 1 << (int)split_mbmi[0]->ref_frame[0]; - pc_tree->horizontala[0].skip_ref_frame_mask = ~used_frames; - pc_tree->horizontala[0].ref_selected[0] = split_mbmi[0]->ref_frame[0]; - } - if (split_mbmi[1] && split_mbmi[1]->ref_frame[0] > INTRA_FRAME && - !has_second_ref(split_mbmi[1])) { // single ref - const int used_frames = 1 << (int)split_mbmi[1]->ref_frame[0]; - pc_tree->horizontala[1].skip_ref_frame_mask = ~used_frames; - pc_tree->horizontala[1].ref_selected[0] = split_mbmi[1]->ref_frame[0]; - } - if (split_mbmi[2] && split_mbmi[3] && - split_mbmi[2]->ref_frame[0] > INTRA_FRAME && - split_mbmi[2]->ref_frame[0] == split_mbmi[3]->ref_frame[0] && - !has_second_ref(split_mbmi[2]) && - !has_second_ref(split_mbmi[3])) { // single ref - const int used_frames = 1 << (int)split_mbmi[2]->ref_frame[0]; - pc_tree->horizontala[2].skip_ref_frame_mask = ~used_frames; - pc_tree->horizontala[2].ref_selected[0] = split_mbmi[2]->ref_frame[0]; +#if CONFIG_COLLECT_PARTITION_STATS + { + RD_STATS tmp_sum_rdc; + av1_init_rd_stats(&tmp_sum_rdc); + tmp_sum_rdc.rate = x->partition_cost[pl][PARTITION_HORZ_A]; + tmp_sum_rdc.rdcost = RDCOST(x->rdmult, tmp_sum_rdc.rate, 0); + if (best_rdc.rdcost - tmp_sum_rdc.rdcost >= 0) { + partition_attempts[PARTITION_HORZ_A] += 1; + aom_usec_timer_start(&partition_timer); + partition_timer_on = 1; } } +#endif rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->horizontala, ctx_none, mi_row, mi_col, bsize, PARTITION_HORZ_A, mi_row, mi_col, bsize2, mi_row, mi_col + mi_step, bsize2, mi_row + mi_step, mi_col, subsize); +#if CONFIG_COLLECT_PARTITION_STATS + if (partition_timer_on) { + aom_usec_timer_mark(&partition_timer); + int64_t time = aom_usec_timer_elapsed(&partition_timer); + partition_times[PARTITION_HORZ_A] += time; + partition_timer_on = 0; + } +#endif restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); } // PARTITION_HORZ_B - if (partition_horz_allowed && horzb_partition_allowed) { + if (!terminate_partition_search && partition_horz_allowed && + horzb_partition_allowed && !is_gt_max_sq_part) { subsize = get_partition_subsize(bsize, PARTITION_HORZ_B); pc_tree->horizontalb[0].rd_mode_is_ready = 0; pc_tree->horizontalb[1].rd_mode_is_ready = 0; @@ -4500,57 +3931,39 @@ BEGIN_PARTITION_SEARCH: pc_tree->horizontalb[0].mic.partition = PARTITION_HORZ_B; pc_tree->horizontalb[0].rd_mode_is_ready = 1; } - for (int i = 0; i < 3; ++i) { - pc_tree->horizontalb[i].skip_ref_frame_mask = 0; - pc_tree->horizontalb[i].ref_selected[0] = - pc_tree->horizontalb[i].ref_selected[1] = NONE_FRAME; - } - if (cpi->sf.prune_ref_frame_for_rect_partitions) { - int used_frames; - used_frames = ref_frames_used[0] | ref_frames_used[1]; - if (used_frames) - pc_tree->horizontalb[0].skip_ref_frame_mask = ~used_frames; - used_frames = ref_frames_used[2]; - if (used_frames) - pc_tree->horizontalb[1].skip_ref_frame_mask = ~used_frames; - used_frames = ref_frames_used[3]; - if (used_frames) - pc_tree->horizontalb[2].skip_ref_frame_mask = ~used_frames; - } - if (cpi->sf.prune_ref_mode_for_partitions) { - // Overwrite skip_ref_frame_mask for the current block - if (split_mbmi[0] && split_mbmi[1] && - split_mbmi[0]->ref_frame[0] > INTRA_FRAME && - split_mbmi[0]->ref_frame[0] == split_mbmi[1]->ref_frame[0] && - !has_second_ref(split_mbmi[0]) && - !has_second_ref(split_mbmi[1])) { // single ref - const int used_frames = 1 << (int)split_mbmi[0]->ref_frame[0]; - pc_tree->horizontalb[0].skip_ref_frame_mask = ~used_frames; - pc_tree->horizontalb[0].ref_selected[0] = split_mbmi[0]->ref_frame[0]; - } - if (split_mbmi[2] && split_mbmi[2]->ref_frame[0] > INTRA_FRAME && - !has_second_ref(split_mbmi[2])) { // single ref - const int used_frames = 1 << (int)split_mbmi[2]->ref_frame[0]; - pc_tree->horizontalb[1].skip_ref_frame_mask = ~used_frames; - pc_tree->horizontalb[1].ref_selected[0] = split_mbmi[2]->ref_frame[0]; - } - if (split_mbmi[3] && split_mbmi[3]->ref_frame[0] > INTRA_FRAME && - !has_second_ref(split_mbmi[3])) { // single ref - const int used_frames = 1 << (int)split_mbmi[3]->ref_frame[0]; - pc_tree->horizontalb[2].skip_ref_frame_mask = ~used_frames; - pc_tree->horizontalb[2].ref_selected[0] = split_mbmi[3]->ref_frame[0]; +#if CONFIG_COLLECT_PARTITION_STATS + { + RD_STATS tmp_sum_rdc; + av1_init_rd_stats(&tmp_sum_rdc); + tmp_sum_rdc.rate = x->partition_cost[pl][PARTITION_HORZ_B]; + tmp_sum_rdc.rdcost = RDCOST(x->rdmult, tmp_sum_rdc.rate, 0); + if (best_rdc.rdcost - tmp_sum_rdc.rdcost >= 0) { + partition_attempts[PARTITION_HORZ_B] += 1; + aom_usec_timer_start(&partition_timer); + partition_timer_on = 1; } } +#endif rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->horizontalb, ctx_none, mi_row, mi_col, bsize, PARTITION_HORZ_B, mi_row, mi_col, subsize, mi_row + mi_step, mi_col, bsize2, mi_row + mi_step, mi_col + mi_step, bsize2); + +#if CONFIG_COLLECT_PARTITION_STATS + if (partition_timer_on) { + aom_usec_timer_mark(&partition_timer); + int64_t time = aom_usec_timer_elapsed(&partition_timer); + partition_times[PARTITION_HORZ_B] += time; + partition_timer_on = 0; + } +#endif restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); } // PARTITION_VERT_A - if (partition_vert_allowed && verta_partition_allowed) { + if (!terminate_partition_search && partition_vert_allowed && + verta_partition_allowed && !is_gt_max_sq_part) { subsize = get_partition_subsize(bsize, PARTITION_VERT_A); pc_tree->verticala[0].rd_mode_is_ready = 0; pc_tree->verticala[1].rd_mode_is_ready = 0; @@ -4560,53 +3973,37 @@ BEGIN_PARTITION_SEARCH: pc_tree->verticala[0].mic.partition = PARTITION_VERT_A; pc_tree->verticala[0].rd_mode_is_ready = 1; } - for (int i = 0; i < 3; ++i) { - pc_tree->verticala[i].skip_ref_frame_mask = 0; - pc_tree->verticala[i].ref_selected[0] = - pc_tree->verticala[i].ref_selected[1] = NONE_FRAME; - } - if (cpi->sf.prune_ref_frame_for_rect_partitions) { - int used_frames; - used_frames = ref_frames_used[0]; - if (used_frames) pc_tree->verticala[0].skip_ref_frame_mask = ~used_frames; - used_frames = ref_frames_used[2]; - if (used_frames) pc_tree->verticala[1].skip_ref_frame_mask = ~used_frames; - used_frames = ref_frames_used[1] | ref_frames_used[3]; - if (used_frames) pc_tree->verticala[2].skip_ref_frame_mask = ~used_frames; - } - if (cpi->sf.prune_ref_mode_for_partitions) { - // Overwrite skip_ref_frame_mask for the current block - if (split_mbmi[0] && split_mbmi[0]->ref_frame[0] > INTRA_FRAME && - !has_second_ref(split_mbmi[0])) { // single ref - const int used_frames = 1 << (int)split_mbmi[0]->ref_frame[0]; - pc_tree->verticala[0].skip_ref_frame_mask = ~used_frames; - pc_tree->verticala[0].ref_selected[0] = split_mbmi[0]->ref_frame[0]; - } - if (split_mbmi[2] && split_mbmi[2]->ref_frame[0] > INTRA_FRAME && - !has_second_ref(split_mbmi[2])) { // single ref - const int used_frames = 1 << (int)split_mbmi[2]->ref_frame[0]; - pc_tree->verticala[1].skip_ref_frame_mask = ~used_frames; - pc_tree->verticala[1].ref_selected[0] = split_mbmi[2]->ref_frame[0]; - } - if (split_mbmi[1] && split_mbmi[3] && - split_mbmi[1]->ref_frame[0] > INTRA_FRAME && - split_mbmi[1]->ref_frame[0] == split_mbmi[3]->ref_frame[0] && - !has_second_ref(split_mbmi[1]) && - !has_second_ref(split_mbmi[3])) { // single ref - const int used_frames = 1 << (int)split_mbmi[1]->ref_frame[0]; - pc_tree->verticala[2].skip_ref_frame_mask = ~used_frames; - pc_tree->verticala[2].ref_selected[0] = split_mbmi[1]->ref_frame[0]; +#if CONFIG_COLLECT_PARTITION_STATS + { + RD_STATS tmp_sum_rdc; + av1_init_rd_stats(&tmp_sum_rdc); + tmp_sum_rdc.rate = x->partition_cost[pl][PARTITION_VERT_A]; + tmp_sum_rdc.rdcost = RDCOST(x->rdmult, tmp_sum_rdc.rate, 0); + if (best_rdc.rdcost - tmp_sum_rdc.rdcost >= 0) { + partition_attempts[PARTITION_VERT_A] += 1; + aom_usec_timer_start(&partition_timer); + partition_timer_on = 1; } } +#endif rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->verticala, ctx_none, mi_row, mi_col, bsize, PARTITION_VERT_A, mi_row, mi_col, bsize2, mi_row + mi_step, mi_col, bsize2, mi_row, mi_col + mi_step, subsize); +#if CONFIG_COLLECT_PARTITION_STATS + if (partition_timer_on) { + aom_usec_timer_mark(&partition_timer); + int64_t time = aom_usec_timer_elapsed(&partition_timer); + partition_times[PARTITION_VERT_A] += time; + partition_timer_on = 0; + } +#endif restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); } // PARTITION_VERT_B - if (partition_vert_allowed && vertb_partition_allowed) { + if (!terminate_partition_search && partition_vert_allowed && + vertb_partition_allowed && !is_gt_max_sq_part) { subsize = get_partition_subsize(bsize, PARTITION_VERT_B); pc_tree->verticalb[0].rd_mode_is_ready = 0; pc_tree->verticalb[1].rd_mode_is_ready = 0; @@ -4616,58 +4013,44 @@ BEGIN_PARTITION_SEARCH: pc_tree->verticalb[0].mic.partition = PARTITION_VERT_B; pc_tree->verticalb[0].rd_mode_is_ready = 1; } - for (int i = 0; i < 3; ++i) { - pc_tree->verticalb[i].skip_ref_frame_mask = 0; - pc_tree->verticalb[i].ref_selected[0] = - pc_tree->verticalb[i].ref_selected[1] = NONE_FRAME; - } - if (cpi->sf.prune_ref_frame_for_rect_partitions) { - int used_frames; - used_frames = ref_frames_used[0] | ref_frames_used[2]; - if (used_frames) pc_tree->verticalb[0].skip_ref_frame_mask = ~used_frames; - used_frames = ref_frames_used[1]; - if (used_frames) pc_tree->verticalb[1].skip_ref_frame_mask = ~used_frames; - used_frames = ref_frames_used[3]; - if (used_frames) pc_tree->verticalb[2].skip_ref_frame_mask = ~used_frames; - } - if (cpi->sf.prune_ref_mode_for_partitions) { - // Overwrite skip_ref_frame_mask for the current block - if (split_mbmi[0] && split_mbmi[2] && - split_mbmi[0]->ref_frame[0] > INTRA_FRAME && - split_mbmi[0]->ref_frame[0] == split_mbmi[2]->ref_frame[0] && - !has_second_ref(split_mbmi[0]) && - !has_second_ref(split_mbmi[2])) { // single ref - const int used_frames = 1 << (int)split_mbmi[0]->ref_frame[0]; - pc_tree->verticalb[0].skip_ref_frame_mask = ~used_frames; - pc_tree->verticalb[0].ref_selected[0] = split_mbmi[0]->ref_frame[0]; - } - if (split_mbmi[1] && split_mbmi[1]->ref_frame[0] > INTRA_FRAME && - !has_second_ref(split_mbmi[1])) { // single ref - const int used_frames = 1 << (int)split_mbmi[1]->ref_frame[0]; - pc_tree->verticalb[1].skip_ref_frame_mask = ~used_frames; - pc_tree->verticalb[1].ref_selected[0] = split_mbmi[1]->ref_frame[0]; - } - if (split_mbmi[3] && split_mbmi[3]->ref_frame[0] > INTRA_FRAME && - !has_second_ref(split_mbmi[3])) { // single ref - const int used_frames = 1 << (int)split_mbmi[3]->ref_frame[0]; - pc_tree->verticalb[2].skip_ref_frame_mask = ~used_frames; - pc_tree->verticalb[2].ref_selected[0] = split_mbmi[3]->ref_frame[0]; +#if CONFIG_COLLECT_PARTITION_STATS + { + RD_STATS tmp_sum_rdc; + av1_init_rd_stats(&tmp_sum_rdc); + tmp_sum_rdc.rate = x->partition_cost[pl][PARTITION_VERT_B]; + tmp_sum_rdc.rdcost = RDCOST(x->rdmult, tmp_sum_rdc.rate, 0); + if (!frame_is_intra_only(cm) && + best_rdc.rdcost - tmp_sum_rdc.rdcost >= 0) { + partition_attempts[PARTITION_VERT_B] += 1; + aom_usec_timer_start(&partition_timer); + partition_timer_on = 1; } } +#endif rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc, pc_tree->verticalb, ctx_none, mi_row, mi_col, bsize, PARTITION_VERT_B, mi_row, mi_col, subsize, mi_row, mi_col + mi_step, bsize2, mi_row + mi_step, mi_col + mi_step, bsize2); +#if CONFIG_COLLECT_PARTITION_STATS + if (partition_timer_on) { + aom_usec_timer_mark(&partition_timer); + int64_t time = aom_usec_timer_elapsed(&partition_timer); + partition_times[PARTITION_VERT_B] += time; + partition_timer_on = 0; + } +#endif restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); } // partition4_allowed is 1 if we can use a PARTITION_HORZ_4 or // PARTITION_VERT_4 for this block. This is almost the same as - // ext_partition_allowed, except that we don't allow 128x32 or 32x128 blocks, - // so we require that bsize is not BLOCK_128X128. - const int partition4_allowed = - ext_partition_allowed && bsize != BLOCK_128X128; + // ext_partition_allowed, except that we don't allow 128x32 or 32x128 + // blocks, so we require that bsize is not BLOCK_128X128. + const int partition4_allowed = cpi->oxcf.enable_1to4_partitions && + ext_partition_allowed && + bsize != BLOCK_128X128; + int partition_horz4_allowed = partition4_allowed && partition_horz_allowed; int partition_vert4_allowed = partition4_allowed && partition_vert_allowed; if (cpi->sf.prune_ext_partition_types_search_level == 2) { @@ -4699,9 +4082,16 @@ BEGIN_PARTITION_SEARCH: } #endif + if (blksize < (min_partition_size << 2)) { + partition_horz4_allowed = 0; + partition_vert4_allowed = 0; + } + // PARTITION_HORZ_4 - if (partition_horz4_allowed && has_rows && - (do_rectangular_split || active_h_edge(cpi, mi_row, mi_step))) { + assert(IMPLIES(!cpi->oxcf.enable_rect_partitions, !partition_horz4_allowed)); + if (!terminate_partition_search && partition_horz4_allowed && has_rows && + (do_rectangular_split || active_h_edge(cpi, mi_row, mi_step)) && + !is_gt_max_sq_part) { av1_init_rd_stats(&sum_rdc); const int quarter_step = mi_size_high[bsize] / 4; PICK_MODE_CONTEXT *ctx_prev = ctx_none; @@ -4710,6 +4100,13 @@ BEGIN_PARTITION_SEARCH: sum_rdc.rate = partition_cost[PARTITION_HORZ_4]; sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0); +#if CONFIG_COLLECT_PARTITION_STATS + if (best_rdc.rdcost - sum_rdc.rdcost >= 0) { + partition_attempts[PARTITION_HORZ_4] += 1; + aom_usec_timer_start(&partition_timer); + partition_timer_on = 1; + } +#endif for (int i = 0; i < 4; ++i) { const int this_mi_row = mi_row + i * quarter_step; @@ -4718,13 +4115,6 @@ BEGIN_PARTITION_SEARCH: PICK_MODE_CONTEXT *ctx_this = &pc_tree->horizontal4[i]; ctx_this->rd_mode_is_ready = 0; - ctx_this->skip_ref_frame_mask = 0; - if (cpi->sf.prune_ref_frame_for_rect_partitions) { - const int used_frames = i <= 1 - ? (ref_frames_used[0] | ref_frames_used[1]) - : (ref_frames_used[2] | ref_frames_used[3]); - if (used_frames) ctx_this->skip_ref_frame_mask = ~used_frames; - } if (!rd_try_subblock(cpi, td, tile_data, tp, (i == 3), this_mi_row, mi_col, subsize, &best_rdc, &sum_rdc, &this_rdc, PARTITION_HORZ_4, ctx_prev, ctx_this)) @@ -4740,12 +4130,23 @@ BEGIN_PARTITION_SEARCH: pc_tree->partitioning = PARTITION_HORZ_4; } } + +#if CONFIG_COLLECT_PARTITION_STATS + if (partition_timer_on) { + aom_usec_timer_mark(&partition_timer); + int64_t time = aom_usec_timer_elapsed(&partition_timer); + partition_times[PARTITION_HORZ_4] += time; + partition_timer_on = 0; + } +#endif restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); } // PARTITION_VERT_4 - if (partition_vert4_allowed && has_cols && - (do_rectangular_split || active_v_edge(cpi, mi_row, mi_step))) { + assert(IMPLIES(!cpi->oxcf.enable_rect_partitions, !partition_vert4_allowed)); + if (!terminate_partition_search && partition_vert4_allowed && has_cols && + (do_rectangular_split || active_v_edge(cpi, mi_row, mi_step)) && + !is_gt_max_sq_part) { av1_init_rd_stats(&sum_rdc); const int quarter_step = mi_size_wide[bsize] / 4; PICK_MODE_CONTEXT *ctx_prev = ctx_none; @@ -4754,6 +4155,13 @@ BEGIN_PARTITION_SEARCH: sum_rdc.rate = partition_cost[PARTITION_VERT_4]; sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0); +#if CONFIG_COLLECT_PARTITION_STATS + if (best_rdc.rdcost - sum_rdc.rdcost >= 0) { + partition_attempts[PARTITION_VERT_4] += 1; + aom_usec_timer_start(&partition_timer); + partition_timer_on = 1; + } +#endif for (int i = 0; i < 4; ++i) { const int this_mi_col = mi_col + i * quarter_step; @@ -4762,13 +4170,6 @@ BEGIN_PARTITION_SEARCH: PICK_MODE_CONTEXT *ctx_this = &pc_tree->vertical4[i]; ctx_this->rd_mode_is_ready = 0; - ctx_this->skip_ref_frame_mask = 0; - if (cpi->sf.prune_ref_frame_for_rect_partitions) { - const int used_frames = i <= 1 - ? (ref_frames_used[0] | ref_frames_used[2]) - : (ref_frames_used[1] | ref_frames_used[3]); - if (used_frames) ctx_this->skip_ref_frame_mask = ~used_frames; - } if (!rd_try_subblock(cpi, td, tile_data, tp, (i == 3), mi_row, this_mi_col, subsize, &best_rdc, &sum_rdc, &this_rdc, PARTITION_VERT_4, ctx_prev, ctx_this)) @@ -4784,6 +4185,14 @@ BEGIN_PARTITION_SEARCH: pc_tree->partitioning = PARTITION_VERT_4; } } +#if CONFIG_COLLECT_PARTITION_STATS + if (partition_timer_on) { + aom_usec_timer_mark(&partition_timer); + int64_t time = aom_usec_timer_elapsed(&partition_timer); + partition_times[PARTITION_VERT_4] += time; + partition_timer_on = 0; + } +#endif restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes); } @@ -4791,6 +4200,9 @@ BEGIN_PARTITION_SEARCH: // Did not find a valid partition, go back and search again, with less // constraint on which partition types to search. x->must_find_valid_partition = 1; +#if CONFIG_COLLECT_PARTITION_STATS == 2 + part_stats->partition_redo += 1; +#endif goto BEGIN_PARTITION_SEARCH; } @@ -4801,6 +4213,44 @@ BEGIN_PARTITION_SEARCH: (void)best_rd; *rd_cost = best_rdc; +#if CONFIG_COLLECT_PARTITION_STATS + if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX) { + partition_decisions[pc_tree->partitioning] += 1; + } +#endif + +#if CONFIG_COLLECT_PARTITION_STATS == 1 + // If CONFIG_COLLECT_PARTITION_STATS is 1, then print out the stats for each + // prediction block + FILE *f = fopen("data.csv", "a"); + fprintf(f, "%d,%d,%d,", bsize, cm->show_frame, frame_is_intra_only(cm)); + for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) { + fprintf(f, "%d,", partition_decisions[idx]); + } + for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) { + fprintf(f, "%d,", partition_attempts[idx]); + } + for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) { + fprintf(f, "%ld,", partition_times[idx]); + } + fprintf(f, "\n"); + fclose(f); +#endif + +#if CONFIG_COLLECT_PARTITION_STATS == 2 + // If CONFIG_COLLECTION_PARTITION_STATS is 2, then we print out the stats for + // the whole clip. So we need to pass the information upstream to the encoder + const int bsize_idx = av1_get_bsize_idx_for_part_stats(bsize); + int *agg_attempts = part_stats->partition_attempts[bsize_idx]; + int *agg_decisions = part_stats->partition_decisions[bsize_idx]; + int64_t *agg_times = part_stats->partition_times[bsize_idx]; + for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) { + agg_attempts[idx] += partition_attempts[idx]; + agg_decisions[idx] += partition_decisions[idx]; + agg_times[idx] += partition_times[idx]; + } +#endif + if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX && pc_tree->index != 3) { if (bsize == cm->seq_params.sb_size) { @@ -4820,19 +4270,23 @@ BEGIN_PARTITION_SEARCH: assert(tp_orig == *tp); } } +#undef NUM_SIMPLE_MOTION_FEATURES // Set all the counters as max. static void init_first_partition_pass_stats_tables( - FIRST_PARTITION_PASS_STATS *stats) { + AV1_COMP *cpi, FIRST_PARTITION_PASS_STATS *stats) { for (int i = 0; i < FIRST_PARTITION_PASS_STATS_TABLES; ++i) { memset(stats[i].ref0_counts, 0xff, sizeof(stats[i].ref0_counts)); memset(stats[i].ref1_counts, 0xff, sizeof(stats[i].ref1_counts)); stats[i].sample_counts = INT_MAX; + if (cpi->sf.use_first_partition_pass_interintra_stats) + memset(stats[i].interintra_motion_mode_count, 0xff, + sizeof(stats[i].interintra_motion_mode_count)); } } -// Minimum number of samples to trigger the -// mode_pruning_based_on_two_pass_partition_search feature. +// Minimum number of samples to trigger the mode pruning in +// two_pass_partition_search feature. #define FIRST_PARTITION_PASS_MIN_SAMPLES 16 static int get_rdmult_delta(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row, @@ -4847,7 +4301,6 @@ static int get_rdmult_delta(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row, int row, col; int dr = 0; - int count = 0; double r0, rk, beta; if (tpl_frame->is_valid == 0) return orig_rdmult; @@ -4864,8 +4317,6 @@ static int get_rdmult_delta(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row, intra_cost += this_stats->intra_cost; mc_dep_cost += this_stats->mc_dep_cost; - - ++count; } } @@ -4955,8 +4406,7 @@ static void first_partition_search_pass(AV1_COMP *cpi, ThreadData *td, const SPEED_FEATURES *const sf = &cpi->sf; // Reset the stats tables. - if (sf->mode_pruning_based_on_two_pass_partition_search) - av1_zero(x->first_partition_pass_stats); + av1_zero(x->first_partition_pass_stats); AV1_COMMON *const cm = &cpi->common; const BLOCK_SIZE sb_size = cm->seq_params.sb_size; @@ -4968,6 +4418,7 @@ static void first_partition_search_pass(AV1_COMP *cpi, ThreadData *td, x->cb_partition_scan = 0; x->source_variance = UINT_MAX; + x->simple_motion_pred_sse = UINT_MAX; if (sf->adaptive_pred_interp_filter) { const int leaf_nodes = 256; for (int i = 0; i < leaf_nodes; ++i) { @@ -4996,29 +4447,208 @@ static void first_partition_search_pass(AV1_COMP *cpi, ThreadData *td, x->use_cb_search_range = 1; - if (sf->mode_pruning_based_on_two_pass_partition_search) { - for (int i = 0; i < FIRST_PARTITION_PASS_STATS_TABLES; ++i) { - FIRST_PARTITION_PASS_STATS *const stat = - &x->first_partition_pass_stats[i]; - if (stat->sample_counts < FIRST_PARTITION_PASS_MIN_SAMPLES) { - // If there are not enough samples collected, make all available. - memset(stat->ref0_counts, 0xff, sizeof(stat->ref0_counts)); - memset(stat->ref1_counts, 0xff, sizeof(stat->ref1_counts)); - } else if (sf->selective_ref_frame < 3) { - // ALTREF2_FRAME and BWDREF_FRAME may be skipped during the - // initial partition scan, so we don't eliminate them. - stat->ref0_counts[ALTREF2_FRAME] = 0xff; - stat->ref1_counts[ALTREF2_FRAME] = 0xff; - stat->ref0_counts[BWDREF_FRAME] = 0xff; - stat->ref1_counts[BWDREF_FRAME] = 0xff; + for (int i = 0; i < FIRST_PARTITION_PASS_STATS_TABLES; ++i) { + FIRST_PARTITION_PASS_STATS *const stat = &x->first_partition_pass_stats[i]; + if (stat->sample_counts < FIRST_PARTITION_PASS_MIN_SAMPLES) { + // If there are not enough samples collected, make all available. + memset(stat->ref0_counts, 0xff, sizeof(stat->ref0_counts)); + memset(stat->ref1_counts, 0xff, sizeof(stat->ref1_counts)); + if (cpi->sf.use_first_partition_pass_interintra_stats) + memset(stat->interintra_motion_mode_count, 0xff, + sizeof(stat->interintra_motion_mode_count)); + } else if (sf->selective_ref_frame < 3) { + // ALTREF2_FRAME and BWDREF_FRAME may be skipped during the + // initial partition scan, so we don't eliminate them. + stat->ref0_counts[ALTREF2_FRAME] = 0xff; + stat->ref1_counts[ALTREF2_FRAME] = 0xff; + stat->ref0_counts[BWDREF_FRAME] = 0xff; + stat->ref1_counts[BWDREF_FRAME] = 0xff; + if (cpi->sf.use_first_partition_pass_interintra_stats) { + stat->interintra_motion_mode_count[ALTREF2_FRAME] = 0xff; + stat->interintra_motion_mode_count[BWDREF_FRAME] = 0xff; } } } } -static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td, - TileDataEnc *tile_data, int mi_row, - TOKENEXTRA **tp) { +#define AVG_CDF_WEIGHT_LEFT 3 +#define AVG_CDF_WEIGHT_TOP_RIGHT 1 + +static void avg_cdf_symbol(aom_cdf_prob *cdf_ptr_left, aom_cdf_prob *cdf_ptr_tr, + int num_cdfs, int cdf_stride, int nsymbs, + int wt_left, int wt_tr) { + for (int i = 0; i < num_cdfs; i++) { + for (int j = 0; j <= nsymbs; j++) { + cdf_ptr_left[i * cdf_stride + j] = + (aom_cdf_prob)(((int)cdf_ptr_left[i * cdf_stride + j] * wt_left + + (int)cdf_ptr_tr[i * cdf_stride + j] * wt_tr + + ((wt_left + wt_tr) / 2)) / + (wt_left + wt_tr)); + assert(cdf_ptr_left[i * cdf_stride + j] >= 0 && + cdf_ptr_left[i * cdf_stride + j] < CDF_PROB_TOP); + } + } +} + +#define AVERAGE_CDF(cname_left, cname_tr, nsymbs) \ + AVG_CDF_STRIDE(cname_left, cname_tr, nsymbs, CDF_SIZE(nsymbs)) + +#define AVG_CDF_STRIDE(cname_left, cname_tr, nsymbs, cdf_stride) \ + do { \ + aom_cdf_prob *cdf_ptr_left = (aom_cdf_prob *)cname_left; \ + aom_cdf_prob *cdf_ptr_tr = (aom_cdf_prob *)cname_tr; \ + int array_size = (int)sizeof(cname_left) / sizeof(aom_cdf_prob); \ + int num_cdfs = array_size / cdf_stride; \ + avg_cdf_symbol(cdf_ptr_left, cdf_ptr_tr, num_cdfs, cdf_stride, nsymbs, \ + wt_left, wt_tr); \ + } while (0) + +static void avg_nmv(nmv_context *nmv_left, nmv_context *nmv_tr, int wt_left, + int wt_tr) { + AVERAGE_CDF(nmv_left->joints_cdf, nmv_tr->joints_cdf, 4); + for (int i = 0; i < 2; i++) { + AVERAGE_CDF(nmv_left->comps[i].classes_cdf, nmv_tr->comps[i].classes_cdf, + MV_CLASSES); + AVERAGE_CDF(nmv_left->comps[i].class0_fp_cdf, + nmv_tr->comps[i].class0_fp_cdf, MV_FP_SIZE); + AVERAGE_CDF(nmv_left->comps[i].fp_cdf, nmv_tr->comps[i].fp_cdf, MV_FP_SIZE); + AVERAGE_CDF(nmv_left->comps[i].sign_cdf, nmv_tr->comps[i].sign_cdf, 2); + AVERAGE_CDF(nmv_left->comps[i].class0_hp_cdf, + nmv_tr->comps[i].class0_hp_cdf, 2); + AVERAGE_CDF(nmv_left->comps[i].hp_cdf, nmv_tr->comps[i].hp_cdf, 2); + AVERAGE_CDF(nmv_left->comps[i].class0_cdf, nmv_tr->comps[i].class0_cdf, + CLASS0_SIZE); + AVERAGE_CDF(nmv_left->comps[i].bits_cdf, nmv_tr->comps[i].bits_cdf, 2); + } +} + +// In case of row-based multi-threading of encoder, since we always +// keep a top - right sync, we can average the top - right SB's CDFs and +// the left SB's CDFs and use the same for current SB's encoding to +// improve the performance. This function facilitates the averaging +// of CDF and used only when row-mt is enabled in encoder. +static void avg_cdf_symbols(FRAME_CONTEXT *ctx_left, FRAME_CONTEXT *ctx_tr, + int wt_left, int wt_tr) { + AVERAGE_CDF(ctx_left->txb_skip_cdf, ctx_tr->txb_skip_cdf, 2); + AVERAGE_CDF(ctx_left->eob_extra_cdf, ctx_tr->eob_extra_cdf, 2); + AVERAGE_CDF(ctx_left->dc_sign_cdf, ctx_tr->dc_sign_cdf, 2); + AVERAGE_CDF(ctx_left->eob_flag_cdf16, ctx_tr->eob_flag_cdf16, 5); + AVERAGE_CDF(ctx_left->eob_flag_cdf32, ctx_tr->eob_flag_cdf32, 6); + AVERAGE_CDF(ctx_left->eob_flag_cdf64, ctx_tr->eob_flag_cdf64, 7); + AVERAGE_CDF(ctx_left->eob_flag_cdf128, ctx_tr->eob_flag_cdf128, 8); + AVERAGE_CDF(ctx_left->eob_flag_cdf256, ctx_tr->eob_flag_cdf256, 9); + AVERAGE_CDF(ctx_left->eob_flag_cdf512, ctx_tr->eob_flag_cdf512, 10); + AVERAGE_CDF(ctx_left->eob_flag_cdf1024, ctx_tr->eob_flag_cdf1024, 11); + AVERAGE_CDF(ctx_left->coeff_base_eob_cdf, ctx_tr->coeff_base_eob_cdf, 3); + AVERAGE_CDF(ctx_left->coeff_base_cdf, ctx_tr->coeff_base_cdf, 4); + AVERAGE_CDF(ctx_left->coeff_br_cdf, ctx_tr->coeff_br_cdf, BR_CDF_SIZE); + AVERAGE_CDF(ctx_left->newmv_cdf, ctx_tr->newmv_cdf, 2); + AVERAGE_CDF(ctx_left->zeromv_cdf, ctx_tr->zeromv_cdf, 2); + AVERAGE_CDF(ctx_left->refmv_cdf, ctx_tr->refmv_cdf, 2); + AVERAGE_CDF(ctx_left->drl_cdf, ctx_tr->drl_cdf, 2); + AVERAGE_CDF(ctx_left->inter_compound_mode_cdf, + ctx_tr->inter_compound_mode_cdf, INTER_COMPOUND_MODES); + AVERAGE_CDF(ctx_left->compound_type_cdf, ctx_tr->compound_type_cdf, + MASKED_COMPOUND_TYPES); + AVERAGE_CDF(ctx_left->wedge_idx_cdf, ctx_tr->wedge_idx_cdf, 16); + AVERAGE_CDF(ctx_left->interintra_cdf, ctx_tr->interintra_cdf, 2); + AVERAGE_CDF(ctx_left->wedge_interintra_cdf, ctx_tr->wedge_interintra_cdf, 2); + AVERAGE_CDF(ctx_left->interintra_mode_cdf, ctx_tr->interintra_mode_cdf, + INTERINTRA_MODES); + AVERAGE_CDF(ctx_left->motion_mode_cdf, ctx_tr->motion_mode_cdf, MOTION_MODES); + AVERAGE_CDF(ctx_left->obmc_cdf, ctx_tr->obmc_cdf, 2); + AVERAGE_CDF(ctx_left->palette_y_size_cdf, ctx_tr->palette_y_size_cdf, + PALETTE_SIZES); + AVERAGE_CDF(ctx_left->palette_uv_size_cdf, ctx_tr->palette_uv_size_cdf, + PALETTE_SIZES); + for (int j = 0; j < PALETTE_SIZES; j++) { + int nsymbs = j + PALETTE_MIN_SIZE; + AVG_CDF_STRIDE(ctx_left->palette_y_color_index_cdf[j], + ctx_tr->palette_y_color_index_cdf[j], nsymbs, + CDF_SIZE(PALETTE_COLORS)); + AVG_CDF_STRIDE(ctx_left->palette_uv_color_index_cdf[j], + ctx_tr->palette_uv_color_index_cdf[j], nsymbs, + CDF_SIZE(PALETTE_COLORS)); + } + AVERAGE_CDF(ctx_left->palette_y_mode_cdf, ctx_tr->palette_y_mode_cdf, 2); + AVERAGE_CDF(ctx_left->palette_uv_mode_cdf, ctx_tr->palette_uv_mode_cdf, 2); + AVERAGE_CDF(ctx_left->comp_inter_cdf, ctx_tr->comp_inter_cdf, 2); + AVERAGE_CDF(ctx_left->single_ref_cdf, ctx_tr->single_ref_cdf, 2); + AVERAGE_CDF(ctx_left->comp_ref_type_cdf, ctx_tr->comp_ref_type_cdf, 2); + AVERAGE_CDF(ctx_left->uni_comp_ref_cdf, ctx_tr->uni_comp_ref_cdf, 2); + AVERAGE_CDF(ctx_left->comp_ref_cdf, ctx_tr->comp_ref_cdf, 2); + AVERAGE_CDF(ctx_left->comp_bwdref_cdf, ctx_tr->comp_bwdref_cdf, 2); + AVERAGE_CDF(ctx_left->txfm_partition_cdf, ctx_tr->txfm_partition_cdf, 2); + AVERAGE_CDF(ctx_left->compound_index_cdf, ctx_tr->compound_index_cdf, 2); + AVERAGE_CDF(ctx_left->comp_group_idx_cdf, ctx_tr->comp_group_idx_cdf, 2); + AVERAGE_CDF(ctx_left->skip_mode_cdfs, ctx_tr->skip_mode_cdfs, 2); + AVERAGE_CDF(ctx_left->skip_cdfs, ctx_tr->skip_cdfs, 2); + AVERAGE_CDF(ctx_left->intra_inter_cdf, ctx_tr->intra_inter_cdf, 2); + avg_nmv(&ctx_left->nmvc, &ctx_tr->nmvc, wt_left, wt_tr); + avg_nmv(&ctx_left->ndvc, &ctx_tr->ndvc, wt_left, wt_tr); + AVERAGE_CDF(ctx_left->intrabc_cdf, ctx_tr->intrabc_cdf, 2); + AVERAGE_CDF(ctx_left->seg.tree_cdf, ctx_tr->seg.tree_cdf, MAX_SEGMENTS); + AVERAGE_CDF(ctx_left->seg.pred_cdf, ctx_tr->seg.pred_cdf, 2); + AVERAGE_CDF(ctx_left->seg.spatial_pred_seg_cdf, + ctx_tr->seg.spatial_pred_seg_cdf, MAX_SEGMENTS); + AVERAGE_CDF(ctx_left->filter_intra_cdfs, ctx_tr->filter_intra_cdfs, 2); + AVERAGE_CDF(ctx_left->filter_intra_mode_cdf, ctx_tr->filter_intra_mode_cdf, + FILTER_INTRA_MODES); + AVERAGE_CDF(ctx_left->switchable_restore_cdf, ctx_tr->switchable_restore_cdf, + RESTORE_SWITCHABLE_TYPES); + AVERAGE_CDF(ctx_left->wiener_restore_cdf, ctx_tr->wiener_restore_cdf, 2); + AVERAGE_CDF(ctx_left->sgrproj_restore_cdf, ctx_tr->sgrproj_restore_cdf, 2); + AVERAGE_CDF(ctx_left->y_mode_cdf, ctx_tr->y_mode_cdf, INTRA_MODES); + AVG_CDF_STRIDE(ctx_left->uv_mode_cdf[0], ctx_tr->uv_mode_cdf[0], + UV_INTRA_MODES - 1, CDF_SIZE(UV_INTRA_MODES)); + AVERAGE_CDF(ctx_left->uv_mode_cdf[1], ctx_tr->uv_mode_cdf[1], UV_INTRA_MODES); + for (int i = 0; i < PARTITION_CONTEXTS; i++) { + if (i < 4) { + AVG_CDF_STRIDE(ctx_left->partition_cdf[i], ctx_tr->partition_cdf[i], 4, + CDF_SIZE(10)); + } else if (i < 16) { + AVERAGE_CDF(ctx_left->partition_cdf[i], ctx_tr->partition_cdf[i], 10); + } else { + AVG_CDF_STRIDE(ctx_left->partition_cdf[i], ctx_tr->partition_cdf[i], 8, + CDF_SIZE(10)); + } + } + AVERAGE_CDF(ctx_left->switchable_interp_cdf, ctx_tr->switchable_interp_cdf, + SWITCHABLE_FILTERS); + AVERAGE_CDF(ctx_left->kf_y_cdf, ctx_tr->kf_y_cdf, INTRA_MODES); + AVERAGE_CDF(ctx_left->angle_delta_cdf, ctx_tr->angle_delta_cdf, + 2 * MAX_ANGLE_DELTA + 1); + AVG_CDF_STRIDE(ctx_left->tx_size_cdf[0], ctx_tr->tx_size_cdf[0], MAX_TX_DEPTH, + CDF_SIZE(MAX_TX_DEPTH + 1)); + AVERAGE_CDF(ctx_left->tx_size_cdf[1], ctx_tr->tx_size_cdf[1], + MAX_TX_DEPTH + 1); + AVERAGE_CDF(ctx_left->tx_size_cdf[2], ctx_tr->tx_size_cdf[2], + MAX_TX_DEPTH + 1); + AVERAGE_CDF(ctx_left->tx_size_cdf[3], ctx_tr->tx_size_cdf[3], + MAX_TX_DEPTH + 1); + AVERAGE_CDF(ctx_left->delta_q_cdf, ctx_tr->delta_q_cdf, DELTA_Q_PROBS + 1); + AVERAGE_CDF(ctx_left->delta_lf_cdf, ctx_tr->delta_lf_cdf, DELTA_LF_PROBS + 1); + for (int i = 0; i < FRAME_LF_COUNT; i++) { + AVERAGE_CDF(ctx_left->delta_lf_multi_cdf[i], ctx_tr->delta_lf_multi_cdf[i], + DELTA_LF_PROBS + 1); + } + AVG_CDF_STRIDE(ctx_left->intra_ext_tx_cdf[1], ctx_tr->intra_ext_tx_cdf[1], 7, + CDF_SIZE(TX_TYPES)); + AVG_CDF_STRIDE(ctx_left->intra_ext_tx_cdf[2], ctx_tr->intra_ext_tx_cdf[2], 5, + CDF_SIZE(TX_TYPES)); + AVG_CDF_STRIDE(ctx_left->inter_ext_tx_cdf[1], ctx_tr->inter_ext_tx_cdf[1], 16, + CDF_SIZE(TX_TYPES)); + AVG_CDF_STRIDE(ctx_left->inter_ext_tx_cdf[2], ctx_tr->inter_ext_tx_cdf[2], 12, + CDF_SIZE(TX_TYPES)); + AVG_CDF_STRIDE(ctx_left->inter_ext_tx_cdf[3], ctx_tr->inter_ext_tx_cdf[3], 2, + CDF_SIZE(TX_TYPES)); + AVERAGE_CDF(ctx_left->cfl_sign_cdf, ctx_tr->cfl_sign_cdf, CFL_JOINT_SIGNS); + AVERAGE_CDF(ctx_left->cfl_alpha_cdf, ctx_tr->cfl_alpha_cdf, + CFL_ALPHABET_SIZE); +} + +static void encode_sb_row(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data, + int mi_row, TOKENEXTRA **tp, int use_nonrd_mode) { AV1_COMMON *const cm = &cpi->common; const int num_planes = av1_num_planes(cm); const TileInfo *const tile_info = &tile_data->tile_info; @@ -5032,6 +4662,10 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td, const int mib_size_log2 = cm->seq_params.mib_size_log2; const int sb_row = (mi_row - tile_info->mi_row_start) >> mib_size_log2; +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, encode_sb_time); +#endif + // Initialize the left context for the new SB row av1_zero_left_context(xd); @@ -5049,13 +4683,48 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td, mi_col < tile_info->mi_col_end; mi_col += mib_size, sb_col_in_tile++) { (*(cpi->row_mt_sync_read_ptr))(&tile_data->row_mt_sync, sb_row, sb_col_in_tile); - if ((cpi->row_mt == 1) && (tile_info->mi_col_start == mi_col) && + if (tile_data->allow_update_cdf && (cpi->row_mt == 1) && (tile_info->mi_row_start != mi_row)) { - // restore frame context of 1st column sb - memcpy(xd->tile_ctx, x->backup_tile_ctx, sizeof(*xd->tile_ctx)); + if ((tile_info->mi_col_start == mi_col)) { + // restore frame context of 1st column sb + memcpy(xd->tile_ctx, x->row_ctx, sizeof(*xd->tile_ctx)); + } else { + int wt_left = AVG_CDF_WEIGHT_LEFT; + int wt_tr = AVG_CDF_WEIGHT_TOP_RIGHT; + if (tile_info->mi_col_end > (mi_col + mib_size)) + avg_cdf_symbols(xd->tile_ctx, x->row_ctx + sb_col_in_tile, wt_left, + wt_tr); + else + avg_cdf_symbols(xd->tile_ctx, x->row_ctx + sb_col_in_tile - 1, + wt_left, wt_tr); + } + } + + switch (cpi->oxcf.coeff_cost_upd_freq) { + case COST_UPD_TILE: // Tile level + if (mi_row != tile_info->mi_row_start) break; + AOM_FALLTHROUGH_INTENDED; + case COST_UPD_SBROW: // SB row level in tile + if (mi_col != tile_info->mi_col_start) break; + AOM_FALLTHROUGH_INTENDED; + case COST_UPD_SB: // SB level + av1_fill_coeff_costs(&td->mb, xd->tile_ctx, num_planes); + break; + default: assert(0); + } + + switch (cpi->oxcf.mode_cost_upd_freq) { + case COST_UPD_TILE: // Tile level + if (mi_row != tile_info->mi_row_start) break; + AOM_FALLTHROUGH_INTENDED; + case COST_UPD_SBROW: // SB row level in tile + if (mi_col != tile_info->mi_col_start) break; + AOM_FALLTHROUGH_INTENDED; + case COST_UPD_SB: // SB level + av1_fill_mode_rates(cm, x, xd->tile_ctx); + break; + default: assert(0); } - av1_fill_coeff_costs(&td->mb, xd->tile_ctx, num_planes); - av1_fill_mode_rates(cm, x, xd->tile_ctx); if (sf->adaptive_pred_interp_filter) { for (int i = 0; i < leaf_nodes; ++i) { @@ -5068,16 +4737,27 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td, x->mb_rd_record.num = x->mb_rd_record.index_start = 0; - av1_zero(x->txb_rd_record_8X8); - av1_zero(x->txb_rd_record_16X16); - av1_zero(x->txb_rd_record_32X32); - av1_zero(x->txb_rd_record_64X64); - av1_zero(x->txb_rd_record_intra); + if (!use_nonrd_mode) { + av1_zero(x->txb_rd_record_8X8); + av1_zero(x->txb_rd_record_16X16); + av1_zero(x->txb_rd_record_32X32); + av1_zero(x->txb_rd_record_64X64); + av1_zero(x->txb_rd_record_intra); + } + + av1_zero(x->picked_ref_frames_mask); av1_zero(x->pred_mv); PC_TREE *const pc_root = td->pc_root[mib_size_log2 - MIN_MIB_SIZE_LOG2]; pc_root->index = 0; + if ((sf->simple_motion_search_prune_rect || + sf->simple_motion_search_early_term_none || + sf->firstpass_simple_motion_search_early_term) && + !frame_is_intra_only(cm)) { + init_simple_motion_search_mvs(pc_root); + } + const struct segmentation *const seg = &cm->seg; int seg_skip = 0; if (seg->enabled) { @@ -5099,6 +4779,7 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td, const int idx_str = cm->mi_stride * mi_row + mi_col; MB_MODE_INFO **mi = cm->mi_grid_visible + idx_str; x->source_variance = UINT_MAX; + x->simple_motion_pred_sse = UINT_MAX; if (sf->partition_search_type == FIXED_PARTITION || seg_skip) { set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size); const BLOCK_SIZE bsize = seg_skip ? sb_size : sf->always_this_block_size; @@ -5112,6 +4793,13 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td, set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize); rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size, &dummy_rate, &dummy_dist, 1, pc_root); + } else if (sf->partition_search_type == VAR_BASED_PARTITION && + use_nonrd_mode) { + set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size); + av1_choose_var_based_partitioning(cpi, tile_info, x, mi_row, mi_col); + nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size, + &dummy_rate, &dummy_dist, 1, pc_root); + } else { const int orig_rdmult = cpi->rd.RDMULT; x->cb_rdmult = orig_rdmult; @@ -5124,58 +4812,87 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td, x->rdmult = x->cb_rdmult; } - // If required set upper and lower partition size limits - if (sf->auto_min_max_partition_size) { - set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size); - rd_auto_partition_range(cpi, tile_info, xd, mi_row, mi_col, - &x->min_partition_size, &x->max_partition_size); - } - reset_partition(pc_root, sb_size); x->use_cb_search_range = 0; - init_first_partition_pass_stats_tables(x->first_partition_pass_stats); +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, first_partition_search_pass_time); +#endif + init_first_partition_pass_stats_tables(cpi, + x->first_partition_pass_stats); // Do the first pass if we need two pass partition search - if (cpi->sf.two_pass_partition_search && + if (cpi->two_pass_partition_search && cpi->sf.use_square_partition_only_threshold > BLOCK_4X4 && - mi_row + mi_size_high[sb_size] < cm->mi_rows && - mi_col + mi_size_wide[sb_size] < cm->mi_cols && + mi_row + mi_size_high[sb_size] <= cm->mi_rows && + mi_col + mi_size_wide[sb_size] <= cm->mi_cols && cm->current_frame.frame_type != KEY_FRAME) { first_partition_search_pass(cpi, td, tile_data, mi_row, mi_col, tp); } +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, first_partition_search_pass_time); +#endif + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, rd_pick_partition_time); +#endif + BLOCK_SIZE max_sq_size = BLOCK_128X128; + switch (cpi->oxcf.max_partition_size) { + case 4: max_sq_size = BLOCK_4X4; break; + case 8: max_sq_size = BLOCK_8X8; break; + case 16: max_sq_size = BLOCK_16X16; break; + case 32: max_sq_size = BLOCK_32X32; break; + case 64: max_sq_size = BLOCK_64X64; break; + case 128: max_sq_size = BLOCK_128X128; break; + default: assert(0); break; + } + max_sq_size = AOMMIN(max_sq_size, sb_size); + + BLOCK_SIZE min_sq_size = BLOCK_4X4; + switch (cpi->oxcf.min_partition_size) { + case 4: min_sq_size = BLOCK_4X4; break; + case 8: min_sq_size = BLOCK_8X8; break; + case 16: min_sq_size = BLOCK_16X16; break; + case 32: min_sq_size = BLOCK_32X32; break; + case 64: min_sq_size = BLOCK_64X64; break; + case 128: min_sq_size = BLOCK_128X128; break; + default: assert(0); break; + } + + if (use_auto_max_partition(cpi, sb_size, mi_row, mi_col)) { + float features[FEATURE_SIZE_MAX_MIN_PART_PRED] = { 0.0f }; + + av1_get_max_min_partition_features(cpi, x, mi_row, mi_col, features); + max_sq_size = + AOMMIN(av1_predict_max_partition(cpi, x, features), max_sq_size); + } + + min_sq_size = AOMMIN(min_sq_size, max_sq_size); rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size, - &dummy_rdc, INT64_MAX, pc_root, NULL); + max_sq_size, min_sq_size, &dummy_rdc, INT64_MAX, + pc_root, NULL); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, rd_pick_partition_time); +#endif } -#if CONFIG_COLLECT_INTER_MODE_RD_STATS // TODO(angiebird): Let inter_mode_rd_model_estimation support multi-tile. - if (cpi->sf.inter_mode_rd_model_estimation && cm->tile_cols == 1 && + if (cpi->sf.inter_mode_rd_model_estimation == 1 && cm->tile_cols == 1 && cm->tile_rows == 1) { av1_inter_mode_data_fit(tile_data, x->rdmult); } -#endif - // Context update for row based multi-threading of encoder is done based on - // the following conditions: - // 1. If mib_size_log2==5, context of top-right superblock is used - // for context modelling. If top-right is not available (in case of tile - // with width == mib_size_log2==5), top superblock's context is used. - // 2. If mib_size_log2==4, context of next superblock to top-right - // superblock is used. Using context of top-right superblock in this case - // gives high BD Rate drop for smaller resolutions. - if (cpi->row_mt == 1) { - int update_context = 0; - if (mib_size_log2 == 5) { - update_context = sb_cols_in_tile == 1 || sb_col_in_tile == 1; - } else if (mib_size_log2 == 4) { - update_context = sb_cols_in_tile == 1 || - (sb_cols_in_tile == 2 && sb_col_in_tile == 1) || - sb_col_in_tile == 2; - } - if (update_context) - memcpy(x->backup_tile_ctx, xd->tile_ctx, sizeof(*xd->tile_ctx)); + if (tile_data->allow_update_cdf && (cpi->row_mt == 1) && + (tile_info->mi_row_end > (mi_row + mib_size))) { + if (sb_cols_in_tile == 1) + memcpy(x->row_ctx, xd->tile_ctx, sizeof(*xd->tile_ctx)); + else if (sb_col_in_tile >= 1) + memcpy(x->row_ctx + sb_col_in_tile - 1, xd->tile_ctx, + sizeof(*xd->tile_ctx)); } (*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, sb_row, sb_col_in_tile, sb_cols_in_tile); } +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, encode_sb_time); +#endif } static void init_encode_frame_mb_context(AV1_COMP *cpi) { @@ -5193,18 +4910,18 @@ static void init_encode_frame_mb_context(AV1_COMP *cpi) { } static MV_REFERENCE_FRAME get_frame_type(const AV1_COMP *cpi) { - if (frame_is_intra_only(&cpi->common)) return INTRA_FRAME; - // We will not update the golden frame with an internal overlay frame - else if ((cpi->rc.is_src_frame_alt_ref && cpi->refresh_golden_frame) || - cpi->rc.is_src_frame_ext_arf) + if (frame_is_intra_only(&cpi->common)) { + return INTRA_FRAME; + } else if ((cpi->rc.is_src_frame_alt_ref && cpi->refresh_golden_frame) || + cpi->rc.is_src_frame_internal_arf) { + // We will not update the golden frame with an internal overlay frame return ALTREF_FRAME; - else if (cpi->refresh_golden_frame || cpi->refresh_alt2_ref_frame || - cpi->refresh_alt_ref_frame) + } else if (cpi->refresh_golden_frame || cpi->refresh_alt2_ref_frame || + cpi->refresh_alt_ref_frame) { return GOLDEN_FRAME; - else - // TODO(zoeliu): To investigate whether a frame_type other than - // INTRA/ALTREF/GOLDEN/LAST needs to be specified seperately. + } else { return LAST_FRAME; + } } static TX_MODE select_tx_mode(const AV1_COMP *cpi) { @@ -5238,7 +4955,6 @@ void av1_alloc_tile_data(AV1_COMP *cpi) { for (i = 0; i < BLOCK_SIZES_ALL; ++i) { for (j = 0; j < MAX_MODES; ++j) { tile_data->thresh_freq_fact[i][j] = 32; - tile_data->mode_map[i][j] = j; } } } @@ -5296,7 +5012,7 @@ void av1_encode_sb_row(AV1_COMP *cpi, ThreadData *td, int tile_row, cm->seq_params.mib_size_log2 + MI_SIZE_LOG2, num_planes); cpi->tplist[tile_row][tile_col][sb_row_in_tile].start = tok; - encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok); + encode_sb_row(cpi, td, this_tile, mi_row, &tok, cpi->sf.use_nonrd_pick_mode); cpi->tplist[tile_row][tile_col][sb_row_in_tile].stop = tok; cpi->tplist[tile_row][tile_col][sb_row_in_tile].count = @@ -5321,9 +5037,7 @@ void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row, const TileInfo *const tile_info = &this_tile->tile_info; int mi_row; -#if CONFIG_COLLECT_INTER_MODE_RD_STATS av1_inter_mode_data_init(this_tile); -#endif av1_zero_above_context(cm, &td->mb.e_mbd, tile_info->mi_col_start, tile_info->mi_col_end, tile_row); @@ -5363,28 +5077,12 @@ static void encode_tiles(AV1_COMP *cpi) { cpi->td.intrabc_used = 0; cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx; cpi->td.mb.tile_pb_ctx = &this_tile->tctx; - cpi->td.mb.backup_tile_ctx = &this_tile->backup_tctx; av1_encode_tile(cpi, &cpi->td, tile_row, tile_col); cpi->intrabc_used |= cpi->td.intrabc_used; } } } -#if CONFIG_FP_MB_STATS -static int input_fpmb_stats(FIRSTPASS_MB_STATS *firstpass_mb_stats, - AV1_COMMON *cm, uint8_t **this_frame_mb_stats) { - uint8_t *mb_stats_in = - firstpass_mb_stats->mb_stats_start + - cm->current_frame.frame_number * cm->MBs * sizeof(uint8_t); - - if (mb_stats_in > firstpass_mb_stats->mb_stats_end) return EOF; - - *this_frame_mb_stats = mb_stats_in; - - return 1; -} -#endif - #define GLOBAL_TRANS_TYPES_ENC 3 // highest motion model to search static int gm_get_params_cost(const WarpedMotionParams *gm, const WarpedMotionParams *ref_gm, int allow_hp) { @@ -5441,123 +5139,73 @@ static int do_gm_search_logic(SPEED_FEATURES *const sf, int num_refs_using_gm, (void)frame; switch (sf->gm_search_type) { case GM_FULL_SEARCH: return 1; - case GM_REDUCED_REF_SEARCH: + case GM_REDUCED_REF_SEARCH_SKIP_L2_L3: return !(frame == LAST2_FRAME || frame == LAST3_FRAME); + case GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2: + return !(frame == LAST2_FRAME || frame == LAST3_FRAME || + (frame == ALTREF2_FRAME)); case GM_DISABLE_SEARCH: return 0; default: assert(0); } return 1; } -static const uint8_t ref_frame_flag_list[REF_FRAMES] = { 0, - AOM_LAST_FLAG, - AOM_LAST2_FLAG, - AOM_LAST3_FLAG, - AOM_GOLD_FLAG, - AOM_BWD_FLAG, - AOM_ALT2_FLAG, - AOM_ALT_FLAG }; - -// Enforce the number of references for each arbitrary frame limited to -// (INTER_REFS_PER_FRAME - 1) +static int get_max_allowed_ref_frames(const AV1_COMP *cpi) { + const unsigned int max_allowed_refs_for_given_speed = + (cpi->sf.selective_ref_frame >= 3) ? INTER_REFS_PER_FRAME - 1 + : INTER_REFS_PER_FRAME; + return AOMMIN(max_allowed_refs_for_given_speed, + cpi->oxcf.max_reference_frames); +} + +// Enforce the number of references for each arbitrary frame based on user +// options and speed. static void enforce_max_ref_frames(AV1_COMP *cpi) { - AV1_COMMON *const cm = &cpi->common; MV_REFERENCE_FRAME ref_frame; int total_valid_refs = 0; for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { - if (cpi->ref_frame_flags & ref_frame_flag_list[ref_frame]) + if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) { total_valid_refs++; + } } - // NOTE(zoeliu): When all the possible reference frames are availble, we - // reduce the number of reference frames by 1, following the rules of: - // (1) Retain GOLDEN_FARME/ALTEF_FRAME; - // (2) Check the earliest 2 remaining reference frames, and remove the one - // with the lower quality factor, otherwise if both have been coded at - // the same quality level, remove the earliest reference frame. - - if (total_valid_refs == INTER_REFS_PER_FRAME) { - unsigned int min_ref_order_hint = UINT_MAX; - unsigned int second_min_ref_order_hint = UINT_MAX; - MV_REFERENCE_FRAME earliest_ref_frames[2] = { LAST3_FRAME, LAST2_FRAME }; - const RefCntBuffer *earliest_bufs[2] = { NULL }; - - // Locate the earliest two reference frames except GOLDEN/ALTREF. - for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { - // Retain GOLDEN/ALTERF - if (ref_frame == GOLDEN_FRAME || ref_frame == ALTREF_FRAME) continue; - - const RefCntBuffer *const buf = - cm->current_frame.frame_refs[ref_frame - LAST_FRAME].buf; - if (buf != NULL) { - const unsigned int ref_order_hint = buf->order_hint; - - if (min_ref_order_hint == UINT_MAX) { - min_ref_order_hint = ref_order_hint; - earliest_ref_frames[0] = ref_frame; - earliest_bufs[0] = buf; - } else { - if (get_relative_dist(&cm->seq_params.order_hint_info, ref_order_hint, - min_ref_order_hint) < 0) { - second_min_ref_order_hint = min_ref_order_hint; - earliest_ref_frames[1] = earliest_ref_frames[0]; - earliest_bufs[1] = earliest_bufs[0]; - - min_ref_order_hint = ref_order_hint; - earliest_ref_frames[0] = ref_frame; - earliest_bufs[0] = buf; - } else if (second_min_ref_order_hint == UINT_MAX || - get_relative_dist(&cm->seq_params.order_hint_info, - ref_order_hint, - second_min_ref_order_hint) < 0) { - second_min_ref_order_hint = ref_order_hint; - earliest_ref_frames[1] = ref_frame; - earliest_bufs[1] = buf; - } - } - } + const int max_allowed_refs = get_max_allowed_ref_frames(cpi); + + // When more than 'max_allowed_refs' are available, we reduce the number of + // reference frames one at a time based on this order. + const MV_REFERENCE_FRAME disable_order[] = { + LAST3_FRAME, + LAST2_FRAME, + ALTREF2_FRAME, + GOLDEN_FRAME, + }; + + for (int i = 0; i < 4 && total_valid_refs > max_allowed_refs; ++i) { + const MV_REFERENCE_FRAME ref_frame_to_disable = disable_order[i]; + + if (!(cpi->ref_frame_flags & + av1_ref_frame_flag_list[ref_frame_to_disable])) { + continue; } - // Check the coding quality factors of the two earliest reference frames. - RATE_FACTOR_LEVEL ref_rf_level[2]; - double ref_rf_deltas[2]; - for (int i = 0; i < 2; ++i) { - ref_rf_level[i] = earliest_bufs[i]->frame_rf_level; - ref_rf_deltas[i] = rate_factor_deltas[ref_rf_level[i]]; - } - (void)ref_rf_level; - (void)ref_rf_deltas; - -#define USE_RF_LEVEL_TO_ENFORCE 1 -#if USE_RF_LEVEL_TO_ENFORCE - // If both earliest two reference frames are coded using the same rate- - // factor, disable the earliest reference frame; Otherwise disable the - // reference frame that uses a lower rate-factor delta. - const MV_REFERENCE_FRAME ref_frame_to_disable = - (ref_rf_deltas[0] <= ref_rf_deltas[1]) ? earliest_ref_frames[0] - : earliest_ref_frames[1]; -#else - // Always disable the earliest reference frame - const MV_REFERENCE_FRAME ref_frame_to_disable = earliest_ref_frames[0]; -#endif // USE_RF_LEVEL_TO_ENFORCE -#undef USE_RF_LEVEL_TO_ENFORCE switch (ref_frame_to_disable) { - case LAST_FRAME: cpi->ref_frame_flags &= ~AOM_LAST_FLAG; break; - case LAST2_FRAME: cpi->ref_frame_flags &= ~AOM_LAST2_FLAG; break; case LAST3_FRAME: cpi->ref_frame_flags &= ~AOM_LAST3_FLAG; break; - case BWDREF_FRAME: cpi->ref_frame_flags &= ~AOM_BWD_FLAG; break; + case LAST2_FRAME: cpi->ref_frame_flags &= ~AOM_LAST2_FLAG; break; case ALTREF2_FRAME: cpi->ref_frame_flags &= ~AOM_ALT2_FLAG; break; - default: break; + case GOLDEN_FRAME: cpi->ref_frame_flags &= ~AOM_GOLD_FLAG; break; + default: assert(0); } + --total_valid_refs; } + assert(total_valid_refs <= max_allowed_refs); } static INLINE int av1_refs_are_one_sided(const AV1_COMMON *cm) { assert(!frame_is_intra_only(cm)); int one_sided_refs = 1; - for (int ref = 0; ref < INTER_REFS_PER_FRAME; ++ref) { - const RefCntBuffer *const buf = cm->current_frame.frame_refs[ref].buf; + for (int ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref) { + const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref); if (buf == NULL) continue; const int ref_order_hint = buf->order_hint; @@ -5577,9 +5225,9 @@ static INLINE void get_skip_mode_ref_offsets(const AV1_COMMON *cm, if (!skip_mode_info->skip_mode_allowed) return; const RefCntBuffer *const buf_0 = - cm->current_frame.frame_refs[skip_mode_info->ref_frame_idx_0].buf; + get_ref_frame_buf(cm, LAST_FRAME + skip_mode_info->ref_frame_idx_0); const RefCntBuffer *const buf_1 = - cm->current_frame.frame_refs[skip_mode_info->ref_frame_idx_1].buf; + get_ref_frame_buf(cm, LAST_FRAME + skip_mode_info->ref_frame_idx_1); assert(buf_0 != NULL && buf_1 != NULL); ref_order_hint[0] = buf_0->order_hint; @@ -5666,9 +5314,10 @@ static void encode_frame_internal(AV1_COMP *cpi) { av1_zero(*td->counts); av1_zero(rdc->comp_pred_diff); + // Two pass partition search can be enabled/disabled for different frames. + // Reset this data at frame level to avoid any incorrect usage. + init_first_partition_pass_stats_tables(cpi, x->first_partition_pass_stats); - // Allow intrabc when screen content tools are enabled. - cm->allow_intrabc = cm->allow_screen_content_tools; // Reset the flag. cpi->intrabc_used = 0; // Need to disable intrabc when superres is selected @@ -5676,6 +5325,8 @@ static void encode_frame_internal(AV1_COMP *cpi) { cm->allow_intrabc = 0; } + cm->allow_intrabc &= (cpi->oxcf.enable_intrabc); + if (cpi->oxcf.pass != 1 && av1_use_hash_me(cm)) { // add to hash table const int pic_width = cpi->source->y_crop_width; @@ -5760,7 +5411,7 @@ static void encode_frame_internal(AV1_COMP *cpi) { if (xd->lossless[i]) { cpi->optimize_seg_arr[i] = 0; } else { - cpi->optimize_seg_arr[i] = cpi->optimize_speed_feature; + cpi->optimize_seg_arr[i] = cpi->sf.optimize_coefficients; } } cm->coded_lossless = is_coded_lossless(cm, xd); @@ -5775,7 +5426,8 @@ static void encode_frame_internal(AV1_COMP *cpi) { cm->delta_q_info.delta_q_present_flag = cpi->oxcf.deltaq_mode != NO_DELTA_Q; cm->delta_q_info.delta_lf_present_flag = cpi->oxcf.deltaq_mode == DELTA_Q_LF; cm->delta_q_info.delta_lf_multi = DEFAULT_DELTA_LF_MULTI; - // update delta_q_present_flag and delta_lf_present_flag based on base_qindex + // update delta_q_present_flag and delta_lf_present_flag based on + // base_qindex cm->delta_q_info.delta_q_present_flag &= cm->base_qindex > 0; cm->delta_q_info.delta_lf_present_flag &= cm->base_qindex > 0; @@ -5801,8 +5453,7 @@ static void encode_frame_internal(AV1_COMP *cpi) { aom_clear_system_state(); if (tpl_frame->is_valid) - cpi->rd.r0 = - (double)intra_cost_base / (intra_cost_base + mc_dep_cost_base); + cpi->rd.r0 = (double)intra_cost_base / mc_dep_cost_base; } av1_frame_init_quantizer(cpi); @@ -5815,7 +5466,6 @@ static void encode_frame_internal(AV1_COMP *cpi) { cm->last_frame_seg_map = cm->prev_frame->seg_map; else cm->last_frame_seg_map = NULL; - cm->current_frame_seg_map = cm->cur_frame->seg_map; if (cm->allow_intrabc || cm->coded_lossless) { av1_set_default_ref_deltas(cm->lf.ref_deltas); av1_set_default_mode_deltas(cm->lf.mode_deltas); @@ -5831,14 +5481,17 @@ static void encode_frame_internal(AV1_COMP *cpi) { cm->prev_mi = cm->allow_ref_frame_mvs ? cm->prev_mip : NULL; x->txb_split_count = 0; +#if CONFIG_SPEED_STATS + x->tx_search_count = 0; +#endif // CONFIG_SPEED_STATS +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, av1_compute_global_motion_time); +#endif av1_zero(rdc->global_motion_used); av1_zero(cpi->gmparams_cost); -#if !CONFIG_GLOBAL_MOTION_SEARCH - cpi->global_motion_search_done = 1; -#endif // !CONFIG_GLOBAL_MOTION_SEARCH if (cpi->common.current_frame.frame_type == INTER_FRAME && cpi->source && - !cpi->global_motion_search_done) { + cpi->oxcf.enable_global_motion && !cpi->global_motion_search_done) { YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES]; int frame; double params_by_motion[RANSAC_NUM_MOTIONS * (MAX_PARAMDIM - 1)]; @@ -5853,7 +5506,9 @@ static void encode_frame_internal(AV1_COMP *cpi) { int num_refs_using_gm = 0; for (frame = ALTREF_FRAME; frame >= LAST_FRAME; --frame) { - ref_buf[frame] = get_ref_frame_buffer(cpi, frame); + ref_buf[frame] = NULL; + RefCntBuffer *buf = get_ref_frame_buf(cm, frame); + if (buf != NULL) ref_buf[frame] = &buf->buf; int pframe; cm->global_motion[frame] = default_warp_params; const WarpedMotionParams *ref_params = @@ -5872,15 +5527,26 @@ static void encode_frame_internal(AV1_COMP *cpi) { do_gm_search_logic(&cpi->sf, num_refs_using_gm, frame) && !(cpi->sf.selective_ref_gm && skip_gm_frame(cm, frame))) { TransformationType model; - const int64_t ref_frame_error = - av1_frame_error(xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd, - ref_buf[frame]->y_buffer, ref_buf[frame]->y_stride, - cpi->source->y_buffer, cpi->source->y_width, - cpi->source->y_height, cpi->source->y_stride); + const int64_t ref_frame_error = av1_frame_error( + is_cur_buf_hbd(xd), xd->bd, ref_buf[frame]->y_buffer, + ref_buf[frame]->y_stride, cpi->source->y_buffer, + cpi->source->y_width, cpi->source->y_height, cpi->source->y_stride); if (ref_frame_error == 0) continue; aom_clear_system_state(); + + // TODO(sarahparker, debargha): Explore do_adaptive_gm_estimation = 1 + const int do_adaptive_gm_estimation = 0; + + const int ref_frame_dist = get_relative_dist( + &cm->seq_params.order_hint_info, cm->current_frame.order_hint, + cm->cur_frame->ref_order_hints[frame - LAST_FRAME]); + const GlobalMotionEstimationType gm_estimation_type = + cm->seq_params.order_hint_info.enable_order_hint && + abs(ref_frame_dist) <= 2 && do_adaptive_gm_estimation + ? GLOBAL_MOTION_DISFLOW_BASED + : GLOBAL_MOTION_FEATURE_BASED; for (model = ROTZOOM; model < GLOBAL_TRANS_TYPES_ENC; ++model) { int64_t best_warp_error = INT64_MAX; // Initially set all params to identity. @@ -5891,8 +5557,8 @@ static void encode_frame_internal(AV1_COMP *cpi) { av1_compute_global_motion(model, cpi->source, ref_buf[frame], cpi->common.seq_params.bit_depth, - inliers_by_motion, params_by_motion, - RANSAC_NUM_MOTIONS); + gm_estimation_type, inliers_by_motion, + params_by_motion, RANSAC_NUM_MOTIONS); for (i = 0; i < RANSAC_NUM_MOTIONS; ++i) { if (inliers_by_motion[i] == 0) continue; @@ -5902,17 +5568,17 @@ static void encode_frame_internal(AV1_COMP *cpi) { if (tmp_wm_params.wmtype != IDENTITY) { const int64_t warp_error = av1_refine_integerized_param( - &tmp_wm_params, tmp_wm_params.wmtype, - xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd, - ref_buf[frame]->y_buffer, ref_buf[frame]->y_width, + &tmp_wm_params, tmp_wm_params.wmtype, is_cur_buf_hbd(xd), + xd->bd, ref_buf[frame]->y_buffer, ref_buf[frame]->y_width, ref_buf[frame]->y_height, ref_buf[frame]->y_stride, cpi->source->y_buffer, cpi->source->y_width, cpi->source->y_height, cpi->source->y_stride, 5, best_warp_error); if (warp_error < best_warp_error) { best_warp_error = warp_error; - // Save the wm_params modified by av1_refine_integerized_param() - // rather than motion index to avoid rerunning refine() below. + // Save the wm_params modified by + // av1_refine_integerized_param() rather than motion index to + // avoid rerunning refine() below. memcpy(&(cm->global_motion[frame]), &tmp_wm_params, sizeof(WarpedMotionParams)); } @@ -5956,7 +5622,7 @@ static void encode_frame_internal(AV1_COMP *cpi) { // clear disabled ref_frames for (frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) { const int ref_disabled = - !(cpi->ref_frame_flags & ref_frame_flag_list[frame]); + !(cpi->ref_frame_flags & av1_ref_frame_flag_list[frame]); if (ref_disabled && cpi->sf.recode_loop != DISALLOW_RECODE) { cpi->gmparams_cost[frame] = 0; cm->global_motion[frame] = default_warp_params; @@ -5966,8 +5632,17 @@ static void encode_frame_internal(AV1_COMP *cpi) { } memcpy(cm->cur_frame->global_motion, cm->global_motion, REF_FRAMES * sizeof(WarpedMotionParams)); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, av1_compute_global_motion_time); +#endif +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, av1_setup_motion_field_time); +#endif av1_setup_motion_field(cm); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, av1_setup_motion_field_time); +#endif cpi->all_one_sided_refs = frame_is_intra_only(cm) ? 0 : av1_refs_are_one_sided(cm); @@ -5976,16 +5651,6 @@ static void encode_frame_internal(AV1_COMP *cpi) { check_skip_mode_enabled(cpi); { - struct aom_usec_timer emr_timer; - aom_usec_timer_start(&emr_timer); - -#if CONFIG_FP_MB_STATS - if (cpi->use_fp_mb_stats) { - input_fpmb_stats(&cpi->twopass.firstpass_mb_stats, cm, - &cpi->twopass.this_frame_mb_stats); - } -#endif - cpi->row_mt_sync_read_ptr = av1_row_mt_sync_read_dummy; cpi->row_mt_sync_write_ptr = av1_row_mt_sync_write_dummy; cpi->row_mt = 0; @@ -6000,9 +5665,6 @@ static void encode_frame_internal(AV1_COMP *cpi) { else encode_tiles(cpi); } - - aom_usec_timer_mark(&emr_timer); - cpi->time_encode_sb_row += aom_usec_timer_elapsed(&emr_timer); } // If intrabc is allowed but never selected, reset the allow_intrabc flag. @@ -6016,21 +5678,7 @@ void av1_encode_frame(AV1_COMP *cpi) { const int num_planes = av1_num_planes(cm); // Indicates whether or not to use a default reduced set for ext-tx // rather than the potential full set of 16 transforms - cm->reduced_tx_set_used = 0; - - if (cm->show_frame == 0) { - int arf_offset = AOMMIN( - (MAX_GF_INTERVAL - 1), - cpi->twopass.gf_group.arf_src_offset[cpi->twopass.gf_group.index]); - int brf_offset = - cpi->twopass.gf_group.brf_src_offset[cpi->twopass.gf_group.index]; - arf_offset = AOMMIN((MAX_GF_INTERVAL - 1), arf_offset + brf_offset); - current_frame->order_hint = current_frame->frame_number + arf_offset; - } else { - current_frame->order_hint = current_frame->frame_number; - } - current_frame->order_hint %= - (1 << (cm->seq_params.order_hint_info.order_hint_bits_minus_1 + 1)); + cm->reduced_tx_set_used = cpi->oxcf.reduced_tx_type_set; // Make sure segment_id is no larger than last_active_segid. if (cm->seg.enabled && cm->seg.update_map) { @@ -6047,7 +5695,7 @@ void av1_encode_frame(AV1_COMP *cpi) { } av1_setup_frame_buf_refs(cm); - if (cpi->sf.selective_ref_frame >= 3) enforce_max_ref_frames(cpi); + enforce_max_ref_frames(cpi); av1_setup_frame_sign_bias(cm); #if CONFIG_MISMATCH_DEBUG @@ -6056,8 +5704,6 @@ void av1_encode_frame(AV1_COMP *cpi) { (void)num_planes; #endif - cpi->allow_comp_inter_inter = !frame_is_intra_only(cm); - if (cpi->sf.frame_parameter_update) { int i; RD_OPT *const rd_opt = &cpi->rd; @@ -6079,7 +5725,7 @@ void av1_encode_frame(AV1_COMP *cpi) { /* prediction (compound, single or hybrid) mode selection */ // NOTE: "is_alt_ref" is true only for OVERLAY/INTNL_OVERLAY frames - if (is_alt_ref || !cpi->allow_comp_inter_inter) + if (is_alt_ref || frame_is_intra_only(cm)) current_frame->reference_mode = SINGLE_REFERENCE; else current_frame->reference_mode = REFERENCE_MODE_SELECT; @@ -6106,7 +5752,8 @@ void av1_encode_frame(AV1_COMP *cpi) { #endif // CONFIG_ENTROPY_STATS } } - // Re-check on the skip mode status as reference mode may have been changed. + // Re-check on the skip mode status as reference mode may have been + // changed. SkipModeInfo *const skip_mode_info = ¤t_frame->skip_mode_info; if (frame_is_intra_only(cm) || current_frame->reference_mode == SINGLE_REFERENCE) { @@ -6287,8 +5934,7 @@ static void encode_superblock(const AV1_COMP *const cpi, TileDataEnc *tile_data, const int mi_height = mi_size_high[bsize]; const int is_inter = is_inter_block(mbmi); - if (cpi->sf.mode_pruning_based_on_two_pass_partition_search && - x->cb_partition_scan) { + if (cpi->two_pass_partition_search && x->cb_partition_scan) { for (int row = mi_row; row < mi_row + mi_width; row += FIRST_PARTITION_PASS_SAMPLE_REGION) { for (int col = mi_col; col < mi_col + mi_height; @@ -6302,8 +5948,15 @@ static void encode_superblock(const AV1_COMP *const cpi, TileDataEnc *tile_data, if (stats->ref0_counts[mbmi->ref_frame[0]] < 255) ++stats->ref0_counts[mbmi->ref_frame[0]]; if (mbmi->ref_frame[1] >= 0 && - stats->ref1_counts[mbmi->ref_frame[0]] < 255) + stats->ref1_counts[mbmi->ref_frame[1]] < 255) ++stats->ref1_counts[mbmi->ref_frame[1]]; + if (cpi->sf.use_first_partition_pass_interintra_stats) { + // Increase the counter for interintra_motion_mode_count + if (mbmi->motion_mode == 0 && mbmi->ref_frame[1] == INTRA_FRAME && + stats->interintra_motion_mode_count[mbmi->ref_frame[0]] < 255) { + ++stats->interintra_motion_mode_count[mbmi->ref_frame[0]]; + } + } } } } @@ -6351,15 +6004,19 @@ static void encode_superblock(const AV1_COMP *const cpi, TileDataEnc *tile_data, set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); for (ref = 0; ref < 1 + is_compound; ++ref) { - YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, mbmi->ref_frame[ref]); + const YV12_BUFFER_CONFIG *cfg = + get_ref_frame_yv12_buf(cm, mbmi->ref_frame[ref]); assert(IMPLIES(!is_intrabc_block(mbmi), cfg)); av1_setup_pre_planes(xd, ref, cfg, mi_row, mi_col, - &xd->block_refs[ref]->sf, num_planes); + xd->block_ref_scale_factors[ref], num_planes); } - av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize); - if (mbmi->motion_mode == OBMC_CAUSAL) + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0, + av1_num_planes(cm) - 1); + if (mbmi->motion_mode == OBMC_CAUSAL) { + assert(cpi->oxcf.enable_obmc == 1); av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col); + } #if CONFIG_MISMATCH_DEBUG if (dry_run == OUTPUT_ENABLED) { diff --git a/libaom/av1/encoder/encodemb.c b/libaom/av1/encoder/encodemb.c index e0c0370..8e9da61 100644 --- a/libaom/av1/encoder/encodemb.c +++ b/libaom/av1/encoder/encodemb.c @@ -43,7 +43,7 @@ static void subtract_block(const MACROBLOCKD *xd, int rows, int cols, const uint8_t *src8, ptrdiff_t src_stride, const uint8_t *pred8, ptrdiff_t pred_stride) { if (check_subtract_block_size(rows, cols)) { - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + if (is_cur_buf_hbd(xd)) { aom_highbd_subtract_block_c(rows, cols, diff, diff_stride, src8, src_stride, pred8, pred_stride, xd->bd); return; @@ -54,7 +54,7 @@ static void subtract_block(const MACROBLOCKD *xd, int rows, int cols, return; } - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + if (is_cur_buf_hbd(xd)) { aom_highbd_subtract_block(rows, cols, diff, diff_stride, src8, src_stride, pred8, pred_stride, xd->bd); return; @@ -111,16 +111,15 @@ int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *mb, int plane, return eob; } - (void)fast_mode; return av1_optimize_txb_new(cpi, mb, plane, block, tx_size, tx_type, txb_ctx, - rate_cost, cpi->oxcf.sharpness); + rate_cost, cpi->oxcf.sharpness, fast_mode); } -typedef enum QUANT_FUNC { +enum { QUANT_FUNC_LOWBD = 0, QUANT_FUNC_HIGHBD = 1, QUANT_FUNC_TYPES = 2 -} QUANT_FUNC; +} UENUM1BYTE(QUANT_FUNC); static AV1_QUANT_FACADE quant_func_list[AV1_XFORM_QUANT_TYPES][QUANT_FUNC_TYPES] = { @@ -163,6 +162,7 @@ void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block, qparam.tx_size = tx_size; qparam.qmatrix = qmatrix; qparam.iqmatrix = iqmatrix; + qparam.use_quant_b_adapt = cm->use_quant_b_adapt; TxfmParam txfm_param; txfm_param.tx_type = tx_type; txfm_param.tx_size = tx_size; @@ -171,7 +171,7 @@ void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block, txfm_param.tx_size, is_inter_block(mbmi), cm->reduced_tx_set_used); txfm_param.bd = xd->bd; - txfm_param.is_hbd = get_bitdepth_data_path_index(xd); + txfm_param.is_hbd = is_cur_buf_hbd(xd); av1_fwd_txfm(src_diff, coeff, diff_stride, &txfm_param); @@ -184,7 +184,7 @@ void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block, av1_quantize_skip(n_coeffs, qcoeff, dqcoeff, eob); } } - // NOTE: optimize_b_following is ture means av1_optimze_b will be called + // NOTE: optimize_b_following is true means av1_optimze_b will be called // When the condition of doing optimize_b is changed, // this flag need update simultaneously const int optimize_b_following = @@ -226,13 +226,17 @@ static void encode_block(int plane, int block, int blk_row, int blk_col, if (!is_blk_skip(x, plane, blk_row * bw + blk_col) && !mbmi->skip_mode) { TX_TYPE tx_type = av1_get_tx_type(pd->plane_type, xd, blk_row, blk_col, tx_size, cm->reduced_tx_set_used); - if (args->enable_optimize_b) { - av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, - tx_size, tx_type, AV1_XFORM_QUANT_FP); + if (args->enable_optimize_b != NO_TRELLIS_OPT) { + av1_xform_quant( + cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, tx_type, + USE_B_QUANT_NO_TRELLIS && + (args->enable_optimize_b == FINAL_PASS_TRELLIS_OPT) + ? AV1_XFORM_QUANT_B + : AV1_XFORM_QUANT_FP); TXB_CTX txb_ctx; get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx); - av1_optimize_b(args->cpi, x, plane, block, tx_size, tx_type, &txb_ctx, 1, - &dummy_rate_cost); + av1_optimize_b(args->cpi, x, plane, block, tx_size, tx_type, &txb_ctx, + args->cpi->sf.trellis_eob_fast, &dummy_rate_cost); } else { av1_xform_quant( cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, tx_type, @@ -255,12 +259,12 @@ static void encode_block(int plane, int block, int blk_row, int blk_col, cm->reduced_tx_set_used); } + // TODO(debargha, jingning): Temporarily disable txk_type check for eob=0 + // case. It is possible that certain collision in hash index would cause + // the assertion failure. To further optimize the rate-distortion + // performance, we need to re-visit this part and enable this assert + // again. if (p->eobs[block] == 0 && plane == 0) { - // TODO(debargha, jingning): Temporarily disable txk_type check for eob=0 - // case. It is possible that certain collision in hash index would cause - // the assertion failure. To further optimize the rate-distortion - // performance, we need to re-visit this part and enable this assert - // again. #if 0 if (args->cpi->oxcf.aq_mode == NO_AQ && args->cpi->oxcf.deltaq_mode == NO_DELTA_Q) { @@ -431,7 +435,7 @@ static void encode_block_pass1(int plane, int block, int blk_row, int blk_col, if (p->eobs[block] > 0) { txfm_param.bd = xd->bd; - txfm_param.is_hbd = get_bitdepth_data_path_index(xd); + txfm_param.is_hbd = is_cur_buf_hbd(xd); txfm_param.tx_type = DCT_DCT; txfm_param.tx_size = tx_size; txfm_param.eob = p->eobs[block]; @@ -578,13 +582,17 @@ void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col, const ENTROPY_CONTEXT *a = &args->ta[blk_col]; const ENTROPY_CONTEXT *l = &args->tl[blk_row]; - if (args->enable_optimize_b) { - av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, - tx_size, tx_type, AV1_XFORM_QUANT_FP); + if (args->enable_optimize_b != NO_TRELLIS_OPT) { + av1_xform_quant( + cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, tx_type, + USE_B_QUANT_NO_TRELLIS && + (args->enable_optimize_b == FINAL_PASS_TRELLIS_OPT) + ? AV1_XFORM_QUANT_B + : AV1_XFORM_QUANT_FP); TXB_CTX txb_ctx; get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx); - av1_optimize_b(args->cpi, x, plane, block, tx_size, tx_type, &txb_ctx, 1, - &dummy_rate_cost); + av1_optimize_b(args->cpi, x, plane, block, tx_size, tx_type, &txb_ctx, + args->cpi->sf.trellis_eob_fast, &dummy_rate_cost); } else { av1_xform_quant( cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, tx_type, @@ -597,12 +605,12 @@ void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col, dst_stride, *eob, cm->reduced_tx_set_used); } + // TODO(jingning): Temporarily disable txk_type check for eob=0 case. + // It is possible that certain collision in hash index would cause + // the assertion failure. To further optimize the rate-distortion + // performance, we need to re-visit this part and enable this assert + // again. if (*eob == 0 && plane == 0) { - // TODO(jingning): Temporarily disable txk_type check for eob=0 case. - // It is possible that certain collision in hash index would cause - // the assertion failure. To further optimize the rate-distortion - // performance, we need to re-visit this part and enable this assert - // again. #if 0 if (args->cpi->oxcf.aq_mode == NO_AQ && args->cpi->oxcf.deltaq_mode == NO_DELTA_Q) { diff --git a/libaom/av1/encoder/encodemb.h b/libaom/av1/encoder/encodemb.h index 39080de..d4394cf 100644 --- a/libaom/av1/encoder/encodemb.h +++ b/libaom/av1/encoder/encodemb.h @@ -37,13 +37,13 @@ struct encode_b_args { int8_t enable_optimize_b; }; -typedef enum AV1_XFORM_QUANT { +enum { AV1_XFORM_QUANT_FP = 0, AV1_XFORM_QUANT_B = 1, AV1_XFORM_QUANT_DC = 2, AV1_XFORM_QUANT_SKIP_QUANT, AV1_XFORM_QUANT_TYPES, -} AV1_XFORM_QUANT; +} UENUM1BYTE(AV1_XFORM_QUANT); void av1_encode_sb(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col, RUN_TYPE dry_run); diff --git a/libaom/av1/encoder/encoder.c b/libaom/av1/encoder/encoder.c index 7652029..818e43c 100644 --- a/libaom/av1/encoder/encoder.c +++ b/libaom/av1/encoder/encoder.c @@ -33,9 +33,9 @@ #include "aom_ports/mem.h" #include "aom_ports/system_state.h" #include "aom_scale/aom_scale.h" -#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG +#if CONFIG_BITSTREAM_DEBUG #include "aom_util/debug_util.h" -#endif // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG +#endif // CONFIG_BITSTREAM_DEBUG #include "av1/common/alloccommon.h" #include "av1/common/cdef.h" @@ -54,6 +54,7 @@ #include "av1/encoder/context_tree.h" #include "av1/encoder/encodeframe.h" #include "av1/encoder/encodemv.h" +#include "av1/encoder/encode_strategy.h" #include "av1/encoder/encoder.h" #include "av1/encoder/encodetxb.h" #include "av1/encoder/ethread.h" @@ -61,6 +62,7 @@ #include "av1/encoder/grain_test_vectors.h" #include "av1/encoder/hash_motion.h" #include "av1/encoder/mbgraph.h" +#include "av1/encoder/pass2_strategy.h" #include "av1/encoder/picklpf.h" #include "av1/encoder/pickrst.h" #include "av1/encoder/random.h" @@ -69,14 +71,11 @@ #include "av1/encoder/rdopt.h" #include "av1/encoder/segmentation.h" #include "av1/encoder/speed_features.h" -#include "av1/encoder/temporal_filter.h" #include "av1/encoder/reconinter_enc.h" +#include "av1/encoder/var_based_part.h" #define DEFAULT_EXPLICIT_ORDER_HINT_BITS 7 -// av1 uses 10,000,000 ticks/second as time stamp -#define TICKS_PER_SEC 10000000LL - #if CONFIG_ENTROPY_STATS FRAME_COUNTS aggregate_fc; #endif // CONFIG_ENTROPY_STATS @@ -100,30 +99,6 @@ FILE *yuv_rec_file; #define FILE_NAME_LEN 100 #endif -// Estimate if the source frame is screen content, based on the portion of -// blocks that have no more than 4 (experimentally selected) luma colors. -static int is_screen_content(const uint8_t *src, int use_hbd, int bd, - int stride, int width, int height) { - assert(src != NULL); - int counts = 0; - const int blk_w = 16; - const int blk_h = 16; - const int limit = 4; - for (int r = 0; r + blk_h <= height; r += blk_h) { - for (int c = 0; c + blk_w <= width; c += blk_w) { - int count_buf[1 << 12]; // Maximum (1 << 12) color levels. - const int n_colors = - use_hbd ? av1_count_colors_highbd(src + r * stride + c, stride, blk_w, - blk_h, bd, count_buf) - : av1_count_colors(src + r * stride + c, stride, blk_w, blk_h, - count_buf); - if (n_colors > 1 && n_colors <= limit) counts++; - } - } - // The threshold is 10%. - return counts * blk_h * blk_w * 10 > width * height; -} - static INLINE void Scale2Ratio(AOM_SCALING mode, int *hr, int *hs) { switch (mode) { case NORMAL: @@ -269,7 +244,7 @@ int av1_get_active_map(AV1_COMP *cpi, unsigned char *new_map_16x16, int rows, // by calculuating the 16x4 Horizontal DCT. This is to be used to // decide the superresolution parameters. void analyze_hor_freq(const AV1_COMP *cpi, double *energy) { - uint64_t freq_energy[8] = { 0 }; + uint64_t freq_energy[16] = { 0 }; const YV12_BUFFER_CONFIG *buf = cpi->source; const int bd = cpi->td.mb.e_mbd.bd; const int width = buf->y_crop_width; @@ -283,14 +258,13 @@ void analyze_hor_freq(const AV1_COMP *cpi, double *energy) { for (int j = 0; j < width - 16; j += 16) { av1_fwd_txfm2d_16x4(src16 + i * buf->y_stride + j, coeff, buf->y_stride, H_DCT, bd); - for (int k = 8; k < 16; ++k) { + for (int k = 1; k < 16; ++k) { const uint64_t this_energy = ((int64_t)coeff[k] * coeff[k]) + ((int64_t)coeff[k + 16] * coeff[k + 16]) + ((int64_t)coeff[k + 32] * coeff[k + 32]) + ((int64_t)coeff[k + 48] * coeff[k + 48]); - freq_energy[k - 8] += - ROUND_POWER_OF_TWO(this_energy, 2 + 2 * (bd - 8)); + freq_energy[k] += ROUND_POWER_OF_TWO(this_energy, 2 + 2 * (bd - 8)); } n++; } @@ -305,24 +279,24 @@ void analyze_hor_freq(const AV1_COMP *cpi, double *energy) { src16[ii * 16 + jj] = buf->y_buffer[(i + ii) * buf->y_stride + (j + jj)]; av1_fwd_txfm2d_16x4(src16, coeff, 16, H_DCT, bd); - for (int k = 8; k < 16; ++k) { + for (int k = 1; k < 16; ++k) { const uint64_t this_energy = ((int64_t)coeff[k] * coeff[k]) + ((int64_t)coeff[k + 16] * coeff[k + 16]) + ((int64_t)coeff[k + 32] * coeff[k + 32]) + ((int64_t)coeff[k + 48] * coeff[k + 48]); - freq_energy[k - 8] += ROUND_POWER_OF_TWO(this_energy, 2); + freq_energy[k] += ROUND_POWER_OF_TWO(this_energy, 2); } n++; } } } if (n) { - for (int k = 0; k < 8; ++k) energy[k] = (double)freq_energy[k] / n; + for (int k = 1; k < 16; ++k) energy[k] = (double)freq_energy[k] / n; // Convert to cumulative energy - for (int k = 6; k >= 0; --k) energy[k] += energy[k + 1]; + for (int k = 14; k > 0; --k) energy[k] += energy[k + 1]; } else { - for (int k = 0; k < 8; ++k) energy[k] = 1e+20; + for (int k = 1; k < 16; ++k) energy[k] = 1e+20; } } @@ -358,6 +332,9 @@ static BLOCK_SIZE select_sb_size(const AV1_COMP *const cpi) { // When superres / resize is on, 'cm->width / height' can change between // calls, so we don't apply this heuristic there. Also, this heuristic gives // compression gain for speed >= 2 only. + // Things break if superblock size changes per-frame which is why this + // heuristic is set based on configured speed rather than actual + // speed-features (which may change per-frame in future) if (cpi->oxcf.superres_mode == SUPERRES_NONE && cpi->oxcf.resize_mode == RESIZE_NONE && cpi->oxcf.speed >= 2) { return (cm->width >= 480 && cm->height >= 360) ? BLOCK_128X128 @@ -375,64 +352,28 @@ static void setup_frame(AV1_COMP *cpi) { // other inter-frames the encoder currently uses only two contexts; // context 1 for ALTREF frames and context 0 for the others. - cm->primary_ref_frame = PRIMARY_REF_NONE; if (frame_is_intra_only(cm) || cm->error_resilient_mode || - cm->force_primary_ref_none) { + cpi->ext_use_primary_ref_none) { av1_setup_past_independence(cm); - for (int i = 0; i < REF_FRAMES; i++) { - cm->fb_of_context_type[i] = -1; - } - cm->fb_of_context_type[REGULAR_FRAME] = - cm->show_frame ? get_ref_frame_map_idx(cpi, GOLDEN_FRAME) - : get_ref_frame_map_idx(cpi, ALTREF_FRAME); - cm->frame_context_idx = REGULAR_FRAME; - } else { - const GF_GROUP *gf_group = &cpi->twopass.gf_group; - if (gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE) - cm->frame_context_idx = EXT_ARF_FRAME; - else if (cpi->refresh_alt_ref_frame) - cm->frame_context_idx = ARF_FRAME; - else if (cpi->rc.is_src_frame_alt_ref) - cm->frame_context_idx = OVERLAY_FRAME; - else if (cpi->refresh_golden_frame) - cm->frame_context_idx = GLD_FRAME; - else if (cpi->refresh_bwd_ref_frame) - cm->frame_context_idx = BRF_FRAME; - else - cm->frame_context_idx = REGULAR_FRAME; - int wanted_fb = cm->fb_of_context_type[cm->frame_context_idx]; - for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { - int fb = get_ref_frame_map_idx(cpi, ref_frame); - if (fb == wanted_fb) { - cm->primary_ref_frame = ref_frame - LAST_FRAME; - } - } } if (cm->current_frame.frame_type == KEY_FRAME && cm->show_frame) { - cpi->refresh_golden_frame = 1; - cpi->refresh_alt_ref_frame = 1; - av1_zero(cpi->interp_filter_selected); set_sb_size(&cm->seq_params, select_sb_size(cpi)); } else if (frame_is_sframe(cm)) { - cpi->refresh_golden_frame = 1; - cpi->refresh_alt_ref_frame = 1; - av1_zero(cpi->interp_filter_selected); set_sb_size(&cm->seq_params, select_sb_size(cpi)); } else { - if (cm->primary_ref_frame == PRIMARY_REF_NONE || - cm->current_frame.frame_refs[cm->primary_ref_frame].buf == NULL) { + const RefCntBuffer *const primary_ref_buf = get_primary_ref_frame_buf(cm); + if (primary_ref_buf == NULL) { av1_setup_past_independence(cm); cm->seg.update_map = 1; cm->seg.update_data = 1; } else { - *cm->fc = cm->current_frame.frame_refs[cm->primary_ref_frame] - .buf->frame_context; + *cm->fc = primary_ref_buf->frame_context; } - av1_zero(cpi->interp_filter_selected[0]); } - cm->prev_frame = get_prev_frame(cm); + av1_zero(cm->cur_frame->interp_filter_selected); + cm->prev_frame = get_primary_ref_frame_buf(cm); cpi->vaq_refresh = 0; } @@ -526,6 +467,20 @@ static void alloc_context_buffers_ext(AV1_COMP *cpi) { aom_calloc(mi_size, sizeof(*cpi->mbmi_ext_base))); } +static void reset_film_grain_chroma_params(aom_film_grain_t *pars) { + pars->num_cr_points = 0; + pars->cr_mult = 0; + pars->cr_luma_mult = 0; + memset(pars->scaling_points_cr, 0, sizeof(pars->scaling_points_cr)); + memset(pars->ar_coeffs_cr, 0, sizeof(pars->ar_coeffs_cr)); + pars->num_cb_points = 0; + pars->cb_mult = 0; + pars->cb_luma_mult = 0; + pars->chroma_scaling_from_luma = 0; + memset(pars->scaling_points_cb, 0, sizeof(pars->scaling_points_cb)); + memset(pars->ar_coeffs_cb, 0, sizeof(pars->ar_coeffs_cb)); +} + static void update_film_grain_parameters(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) { AV1_COMMON *const cm = &cpi->common; @@ -543,20 +498,27 @@ static void update_film_grain_parameters(struct AV1_COMP *cpi, memcpy(&cm->film_grain_params, film_grain_test_vectors + oxcf->film_grain_test_vector - 1, sizeof(cm->film_grain_params)); - + if (oxcf->monochrome) + reset_film_grain_chroma_params(&cm->film_grain_params); cm->film_grain_params.bit_depth = cm->seq_params.bit_depth; if (cm->seq_params.color_range == AOM_CR_FULL_RANGE) { cm->film_grain_params.clip_to_restricted_range = 0; } } } else if (oxcf->film_grain_table_filename) { + cm->seq_params.film_grain_params_present = 1; + cpi->film_grain_table = aom_malloc(sizeof(*cpi->film_grain_table)); memset(cpi->film_grain_table, 0, sizeof(aom_film_grain_table_t)); aom_film_grain_table_read(cpi->film_grain_table, oxcf->film_grain_table_filename, &cm->error); } else { +#if CONFIG_DENOISE + cm->seq_params.film_grain_params_present = (cpi->oxcf.noise_level > 0); +#else cm->seq_params.film_grain_params_present = 0; +#endif memset(&cm->film_grain_params, 0, sizeof(cm->film_grain_params)); } } @@ -589,10 +551,8 @@ static void dealloc_compressor_data(AV1_COMP *cpi) { aom_free(cpi->td.mb.wsrc_buf); cpi->td.mb.wsrc_buf = NULL; -#if CONFIG_COLLECT_INTER_MODE_RD_STATS aom_free(cpi->td.mb.inter_modes_info); cpi->td.mb.inter_modes_info = NULL; -#endif for (int i = 0; i < 2; i++) for (int j = 0; j < 2; j++) { @@ -809,7 +769,7 @@ static void configure_static_seg_features(AV1_COMP *cpi) { static void update_reference_segmentation_map(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; MB_MODE_INFO **mi_4x4_ptr = cm->mi_grid_visible; - uint8_t *cache_ptr = cm->current_frame_seg_map; + uint8_t *cache_ptr = cm->cur_frame->seg_map; int row, col; for (row = 0; row < cm->mi_rows; row++) { @@ -827,11 +787,13 @@ static void alloc_raw_frame_buffers(AV1_COMP *cpi) { const SequenceHeader *const seq_params = &cm->seq_params; const AV1EncoderConfig *oxcf = &cpi->oxcf; - if (!cpi->lookahead) - cpi->lookahead = - av1_lookahead_init(oxcf->width, oxcf->height, seq_params->subsampling_x, - seq_params->subsampling_y, - seq_params->use_highbitdepth, oxcf->lag_in_frames); + if (!cpi->lookahead) { + int is_scale = (oxcf->resize_mode || oxcf->superres_mode); + cpi->lookahead = av1_lookahead_init( + oxcf->width, oxcf->height, seq_params->subsampling_x, + seq_params->subsampling_y, seq_params->use_highbitdepth, + oxcf->lag_in_frames, oxcf->border_in_pixels, is_scale); + } if (!cpi->lookahead) aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate lag buffers"); @@ -840,7 +802,7 @@ static void alloc_raw_frame_buffers(AV1_COMP *cpi) { if (aom_realloc_frame_buffer( &cpi->alt_ref_buffer, oxcf->width, oxcf->height, seq_params->subsampling_x, seq_params->subsampling_y, - seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS, + seq_params->use_highbitdepth, oxcf->border_in_pixels, cm->byte_alignment, NULL, NULL, NULL)) aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate altref buffer"); @@ -852,7 +814,7 @@ static void alloc_util_frame_buffers(AV1_COMP *cpi) { if (aom_realloc_frame_buffer( &cpi->last_frame_uf, cm->width, cm->height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, - AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL)) + cpi->oxcf.border_in_pixels, cm->byte_alignment, NULL, NULL, NULL)) aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate last frame buffer"); @@ -860,21 +822,21 @@ static void alloc_util_frame_buffers(AV1_COMP *cpi) { &cpi->trial_frame_rst, cm->superres_upscaled_width, cm->superres_upscaled_height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, - AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL)) + AOM_RESTORATION_FRAME_BORDER, cm->byte_alignment, NULL, NULL, NULL)) aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate trial restored frame buffer"); if (aom_realloc_frame_buffer( &cpi->scaled_source, cm->width, cm->height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, - AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL)) + cpi->oxcf.border_in_pixels, cm->byte_alignment, NULL, NULL, NULL)) aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate scaled source buffer"); if (aom_realloc_frame_buffer( &cpi->scaled_last_source, cm->width, cm->height, seq_params->subsampling_x, seq_params->subsampling_y, - seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS, + seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, cm->byte_alignment, NULL, NULL, NULL)) aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate scaled last source buffer"); @@ -978,10 +940,9 @@ static void update_frame_size(AV1_COMP *cpi) { static void init_buffer_indices(AV1_COMP *cpi) { int fb_idx; for (fb_idx = 0; fb_idx < REF_FRAMES; ++fb_idx) - cpi->remapped_ref_idx[fb_idx] = fb_idx; + cpi->common.remapped_ref_idx[fb_idx] = fb_idx; cpi->rate_index = 0; cpi->rate_size = 0; - cpi->cur_poc = -1; } static INLINE int does_level_match(int width, int height, double fps, @@ -1003,77 +964,58 @@ static void set_bitstream_level_tier(SequenceHeader *seq, AV1_COMMON *cm, // and max display sample rates. // Need to add checks for max bit rate, max decoded luma sample rate, header // rate, etc. that are not covered by this function. - (void)oxcf; - BitstreamLevel bl = { 9, 3 }; + AV1_LEVEL level = SEQ_LEVEL_MAX; if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, 512, 288, 30.0, 4)) { - bl.major = 2; - bl.minor = 0; + level = SEQ_LEVEL_2_0; } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, 704, 396, 30.0, 4)) { - bl.major = 2; - bl.minor = 1; + level = SEQ_LEVEL_2_1; } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, 1088, 612, 30.0, 4)) { - bl.major = 3; - bl.minor = 0; + level = SEQ_LEVEL_3_0; } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, 1376, 774, 30.0, 4)) { - bl.major = 3; - bl.minor = 1; + level = SEQ_LEVEL_3_1; } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, 2048, 1152, 30.0, 3)) { - bl.major = 4; - bl.minor = 0; + level = SEQ_LEVEL_4_0; } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, 2048, 1152, 60.0, 3)) { - bl.major = 4; - bl.minor = 1; + level = SEQ_LEVEL_4_1; } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, 4096, 2176, 30.0, 2)) { - bl.major = 5; - bl.minor = 0; + level = SEQ_LEVEL_5_0; } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, 4096, 2176, 60.0, 2)) { - bl.major = 5; - bl.minor = 1; + level = SEQ_LEVEL_5_1; } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, 4096, 2176, 120.0, 2)) { - bl.major = 5; - bl.minor = 2; + level = SEQ_LEVEL_5_2; } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, 8192, 4352, 30.0, 2)) { - bl.major = 6; - bl.minor = 0; + level = SEQ_LEVEL_6_0; } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, 8192, 4352, 60.0, 2)) { - bl.major = 6; - bl.minor = 1; } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, 8192, 4352, 120.0, 2)) { - bl.major = 6; - bl.minor = 2; + level = SEQ_LEVEL_6_2; } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, 16384, 8704, 30.0, 2)) { - bl.major = 7; - bl.minor = 0; + level = SEQ_LEVEL_7_0; } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, 16384, 8704, 60.0, 2)) { - bl.major = 7; - bl.minor = 1; + level = SEQ_LEVEL_7_1; } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, 16384, 8704, 120.0, 2)) { - bl.major = 7; - bl.minor = 2; + level = SEQ_LEVEL_7_2; } for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) { - seq->level[i] = bl; - seq->tier[i] = 0; // setting main tier by default + seq->seq_level_idx[i] = level; // Set the maximum parameters for bitrate and buffer size for this profile, // level, and tier cm->op_params[i].bitrate = max_level_bitrate( - cm->seq_params.profile, major_minor_to_seq_level_idx(seq->level[i]), - seq->tier[i]); + cm->seq_params.profile, seq->seq_level_idx[i], seq->tier[i]); // Level with seq_level_idx = 31 returns a high "dummy" bitrate to pass the // check if (cm->op_params[i].bitrate == 0) @@ -1106,9 +1048,24 @@ static void init_seq_coding_tools(SequenceHeader *seq, AV1_COMMON *cm, ? DEFAULT_EXPLICIT_ORDER_HINT_BITS - 1 : -1; + seq->max_frame_width = + oxcf->forced_max_frame_width ? oxcf->forced_max_frame_width : oxcf->width; + seq->max_frame_height = oxcf->forced_max_frame_height + ? oxcf->forced_max_frame_height + : oxcf->height; + seq->num_bits_width = + (seq->max_frame_width > 1) ? get_msb(seq->max_frame_width - 1) + 1 : 1; + seq->num_bits_height = + (seq->max_frame_height > 1) ? get_msb(seq->max_frame_height - 1) + 1 : 1; + assert(seq->num_bits_width <= 16); + assert(seq->num_bits_height <= 16); + + seq->frame_id_length = FRAME_ID_LENGTH; + seq->delta_frame_id_length = DELTA_FRAME_ID_LENGTH; + seq->enable_dual_filter = oxcf->enable_dual_filter; - seq->order_hint_info.enable_jnt_comp = oxcf->enable_jnt_comp; - seq->order_hint_info.enable_jnt_comp &= + seq->order_hint_info.enable_dist_wtd_comp = oxcf->enable_dist_wtd_comp; + seq->order_hint_info.enable_dist_wtd_comp &= seq->order_hint_info.enable_order_hint; seq->order_hint_info.enable_ref_frame_mvs = oxcf->enable_ref_frame_mvs; seq->order_hint_info.enable_ref_frame_mvs &= @@ -1117,10 +1074,10 @@ static void init_seq_coding_tools(SequenceHeader *seq, AV1_COMMON *cm, seq->enable_cdef = oxcf->enable_cdef; seq->enable_restoration = oxcf->enable_restoration; seq->enable_warped_motion = oxcf->enable_warped_motion; - seq->enable_interintra_compound = 1; - seq->enable_masked_compound = 1; - seq->enable_intra_edge_filter = 1; - seq->enable_filter_intra = 1; + seq->enable_interintra_compound = oxcf->enable_interintra_comp; + seq->enable_masked_compound = oxcf->enable_masked_comp; + seq->enable_intra_edge_filter = oxcf->enable_intra_edge_filter; + seq->enable_filter_intra = oxcf->enable_filter_intra; set_bitstream_level_tier(seq, cm, oxcf); @@ -1317,14 +1274,14 @@ static void set_rc_buffer_sizes(RATE_CONTROL *rc, static unsigned int fnname##_bits8( \ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ int ref_stride, const uint8_t *second_pred, \ - const JNT_COMP_PARAMS *jcp_param) { \ + const DIST_WTD_COMP_PARAMS *jcp_param) { \ return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \ jcp_param); \ } \ static unsigned int fnname##_bits10( \ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ int ref_stride, const uint8_t *second_pred, \ - const JNT_COMP_PARAMS *jcp_param) { \ + const DIST_WTD_COMP_PARAMS *jcp_param) { \ return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \ jcp_param) >> \ 2; \ @@ -1332,7 +1289,7 @@ static void set_rc_buffer_sizes(RATE_CONTROL *rc, static unsigned int fnname##_bits12( \ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \ int ref_stride, const uint8_t *second_pred, \ - const JNT_COMP_PARAMS *jcp_param) { \ + const DIST_WTD_COMP_PARAMS *jcp_param) { \ return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \ jcp_param) >> \ 4; \ @@ -1406,28 +1363,28 @@ MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x16) MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x16_avg) MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x16x4d) -MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad128x128_avg) -MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad128x64_avg) -MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad64x128_avg) -MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad32x16_avg) -MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad16x32_avg) -MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad64x32_avg) -MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad32x64_avg) -MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad32x32_avg) -MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad64x64_avg) -MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad16x16_avg) -MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad16x8_avg) -MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad8x16_avg) -MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad8x8_avg) -MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad8x4_avg) -MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad4x8_avg) -MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad4x4_avg) -MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad4x16_avg) -MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad16x4_avg) -MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad8x32_avg) -MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad32x8_avg) -MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad16x64_avg) -MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad64x16_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad128x128_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad128x64_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x128_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x16_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x32_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x32_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x64_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x32_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x64_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x16_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x8_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x16_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x8_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x4_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad4x8_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad4x4_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad4x16_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x4_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x32_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x8_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x64_avg) +MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x16_avg) #define HIGHBD_MBFP(BT, MCSDF, MCSVF) \ cpi->fn_ptr[BT].msdf = MCSDF; \ @@ -1536,166 +1493,167 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) { aom_highbd_8_sub_pixel_variance64x16, aom_highbd_8_sub_pixel_avg_variance64x16, aom_highbd_sad64x16x4d_bits8, - aom_highbd_jnt_sad64x16_avg_bits8, - aom_highbd_8_jnt_sub_pixel_avg_variance64x16) + aom_highbd_dist_wtd_sad64x16_avg_bits8, + aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x16) HIGHBD_BFP(BLOCK_16X64, aom_highbd_sad16x64_bits8, aom_highbd_sad16x64_avg_bits8, aom_highbd_8_variance16x64, aom_highbd_8_sub_pixel_variance16x64, aom_highbd_8_sub_pixel_avg_variance16x64, aom_highbd_sad16x64x4d_bits8, - aom_highbd_jnt_sad16x64_avg_bits8, - aom_highbd_8_jnt_sub_pixel_avg_variance16x64) + aom_highbd_dist_wtd_sad16x64_avg_bits8, + aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x64) HIGHBD_BFP( BLOCK_32X8, aom_highbd_sad32x8_bits8, aom_highbd_sad32x8_avg_bits8, aom_highbd_8_variance32x8, aom_highbd_8_sub_pixel_variance32x8, aom_highbd_8_sub_pixel_avg_variance32x8, - aom_highbd_sad32x8x4d_bits8, aom_highbd_jnt_sad32x8_avg_bits8, - aom_highbd_8_jnt_sub_pixel_avg_variance32x8) + aom_highbd_sad32x8x4d_bits8, aom_highbd_dist_wtd_sad32x8_avg_bits8, + aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x8) HIGHBD_BFP( BLOCK_8X32, aom_highbd_sad8x32_bits8, aom_highbd_sad8x32_avg_bits8, aom_highbd_8_variance8x32, aom_highbd_8_sub_pixel_variance8x32, aom_highbd_8_sub_pixel_avg_variance8x32, - aom_highbd_sad8x32x4d_bits8, aom_highbd_jnt_sad8x32_avg_bits8, - aom_highbd_8_jnt_sub_pixel_avg_variance8x32) + aom_highbd_sad8x32x4d_bits8, aom_highbd_dist_wtd_sad8x32_avg_bits8, + aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x32) HIGHBD_BFP( BLOCK_16X4, aom_highbd_sad16x4_bits8, aom_highbd_sad16x4_avg_bits8, aom_highbd_8_variance16x4, aom_highbd_8_sub_pixel_variance16x4, aom_highbd_8_sub_pixel_avg_variance16x4, - aom_highbd_sad16x4x4d_bits8, aom_highbd_jnt_sad16x4_avg_bits8, - aom_highbd_8_jnt_sub_pixel_avg_variance16x4) + aom_highbd_sad16x4x4d_bits8, aom_highbd_dist_wtd_sad16x4_avg_bits8, + aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x4) HIGHBD_BFP( BLOCK_4X16, aom_highbd_sad4x16_bits8, aom_highbd_sad4x16_avg_bits8, aom_highbd_8_variance4x16, aom_highbd_8_sub_pixel_variance4x16, aom_highbd_8_sub_pixel_avg_variance4x16, - aom_highbd_sad4x16x4d_bits8, aom_highbd_jnt_sad4x16_avg_bits8, - aom_highbd_8_jnt_sub_pixel_avg_variance4x16) + aom_highbd_sad4x16x4d_bits8, aom_highbd_dist_wtd_sad4x16_avg_bits8, + aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x16) HIGHBD_BFP(BLOCK_32X16, aom_highbd_sad32x16_bits8, aom_highbd_sad32x16_avg_bits8, aom_highbd_8_variance32x16, aom_highbd_8_sub_pixel_variance32x16, aom_highbd_8_sub_pixel_avg_variance32x16, aom_highbd_sad32x16x4d_bits8, - aom_highbd_jnt_sad32x16_avg_bits8, - aom_highbd_8_jnt_sub_pixel_avg_variance32x16) + aom_highbd_dist_wtd_sad32x16_avg_bits8, + aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x16) HIGHBD_BFP(BLOCK_16X32, aom_highbd_sad16x32_bits8, aom_highbd_sad16x32_avg_bits8, aom_highbd_8_variance16x32, aom_highbd_8_sub_pixel_variance16x32, aom_highbd_8_sub_pixel_avg_variance16x32, aom_highbd_sad16x32x4d_bits8, - aom_highbd_jnt_sad16x32_avg_bits8, - aom_highbd_8_jnt_sub_pixel_avg_variance16x32) + aom_highbd_dist_wtd_sad16x32_avg_bits8, + aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x32) HIGHBD_BFP(BLOCK_64X32, aom_highbd_sad64x32_bits8, aom_highbd_sad64x32_avg_bits8, aom_highbd_8_variance64x32, aom_highbd_8_sub_pixel_variance64x32, aom_highbd_8_sub_pixel_avg_variance64x32, aom_highbd_sad64x32x4d_bits8, - aom_highbd_jnt_sad64x32_avg_bits8, - aom_highbd_8_jnt_sub_pixel_avg_variance64x32) + aom_highbd_dist_wtd_sad64x32_avg_bits8, + aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x32) HIGHBD_BFP(BLOCK_32X64, aom_highbd_sad32x64_bits8, aom_highbd_sad32x64_avg_bits8, aom_highbd_8_variance32x64, aom_highbd_8_sub_pixel_variance32x64, aom_highbd_8_sub_pixel_avg_variance32x64, aom_highbd_sad32x64x4d_bits8, - aom_highbd_jnt_sad32x64_avg_bits8, - aom_highbd_8_jnt_sub_pixel_avg_variance32x64) + aom_highbd_dist_wtd_sad32x64_avg_bits8, + aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x64) HIGHBD_BFP(BLOCK_32X32, aom_highbd_sad32x32_bits8, aom_highbd_sad32x32_avg_bits8, aom_highbd_8_variance32x32, aom_highbd_8_sub_pixel_variance32x32, aom_highbd_8_sub_pixel_avg_variance32x32, aom_highbd_sad32x32x4d_bits8, - aom_highbd_jnt_sad32x32_avg_bits8, - aom_highbd_8_jnt_sub_pixel_avg_variance32x32) + aom_highbd_dist_wtd_sad32x32_avg_bits8, + aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x32) HIGHBD_BFP(BLOCK_64X64, aom_highbd_sad64x64_bits8, aom_highbd_sad64x64_avg_bits8, aom_highbd_8_variance64x64, aom_highbd_8_sub_pixel_variance64x64, aom_highbd_8_sub_pixel_avg_variance64x64, aom_highbd_sad64x64x4d_bits8, - aom_highbd_jnt_sad64x64_avg_bits8, - aom_highbd_8_jnt_sub_pixel_avg_variance64x64) + aom_highbd_dist_wtd_sad64x64_avg_bits8, + aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x64) HIGHBD_BFP(BLOCK_16X16, aom_highbd_sad16x16_bits8, aom_highbd_sad16x16_avg_bits8, aom_highbd_8_variance16x16, aom_highbd_8_sub_pixel_variance16x16, aom_highbd_8_sub_pixel_avg_variance16x16, aom_highbd_sad16x16x4d_bits8, - aom_highbd_jnt_sad16x16_avg_bits8, - aom_highbd_8_jnt_sub_pixel_avg_variance16x16) + aom_highbd_dist_wtd_sad16x16_avg_bits8, + aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x16) HIGHBD_BFP( BLOCK_16X8, aom_highbd_sad16x8_bits8, aom_highbd_sad16x8_avg_bits8, aom_highbd_8_variance16x8, aom_highbd_8_sub_pixel_variance16x8, aom_highbd_8_sub_pixel_avg_variance16x8, - aom_highbd_sad16x8x4d_bits8, aom_highbd_jnt_sad16x8_avg_bits8, - aom_highbd_8_jnt_sub_pixel_avg_variance16x8) + aom_highbd_sad16x8x4d_bits8, aom_highbd_dist_wtd_sad16x8_avg_bits8, + aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x8) HIGHBD_BFP( BLOCK_8X16, aom_highbd_sad8x16_bits8, aom_highbd_sad8x16_avg_bits8, aom_highbd_8_variance8x16, aom_highbd_8_sub_pixel_variance8x16, aom_highbd_8_sub_pixel_avg_variance8x16, - aom_highbd_sad8x16x4d_bits8, aom_highbd_jnt_sad8x16_avg_bits8, - aom_highbd_8_jnt_sub_pixel_avg_variance8x16) - - HIGHBD_BFP(BLOCK_8X8, aom_highbd_sad8x8_bits8, - aom_highbd_sad8x8_avg_bits8, aom_highbd_8_variance8x8, - aom_highbd_8_sub_pixel_variance8x8, - aom_highbd_8_sub_pixel_avg_variance8x8, - aom_highbd_sad8x8x4d_bits8, aom_highbd_jnt_sad8x8_avg_bits8, - aom_highbd_8_jnt_sub_pixel_avg_variance8x8) - - HIGHBD_BFP(BLOCK_8X4, aom_highbd_sad8x4_bits8, - aom_highbd_sad8x4_avg_bits8, aom_highbd_8_variance8x4, - aom_highbd_8_sub_pixel_variance8x4, - aom_highbd_8_sub_pixel_avg_variance8x4, - aom_highbd_sad8x4x4d_bits8, aom_highbd_jnt_sad8x4_avg_bits8, - aom_highbd_8_jnt_sub_pixel_avg_variance8x4) - - HIGHBD_BFP(BLOCK_4X8, aom_highbd_sad4x8_bits8, - aom_highbd_sad4x8_avg_bits8, aom_highbd_8_variance4x8, - aom_highbd_8_sub_pixel_variance4x8, - aom_highbd_8_sub_pixel_avg_variance4x8, - aom_highbd_sad4x8x4d_bits8, aom_highbd_jnt_sad4x8_avg_bits8, - aom_highbd_8_jnt_sub_pixel_avg_variance4x8) - - HIGHBD_BFP(BLOCK_4X4, aom_highbd_sad4x4_bits8, - aom_highbd_sad4x4_avg_bits8, aom_highbd_8_variance4x4, - aom_highbd_8_sub_pixel_variance4x4, - aom_highbd_8_sub_pixel_avg_variance4x4, - aom_highbd_sad4x4x4d_bits8, aom_highbd_jnt_sad4x4_avg_bits8, - aom_highbd_8_jnt_sub_pixel_avg_variance4x4) + aom_highbd_sad8x16x4d_bits8, aom_highbd_dist_wtd_sad8x16_avg_bits8, + aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x16) + + HIGHBD_BFP( + BLOCK_8X8, aom_highbd_sad8x8_bits8, aom_highbd_sad8x8_avg_bits8, + aom_highbd_8_variance8x8, aom_highbd_8_sub_pixel_variance8x8, + aom_highbd_8_sub_pixel_avg_variance8x8, aom_highbd_sad8x8x4d_bits8, + aom_highbd_dist_wtd_sad8x8_avg_bits8, + aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x8) + + HIGHBD_BFP( + BLOCK_8X4, aom_highbd_sad8x4_bits8, aom_highbd_sad8x4_avg_bits8, + aom_highbd_8_variance8x4, aom_highbd_8_sub_pixel_variance8x4, + aom_highbd_8_sub_pixel_avg_variance8x4, aom_highbd_sad8x4x4d_bits8, + aom_highbd_dist_wtd_sad8x4_avg_bits8, + aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x4) + + HIGHBD_BFP( + BLOCK_4X8, aom_highbd_sad4x8_bits8, aom_highbd_sad4x8_avg_bits8, + aom_highbd_8_variance4x8, aom_highbd_8_sub_pixel_variance4x8, + aom_highbd_8_sub_pixel_avg_variance4x8, aom_highbd_sad4x8x4d_bits8, + aom_highbd_dist_wtd_sad4x8_avg_bits8, + aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x8) HIGHBD_BFP( - BLOCK_128X128, aom_highbd_sad128x128_bits8, - aom_highbd_sad128x128_avg_bits8, aom_highbd_8_variance128x128, - aom_highbd_8_sub_pixel_variance128x128, - aom_highbd_8_sub_pixel_avg_variance128x128, - aom_highbd_sad128x128x4d_bits8, aom_highbd_jnt_sad128x128_avg_bits8, - aom_highbd_8_jnt_sub_pixel_avg_variance128x128) + BLOCK_4X4, aom_highbd_sad4x4_bits8, aom_highbd_sad4x4_avg_bits8, + aom_highbd_8_variance4x4, aom_highbd_8_sub_pixel_variance4x4, + aom_highbd_8_sub_pixel_avg_variance4x4, aom_highbd_sad4x4x4d_bits8, + aom_highbd_dist_wtd_sad4x4_avg_bits8, + aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x4) + + HIGHBD_BFP(BLOCK_128X128, aom_highbd_sad128x128_bits8, + aom_highbd_sad128x128_avg_bits8, + aom_highbd_8_variance128x128, + aom_highbd_8_sub_pixel_variance128x128, + aom_highbd_8_sub_pixel_avg_variance128x128, + aom_highbd_sad128x128x4d_bits8, + aom_highbd_dist_wtd_sad128x128_avg_bits8, + aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x128) HIGHBD_BFP(BLOCK_128X64, aom_highbd_sad128x64_bits8, aom_highbd_sad128x64_avg_bits8, aom_highbd_8_variance128x64, aom_highbd_8_sub_pixel_variance128x64, aom_highbd_8_sub_pixel_avg_variance128x64, aom_highbd_sad128x64x4d_bits8, - aom_highbd_jnt_sad128x64_avg_bits8, - aom_highbd_8_jnt_sub_pixel_avg_variance128x64) + aom_highbd_dist_wtd_sad128x64_avg_bits8, + aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x64) HIGHBD_BFP(BLOCK_64X128, aom_highbd_sad64x128_bits8, aom_highbd_sad64x128_avg_bits8, aom_highbd_8_variance64x128, aom_highbd_8_sub_pixel_variance64x128, aom_highbd_8_sub_pixel_avg_variance64x128, aom_highbd_sad64x128x4d_bits8, - aom_highbd_jnt_sad64x128_avg_bits8, - aom_highbd_8_jnt_sub_pixel_avg_variance64x128) + aom_highbd_dist_wtd_sad64x128_avg_bits8, + aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x128) HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits8, aom_highbd_8_masked_sub_pixel_variance128x128) @@ -1815,148 +1773,148 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) { aom_highbd_10_sub_pixel_variance64x16, aom_highbd_10_sub_pixel_avg_variance64x16, aom_highbd_sad64x16x4d_bits10, - aom_highbd_jnt_sad64x16_avg_bits10, - aom_highbd_10_jnt_sub_pixel_avg_variance64x16); + aom_highbd_dist_wtd_sad64x16_avg_bits10, + aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x16); HIGHBD_BFP(BLOCK_16X64, aom_highbd_sad16x64_bits10, aom_highbd_sad16x64_avg_bits10, aom_highbd_10_variance16x64, aom_highbd_10_sub_pixel_variance16x64, aom_highbd_10_sub_pixel_avg_variance16x64, aom_highbd_sad16x64x4d_bits10, - aom_highbd_jnt_sad16x64_avg_bits10, - aom_highbd_10_jnt_sub_pixel_avg_variance16x64); + aom_highbd_dist_wtd_sad16x64_avg_bits10, + aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x64); HIGHBD_BFP(BLOCK_32X8, aom_highbd_sad32x8_bits10, aom_highbd_sad32x8_avg_bits10, aom_highbd_10_variance32x8, aom_highbd_10_sub_pixel_variance32x8, aom_highbd_10_sub_pixel_avg_variance32x8, aom_highbd_sad32x8x4d_bits10, - aom_highbd_jnt_sad32x8_avg_bits10, - aom_highbd_10_jnt_sub_pixel_avg_variance32x8); + aom_highbd_dist_wtd_sad32x8_avg_bits10, + aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x8); HIGHBD_BFP(BLOCK_8X32, aom_highbd_sad8x32_bits10, aom_highbd_sad8x32_avg_bits10, aom_highbd_10_variance8x32, aom_highbd_10_sub_pixel_variance8x32, aom_highbd_10_sub_pixel_avg_variance8x32, aom_highbd_sad8x32x4d_bits10, - aom_highbd_jnt_sad8x32_avg_bits10, - aom_highbd_10_jnt_sub_pixel_avg_variance8x32); + aom_highbd_dist_wtd_sad8x32_avg_bits10, + aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x32); HIGHBD_BFP(BLOCK_16X4, aom_highbd_sad16x4_bits10, aom_highbd_sad16x4_avg_bits10, aom_highbd_10_variance16x4, aom_highbd_10_sub_pixel_variance16x4, aom_highbd_10_sub_pixel_avg_variance16x4, aom_highbd_sad16x4x4d_bits10, - aom_highbd_jnt_sad16x4_avg_bits10, - aom_highbd_10_jnt_sub_pixel_avg_variance16x4); + aom_highbd_dist_wtd_sad16x4_avg_bits10, + aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x4); HIGHBD_BFP(BLOCK_4X16, aom_highbd_sad4x16_bits10, aom_highbd_sad4x16_avg_bits10, aom_highbd_10_variance4x16, aom_highbd_10_sub_pixel_variance4x16, aom_highbd_10_sub_pixel_avg_variance4x16, aom_highbd_sad4x16x4d_bits10, - aom_highbd_jnt_sad4x16_avg_bits10, - aom_highbd_10_jnt_sub_pixel_avg_variance4x16); + aom_highbd_dist_wtd_sad4x16_avg_bits10, + aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x16); HIGHBD_BFP(BLOCK_32X16, aom_highbd_sad32x16_bits10, aom_highbd_sad32x16_avg_bits10, aom_highbd_10_variance32x16, aom_highbd_10_sub_pixel_variance32x16, aom_highbd_10_sub_pixel_avg_variance32x16, aom_highbd_sad32x16x4d_bits10, - aom_highbd_jnt_sad32x16_avg_bits10, - aom_highbd_10_jnt_sub_pixel_avg_variance32x16); + aom_highbd_dist_wtd_sad32x16_avg_bits10, + aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x16); HIGHBD_BFP(BLOCK_16X32, aom_highbd_sad16x32_bits10, aom_highbd_sad16x32_avg_bits10, aom_highbd_10_variance16x32, aom_highbd_10_sub_pixel_variance16x32, aom_highbd_10_sub_pixel_avg_variance16x32, aom_highbd_sad16x32x4d_bits10, - aom_highbd_jnt_sad16x32_avg_bits10, - aom_highbd_10_jnt_sub_pixel_avg_variance16x32); + aom_highbd_dist_wtd_sad16x32_avg_bits10, + aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x32); HIGHBD_BFP(BLOCK_64X32, aom_highbd_sad64x32_bits10, aom_highbd_sad64x32_avg_bits10, aom_highbd_10_variance64x32, aom_highbd_10_sub_pixel_variance64x32, aom_highbd_10_sub_pixel_avg_variance64x32, aom_highbd_sad64x32x4d_bits10, - aom_highbd_jnt_sad64x32_avg_bits10, - aom_highbd_10_jnt_sub_pixel_avg_variance64x32); + aom_highbd_dist_wtd_sad64x32_avg_bits10, + aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x32); HIGHBD_BFP(BLOCK_32X64, aom_highbd_sad32x64_bits10, aom_highbd_sad32x64_avg_bits10, aom_highbd_10_variance32x64, aom_highbd_10_sub_pixel_variance32x64, aom_highbd_10_sub_pixel_avg_variance32x64, aom_highbd_sad32x64x4d_bits10, - aom_highbd_jnt_sad32x64_avg_bits10, - aom_highbd_10_jnt_sub_pixel_avg_variance32x64); + aom_highbd_dist_wtd_sad32x64_avg_bits10, + aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x64); HIGHBD_BFP(BLOCK_32X32, aom_highbd_sad32x32_bits10, aom_highbd_sad32x32_avg_bits10, aom_highbd_10_variance32x32, aom_highbd_10_sub_pixel_variance32x32, aom_highbd_10_sub_pixel_avg_variance32x32, aom_highbd_sad32x32x4d_bits10, - aom_highbd_jnt_sad32x32_avg_bits10, - aom_highbd_10_jnt_sub_pixel_avg_variance32x32); + aom_highbd_dist_wtd_sad32x32_avg_bits10, + aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x32); HIGHBD_BFP(BLOCK_64X64, aom_highbd_sad64x64_bits10, aom_highbd_sad64x64_avg_bits10, aom_highbd_10_variance64x64, aom_highbd_10_sub_pixel_variance64x64, aom_highbd_10_sub_pixel_avg_variance64x64, aom_highbd_sad64x64x4d_bits10, - aom_highbd_jnt_sad64x64_avg_bits10, - aom_highbd_10_jnt_sub_pixel_avg_variance64x64); + aom_highbd_dist_wtd_sad64x64_avg_bits10, + aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x64); HIGHBD_BFP(BLOCK_16X16, aom_highbd_sad16x16_bits10, aom_highbd_sad16x16_avg_bits10, aom_highbd_10_variance16x16, aom_highbd_10_sub_pixel_variance16x16, aom_highbd_10_sub_pixel_avg_variance16x16, aom_highbd_sad16x16x4d_bits10, - aom_highbd_jnt_sad16x16_avg_bits10, - aom_highbd_10_jnt_sub_pixel_avg_variance16x16); + aom_highbd_dist_wtd_sad16x16_avg_bits10, + aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x16); HIGHBD_BFP(BLOCK_16X8, aom_highbd_sad16x8_bits10, aom_highbd_sad16x8_avg_bits10, aom_highbd_10_variance16x8, aom_highbd_10_sub_pixel_variance16x8, aom_highbd_10_sub_pixel_avg_variance16x8, aom_highbd_sad16x8x4d_bits10, - aom_highbd_jnt_sad16x8_avg_bits10, - aom_highbd_10_jnt_sub_pixel_avg_variance16x8); + aom_highbd_dist_wtd_sad16x8_avg_bits10, + aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x8); HIGHBD_BFP(BLOCK_8X16, aom_highbd_sad8x16_bits10, aom_highbd_sad8x16_avg_bits10, aom_highbd_10_variance8x16, aom_highbd_10_sub_pixel_variance8x16, aom_highbd_10_sub_pixel_avg_variance8x16, aom_highbd_sad8x16x4d_bits10, - aom_highbd_jnt_sad8x16_avg_bits10, - aom_highbd_10_jnt_sub_pixel_avg_variance8x16); + aom_highbd_dist_wtd_sad8x16_avg_bits10, + aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x16); HIGHBD_BFP( BLOCK_8X8, aom_highbd_sad8x8_bits10, aom_highbd_sad8x8_avg_bits10, aom_highbd_10_variance8x8, aom_highbd_10_sub_pixel_variance8x8, aom_highbd_10_sub_pixel_avg_variance8x8, - aom_highbd_sad8x8x4d_bits10, aom_highbd_jnt_sad8x8_avg_bits10, - aom_highbd_10_jnt_sub_pixel_avg_variance8x8); + aom_highbd_sad8x8x4d_bits10, aom_highbd_dist_wtd_sad8x8_avg_bits10, + aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x8); HIGHBD_BFP( BLOCK_8X4, aom_highbd_sad8x4_bits10, aom_highbd_sad8x4_avg_bits10, aom_highbd_10_variance8x4, aom_highbd_10_sub_pixel_variance8x4, aom_highbd_10_sub_pixel_avg_variance8x4, - aom_highbd_sad8x4x4d_bits10, aom_highbd_jnt_sad8x4_avg_bits10, - aom_highbd_10_jnt_sub_pixel_avg_variance8x4); + aom_highbd_sad8x4x4d_bits10, aom_highbd_dist_wtd_sad8x4_avg_bits10, + aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x4); HIGHBD_BFP( BLOCK_4X8, aom_highbd_sad4x8_bits10, aom_highbd_sad4x8_avg_bits10, aom_highbd_10_variance4x8, aom_highbd_10_sub_pixel_variance4x8, aom_highbd_10_sub_pixel_avg_variance4x8, - aom_highbd_sad4x8x4d_bits10, aom_highbd_jnt_sad4x8_avg_bits10, - aom_highbd_10_jnt_sub_pixel_avg_variance4x8); + aom_highbd_sad4x8x4d_bits10, aom_highbd_dist_wtd_sad4x8_avg_bits10, + aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x8); HIGHBD_BFP( BLOCK_4X4, aom_highbd_sad4x4_bits10, aom_highbd_sad4x4_avg_bits10, aom_highbd_10_variance4x4, aom_highbd_10_sub_pixel_variance4x4, aom_highbd_10_sub_pixel_avg_variance4x4, - aom_highbd_sad4x4x4d_bits10, aom_highbd_jnt_sad4x4_avg_bits10, - aom_highbd_10_jnt_sub_pixel_avg_variance4x4); + aom_highbd_sad4x4x4d_bits10, aom_highbd_dist_wtd_sad4x4_avg_bits10, + aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x4); HIGHBD_BFP(BLOCK_128X128, aom_highbd_sad128x128_bits10, aom_highbd_sad128x128_avg_bits10, @@ -1964,24 +1922,26 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) { aom_highbd_10_sub_pixel_variance128x128, aom_highbd_10_sub_pixel_avg_variance128x128, aom_highbd_sad128x128x4d_bits10, - aom_highbd_jnt_sad128x128_avg_bits10, - aom_highbd_10_jnt_sub_pixel_avg_variance128x128); - - HIGHBD_BFP( - BLOCK_128X64, aom_highbd_sad128x64_bits10, - aom_highbd_sad128x64_avg_bits10, aom_highbd_10_variance128x64, - aom_highbd_10_sub_pixel_variance128x64, - aom_highbd_10_sub_pixel_avg_variance128x64, - aom_highbd_sad128x64x4d_bits10, aom_highbd_jnt_sad128x64_avg_bits10, - aom_highbd_10_jnt_sub_pixel_avg_variance128x64); - - HIGHBD_BFP( - BLOCK_64X128, aom_highbd_sad64x128_bits10, - aom_highbd_sad64x128_avg_bits10, aom_highbd_10_variance64x128, - aom_highbd_10_sub_pixel_variance64x128, - aom_highbd_10_sub_pixel_avg_variance64x128, - aom_highbd_sad64x128x4d_bits10, aom_highbd_jnt_sad64x128_avg_bits10, - aom_highbd_10_jnt_sub_pixel_avg_variance64x128); + aom_highbd_dist_wtd_sad128x128_avg_bits10, + aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x128); + + HIGHBD_BFP(BLOCK_128X64, aom_highbd_sad128x64_bits10, + aom_highbd_sad128x64_avg_bits10, + aom_highbd_10_variance128x64, + aom_highbd_10_sub_pixel_variance128x64, + aom_highbd_10_sub_pixel_avg_variance128x64, + aom_highbd_sad128x64x4d_bits10, + aom_highbd_dist_wtd_sad128x64_avg_bits10, + aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x64); + + HIGHBD_BFP(BLOCK_64X128, aom_highbd_sad64x128_bits10, + aom_highbd_sad64x128_avg_bits10, + aom_highbd_10_variance64x128, + aom_highbd_10_sub_pixel_variance64x128, + aom_highbd_10_sub_pixel_avg_variance64x128, + aom_highbd_sad64x128x4d_bits10, + aom_highbd_dist_wtd_sad64x128_avg_bits10, + aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x128); HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits10, aom_highbd_10_masked_sub_pixel_variance128x128) @@ -2107,148 +2067,148 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) { aom_highbd_12_sub_pixel_variance64x16, aom_highbd_12_sub_pixel_avg_variance64x16, aom_highbd_sad64x16x4d_bits12, - aom_highbd_jnt_sad64x16_avg_bits12, - aom_highbd_12_jnt_sub_pixel_avg_variance64x16); + aom_highbd_dist_wtd_sad64x16_avg_bits12, + aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x16); HIGHBD_BFP(BLOCK_16X64, aom_highbd_sad16x64_bits12, aom_highbd_sad16x64_avg_bits12, aom_highbd_12_variance16x64, aom_highbd_12_sub_pixel_variance16x64, aom_highbd_12_sub_pixel_avg_variance16x64, aom_highbd_sad16x64x4d_bits12, - aom_highbd_jnt_sad16x64_avg_bits12, - aom_highbd_12_jnt_sub_pixel_avg_variance16x64); + aom_highbd_dist_wtd_sad16x64_avg_bits12, + aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x64); HIGHBD_BFP(BLOCK_32X8, aom_highbd_sad32x8_bits12, aom_highbd_sad32x8_avg_bits12, aom_highbd_12_variance32x8, aom_highbd_12_sub_pixel_variance32x8, aom_highbd_12_sub_pixel_avg_variance32x8, aom_highbd_sad32x8x4d_bits12, - aom_highbd_jnt_sad32x8_avg_bits12, - aom_highbd_12_jnt_sub_pixel_avg_variance32x8); + aom_highbd_dist_wtd_sad32x8_avg_bits12, + aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x8); HIGHBD_BFP(BLOCK_8X32, aom_highbd_sad8x32_bits12, aom_highbd_sad8x32_avg_bits12, aom_highbd_12_variance8x32, aom_highbd_12_sub_pixel_variance8x32, aom_highbd_12_sub_pixel_avg_variance8x32, aom_highbd_sad8x32x4d_bits12, - aom_highbd_jnt_sad8x32_avg_bits12, - aom_highbd_12_jnt_sub_pixel_avg_variance8x32); + aom_highbd_dist_wtd_sad8x32_avg_bits12, + aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x32); HIGHBD_BFP(BLOCK_16X4, aom_highbd_sad16x4_bits12, aom_highbd_sad16x4_avg_bits12, aom_highbd_12_variance16x4, aom_highbd_12_sub_pixel_variance16x4, aom_highbd_12_sub_pixel_avg_variance16x4, aom_highbd_sad16x4x4d_bits12, - aom_highbd_jnt_sad16x4_avg_bits12, - aom_highbd_12_jnt_sub_pixel_avg_variance16x4); + aom_highbd_dist_wtd_sad16x4_avg_bits12, + aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x4); HIGHBD_BFP(BLOCK_4X16, aom_highbd_sad4x16_bits12, aom_highbd_sad4x16_avg_bits12, aom_highbd_12_variance4x16, aom_highbd_12_sub_pixel_variance4x16, aom_highbd_12_sub_pixel_avg_variance4x16, aom_highbd_sad4x16x4d_bits12, - aom_highbd_jnt_sad4x16_avg_bits12, - aom_highbd_12_jnt_sub_pixel_avg_variance4x16); + aom_highbd_dist_wtd_sad4x16_avg_bits12, + aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x16); HIGHBD_BFP(BLOCK_32X16, aom_highbd_sad32x16_bits12, aom_highbd_sad32x16_avg_bits12, aom_highbd_12_variance32x16, aom_highbd_12_sub_pixel_variance32x16, aom_highbd_12_sub_pixel_avg_variance32x16, aom_highbd_sad32x16x4d_bits12, - aom_highbd_jnt_sad32x16_avg_bits12, - aom_highbd_12_jnt_sub_pixel_avg_variance32x16); + aom_highbd_dist_wtd_sad32x16_avg_bits12, + aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x16); HIGHBD_BFP(BLOCK_16X32, aom_highbd_sad16x32_bits12, aom_highbd_sad16x32_avg_bits12, aom_highbd_12_variance16x32, aom_highbd_12_sub_pixel_variance16x32, aom_highbd_12_sub_pixel_avg_variance16x32, aom_highbd_sad16x32x4d_bits12, - aom_highbd_jnt_sad16x32_avg_bits12, - aom_highbd_12_jnt_sub_pixel_avg_variance16x32); + aom_highbd_dist_wtd_sad16x32_avg_bits12, + aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x32); HIGHBD_BFP(BLOCK_64X32, aom_highbd_sad64x32_bits12, aom_highbd_sad64x32_avg_bits12, aom_highbd_12_variance64x32, aom_highbd_12_sub_pixel_variance64x32, aom_highbd_12_sub_pixel_avg_variance64x32, aom_highbd_sad64x32x4d_bits12, - aom_highbd_jnt_sad64x32_avg_bits12, - aom_highbd_12_jnt_sub_pixel_avg_variance64x32); + aom_highbd_dist_wtd_sad64x32_avg_bits12, + aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x32); HIGHBD_BFP(BLOCK_32X64, aom_highbd_sad32x64_bits12, aom_highbd_sad32x64_avg_bits12, aom_highbd_12_variance32x64, aom_highbd_12_sub_pixel_variance32x64, aom_highbd_12_sub_pixel_avg_variance32x64, aom_highbd_sad32x64x4d_bits12, - aom_highbd_jnt_sad32x64_avg_bits12, - aom_highbd_12_jnt_sub_pixel_avg_variance32x64); + aom_highbd_dist_wtd_sad32x64_avg_bits12, + aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x64); HIGHBD_BFP(BLOCK_32X32, aom_highbd_sad32x32_bits12, aom_highbd_sad32x32_avg_bits12, aom_highbd_12_variance32x32, aom_highbd_12_sub_pixel_variance32x32, aom_highbd_12_sub_pixel_avg_variance32x32, aom_highbd_sad32x32x4d_bits12, - aom_highbd_jnt_sad32x32_avg_bits12, - aom_highbd_12_jnt_sub_pixel_avg_variance32x32); + aom_highbd_dist_wtd_sad32x32_avg_bits12, + aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x32); HIGHBD_BFP(BLOCK_64X64, aom_highbd_sad64x64_bits12, aom_highbd_sad64x64_avg_bits12, aom_highbd_12_variance64x64, aom_highbd_12_sub_pixel_variance64x64, aom_highbd_12_sub_pixel_avg_variance64x64, aom_highbd_sad64x64x4d_bits12, - aom_highbd_jnt_sad64x64_avg_bits12, - aom_highbd_12_jnt_sub_pixel_avg_variance64x64); + aom_highbd_dist_wtd_sad64x64_avg_bits12, + aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x64); HIGHBD_BFP(BLOCK_16X16, aom_highbd_sad16x16_bits12, aom_highbd_sad16x16_avg_bits12, aom_highbd_12_variance16x16, aom_highbd_12_sub_pixel_variance16x16, aom_highbd_12_sub_pixel_avg_variance16x16, aom_highbd_sad16x16x4d_bits12, - aom_highbd_jnt_sad16x16_avg_bits12, - aom_highbd_12_jnt_sub_pixel_avg_variance16x16); + aom_highbd_dist_wtd_sad16x16_avg_bits12, + aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x16); HIGHBD_BFP(BLOCK_16X8, aom_highbd_sad16x8_bits12, aom_highbd_sad16x8_avg_bits12, aom_highbd_12_variance16x8, aom_highbd_12_sub_pixel_variance16x8, aom_highbd_12_sub_pixel_avg_variance16x8, aom_highbd_sad16x8x4d_bits12, - aom_highbd_jnt_sad16x8_avg_bits12, - aom_highbd_12_jnt_sub_pixel_avg_variance16x8); + aom_highbd_dist_wtd_sad16x8_avg_bits12, + aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x8); HIGHBD_BFP(BLOCK_8X16, aom_highbd_sad8x16_bits12, aom_highbd_sad8x16_avg_bits12, aom_highbd_12_variance8x16, aom_highbd_12_sub_pixel_variance8x16, aom_highbd_12_sub_pixel_avg_variance8x16, aom_highbd_sad8x16x4d_bits12, - aom_highbd_jnt_sad8x16_avg_bits12, - aom_highbd_12_jnt_sub_pixel_avg_variance8x16); + aom_highbd_dist_wtd_sad8x16_avg_bits12, + aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x16); HIGHBD_BFP( BLOCK_8X8, aom_highbd_sad8x8_bits12, aom_highbd_sad8x8_avg_bits12, aom_highbd_12_variance8x8, aom_highbd_12_sub_pixel_variance8x8, aom_highbd_12_sub_pixel_avg_variance8x8, - aom_highbd_sad8x8x4d_bits12, aom_highbd_jnt_sad8x8_avg_bits12, - aom_highbd_12_jnt_sub_pixel_avg_variance8x8); + aom_highbd_sad8x8x4d_bits12, aom_highbd_dist_wtd_sad8x8_avg_bits12, + aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x8); HIGHBD_BFP( BLOCK_8X4, aom_highbd_sad8x4_bits12, aom_highbd_sad8x4_avg_bits12, aom_highbd_12_variance8x4, aom_highbd_12_sub_pixel_variance8x4, aom_highbd_12_sub_pixel_avg_variance8x4, - aom_highbd_sad8x4x4d_bits12, aom_highbd_jnt_sad8x4_avg_bits12, - aom_highbd_12_jnt_sub_pixel_avg_variance8x4); + aom_highbd_sad8x4x4d_bits12, aom_highbd_dist_wtd_sad8x4_avg_bits12, + aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x4); HIGHBD_BFP( BLOCK_4X8, aom_highbd_sad4x8_bits12, aom_highbd_sad4x8_avg_bits12, aom_highbd_12_variance4x8, aom_highbd_12_sub_pixel_variance4x8, aom_highbd_12_sub_pixel_avg_variance4x8, - aom_highbd_sad4x8x4d_bits12, aom_highbd_jnt_sad4x8_avg_bits12, - aom_highbd_12_jnt_sub_pixel_avg_variance4x8); + aom_highbd_sad4x8x4d_bits12, aom_highbd_dist_wtd_sad4x8_avg_bits12, + aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x8); HIGHBD_BFP( BLOCK_4X4, aom_highbd_sad4x4_bits12, aom_highbd_sad4x4_avg_bits12, aom_highbd_12_variance4x4, aom_highbd_12_sub_pixel_variance4x4, aom_highbd_12_sub_pixel_avg_variance4x4, - aom_highbd_sad4x4x4d_bits12, aom_highbd_jnt_sad4x4_avg_bits12, - aom_highbd_12_jnt_sub_pixel_avg_variance4x4); + aom_highbd_sad4x4x4d_bits12, aom_highbd_dist_wtd_sad4x4_avg_bits12, + aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x4); HIGHBD_BFP(BLOCK_128X128, aom_highbd_sad128x128_bits12, aom_highbd_sad128x128_avg_bits12, @@ -2256,24 +2216,26 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) { aom_highbd_12_sub_pixel_variance128x128, aom_highbd_12_sub_pixel_avg_variance128x128, aom_highbd_sad128x128x4d_bits12, - aom_highbd_jnt_sad128x128_avg_bits12, - aom_highbd_12_jnt_sub_pixel_avg_variance128x128); - - HIGHBD_BFP( - BLOCK_128X64, aom_highbd_sad128x64_bits12, - aom_highbd_sad128x64_avg_bits12, aom_highbd_12_variance128x64, - aom_highbd_12_sub_pixel_variance128x64, - aom_highbd_12_sub_pixel_avg_variance128x64, - aom_highbd_sad128x64x4d_bits12, aom_highbd_jnt_sad128x64_avg_bits12, - aom_highbd_12_jnt_sub_pixel_avg_variance128x64); - - HIGHBD_BFP( - BLOCK_64X128, aom_highbd_sad64x128_bits12, - aom_highbd_sad64x128_avg_bits12, aom_highbd_12_variance64x128, - aom_highbd_12_sub_pixel_variance64x128, - aom_highbd_12_sub_pixel_avg_variance64x128, - aom_highbd_sad64x128x4d_bits12, aom_highbd_jnt_sad64x128_avg_bits12, - aom_highbd_12_jnt_sub_pixel_avg_variance64x128); + aom_highbd_dist_wtd_sad128x128_avg_bits12, + aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x128); + + HIGHBD_BFP(BLOCK_128X64, aom_highbd_sad128x64_bits12, + aom_highbd_sad128x64_avg_bits12, + aom_highbd_12_variance128x64, + aom_highbd_12_sub_pixel_variance128x64, + aom_highbd_12_sub_pixel_avg_variance128x64, + aom_highbd_sad128x64x4d_bits12, + aom_highbd_dist_wtd_sad128x64_avg_bits12, + aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x64); + + HIGHBD_BFP(BLOCK_64X128, aom_highbd_sad64x128_bits12, + aom_highbd_sad64x128_avg_bits12, + aom_highbd_12_variance64x128, + aom_highbd_12_sub_pixel_variance64x128, + aom_highbd_12_sub_pixel_avg_variance64x128, + aom_highbd_sad64x128x4d_bits12, + aom_highbd_dist_wtd_sad64x128_avg_bits12, + aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x128); HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits12, aom_highbd_12_masked_sub_pixel_variance128x128) @@ -2433,6 +2395,16 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) { assert(IMPLIES(seq_params->profile <= PROFILE_1, seq_params->bit_depth <= AOM_BITS_10)); + memcpy(cpi->target_seq_level_idx, oxcf->target_seq_level_idx, + sizeof(cpi->target_seq_level_idx)); + cpi->keep_level_stats = 0; + for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) { + if (cpi->target_seq_level_idx[i] < SEQ_LEVELS) { + cpi->keep_level_stats = 1; + break; + } + } + cm->timing_info_present = oxcf->timing_info_present; cm->timing_info.num_units_in_display_tick = oxcf->timing_info.num_units_in_display_tick; @@ -2541,6 +2513,8 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) { // Superblock size should not be updated after the first key frame. if (!cpi->seq_params_locked) { set_sb_size(&cm->seq_params, select_sb_size(cpi)); + for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) + seq_params->tier[i] = (oxcf->tier_mask >> i) & 1; } if (cpi->initial_width || sb_size != seq_params->sb_size) { @@ -2558,10 +2532,6 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) { cpi->alt_ref_source = NULL; rc->is_src_frame_alt_ref = 0; - rc->is_bwd_ref_frame = 0; - rc->is_last_bipred_frame = 0; - rc->is_bipred_frame = 0; - set_tile_info(cpi); cpi->ext_refresh_frame_flags_pending = 0; @@ -2578,6 +2548,21 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) { } } +static void init_level_info(AV1LevelInfo *level_info) { + memset(level_info, 0, MAX_NUM_OPERATING_POINTS * sizeof(*level_info)); + for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) { + AV1LevelSpec *const level_spec = &level_info[i].level_spec; + level_spec->level = SEQ_LEVEL_MAX; + AV1LevelStats *const level_stats = &level_info[i].level_stats; + level_stats->min_cropped_tile_width = INT_MAX; + level_stats->min_cropped_tile_height = INT_MAX; + level_stats->min_frame_width = INT_MAX; + level_stats->min_frame_height = INT_MAX; + level_stats->tile_width_is_valid = 1; + level_stats->min_cr = 1e8; + } +} + AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf, BufferPool *const pool) { unsigned int i; @@ -2620,10 +2605,11 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf, av1_rc_init(&cpi->oxcf, oxcf->pass, &cpi->rc); cm->current_frame.frame_number = 0; + cm->current_frame_id = -1; cpi->seq_params_locked = 0; cpi->partition_search_skippable_frame = 0; cpi->tile_data = NULL; - cpi->last_show_frame_buf_idx = INVALID_IDX; + cpi->last_show_frame_buf = NULL; realloc_segmentation_maps(cpi); memset(cpi->nmv_costs, 0, sizeof(cpi->nmv_costs)); @@ -2636,19 +2622,10 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf, aom_calloc(cm->MBs * sizeof(*cpi->mbgraph_stats[i].mb_stats), 1)); } -#if CONFIG_FP_MB_STATS - cpi->use_fp_mb_stats = 0; - if (cpi->use_fp_mb_stats) { - // a place holder used to store the first pass mb stats in the first pass - CHECK_MEM_ERROR(cm, cpi->twopass.frame_mb_stats_buf, - aom_calloc(cm->MBs * sizeof(uint8_t), 1)); - } else { - cpi->twopass.frame_mb_stats_buf = NULL; - } -#endif - cpi->refresh_alt_ref_frame = 0; + init_level_info(cpi->level_info); + cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS; #if CONFIG_INTERNAL_STATS cpi->b_calculate_blockiness = 1; @@ -2659,6 +2636,9 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf, cpi->count = 0; cpi->bytes = 0; +#if CONFIG_SPEED_STATS + cpi->tx_search_count = 0; +#endif // CONFIG_SPEED_STATS if (cpi->b_calculate_psnr) { cpi->total_sq_error = 0; @@ -2707,19 +2687,6 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf, const size_t packet_sz = sizeof(FIRSTPASS_STATS); const int packets = (int)(oxcf->two_pass_stats_in.sz / packet_sz); -#if CONFIG_FP_MB_STATS - if (cpi->use_fp_mb_stats) { - const size_t psz = cpi->common.MBs * sizeof(uint8_t); - const int ps = (int)(oxcf->firstpass_mb_stats_in.sz / psz); - - cpi->twopass.firstpass_mb_stats.mb_stats_start = - oxcf->firstpass_mb_stats_in.buf; - cpi->twopass.firstpass_mb_stats.mb_stats_end = - cpi->twopass.firstpass_mb_stats.mb_stats_start + - (ps - 1) * cpi->common.MBs * sizeof(uint8_t); - } -#endif - cpi->twopass.stats_in_start = oxcf->two_pass_stats_in.buf; cpi->twopass.stats_in = cpi->twopass.stats_in_start; cpi->twopass.stats_in_end = &cpi->twopass.stats_in[packets - 1]; @@ -2740,11 +2707,9 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf, (int32_t *)aom_memalign( 16, MAX_SB_SQUARE * sizeof(*cpi->td.mb.wsrc_buf))); -#if CONFIG_COLLECT_INTER_MODE_RD_STATS CHECK_MEM_ERROR( cm, cpi->td.mb.inter_modes_info, (InterModesInfo *)aom_malloc(sizeof(*cpi->td.mb.inter_modes_info))); -#endif for (int x = 0; x < 2; x++) for (int y = 0; y < 2; y++) @@ -2759,8 +2724,8 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf, (int32_t *)aom_memalign( 16, MAX_SB_SQUARE * sizeof(*cpi->td.mb.mask_buf))); - av1_set_speed_features_framesize_independent(cpi); - av1_set_speed_features_framesize_dependent(cpi); + av1_set_speed_features_framesize_independent(cpi, oxcf->speed); + av1_set_speed_features_framesize_dependent(cpi, oxcf->speed); for (int frame = 0; frame < MAX_LAG_BUFFERS; ++frame) { int mi_cols = ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2); @@ -2777,6 +2742,10 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf, cpi->tpl_stats[frame].mi_cols = cm->mi_cols; } +#if CONFIG_COLLECT_PARTITION_STATS == 2 + av1_zero(cpi->partition_stats); +#endif + #define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, JSDAF, JSVAF) \ cpi->fn_ptr[BT].sdf = SDF; \ cpi->fn_ptr[BT].sdaf = SDAF; \ @@ -2789,103 +2758,109 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf, BFP(BLOCK_4X16, aom_sad4x16, aom_sad4x16_avg, aom_variance4x16, aom_sub_pixel_variance4x16, aom_sub_pixel_avg_variance4x16, - aom_sad4x16x4d, aom_jnt_sad4x16_avg, aom_jnt_sub_pixel_avg_variance4x16) + aom_sad4x16x4d, aom_dist_wtd_sad4x16_avg, + aom_dist_wtd_sub_pixel_avg_variance4x16) BFP(BLOCK_16X4, aom_sad16x4, aom_sad16x4_avg, aom_variance16x4, aom_sub_pixel_variance16x4, aom_sub_pixel_avg_variance16x4, - aom_sad16x4x4d, aom_jnt_sad16x4_avg, aom_jnt_sub_pixel_avg_variance16x4) + aom_sad16x4x4d, aom_dist_wtd_sad16x4_avg, + aom_dist_wtd_sub_pixel_avg_variance16x4) BFP(BLOCK_8X32, aom_sad8x32, aom_sad8x32_avg, aom_variance8x32, aom_sub_pixel_variance8x32, aom_sub_pixel_avg_variance8x32, - aom_sad8x32x4d, aom_jnt_sad8x32_avg, aom_jnt_sub_pixel_avg_variance8x32) + aom_sad8x32x4d, aom_dist_wtd_sad8x32_avg, + aom_dist_wtd_sub_pixel_avg_variance8x32) BFP(BLOCK_32X8, aom_sad32x8, aom_sad32x8_avg, aom_variance32x8, aom_sub_pixel_variance32x8, aom_sub_pixel_avg_variance32x8, - aom_sad32x8x4d, aom_jnt_sad32x8_avg, aom_jnt_sub_pixel_avg_variance32x8) + aom_sad32x8x4d, aom_dist_wtd_sad32x8_avg, + aom_dist_wtd_sub_pixel_avg_variance32x8) BFP(BLOCK_16X64, aom_sad16x64, aom_sad16x64_avg, aom_variance16x64, aom_sub_pixel_variance16x64, aom_sub_pixel_avg_variance16x64, - aom_sad16x64x4d, aom_jnt_sad16x64_avg, - aom_jnt_sub_pixel_avg_variance16x64) + aom_sad16x64x4d, aom_dist_wtd_sad16x64_avg, + aom_dist_wtd_sub_pixel_avg_variance16x64) BFP(BLOCK_64X16, aom_sad64x16, aom_sad64x16_avg, aom_variance64x16, aom_sub_pixel_variance64x16, aom_sub_pixel_avg_variance64x16, - aom_sad64x16x4d, aom_jnt_sad64x16_avg, - aom_jnt_sub_pixel_avg_variance64x16) + aom_sad64x16x4d, aom_dist_wtd_sad64x16_avg, + aom_dist_wtd_sub_pixel_avg_variance64x16) BFP(BLOCK_128X128, aom_sad128x128, aom_sad128x128_avg, aom_variance128x128, aom_sub_pixel_variance128x128, aom_sub_pixel_avg_variance128x128, - aom_sad128x128x4d, aom_jnt_sad128x128_avg, - aom_jnt_sub_pixel_avg_variance128x128) + aom_sad128x128x4d, aom_dist_wtd_sad128x128_avg, + aom_dist_wtd_sub_pixel_avg_variance128x128) BFP(BLOCK_128X64, aom_sad128x64, aom_sad128x64_avg, aom_variance128x64, aom_sub_pixel_variance128x64, aom_sub_pixel_avg_variance128x64, - aom_sad128x64x4d, aom_jnt_sad128x64_avg, - aom_jnt_sub_pixel_avg_variance128x64) + aom_sad128x64x4d, aom_dist_wtd_sad128x64_avg, + aom_dist_wtd_sub_pixel_avg_variance128x64) BFP(BLOCK_64X128, aom_sad64x128, aom_sad64x128_avg, aom_variance64x128, aom_sub_pixel_variance64x128, aom_sub_pixel_avg_variance64x128, - aom_sad64x128x4d, aom_jnt_sad64x128_avg, - aom_jnt_sub_pixel_avg_variance64x128) + aom_sad64x128x4d, aom_dist_wtd_sad64x128_avg, + aom_dist_wtd_sub_pixel_avg_variance64x128) BFP(BLOCK_32X16, aom_sad32x16, aom_sad32x16_avg, aom_variance32x16, aom_sub_pixel_variance32x16, aom_sub_pixel_avg_variance32x16, - aom_sad32x16x4d, aom_jnt_sad32x16_avg, - aom_jnt_sub_pixel_avg_variance32x16) + aom_sad32x16x4d, aom_dist_wtd_sad32x16_avg, + aom_dist_wtd_sub_pixel_avg_variance32x16) BFP(BLOCK_16X32, aom_sad16x32, aom_sad16x32_avg, aom_variance16x32, aom_sub_pixel_variance16x32, aom_sub_pixel_avg_variance16x32, - aom_sad16x32x4d, aom_jnt_sad16x32_avg, - aom_jnt_sub_pixel_avg_variance16x32) + aom_sad16x32x4d, aom_dist_wtd_sad16x32_avg, + aom_dist_wtd_sub_pixel_avg_variance16x32) BFP(BLOCK_64X32, aom_sad64x32, aom_sad64x32_avg, aom_variance64x32, aom_sub_pixel_variance64x32, aom_sub_pixel_avg_variance64x32, - aom_sad64x32x4d, aom_jnt_sad64x32_avg, - aom_jnt_sub_pixel_avg_variance64x32) + aom_sad64x32x4d, aom_dist_wtd_sad64x32_avg, + aom_dist_wtd_sub_pixel_avg_variance64x32) BFP(BLOCK_32X64, aom_sad32x64, aom_sad32x64_avg, aom_variance32x64, aom_sub_pixel_variance32x64, aom_sub_pixel_avg_variance32x64, - aom_sad32x64x4d, aom_jnt_sad32x64_avg, - aom_jnt_sub_pixel_avg_variance32x64) + aom_sad32x64x4d, aom_dist_wtd_sad32x64_avg, + aom_dist_wtd_sub_pixel_avg_variance32x64) BFP(BLOCK_32X32, aom_sad32x32, aom_sad32x32_avg, aom_variance32x32, aom_sub_pixel_variance32x32, aom_sub_pixel_avg_variance32x32, - aom_sad32x32x4d, aom_jnt_sad32x32_avg, - aom_jnt_sub_pixel_avg_variance32x32) + aom_sad32x32x4d, aom_dist_wtd_sad32x32_avg, + aom_dist_wtd_sub_pixel_avg_variance32x32) BFP(BLOCK_64X64, aom_sad64x64, aom_sad64x64_avg, aom_variance64x64, aom_sub_pixel_variance64x64, aom_sub_pixel_avg_variance64x64, - aom_sad64x64x4d, aom_jnt_sad64x64_avg, - aom_jnt_sub_pixel_avg_variance64x64) + aom_sad64x64x4d, aom_dist_wtd_sad64x64_avg, + aom_dist_wtd_sub_pixel_avg_variance64x64) BFP(BLOCK_16X16, aom_sad16x16, aom_sad16x16_avg, aom_variance16x16, aom_sub_pixel_variance16x16, aom_sub_pixel_avg_variance16x16, - aom_sad16x16x4d, aom_jnt_sad16x16_avg, - aom_jnt_sub_pixel_avg_variance16x16) + aom_sad16x16x4d, aom_dist_wtd_sad16x16_avg, + aom_dist_wtd_sub_pixel_avg_variance16x16) BFP(BLOCK_16X8, aom_sad16x8, aom_sad16x8_avg, aom_variance16x8, aom_sub_pixel_variance16x8, aom_sub_pixel_avg_variance16x8, - aom_sad16x8x4d, aom_jnt_sad16x8_avg, aom_jnt_sub_pixel_avg_variance16x8) + aom_sad16x8x4d, aom_dist_wtd_sad16x8_avg, + aom_dist_wtd_sub_pixel_avg_variance16x8) BFP(BLOCK_8X16, aom_sad8x16, aom_sad8x16_avg, aom_variance8x16, aom_sub_pixel_variance8x16, aom_sub_pixel_avg_variance8x16, - aom_sad8x16x4d, aom_jnt_sad8x16_avg, aom_jnt_sub_pixel_avg_variance8x16) + aom_sad8x16x4d, aom_dist_wtd_sad8x16_avg, + aom_dist_wtd_sub_pixel_avg_variance8x16) BFP(BLOCK_8X8, aom_sad8x8, aom_sad8x8_avg, aom_variance8x8, aom_sub_pixel_variance8x8, aom_sub_pixel_avg_variance8x8, aom_sad8x8x4d, - aom_jnt_sad8x8_avg, aom_jnt_sub_pixel_avg_variance8x8) + aom_dist_wtd_sad8x8_avg, aom_dist_wtd_sub_pixel_avg_variance8x8) BFP(BLOCK_8X4, aom_sad8x4, aom_sad8x4_avg, aom_variance8x4, aom_sub_pixel_variance8x4, aom_sub_pixel_avg_variance8x4, aom_sad8x4x4d, - aom_jnt_sad8x4_avg, aom_jnt_sub_pixel_avg_variance8x4) + aom_dist_wtd_sad8x4_avg, aom_dist_wtd_sub_pixel_avg_variance8x4) BFP(BLOCK_4X8, aom_sad4x8, aom_sad4x8_avg, aom_variance4x8, aom_sub_pixel_variance4x8, aom_sub_pixel_avg_variance4x8, aom_sad4x8x4d, - aom_jnt_sad4x8_avg, aom_jnt_sub_pixel_avg_variance4x8) + aom_dist_wtd_sad4x8_avg, aom_dist_wtd_sub_pixel_avg_variance4x8) BFP(BLOCK_4X4, aom_sad4x4, aom_sad4x4_avg, aom_variance4x4, aom_sub_pixel_variance4x4, aom_sub_pixel_avg_variance4x4, aom_sad4x4x4d, - aom_jnt_sad4x4_avg, aom_jnt_sub_pixel_avg_variance4x4) + aom_dist_wtd_sad4x4_avg, aom_dist_wtd_sub_pixel_avg_variance4x4) #define OBFP(BT, OSDF, OVF, OSVF) \ cpi->fn_ptr[BT].osdf = OSDF; \ @@ -3083,6 +3058,17 @@ void av1_remove_compressor(AV1_COMP *cpi) { fclose(f); } #endif // CONFIG_INTERNAL_STATS +#if CONFIG_SPEED_STATS + if (cpi->oxcf.pass != 1) { + fprintf(stdout, "tx_search_count = %d\n", cpi->tx_search_count); + } +#endif // CONFIG_SPEED_STATS + +#if CONFIG_COLLECT_PARTITION_STATS == 2 + if (cpi->oxcf.pass != 1) { + av1_print_partition_stats(&cpi->partition_stats); + } +#endif } for (int frame = 0; frame < MAX_LAG_BUFFERS; ++frame) { @@ -3090,7 +3076,7 @@ void av1_remove_compressor(AV1_COMP *cpi) { cpi->tpl_stats[frame].is_valid = 0; } - for (t = 0; t < cpi->num_workers; ++t) { + for (t = cpi->num_workers - 1; t >= 0; --t) { AVxWorker *const worker = &cpi->workers[t]; EncWorkerData *const thread_data = &cpi->tile_thr_data[t]; @@ -3099,7 +3085,7 @@ void av1_remove_compressor(AV1_COMP *cpi) { // Deallocate allocated thread data. if (cpi->row_mt == 1) aom_free(thread_data->td->tctx); - if (t < cpi->num_workers - 1) { + if (t > 0) { aom_free(thread_data->td->palette_buffer); aom_free(thread_data->td->tmp_conv_dst); for (int j = 0; j < 2; ++j) { @@ -3109,9 +3095,7 @@ void av1_remove_compressor(AV1_COMP *cpi) { aom_free(thread_data->td->left_pred_buf); aom_free(thread_data->td->wsrc_buf); -#if CONFIG_COLLECT_INTER_MODE_RD_STATS aom_free(thread_data->td->inter_modes_info); -#endif for (int x = 0; x < 2; x++) { for (int y = 0; y < 2; y++) { aom_free(thread_data->td->hash_value_buffer[x][y]); @@ -3148,12 +3132,6 @@ void av1_remove_compressor(AV1_COMP *cpi) { aom_free(cpi->mbgraph_stats[i].mb_stats); } -#if CONFIG_FP_MB_STATS - if (cpi->use_fp_mb_stats) { - aom_free(cpi->twopass.frame_mb_stats_buf); - cpi->twopass.frame_mb_stats_buf = NULL; - } -#endif #if CONFIG_INTERNAL_STATS aom_free(cpi->ssim_vars); cpi->ssim_vars = NULL; @@ -3179,7 +3157,7 @@ static void generate_psnr_packet(AV1_COMP *cpi) { struct aom_codec_cx_pkt pkt; int i; PSNR_STATS psnr; - aom_calc_highbd_psnr(cpi->source, cpi->common.frame_to_show, &psnr, + aom_calc_highbd_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr, cpi->td.mb.e_mbd.bd, cpi->oxcf.input_bit_depth); for (i = 0; i < 4; ++i) { @@ -3198,15 +3176,6 @@ int av1_use_as_reference(AV1_COMP *cpi, int ref_frame_flags) { return 0; } -void av1_update_reference(AV1_COMP *cpi, int ref_frame_upd_flags) { - cpi->ext_refresh_last_frame = (ref_frame_upd_flags & AOM_LAST_FLAG) != 0; - cpi->ext_refresh_golden_frame = (ref_frame_upd_flags & AOM_GOLD_FLAG) != 0; - cpi->ext_refresh_alt_ref_frame = (ref_frame_upd_flags & AOM_ALT_FLAG) != 0; - cpi->ext_refresh_bwd_ref_frame = (ref_frame_upd_flags & AOM_BWD_FLAG) != 0; - cpi->ext_refresh_alt2_ref_frame = (ref_frame_upd_flags & AOM_ALT2_FLAG) != 0; - cpi->ext_refresh_frame_flags_pending = 1; -} - int av1_copy_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd) { AV1_COMMON *const cm = &cpi->common; const int num_planes = av1_num_planes(cm); @@ -3269,62 +3238,6 @@ void aom_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f) { } #endif -static void check_show_existing_frame(AV1_COMP *cpi) { - const GF_GROUP *const gf_group = &cpi->twopass.gf_group; - AV1_COMMON *const cm = &cpi->common; - const FRAME_UPDATE_TYPE next_frame_update_type = - gf_group->update_type[gf_group->index]; -#if USE_SYMM_MULTI_LAYER - const int which_arf = (cpi->new_bwdref_update_rule == 1) - ? gf_group->arf_update_idx[gf_group->index] > 0 - : gf_group->arf_update_idx[gf_group->index]; -#else - const int which_arf = gf_group->arf_update_idx[gf_group->index]; -#endif - - if (cm->show_existing_frame == 1) { - cm->show_existing_frame = 0; - } else if (cpi->rc.is_last_bipred_frame) { -#if USE_SYMM_MULTI_LAYER - // NOTE: When new structure is used, every bwdref will have one overlay - // frame. Therefore, there is no need to find out which frame to - // show in advance. - if (cpi->new_bwdref_update_rule == 0) { -#endif - // NOTE: If the current frame is a last bi-predictive frame, it is - // needed next to show the BWDREF_FRAME, which is pointed by - // the last_fb_idxes[0] after reference frame buffer update - cpi->rc.is_last_bipred_frame = 0; - cm->show_existing_frame = 1; - cpi->existing_fb_idx_to_show = cpi->remapped_ref_idx[0]; -#if USE_SYMM_MULTI_LAYER - } -#endif - } else if (cpi->is_arf_filter_off[which_arf] && - (next_frame_update_type == OVERLAY_UPDATE || - next_frame_update_type == INTNL_OVERLAY_UPDATE)) { -#if USE_SYMM_MULTI_LAYER - const int bwdref_to_show = - (cpi->new_bwdref_update_rule == 1) ? BWDREF_FRAME : ALTREF2_FRAME; -#else - const int bwdref_to_show = ALTREF2_FRAME; -#endif - // Other parameters related to OVERLAY_UPDATE will be taken care of - // in av1_rc_get_second_pass_params(cpi) - cm->show_existing_frame = 1; - cpi->rc.is_src_frame_alt_ref = 1; - cpi->existing_fb_idx_to_show = - (next_frame_update_type == OVERLAY_UPDATE) - ? get_ref_frame_map_idx(cpi, ALTREF_FRAME) - : get_ref_frame_map_idx(cpi, bwdref_to_show); -#if USE_SYMM_MULTI_LAYER - if (cpi->new_bwdref_update_rule == 0) -#endif - cpi->is_arf_filter_off[which_arf] = 0; - } - cpi->rc.is_src_frame_ext_arf = 0; -} - #ifdef OUTPUT_YUV_REC void aom_write_one_yuv_frame(AV1_COMMON *cm, YV12_BUFFER_CONFIG *s) { uint8_t *src = s->y_buffer; @@ -3433,379 +3346,6 @@ static int recode_loop_test(AV1_COMP *cpi, int high_limit, int low_limit, int q, return force_recode; } -#define DUMP_REF_FRAME_IMAGES 0 - -#if DUMP_REF_FRAME_IMAGES == 1 -static int dump_one_image(AV1_COMMON *cm, - const YV12_BUFFER_CONFIG *const ref_buf, - char *file_name) { - int h; - FILE *f_ref = NULL; - - if (ref_buf == NULL) { - printf("Frame data buffer is NULL.\n"); - return AOM_CODEC_MEM_ERROR; - } - - if ((f_ref = fopen(file_name, "wb")) == NULL) { - printf("Unable to open file %s to write.\n", file_name); - return AOM_CODEC_MEM_ERROR; - } - - // --- Y --- - for (h = 0; h < cm->height; ++h) { - fwrite(&ref_buf->y_buffer[h * ref_buf->y_stride], 1, cm->width, f_ref); - } - // --- U --- - for (h = 0; h < (cm->height >> 1); ++h) { - fwrite(&ref_buf->u_buffer[h * ref_buf->uv_stride], 1, (cm->width >> 1), - f_ref); - } - // --- V --- - for (h = 0; h < (cm->height >> 1); ++h) { - fwrite(&ref_buf->v_buffer[h * ref_buf->uv_stride], 1, (cm->width >> 1), - f_ref); - } - - fclose(f_ref); - - return AOM_CODEC_OK; -} - -static void dump_ref_frame_images(AV1_COMP *cpi) { - AV1_COMMON *const cm = &cpi->common; - MV_REFERENCE_FRAME ref_frame; - - for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { - char file_name[256] = ""; - snprintf(file_name, sizeof(file_name), "/tmp/enc_F%d_ref_%d.yuv", - cm->current_frame.frame_number, ref_frame); - dump_one_image(cm, get_ref_frame_buffer(cpi, ref_frame), file_name); - } -} -#endif // DUMP_REF_FRAME_IMAGES == 1 - -// This function is used to shift the virtual indices of last reference frames -// as follows: -// LAST_FRAME -> LAST2_FRAME -> LAST3_FRAME -// when the LAST_FRAME is updated. -static INLINE void shift_last_ref_frames(AV1_COMP *cpi) { - // TODO(isbs): shift the scaled indices as well - for (int ref_frame = LAST3_FRAME; ref_frame > LAST_FRAME; --ref_frame) { - const int ref_idx = ref_frame - LAST_FRAME; - cpi->remapped_ref_idx[ref_idx] = cpi->remapped_ref_idx[ref_idx - 1]; - - if (!cpi->rc.is_src_frame_alt_ref) { - memcpy(cpi->interp_filter_selected[ref_frame], - cpi->interp_filter_selected[ref_frame - 1], - sizeof(cpi->interp_filter_selected[ref_frame - 1])); - } - } -} - -#if USE_SYMM_MULTI_LAYER -// This function is used to shift the virtual indices of bwd reference -// frames as follows: -// BWD_REF -> ALT2_REF -> EXT_REF -// to clear a space to store the closest bwdref -static INLINE void rshift_bwd_ref_frames(AV1_COMP *cpi) { - // TODO(isbs): shift the scaled indices as well - static const int ordered_bwd[3] = { BWDREF_FRAME, ALTREF2_FRAME, - EXTREF_FRAME }; - - for (int i = 2; i > 0; --i) { - // [0] is allocated to the current coded frame, i.e. bwdref - memcpy(cpi->interp_filter_selected[ordered_bwd[i]], - cpi->interp_filter_selected[ordered_bwd[i - 1]], - sizeof(cpi->interp_filter_selected[ordered_bwd[i - 1]])); - - cpi->remapped_ref_idx[ordered_bwd[i] - LAST_FRAME] = - cpi->remapped_ref_idx[ordered_bwd[i - 1] - LAST_FRAME]; - } -} - -// This function is used to shift the virtual indices of bwd reference -// frames as follows: -// BWD_REF <- ALT2_REF <- EXT_REF -// to update the bwd reference frame for coding the next frame. -static INLINE void lshift_bwd_ref_frames(AV1_COMP *cpi) { - // TODO(isbs): shift the scaled indices as well - static const int ordered_bwd[3] = { BWDREF_FRAME, ALTREF2_FRAME, - EXTREF_FRAME }; - - for (int i = 0; i < 2; ++i) { - // [0] is allocated to the current coded frame, i.e. bwdref - memcpy(cpi->interp_filter_selected[ordered_bwd[i]], - cpi->interp_filter_selected[ordered_bwd[i + 1]], - sizeof(cpi->interp_filter_selected[ordered_bwd[i + 1]])); - - cpi->remapped_ref_idx[ordered_bwd[i] - LAST_FRAME] = - cpi->remapped_ref_idx[ordered_bwd[i + 1] - LAST_FRAME]; - } -} -#endif // USE_SYMM_MULTI_LAYER - -static void update_reference_frames(AV1_COMP *cpi) { - AV1_COMMON *const cm = &cpi->common; - - // NOTE: Save the new show frame buffer index for --test-code=warn, i.e., - // for the purpose to verify no mismatch between encoder and decoder. - if (cm->show_frame) cpi->last_show_frame_buf_idx = cm->new_fb_idx; - - // In the case of show_existing frame, we will not send fresh flag - // to decoder. Any change in the reference frame buffer can be done by - // switching the virtual indices. - if (cm->show_existing_frame) { - // If we are not indicating to the decoder that this frame is - // a show_existing_frame, which occurs in error_resilient mode, - // we still want to refresh the LAST_FRAME when the current frame - // was the source of an ext_arf. - cpi->refresh_last_frame = - !encode_show_existing_frame(cm) && cpi->rc.is_src_frame_ext_arf; - cpi->refresh_golden_frame = 0; - cpi->refresh_bwd_ref_frame = 0; - cpi->refresh_alt2_ref_frame = 0; - cpi->refresh_alt_ref_frame = 0; - - cpi->rc.is_bwd_ref_frame = 0; - cpi->rc.is_last_bipred_frame = 0; - cpi->rc.is_bipred_frame = 0; - } - - BufferPool *const pool = cm->buffer_pool; - - // At this point the new frame has been encoded. - // If any buffer copy / swapping is signaled it should be done here. - - // Only update all of the reference buffers if a KEY_FRAME is also a - // show_frame. This ensures a fwd keyframe does not update all of the buffers - if ((cm->current_frame.frame_type == KEY_FRAME && cm->show_frame) || - frame_is_sframe(cm)) { - for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) { - assign_frame_buffer(pool->frame_bufs, - &cm->ref_frame_map[cpi->remapped_ref_idx[ref_frame]], - cm->new_fb_idx); - } - return; - } - - if (av1_preserve_existing_gf(cpi)) { - // We have decided to preserve the previously existing golden frame as our - // new ARF frame. However, in the short term in function - // av1_bitstream.c::get_refresh_mask() we left it in the GF slot and, if - // we're updating the GF with the current decoded frame, we save it to the - // ARF slot instead. - // We now have to update the ARF with the current frame and swap gld_fb_idx - // and alt_fb_idx so that, overall, we've stored the old GF in the new ARF - // slot and, if we're updating the GF, the current frame becomes the new GF. - int tmp; - - // ARF in general is a better reference than overlay. We shouldkeep ARF as - // reference instead of replacing it with overlay. - - if (!cpi->preserve_arf_as_gld) { - assign_frame_buffer( - pool->frame_bufs, - &cm->ref_frame_map[get_ref_frame_map_idx(cpi, ALTREF_FRAME)], - cm->new_fb_idx); - } - - tmp = get_ref_frame_map_idx(cpi, ALTREF_FRAME); - cpi->remapped_ref_idx[ALTREF_FRAME - 1] = - get_ref_frame_map_idx(cpi, GOLDEN_FRAME); - cpi->remapped_ref_idx[GOLDEN_FRAME - 1] = tmp; - - // TODO(zoeliu): Do we need to copy cpi->interp_filter_selected[0] over to - // cpi->interp_filter_selected[GOLDEN_FRAME]? - } else if (cpi->rc.is_src_frame_ext_arf && encode_show_existing_frame(cm)) { -#if CONFIG_DEBUG - const GF_GROUP *const gf_group = &cpi->twopass.gf_group; - assert(gf_group->update_type[gf_group->index] == INTNL_OVERLAY_UPDATE); -#endif -#if USE_SYMM_MULTI_LAYER - const int bwdref_to_show = - (cpi->new_bwdref_update_rule == 1) ? BWDREF_FRAME : ALTREF2_FRAME; -#else - const int bwdref_to_show = ALTREF2_FRAME; -#endif - // Deal with the special case for showing existing internal ALTREF_FRAME - // Refresh the LAST_FRAME with the ALTREF_FRAME and retire the LAST3_FRAME - // by updating the virtual indices. - const int last3_remapped_idx = get_ref_frame_map_idx(cpi, LAST3_FRAME); - shift_last_ref_frames(cpi); - - cpi->remapped_ref_idx[LAST_FRAME - 1] = - get_ref_frame_map_idx(cpi, bwdref_to_show); - - memcpy(cpi->interp_filter_selected[LAST_FRAME], - cpi->interp_filter_selected[bwdref_to_show], - sizeof(cpi->interp_filter_selected[bwdref_to_show])); -#if USE_SYMM_MULTI_LAYER - if (cpi->new_bwdref_update_rule == 1) { - lshift_bwd_ref_frames(cpi); - // pass outdated forward reference frame (previous LAST3) to the - // spared space - cpi->remapped_ref_idx[EXTREF_FRAME - 1] = last3_remapped_idx; - } else { -#endif - cpi->remapped_ref_idx[bwdref_to_show - 1] = last3_remapped_idx; -#if USE_SYMM_MULTI_LAYER - } -#endif - } else { /* For non key/golden frames */ - // === ALTREF_FRAME === - if (cpi->refresh_alt_ref_frame) { - int arf_idx = get_ref_frame_map_idx(cpi, ALTREF_FRAME); - assign_frame_buffer(pool->frame_bufs, &cm->ref_frame_map[arf_idx], - cm->new_fb_idx); - - memcpy(cpi->interp_filter_selected[ALTREF_FRAME], - cpi->interp_filter_selected[0], - sizeof(cpi->interp_filter_selected[0])); - } - - // === GOLDEN_FRAME === - if (cpi->refresh_golden_frame) { - assign_frame_buffer( - pool->frame_bufs, - &cm->ref_frame_map[get_ref_frame_map_idx(cpi, GOLDEN_FRAME)], - cm->new_fb_idx); - - memcpy(cpi->interp_filter_selected[GOLDEN_FRAME], - cpi->interp_filter_selected[0], - sizeof(cpi->interp_filter_selected[0])); - } - - // === BWDREF_FRAME === - if (cpi->refresh_bwd_ref_frame) { -#if USE_SYMM_MULTI_LAYER - if (cpi->new_bwdref_update_rule) { - // We shift the backward reference frame as follows: - // BWDREF -> ALTREF2 -> EXTREF - // and assign the newly coded frame to BWDREF so that it always - // keeps the nearest future frame - int tmp = get_ref_frame_map_idx(cpi, EXTREF_FRAME); - assign_frame_buffer(pool->frame_bufs, &cm->ref_frame_map[tmp], - cm->new_fb_idx); - - rshift_bwd_ref_frames(cpi); - cpi->remapped_ref_idx[BWDREF_FRAME - 1] = tmp; - } else { -#endif // USE_SYMM_MULTI_LAYER - assign_frame_buffer( - pool->frame_bufs, - &cm->ref_frame_map[get_ref_frame_map_idx(cpi, BWDREF_FRAME)], - cm->new_fb_idx); -#if USE_SYMM_MULTI_LAYER - } -#endif - memcpy(cpi->interp_filter_selected[BWDREF_FRAME], - cpi->interp_filter_selected[0], - sizeof(cpi->interp_filter_selected[0])); - } - - // === ALTREF2_FRAME === - if (cpi->refresh_alt2_ref_frame) { - assign_frame_buffer( - pool->frame_bufs, - &cm->ref_frame_map[get_ref_frame_map_idx(cpi, ALTREF2_FRAME)], - cm->new_fb_idx); - - memcpy(cpi->interp_filter_selected[ALTREF2_FRAME], - cpi->interp_filter_selected[0], - sizeof(cpi->interp_filter_selected[0])); - } - } - - if (cpi->refresh_last_frame) { - // NOTE(zoeliu): We have two layers of mapping (1) from the per-frame - // reference to the reference frame buffer virtual index; and then (2) from - // the virtual index to the reference frame buffer physical index: - // - // LAST_FRAME, ..., EXTREF_FRAME - // | | - // v v - // remapped_ref_idx[LAST_FRAME - 1], ..., remapped_ref_idx[EXTREF_FRAME - 1] - // | | - // v v - // ref_frame_map[], ..., ref_frame_map[] - // - // When refresh_last_frame is set, it is intended to retire LAST3_FRAME, - // have the other 2 LAST reference frames shifted as follows: - // LAST_FRAME -> LAST2_FRAME -> LAST3_FRAME - // , and then have LAST_FRAME refreshed by the newly coded frame. - // - // To fulfill it, the decoder will be notified to execute following 2 steps: - // - // (a) To change ref_frame_map[] and have the virtual index of LAST3_FRAME - // to point to the newly coded frame, i.e. - // ref_frame_map[lst_fb_idexes[2]] => new_fb_idx; - // - // (b) To change the 1st layer mapping to have LAST_FRAME mapped to the - // original virtual index of LAST3_FRAME and have the other mappings - // shifted as follows: - // LAST_FRAME, LAST2_FRAME, LAST3_FRAME - // | | | - // v v v - // remapped_ref_idx[2], remapped_ref_idx[0], remapped_ref_idx[1] - assign_frame_buffer( - pool->frame_bufs, - &cm->ref_frame_map[get_ref_frame_map_idx(cpi, LAST3_FRAME)], - cm->new_fb_idx); - - int last3_remapped_idx = get_ref_frame_map_idx(cpi, LAST3_FRAME); - - shift_last_ref_frames(cpi); - cpi->remapped_ref_idx[LAST_FRAME - 1] = last3_remapped_idx; - - assert(!encode_show_existing_frame(cm)); - memcpy(cpi->interp_filter_selected[LAST_FRAME], - cpi->interp_filter_selected[0], - sizeof(cpi->interp_filter_selected[0])); - - // If the new structure is used, we will always have overlay frames coupled - // with bwdref frames. Therefore, we won't have to perform this update - // in advance (we do this update when the overlay frame shows up). -#if USE_SYMM_MULTI_LAYER - if (cpi->new_bwdref_update_rule == 0 && cpi->rc.is_last_bipred_frame) { -#else - if (cpi->rc.is_last_bipred_frame) { -#endif - // Refresh the LAST_FRAME with the BWDREF_FRAME and retire the - // LAST3_FRAME by updating the virtual indices. - // - // NOTE: The source frame for BWDREF does not have a holding position as - // the OVERLAY frame for ALTREF's. Hence, to resolve the reference - // virtual index reshuffling for BWDREF, the encoder always - // specifies a LAST_BIPRED right before BWDREF and completes the - // reshuffling job accordingly. - last3_remapped_idx = get_ref_frame_map_idx(cpi, LAST3_FRAME); - - shift_last_ref_frames(cpi); - cpi->remapped_ref_idx[LAST_FRAME - 1] = - get_ref_frame_map_idx(cpi, BWDREF_FRAME); - cpi->remapped_ref_idx[BWDREF_FRAME - 1] = last3_remapped_idx; - - memcpy(cpi->interp_filter_selected[LAST_FRAME], - cpi->interp_filter_selected[BWDREF_FRAME], - sizeof(cpi->interp_filter_selected[BWDREF_FRAME])); - } - } - -#if DUMP_REF_FRAME_IMAGES == 1 - // Dump out all reference frame images. - dump_ref_frame_images(cpi); -#endif // DUMP_REF_FRAME_IMAGES -} - -static INLINE void alloc_frame_mvs(AV1_COMMON *const cm, int buffer_idx) { - assert(buffer_idx != INVALID_IDX); - RefCntBuffer *const new_fb_ptr = &cm->buffer_pool->frame_bufs[buffer_idx]; - ensure_mv_buffer(new_fb_ptr, cm); - new_fb_ptr->width = cm->width; - new_fb_ptr->height = cm->height; -} - static void scale_references(AV1_COMP *cpi) { AV1_COMMON *cm = &cpi->common; const int num_planes = av1_num_planes(cm); @@ -3820,68 +3360,79 @@ static void scale_references(AV1_COMP *cpi) { if (cpi->ref_frame_flags & ref_mask[ref_frame - 1]) { BufferPool *const pool = cm->buffer_pool; const YV12_BUFFER_CONFIG *const ref = - get_ref_frame_buffer(cpi, ref_frame); + get_ref_frame_yv12_buf(cm, ref_frame); if (ref == NULL) { - cpi->scaled_ref_idx[ref_frame - 1] = INVALID_IDX; + cpi->scaled_ref_buf[ref_frame - 1] = NULL; continue; } if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) { - RefCntBuffer *new_fb_ptr = NULL; + // Replace the reference buffer with a copy having a thicker border, + // if the reference buffer is higher resolution than the current + // frame, and the border is thin. + if ((ref->y_crop_width > cm->width || + ref->y_crop_height > cm->height) && + ref->border < AOM_BORDER_IN_PIXELS) { + RefCntBuffer *ref_fb = get_ref_frame_buf(cm, ref_frame); + if (aom_yv12_realloc_with_new_border( + &ref_fb->buf, AOM_BORDER_IN_PIXELS, cm->byte_alignment, + num_planes) != 0) { + aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate frame buffer"); + } + } int force_scaling = 0; - int new_fb = cpi->scaled_ref_idx[ref_frame - 1]; - if (new_fb == INVALID_IDX) { - new_fb = get_free_fb(cm); - if (new_fb == INVALID_IDX) + RefCntBuffer *new_fb = cpi->scaled_ref_buf[ref_frame - 1]; + if (new_fb == NULL) { + const int new_fb_idx = get_free_fb(cm); + if (new_fb_idx == INVALID_IDX) { aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, "Unable to find free frame buffer"); + } force_scaling = 1; + new_fb = &pool->frame_bufs[new_fb_idx]; } - new_fb_ptr = &pool->frame_bufs[new_fb]; - if (force_scaling || new_fb_ptr->buf.y_crop_width != cm->width || - new_fb_ptr->buf.y_crop_height != cm->height) { + + if (force_scaling || new_fb->buf.y_crop_width != cm->width || + new_fb->buf.y_crop_height != cm->height) { if (aom_realloc_frame_buffer( - &new_fb_ptr->buf, cm->width, cm->height, + &new_fb->buf, cm->width, cm->height, cm->seq_params.subsampling_x, cm->seq_params.subsampling_y, cm->seq_params.use_highbitdepth, AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL)) { if (force_scaling) { // Release the reference acquired in the get_free_fb() call above. - --new_fb_ptr->ref_count; + --new_fb->ref_count; } aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); } av1_resize_and_extend_frame( - ref, &new_fb_ptr->buf, (int)cm->seq_params.bit_depth, num_planes); - cpi->scaled_ref_idx[ref_frame - 1] = new_fb; + ref, &new_fb->buf, (int)cm->seq_params.bit_depth, num_planes); + cpi->scaled_ref_buf[ref_frame - 1] = new_fb; alloc_frame_mvs(cm, new_fb); } } else { - const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame); - RefCntBuffer *const buf = &pool->frame_bufs[buf_idx]; + RefCntBuffer *buf = get_ref_frame_buf(cm, ref_frame); buf->buf.y_crop_width = ref->y_crop_width; buf->buf.y_crop_height = ref->y_crop_height; - cpi->scaled_ref_idx[ref_frame - 1] = buf_idx; + cpi->scaled_ref_buf[ref_frame - 1] = buf; ++buf->ref_count; } } else { - if (cpi->oxcf.pass != 0) cpi->scaled_ref_idx[ref_frame - 1] = INVALID_IDX; + if (cpi->oxcf.pass != 0) cpi->scaled_ref_buf[ref_frame - 1] = NULL; } } } static void release_scaled_references(AV1_COMP *cpi) { - AV1_COMMON *cm = &cpi->common; - int i; // TODO(isbs): only refresh the necessary frames, rather than all of them - for (i = 0; i < INTER_REFS_PER_FRAME; ++i) { - const int idx = cpi->scaled_ref_idx[i]; - if (idx != INVALID_IDX) { - RefCntBuffer *const buf = &cm->buffer_pool->frame_bufs[idx]; + for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { + RefCntBuffer *const buf = cpi->scaled_ref_buf[i]; + if (buf != NULL) { --buf->ref_count; - cpi->scaled_ref_idx[i] = INVALID_IDX; + cpi->scaled_ref_buf[i] = NULL; } } } @@ -3911,6 +3462,71 @@ static void set_mv_search_params(AV1_COMP *cpi) { } } +static void set_screen_content_options(AV1_COMP *cpi) { + AV1_COMMON *cm = &cpi->common; + + if (cm->seq_params.force_screen_content_tools != 2) { + cm->allow_screen_content_tools = cm->allow_intrabc = + cm->seq_params.force_screen_content_tools; + return; + } + + if (cpi->oxcf.content == AOM_CONTENT_SCREEN) { + cm->allow_screen_content_tools = cm->allow_intrabc = 1; + return; + } + + // Estimate if the source frame is screen content, based on the portion of + // blocks that have few luma colors. + const uint8_t *src = cpi->source->y_buffer; + assert(src != NULL); + const int use_hbd = cpi->source->flags & YV12_FLAG_HIGHBITDEPTH; + const int stride = cpi->source->y_stride; + const int width = cpi->source->y_width; + const int height = cpi->source->y_height; + const int bd = cm->seq_params.bit_depth; + const int blk_w = 16; + const int blk_h = 16; + // These threshold values are selected experimentally. + const int color_thresh = 4; + const unsigned int var_thresh = 0; + // Counts of blocks with no more than color_thresh colors. + int counts_1 = 0; + // Counts of blocks with no more than color_thresh colors and variance larger + // than var_thresh. + int counts_2 = 0; + + for (int r = 0; r + blk_h <= height; r += blk_h) { + for (int c = 0; c + blk_w <= width; c += blk_w) { + int count_buf[1 << 12]; // Maximum (1 << 12) color levels. + const uint8_t *const this_src = src + r * stride + c; + const int n_colors = + use_hbd ? av1_count_colors_highbd(this_src, stride, blk_w, blk_h, bd, + count_buf) + : av1_count_colors(this_src, stride, blk_w, blk_h, count_buf); + if (n_colors > 1 && n_colors <= color_thresh) { + ++counts_1; + struct buf_2d buf; + buf.stride = stride; + buf.buf = (uint8_t *)this_src; + const unsigned int var = + use_hbd + ? av1_high_get_sby_perpixel_variance(cpi, &buf, BLOCK_16X16, bd) + : av1_get_sby_perpixel_variance(cpi, &buf, BLOCK_16X16); + if (var > var_thresh) ++counts_2; + } + } + } + + // The threshold values are selected experimentally. + cm->allow_screen_content_tools = + counts_1 * blk_h * blk_w * 10 > width * height; + // IntraBC would force loop filters off, so we use more strict rules that also + // requires that the block has high variance. + cm->allow_intrabc = cm->allow_screen_content_tools && + counts_2 * blk_h * blk_w * 15 > width * height; +} + static void set_size_independent_vars(AV1_COMP *cpi) { int i; AV1_COMMON *cm = &cpi->common; @@ -3918,25 +3534,14 @@ static void set_size_independent_vars(AV1_COMP *cpi) { cm->global_motion[i] = default_warp_params; } cpi->global_motion_search_done = 0; - av1_set_speed_features_framesize_independent(cpi); + + if (frame_is_intra_only(cm)) set_screen_content_options(cpi); + cpi->is_screen_content_type = (cm->allow_screen_content_tools != 0); + + av1_set_speed_features_framesize_independent(cpi, cpi->speed); av1_set_rd_speed_thresholds(cpi); - av1_set_rd_speed_thresholds_sub8x8(cpi); cm->interp_filter = SWITCHABLE; cm->switchable_motion_mode = 1; - - if (frame_is_intra_only(cm)) { - if (cm->seq_params.force_screen_content_tools == 2) { - cm->allow_screen_content_tools = - cpi->oxcf.content == AOM_CONTENT_SCREEN || - is_screen_content(cpi->source->y_buffer, - cpi->source->flags & YV12_FLAG_HIGHBITDEPTH, - cm->seq_params.bit_depth, cpi->source->y_stride, - cpi->source->y_width, cpi->source->y_height); - } else { - cm->allow_screen_content_tools = - cm->seq_params.force_screen_content_tools; - } - } } static void set_size_dependent_vars(AV1_COMP *cpi, int *q, int *bottom_index, @@ -3945,7 +3550,7 @@ static void set_size_dependent_vars(AV1_COMP *cpi, int *q, int *bottom_index, const AV1EncoderConfig *const oxcf = &cpi->oxcf; // Setup variables that depend on the dimensions of the frame. - av1_set_speed_features_framesize_dependent(cpi); + av1_set_speed_features_framesize_dependent(cpi, cpi->speed); // Decide q and q bounds. *q = av1_rc_pick_q_and_bounds(cpi, cm->width, cm->height, bottom_index, @@ -3966,11 +3571,17 @@ static void set_size_dependent_vars(AV1_COMP *cpi, int *q, int *bottom_index, static void init_motion_estimation(AV1_COMP *cpi) { int y_stride = cpi->scaled_source.y_stride; + int y_stride_src = (cpi->oxcf.resize_mode || cpi->oxcf.superres_mode) + ? y_stride + : cpi->lookahead->buf->img.y_stride; if (cpi->sf.mv.search_method == NSTEP) { - av1_init3smotion_compensation(&cpi->ss_cfg, y_stride); + av1_init3smotion_compensation(&cpi->ss_cfg[SS_CFG_SRC], y_stride); + av1_init3smotion_compensation(&cpi->ss_cfg[SS_CFG_LOOKAHEAD], y_stride_src); } else if (cpi->sf.mv.search_method == DIAMOND) { - av1_init_dsmotion_compensation(&cpi->ss_cfg, y_stride); + av1_init_dsmotion_compensation(&cpi->ss_cfg[SS_CFG_SRC], y_stride); + av1_init_dsmotion_compensation(&cpi->ss_cfg[SS_CFG_LOOKAHEAD], + y_stride_src); } } @@ -3999,10 +3610,9 @@ static void init_ref_frame_bufs(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; int i; BufferPool *const pool = cm->buffer_pool; - cm->new_fb_idx = INVALID_IDX; cm->cur_frame = NULL; for (i = 0; i < REF_FRAMES; ++i) { - cm->ref_frame_map[i] = INVALID_IDX; + cm->ref_frame_map[i] = NULL; } for (i = 0; i < FRAME_BUFFERS; ++i) { pool->frame_bufs[i].ref_count = 0; @@ -4064,7 +3674,7 @@ static int set_size_literal(AV1_COMP *cpi, int width, int height) { return 0; } -static void set_frame_size(AV1_COMP *cpi, int width, int height) { +void av1_set_frame_size(AV1_COMP *cpi, int width, int height) { AV1_COMMON *const cm = &cpi->common; const SequenceHeader *const seq_params = &cm->seq_params; const int num_planes = av1_num_planes(cm); @@ -4083,7 +3693,7 @@ static void set_frame_size(AV1_COMP *cpi, int width, int height) { av1_set_target_rate(cpi, cm->width, cm->height); } - alloc_frame_mvs(cm, cm->new_fb_idx); + alloc_frame_mvs(cm, cm->cur_frame); // Allocate above context buffers if (cm->num_allocated_above_context_planes < av1_num_planes(cm) || @@ -4099,7 +3709,7 @@ static void set_frame_size(AV1_COMP *cpi, int width, int height) { if (aom_realloc_frame_buffer( &cm->cur_frame->buf, cm->width, cm->height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, - AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL)) + cpi->oxcf.border_in_pixels, cm->byte_alignment, NULL, NULL, NULL)) aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); @@ -4116,20 +3726,13 @@ static void set_frame_size(AV1_COMP *cpi, int width, int height) { init_motion_estimation(cpi); for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { - RefBuffer *const ref_buf = - &cm->current_frame.frame_refs[ref_frame - LAST_FRAME]; - const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame); - - if (buf_idx != INVALID_IDX) { - RefCntBuffer *const buf = &cm->buffer_pool->frame_bufs[buf_idx]; - ref_buf->buf = buf; - av1_setup_scale_factors_for_frame(&ref_buf->sf, buf->buf.y_crop_width, + RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame); + if (buf != NULL) { + struct scale_factors *sf = get_ref_scale_factors(cm, ref_frame); + av1_setup_scale_factors_for_frame(sf, buf->buf.y_crop_width, buf->buf.y_crop_height, cm->width, cm->height); - if (av1_is_scaled(&ref_buf->sf)) - aom_extend_frame_borders(&buf->buf, num_planes); - } else { - ref_buf->buf = NULL; + if (av1_is_scaled(sf)) aom_extend_frame_borders(&buf->buf, num_planes); } } @@ -4161,24 +3764,33 @@ static uint8_t calculate_next_resize_scale(const AV1_COMP *cpi) { return new_denom; } -#define ENERGY_BY_Q2_THRESH 0.015 +#define ENERGY_BY_Q2_THRESH 0.01 +#define ENERGY_BY_AC_THRESH 0.2 static uint8_t get_superres_denom_from_qindex_energy(int qindex, double *energy, - double thresh) { + double threshq, + double threshp) { const double q = av1_convert_qindex_to_q(qindex, AOM_BITS_8); - const double threshq2 = thresh * q * q; + const double tq = threshq * q * q; + const double tp = threshp * energy[1]; + const double thresh = AOMMIN(tq, tp); int k; - for (k = 8; k > 0; --k) { - if (energy[k - 1] > threshq2) break; + for (k = 16; k > 8; --k) { + if (energy[k - 1] > thresh) break; } - return 2 * SCALE_NUMERATOR - k; + return 3 * SCALE_NUMERATOR - k; } static uint8_t get_superres_denom_for_qindex(const AV1_COMP *cpi, int qindex) { - double energy[8]; + double energy[16]; analyze_hor_freq(cpi, energy); - return get_superres_denom_from_qindex_energy(qindex, energy, - ENERGY_BY_Q2_THRESH); + /* + printf("\nenergy = ["); + for (int k = 1; k < 16; ++k) printf("%f, ", energy[k]); + printf("]\n"); + */ + return get_superres_denom_from_qindex_energy( + qindex, energy, ENERGY_BY_Q2_THRESH, ENERGY_BY_AC_THRESH); } static uint8_t calculate_next_superres_scale(AV1_COMP *cpi) { @@ -4216,25 +3828,31 @@ static uint8_t calculate_next_superres_scale(AV1_COMP *cpi) { const int qthresh = (frame_is_intra_only(&cpi->common)) ? oxcf->superres_kf_qthresh : oxcf->superres_qthresh; - if (q < qthresh) { + if (q <= qthresh) { new_denom = SCALE_NUMERATOR; } else { - // TODO(debargha): Experiment with the variant below. - // new_denom = get_superres_denom_for_qindex(cpi, q); - uint8_t max_denom = get_superres_denom_for_qindex(cpi, MAXQ); - if (max_denom == SCALE_NUMERATOR) { - new_denom = max_denom; - break; - } else { - const uint8_t q_denom_step = - max_denom - SCALE_NUMERATOR == 0 - ? 255 - : (MAXQ - qthresh + 1 + max_denom - SCALE_NUMERATOR - 1) / - (max_denom - SCALE_NUMERATOR); - const uint8_t additional_denom = - (q - qthresh + 1 + q_denom_step - 1) / q_denom_step; - new_denom = AOMMIN(SCALE_NUMERATOR + additional_denom, max_denom); - } + new_denom = get_superres_denom_for_qindex(cpi, q); + } + break; + } + case SUPERRES_AUTO: { + // Don't use when screen content tools are used. + if (cpi->common.allow_screen_content_tools) break; + // Don't use for inter frames. + if (!frame_is_intra_only(&cpi->common)) break; + // Don't use for keyframes that can be used as references. + if (cpi->rc.frames_to_key != 1) break; + + // Now decide the use of superres based on 'q'. + int bottom_index, top_index; + const int q = av1_rc_pick_q_and_bounds( + cpi, cpi->oxcf.width, cpi->oxcf.height, &bottom_index, &top_index); + + const int qthresh = 128; + if (q <= qthresh) { + new_denom = SCALE_NUMERATOR; + } else { + new_denom = get_superres_denom_for_qindex(cpi, q); } break; } @@ -4311,7 +3929,7 @@ static int validate_size_scales(RESIZE_MODE resize_mode, } // Calculates resize and superres params for next frame -size_params_type av1_calculate_next_size_params(AV1_COMP *cpi) { +static size_params_type calculate_next_size_params(AV1_COMP *cpi) { const AV1EncoderConfig *oxcf = &cpi->oxcf; size_params_type rsz = { oxcf->width, oxcf->height, SCALE_NUMERATOR }; int resize_denom; @@ -4334,7 +3952,8 @@ size_params_type av1_calculate_next_size_params(AV1_COMP *cpi) { return rsz; } -static void setup_frame_size_from_params(AV1_COMP *cpi, size_params_type *rsz) { +static void setup_frame_size_from_params(AV1_COMP *cpi, + const size_params_type *rsz) { int encode_width = rsz->resize_width; int encode_height = rsz->resize_height; @@ -4344,12 +3963,17 @@ static void setup_frame_size_from_params(AV1_COMP *cpi, size_params_type *rsz) { cm->superres_scale_denominator = rsz->superres_denom; av1_calculate_scaled_superres_size(&encode_width, &encode_height, rsz->superres_denom); - set_frame_size(cpi, encode_width, encode_height); + av1_set_frame_size(cpi, encode_width, encode_height); } -static void setup_frame_size(AV1_COMP *cpi) { - size_params_type rsz = av1_calculate_next_size_params(cpi); +void av1_setup_frame_size(AV1_COMP *cpi) { + AV1_COMMON *cm = &cpi->common; + // Reset superres params from previous frame. + cm->superres_scale_denominator = SCALE_NUMERATOR; + const size_params_type rsz = calculate_next_size_params(cpi); setup_frame_size_from_params(cpi, &rsz); + + assert(is_min_tile_width_satisfied(cm)); } static void superres_post_encode(AV1_COMP *cpi) { @@ -4398,237 +4022,431 @@ static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) { assert(IMPLIES(is_lossless_requested(&cpi->oxcf), cm->coded_lossless && cm->all_lossless)); - const int no_loopfilter = cm->coded_lossless || cm->large_scale_tile; - const int no_cdef = - !cm->seq_params.enable_cdef || cm->coded_lossless || cm->large_scale_tile; - const int no_restoration = !cm->seq_params.enable_restoration || - cm->all_lossless || cm->large_scale_tile; + const int use_loopfilter = !cm->coded_lossless && !cm->large_scale_tile; + const int use_cdef = cm->seq_params.enable_cdef && !cm->coded_lossless && + !cm->large_scale_tile; + const int use_restoration = cm->seq_params.enable_restoration && + !cm->all_lossless && !cm->large_scale_tile; struct loopfilter *lf = &cm->lf; - if (no_loopfilter) { - lf->filter_level[0] = 0; - lf->filter_level[1] = 0; - } else { - struct aom_usec_timer timer; - +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, loop_filter_time); +#endif + if (use_loopfilter) { aom_clear_system_state(); - - aom_usec_timer_start(&timer); - av1_pick_filter_level(cpi->source, cpi, cpi->sf.lpf_pick); - - aom_usec_timer_mark(&timer); - cpi->time_pick_lpf += aom_usec_timer_elapsed(&timer); + } else { + lf->filter_level[0] = 0; + lf->filter_level[1] = 0; } if (lf->filter_level[0] || lf->filter_level[1]) { if (cpi->num_workers > 1) - av1_loop_filter_frame_mt(cm->frame_to_show, cm, xd, 0, num_planes, 0, + av1_loop_filter_frame_mt(&cm->cur_frame->buf, cm, xd, 0, num_planes, 0, #if LOOP_FILTER_BITMASK 0, #endif cpi->workers, cpi->num_workers, &cpi->lf_row_sync); else - av1_loop_filter_frame(cm->frame_to_show, cm, xd, + av1_loop_filter_frame(&cm->cur_frame->buf, cm, xd, #if LOOP_FILTER_BITMASK 0, #endif 0, num_planes, 0); } +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, loop_filter_time); +#endif - if (!no_restoration) - av1_loop_restoration_save_boundary_lines(cm->frame_to_show, cm, 0); + if (use_restoration) + av1_loop_restoration_save_boundary_lines(&cm->cur_frame->buf, cm, 0); - if (no_cdef) { - cm->cdef_info.cdef_bits = 0; - cm->cdef_info.cdef_strengths[0] = 0; - cm->cdef_info.nb_cdef_strengths = 1; - cm->cdef_info.cdef_uv_strengths[0] = 0; - } else { + if (use_cdef) { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, cdef_time); +#endif // Find CDEF parameters - av1_cdef_search(cm->frame_to_show, cpi->source, cm, xd, + av1_cdef_search(&cm->cur_frame->buf, cpi->source, cm, xd, cpi->sf.fast_cdef_search); // Apply the filter - av1_cdef_frame(cm->frame_to_show, cm, xd); + av1_cdef_frame(&cm->cur_frame->buf, cm, xd); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, cdef_time); +#endif + } else { + cm->cdef_info.cdef_bits = 0; + cm->cdef_info.cdef_strengths[0] = 0; + cm->cdef_info.nb_cdef_strengths = 1; + cm->cdef_info.cdef_uv_strengths[0] = 0; } superres_post_encode(cpi); - if (no_restoration) { - cm->rst_info[0].frame_restoration_type = RESTORE_NONE; - cm->rst_info[1].frame_restoration_type = RESTORE_NONE; - cm->rst_info[2].frame_restoration_type = RESTORE_NONE; - } else { - av1_loop_restoration_save_boundary_lines(cm->frame_to_show, cm, 1); +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, loop_restoration_time); +#endif + if (use_restoration) { + av1_loop_restoration_save_boundary_lines(&cm->cur_frame->buf, cm, 1); av1_pick_filter_restoration(cpi->source, cpi); if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE || cm->rst_info[1].frame_restoration_type != RESTORE_NONE || cm->rst_info[2].frame_restoration_type != RESTORE_NONE) { if (cpi->num_workers > 1) - av1_loop_restoration_filter_frame_mt(cm->frame_to_show, cm, 0, + av1_loop_restoration_filter_frame_mt(&cm->cur_frame->buf, cm, 0, cpi->workers, cpi->num_workers, &cpi->lr_row_sync, &cpi->lr_ctxt); else - av1_loop_restoration_filter_frame(cm->frame_to_show, cm, 0, + av1_loop_restoration_filter_frame(&cm->cur_frame->buf, cm, 0, &cpi->lr_ctxt); } + } else { + cm->rst_info[0].frame_restoration_type = RESTORE_NONE; + cm->rst_info[1].frame_restoration_type = RESTORE_NONE; + cm->rst_info[2].frame_restoration_type = RESTORE_NONE; } +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, loop_restoration_time); +#endif } -static int encode_without_recode_loop(AV1_COMP *cpi) { +static void fix_interp_filter(InterpFilter *const interp_filter, + const FRAME_COUNTS *const counts) { + if (*interp_filter == SWITCHABLE) { + // Check to see if only one of the filters is actually used + int count[SWITCHABLE_FILTERS] = { 0 }; + int num_filters_used = 0; + for (int i = 0; i < SWITCHABLE_FILTERS; ++i) { + for (int j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) + count[i] += counts->switchable_interp[j][i]; + num_filters_used += (count[i] > 0); + } + if (num_filters_used == 1) { + // Only one filter is used. So set the filter at frame level + for (int i = 0; i < SWITCHABLE_FILTERS; ++i) { + if (count[i]) { + if (i == EIGHTTAP_REGULAR) *interp_filter = i; + break; + } + } + } + } +} + +static void finalize_encoded_frame(AV1_COMP *const cpi) { AV1_COMMON *const cm = &cpi->common; - int q = 0, bottom_index = 0, top_index = 0; // Dummy variables. + CurrentFrame *const current_frame = &cm->current_frame; - aom_clear_system_state(); + if (!cm->seq_params.reduced_still_picture_hdr && + encode_show_existing_frame(cm)) { + RefCntBuffer *const frame_to_show = + cm->ref_frame_map[cpi->existing_fb_idx_to_show]; - set_size_independent_vars(cpi); + if (frame_to_show == NULL) { + aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM, + "Buffer does not contain a reconstructed frame"); + } + assert(frame_to_show->ref_count > 0); + assign_frame_buffer_p(&cm->cur_frame, frame_to_show); + } - setup_frame_size(cpi); + if (!encode_show_existing_frame(cm) && + cm->seq_params.film_grain_params_present && + (cm->show_frame || cm->showable_frame)) { + // Copy the current frame's film grain params to the its corresponding + // RefCntBuffer slot. + cm->cur_frame->film_grain_params = cm->film_grain_params; - assert(cm->width == cpi->scaled_source.y_crop_width); - assert(cm->height == cpi->scaled_source.y_crop_height); + // We must update the parameters if this is not an INTER_FRAME + if (current_frame->frame_type != INTER_FRAME) + cm->cur_frame->film_grain_params.update_parameters = 1; - set_size_dependent_vars(cpi, &q, &bottom_index, &top_index); + // Iterate the random seed for the next frame. + cm->film_grain_params.random_seed += 3381; + if (cm->film_grain_params.random_seed == 0) + cm->film_grain_params.random_seed = 7391; + } - cpi->source = - av1_scale_if_required(cm, cpi->unscaled_source, &cpi->scaled_source); - if (cpi->unscaled_last_source != NULL) - cpi->last_source = av1_scale_if_required(cm, cpi->unscaled_last_source, - &cpi->scaled_last_source); - cpi->source->buf_8bit_valid = 0; - if (frame_is_intra_only(cm) == 0) { - scale_references(cpi); + // Initialise all tiles' contexts from the global frame context + for (int tile_col = 0; tile_col < cm->tile_cols; tile_col++) { + for (int tile_row = 0; tile_row < cm->tile_rows; tile_row++) { + const int tile_idx = tile_row * cm->tile_cols + tile_col; + cpi->tile_data[tile_idx].tctx = *cm->fc; + } } - av1_set_quantizer(cm, q); - setup_frame(cpi); - suppress_active_map(cpi); + fix_interp_filter(&cm->interp_filter, cpi->td.counts); +} + +static int get_regulated_q_overshoot(AV1_COMP *const cpi, int q_low, int q_high, + int top_index, int bottom_index) { + const AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + + av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height); + + int q_regulated = + av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index, + AOMMAX(q_high, top_index), cm->width, cm->height); - // Variance adaptive and in frame q adjustment experiments are mutually - // exclusive. - if (cpi->oxcf.aq_mode == VARIANCE_AQ) { - av1_vaq_frame_setup(cpi); - } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) { - av1_setup_in_frame_q_adj(cpi); - } else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) { - av1_cyclic_refresh_setup(cpi); + int retries = 0; + while (q_regulated < q_low && retries < 10) { + av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height); + q_regulated = + av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index, + AOMMAX(q_high, top_index), cm->width, cm->height); + retries++; } - apply_active_map(cpi); - if (cm->seg.enabled) { - if (!cm->seg.update_data && cm->prev_frame) { - segfeatures_copy(&cm->seg, &cm->prev_frame->seg); + return q_regulated; +} + +static int get_regulated_q_undershoot(AV1_COMP *const cpi, int q_high, + int top_index, int bottom_index) { + const AV1_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + + av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height); + int q_regulated = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index, + top_index, cm->width, cm->height); + + int retries = 0; + while (q_regulated > q_high && retries < 10) { + av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height); + q_regulated = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index, + top_index, cm->width, cm->height); + retries++; + } + return q_regulated; +} + +// Called after encode_with_recode_loop() has just encoded a frame and packed +// its bitstream. This function works out whether we under- or over-shot +// our bitrate target and adjusts q as appropriate. Also decides whether +// or not we should do another recode loop, indicated by *loop +static void recode_loop_update_q(AV1_COMP *const cpi, int *const loop, + int *const q, int *const q_low, + int *const q_high, const int top_index, + const int bottom_index, + int *const undershoot_seen, + int *const overshoot_seen, + const int loop_at_this_size) { + AV1_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + + int frame_over_shoot_limit = 0, frame_under_shoot_limit = 0; + av1_rc_compute_frame_size_bounds(cpi, rc->this_frame_target, + &frame_under_shoot_limit, + &frame_over_shoot_limit); + if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1; + + if ((cm->current_frame.frame_type == KEY_FRAME) && + rc->this_key_frame_forced && + (rc->projected_frame_size < rc->max_frame_bandwidth)) { + int last_q = *q; + int64_t kf_err; + + int64_t high_err_target = cpi->ambient_err; + int64_t low_err_target = cpi->ambient_err >> 1; + + if (cm->seq_params.use_highbitdepth) { + kf_err = aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf); } else { - calculate_segdata(&cm->seg); + kf_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf); + } + // Prevent possible divide by zero error below for perfect KF + kf_err += !kf_err; + + // The key frame is not good enough or we can afford + // to make it better without undue risk of popping. + if ((kf_err > high_err_target && + rc->projected_frame_size <= frame_over_shoot_limit) || + (kf_err > low_err_target && + rc->projected_frame_size <= frame_under_shoot_limit)) { + // Lower q_high + *q_high = *q > *q_low ? *q - 1 : *q_low; + + // Adjust Q + *q = (int)((*q * high_err_target) / kf_err); + *q = AOMMIN(*q, (*q_high + *q_low) >> 1); + } else if (kf_err < low_err_target && + rc->projected_frame_size >= frame_under_shoot_limit) { + // The key frame is much better than the previous frame + // Raise q_low + *q_low = *q < *q_high ? *q + 1 : *q_high; + + // Adjust Q + *q = (int)((*q * low_err_target) / kf_err); + *q = AOMMIN(*q, (*q_high + *q_low + 1) >> 1); } - } else { - memset(&cm->seg, 0, sizeof(cm->seg)); - } - segfeatures_copy(&cm->cur_frame->seg, &cm->seg); - // transform / motion compensation build reconstruction frame - av1_encode_frame(cpi); + // Clamp Q to upper and lower limits: + *q = clamp(*q, *q_low, *q_high); + + *loop = *q != last_q; + } else if (recode_loop_test(cpi, frame_over_shoot_limit, + frame_under_shoot_limit, *q, + AOMMAX(*q_high, top_index), bottom_index)) { + // Is the projected frame size out of range and are we allowed + // to attempt to recode. + int last_q = *q; + + // Frame size out of permitted range: + // Update correction factor & compute new Q to try... + // Frame is too large + if (rc->projected_frame_size > rc->this_frame_target) { + // Special case if the projected size is > the max allowed. + if (rc->projected_frame_size >= rc->max_frame_bandwidth) + *q_high = rc->worst_quality; + + // Raise Qlow as to at least the current value + *q_low = *q < *q_high ? *q + 1 : *q_high; + + if (*undershoot_seen || loop_at_this_size > 2 || + (loop_at_this_size == 2 && !frame_is_intra_only(cm))) { + av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height); + + *q = (*q_high + *q_low + 1) / 2; + } else if (loop_at_this_size == 2 && frame_is_intra_only(cm)) { + const int q_mid = (*q_high + *q_low + 1) / 2; + const int q_regulated = get_regulated_q_overshoot( + cpi, *q_low, *q_high, top_index, bottom_index); + // Get 'q' in-between 'q_mid' and 'q_regulated' for a smooth + // transition between loop_at_this_size < 2 and loop_at_this_size > 2. + *q = (q_mid + q_regulated + 1) / 2; + } else { + *q = get_regulated_q_overshoot(cpi, *q_low, *q_high, top_index, + bottom_index); + } - // Update some stats from cyclic refresh, and check if we should not update - // golden reference, for 1 pass CBR. - if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && - cm->current_frame.frame_type != KEY_FRAME && - (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == AOM_CBR)) - av1_cyclic_refresh_check_golden_update(cpi); + *overshoot_seen = 1; + } else { + // Frame is too small + *q_high = *q > *q_low ? *q - 1 : *q_low; + + if (*overshoot_seen || loop_at_this_size > 2 || + (loop_at_this_size == 2 && !frame_is_intra_only(cm))) { + av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height); + *q = (*q_high + *q_low) / 2; + } else if (loop_at_this_size == 2 && frame_is_intra_only(cm)) { + const int q_mid = (*q_high + *q_low) / 2; + const int q_regulated = + get_regulated_q_undershoot(cpi, *q_high, top_index, bottom_index); + // Get 'q' in-between 'q_mid' and 'q_regulated' for a smooth + // transition between loop_at_this_size < 2 and loop_at_this_size > 2. + *q = (q_mid + q_regulated) / 2; + + // Special case reset for qlow for constrained quality. + // This should only trigger where there is very substantial + // undershoot on a frame and the auto cq level is above + // the user passsed in value. + if (cpi->oxcf.rc_mode == AOM_CQ && q_regulated < *q_low) { + *q_low = *q; + } + } else { + *q = get_regulated_q_undershoot(cpi, *q_high, top_index, bottom_index); + + // Special case reset for qlow for constrained quality. + // This should only trigger where there is very substantial + // undershoot on a frame and the auto cq level is above + // the user passsed in value. + if (cpi->oxcf.rc_mode == AOM_CQ && *q < *q_low) { + *q_low = *q; + } + } - // Update the skip mb flag probabilities based on the distribution - // seen in the last encoder iteration. - // update_base_skip_probs(cpi); - aom_clear_system_state(); - return AOM_CODEC_OK; + *undershoot_seen = 1; + } + + // Clamp Q to upper and lower limits: + *q = clamp(*q, *q_low, *q_high); + + *loop = (*q != last_q); + } else { + *loop = 0; + } } static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) { AV1_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; - int bottom_index, top_index; - int loop_count = 0; - int loop_at_this_size = 0; - int loop = 0; - int overshoot_seen = 0; - int undershoot_seen = 0; - int frame_over_shoot_limit; - int frame_under_shoot_limit; - int q = 0, q_low = 0, q_high = 0; + const int allow_recode = cpi->sf.recode_loop != DISALLOW_RECODE; set_size_independent_vars(cpi); cpi->source->buf_8bit_valid = 0; - aom_clear_system_state(); + av1_setup_frame_size(cpi); - setup_frame_size(cpi); + int top_index = 0, bottom_index = 0; + int q = 0, q_low = 0, q_high = 0; set_size_dependent_vars(cpi, &q, &bottom_index, &top_index); + q_low = bottom_index; + q_high = top_index; + // Loop variables + int loop_count = 0; + int loop_at_this_size = 0; + int loop = 0; + int overshoot_seen = 0; + int undershoot_seen = 0; + +#if CONFIG_COLLECT_COMPONENT_TIMING + printf("\n Encoding a frame:"); +#endif do { aom_clear_system_state(); - if (loop_count == 0) { - // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed. - set_mv_search_params(cpi); - - // Reset the loop state for new frame size. - overshoot_seen = 0; - undershoot_seen = 0; - - q_low = bottom_index; - q_high = top_index; - - loop_at_this_size = 0; - - // Decide frame size bounds first time through. - av1_rc_compute_frame_size_bounds(cpi, rc->this_frame_target, - &frame_under_shoot_limit, - &frame_over_shoot_limit); - } - // if frame was scaled calculate global_motion_search again if already // done - if (loop_count > 0 && cpi->source && cpi->global_motion_search_done) + if (loop_count > 0 && cpi->source && cpi->global_motion_search_done) { if (cpi->source->y_crop_width != cm->width || - cpi->source->y_crop_height != cm->height) + cpi->source->y_crop_height != cm->height) { cpi->global_motion_search_done = 0; + } + } cpi->source = av1_scale_if_required(cm, cpi->unscaled_source, &cpi->scaled_source); - if (cpi->unscaled_last_source != NULL) + if (cpi->unscaled_last_source != NULL) { cpi->last_source = av1_scale_if_required(cm, cpi->unscaled_last_source, &cpi->scaled_last_source); + } - if (frame_is_intra_only(cm) == 0) { + if (!frame_is_intra_only(cm)) { if (loop_count > 0) { release_scaled_references(cpi); } scale_references(cpi); } av1_set_quantizer(cm, q); + av1_init_quantizer(cpi); + + av1_set_variance_partition_thresholds(cpi, q, 0); + // printf("Frame %d/%d: q = %d, frame_type = %d superres_denom = %d\n", // cm->current_frame.frame_number, cm->show_frame, q, // cm->current_frame.frame_type, cm->superres_scale_denominator); - if (loop_count == 0) setup_frame(cpi); - - // Base q-index may have changed, so we need to assign proper default coef - // probs before every iteration. - if (cm->primary_ref_frame == PRIMARY_REF_NONE || - cm->current_frame.frame_refs[cm->primary_ref_frame].buf == NULL) { + if (loop_count == 0) { + setup_frame(cpi); + } else if (get_primary_ref_frame_buf(cm) == NULL) { + // Base q-index may have changed, so we need to assign proper default coef + // probs before every iteration. av1_default_coef_probs(cm); av1_setup_frame_contexts(cm); } - // Variance adaptive and in frame q adjustment experiments are mutually - // exclusive. if (cpi->oxcf.aq_mode == VARIANCE_AQ) { av1_vaq_frame_setup(cpi); } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) { av1_setup_in_frame_q_adj(cpi); + } else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && !allow_recode) { + suppress_active_map(cpi); + av1_cyclic_refresh_setup(cpi); + apply_active_map(cpi); } + if (cm->seg.enabled) { if (!cm->seg.update_data && cm->prev_frame) { segfeatures_copy(&cm->seg, &cm->prev_frame->seg); @@ -4640,13 +4458,15 @@ static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) { } segfeatures_copy(&cm->cur_frame->seg, &cm->seg); + if (allow_recode) save_coding_context(cpi); +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, av1_encode_frame_time); +#endif // transform / motion compensation build reconstruction frame - save_coding_context(cpi); av1_encode_frame(cpi); - - // Update the skip mb flag probabilities based on the distribution - // seen in the last encoder iteration. - // update_base_skip_probs(cpi); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, av1_encode_frame_time); +#endif aom_clear_system_state(); @@ -4656,141 +4476,20 @@ static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) { if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) { restore_coding_context(cpi); - if (av1_pack_bitstream(cpi, dest, size) != AOM_CODEC_OK) + finalize_encoded_frame(cpi); + int largest_tile_id = 0; // Output from bitstream: unused here + if (av1_pack_bitstream(cpi, dest, size, &largest_tile_id) != AOM_CODEC_OK) return AOM_CODEC_ERROR; rc->projected_frame_size = (int)(*size) << 3; restore_coding_context(cpi); - - if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1; } - if (cpi->oxcf.rc_mode == AOM_Q) { - loop = 0; - } else { - if ((cm->current_frame.frame_type == KEY_FRAME) && - rc->this_key_frame_forced && - (rc->projected_frame_size < rc->max_frame_bandwidth)) { - int last_q = q; - int64_t kf_err; - - int64_t high_err_target = cpi->ambient_err; - int64_t low_err_target = cpi->ambient_err >> 1; - - if (cm->seq_params.use_highbitdepth) { - kf_err = aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf); - } else { - kf_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf); - } - // Prevent possible divide by zero error below for perfect KF - kf_err += !kf_err; - - // The key frame is not good enough or we can afford - // to make it better without undue risk of popping. - if ((kf_err > high_err_target && - rc->projected_frame_size <= frame_over_shoot_limit) || - (kf_err > low_err_target && - rc->projected_frame_size <= frame_under_shoot_limit)) { - // Lower q_high - q_high = q > q_low ? q - 1 : q_low; - - // Adjust Q - q = (int)((q * high_err_target) / kf_err); - q = AOMMIN(q, (q_high + q_low) >> 1); - } else if (kf_err < low_err_target && - rc->projected_frame_size >= frame_under_shoot_limit) { - // The key frame is much better than the previous frame - // Raise q_low - q_low = q < q_high ? q + 1 : q_high; - - // Adjust Q - q = (int)((q * low_err_target) / kf_err); - q = AOMMIN(q, (q_high + q_low + 1) >> 1); - } - - // Clamp Q to upper and lower limits: - q = clamp(q, q_low, q_high); - - loop = q != last_q; - } else if (recode_loop_test(cpi, frame_over_shoot_limit, - frame_under_shoot_limit, q, - AOMMAX(q_high, top_index), bottom_index)) { - // Is the projected frame size out of range and are we allowed - // to attempt to recode. - int last_q = q; - int retries = 0; - - // Frame size out of permitted range: - // Update correction factor & compute new Q to try... - // Frame is too large - if (rc->projected_frame_size > rc->this_frame_target) { - // Special case if the projected size is > the max allowed. - if (rc->projected_frame_size >= rc->max_frame_bandwidth) - q_high = rc->worst_quality; - - // Raise Qlow as to at least the current value - q_low = q < q_high ? q + 1 : q_high; - - if (undershoot_seen || loop_at_this_size > 1) { - // Update rate_correction_factor unless - av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height); - - q = (q_high + q_low + 1) / 2; - } else { - // Update rate_correction_factor unless - av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height); - - q = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index, - AOMMAX(q_high, top_index), cm->width, - cm->height); - - while (q < q_low && retries < 10) { - av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height); - q = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index, - AOMMAX(q_high, top_index), cm->width, - cm->height); - retries++; - } - } - - overshoot_seen = 1; - } else { - // Frame is too small - q_high = q > q_low ? q - 1 : q_low; - - if (overshoot_seen || loop_at_this_size > 1) { - av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height); - q = (q_high + q_low) / 2; - } else { - av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height); - q = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index, - top_index, cm->width, cm->height); - // Special case reset for qlow for constrained quality. - // This should only trigger where there is very substantial - // undershoot on a frame and the auto cq level is above - // the user passsed in value. - if (cpi->oxcf.rc_mode == AOM_CQ && q < q_low) { - q_low = q; - } - - while (q > q_high && retries < 10) { - av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height); - q = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index, - top_index, cm->width, cm->height); - retries++; - } - } - - undershoot_seen = 1; - } - - // Clamp Q to upper and lower limits: - q = clamp(q, q_low, q_high); - - loop = (q != last_q); - } else { - loop = 0; - } + if (allow_recode && cpi->oxcf.rc_mode != AOM_Q) { + // Update q and decide whether to do a recode loop + recode_loop_update_q(cpi, &loop, &q, &q_low, &q_high, top_index, + bottom_index, &undershoot_seen, &overshoot_seen, + loop_at_this_size); } // Special case for overlay frame. @@ -4798,8 +4497,9 @@ static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) { rc->projected_frame_size < rc->max_frame_bandwidth) loop = 0; - if (!cpi->sf.gm_disable_recode) { - if (recode_loop_test_global_motion(cpi)) loop = 1; + if (allow_recode && !cpi->sf.gm_disable_recode && + recode_loop_test_global_motion(cpi)) { + loop = 1; } if (loop) { @@ -4810,127 +4510,14 @@ static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) { ++cpi->tot_recode_hits; #endif } +#if CONFIG_COLLECT_COMPONENT_TIMING + if (loop) printf("\n Recoding:"); +#endif } while (loop); return AOM_CODEC_OK; } -static int get_ref_frame_flags(const AV1_COMP *cpi) { - const int *const map = cpi->common.ref_frame_map; - - // No.1 Priority: LAST_FRAME - const int last2_is_last = - map[cpi->remapped_ref_idx[1]] == map[cpi->remapped_ref_idx[0]]; - const int last3_is_last = - map[cpi->remapped_ref_idx[2]] == map[cpi->remapped_ref_idx[0]]; - const int gld_is_last = map[get_ref_frame_map_idx(cpi, GOLDEN_FRAME)] == - map[cpi->remapped_ref_idx[0]]; - const int bwd_is_last = map[get_ref_frame_map_idx(cpi, BWDREF_FRAME)] == - map[cpi->remapped_ref_idx[0]]; - const int alt2_is_last = map[get_ref_frame_map_idx(cpi, ALTREF2_FRAME)] == - map[cpi->remapped_ref_idx[0]]; - const int alt_is_last = map[get_ref_frame_map_idx(cpi, ALTREF_FRAME)] == - map[cpi->remapped_ref_idx[0]]; - - // No.2 Priority: ALTREF_FRAME - const int last2_is_alt = map[cpi->remapped_ref_idx[1]] == - map[get_ref_frame_map_idx(cpi, ALTREF_FRAME)]; - const int last3_is_alt = map[cpi->remapped_ref_idx[2]] == - map[get_ref_frame_map_idx(cpi, ALTREF_FRAME)]; - const int gld_is_alt = map[get_ref_frame_map_idx(cpi, GOLDEN_FRAME)] == - map[get_ref_frame_map_idx(cpi, ALTREF_FRAME)]; - const int bwd_is_alt = map[get_ref_frame_map_idx(cpi, BWDREF_FRAME)] == - map[get_ref_frame_map_idx(cpi, ALTREF_FRAME)]; - const int alt2_is_alt = map[get_ref_frame_map_idx(cpi, ALTREF2_FRAME)] == - map[get_ref_frame_map_idx(cpi, ALTREF_FRAME)]; - - // No.3 Priority: LAST2_FRAME - const int last3_is_last2 = - map[cpi->remapped_ref_idx[2]] == map[cpi->remapped_ref_idx[1]]; - const int gld_is_last2 = map[get_ref_frame_map_idx(cpi, GOLDEN_FRAME)] == - map[cpi->remapped_ref_idx[1]]; - const int bwd_is_last2 = map[get_ref_frame_map_idx(cpi, BWDREF_FRAME)] == - map[cpi->remapped_ref_idx[1]]; - const int alt2_is_last2 = map[get_ref_frame_map_idx(cpi, ALTREF2_FRAME)] == - map[cpi->remapped_ref_idx[1]]; - - // No.4 Priority: LAST3_FRAME - const int gld_is_last3 = map[get_ref_frame_map_idx(cpi, GOLDEN_FRAME)] == - map[cpi->remapped_ref_idx[2]]; - const int bwd_is_last3 = map[get_ref_frame_map_idx(cpi, BWDREF_FRAME)] == - map[cpi->remapped_ref_idx[2]]; - const int alt2_is_last3 = map[get_ref_frame_map_idx(cpi, ALTREF2_FRAME)] == - map[cpi->remapped_ref_idx[2]]; - - // No.5 Priority: GOLDEN_FRAME - const int bwd_is_gld = map[get_ref_frame_map_idx(cpi, BWDREF_FRAME)] == - map[get_ref_frame_map_idx(cpi, GOLDEN_FRAME)]; - const int alt2_is_gld = map[get_ref_frame_map_idx(cpi, ALTREF2_FRAME)] == - map[get_ref_frame_map_idx(cpi, GOLDEN_FRAME)]; - - // No.6 Priority: BWDREF_FRAME - const int alt2_is_bwd = map[get_ref_frame_map_idx(cpi, ALTREF2_FRAME)] == - map[get_ref_frame_map_idx(cpi, BWDREF_FRAME)]; - - // No.7 Priority: ALTREF2_FRAME - - // After av1_apply_encoding_flags() is called, cpi->ref_frame_flags might be - // adjusted according to external encoder flags. - int flags = cpi->ext_ref_frame_flags; - - if (cpi->rc.frames_till_gf_update_due == INT_MAX) flags &= ~AOM_GOLD_FLAG; - - if (alt_is_last) flags &= ~AOM_ALT_FLAG; - - if (last2_is_last || last2_is_alt) flags &= ~AOM_LAST2_FLAG; - - if (last3_is_last || last3_is_alt || last3_is_last2) flags &= ~AOM_LAST3_FLAG; - - if (gld_is_last || gld_is_alt || gld_is_last2 || gld_is_last3) - flags &= ~AOM_GOLD_FLAG; - - if ((bwd_is_last || bwd_is_alt || bwd_is_last2 || bwd_is_last3 || - bwd_is_gld) && - (flags & AOM_BWD_FLAG)) - flags &= ~AOM_BWD_FLAG; - - if ((alt2_is_last || alt2_is_alt || alt2_is_last2 || alt2_is_last3 || - alt2_is_gld || alt2_is_bwd) && - (flags & AOM_ALT2_FLAG)) - flags &= ~AOM_ALT2_FLAG; - - return flags; -} - -static void set_ext_overrides(AV1_COMP *cpi) { - // Overrides the defaults with the externally supplied values with - // av1_update_reference() and av1_update_entropy() calls - // Note: The overrides are valid only for the next frame passed - // to encode_frame_to_data_rate() function - if (cpi->ext_use_s_frame) cpi->common.current_frame.frame_type = S_FRAME; - cpi->common.force_primary_ref_none = cpi->ext_use_primary_ref_none; - - if (cpi->ext_refresh_frame_context_pending) { - cpi->common.refresh_frame_context = cpi->ext_refresh_frame_context; - cpi->ext_refresh_frame_context_pending = 0; - } - if (cpi->ext_refresh_frame_flags_pending) { - cpi->refresh_last_frame = cpi->ext_refresh_last_frame; - cpi->refresh_golden_frame = cpi->ext_refresh_golden_frame; - cpi->refresh_alt_ref_frame = cpi->ext_refresh_alt_ref_frame; - cpi->refresh_bwd_ref_frame = cpi->ext_refresh_bwd_ref_frame; - cpi->refresh_alt2_ref_frame = cpi->ext_refresh_alt2_ref_frame; - cpi->ext_refresh_frame_flags_pending = 0; - } - cpi->common.allow_ref_frame_mvs = cpi->ext_use_ref_frame_mvs; - // A keyframe is already error resilient and keyframes with - // error_resilient_mode interferes with the use of show_existing_frame - // when forward reference keyframes are enabled. - cpi->common.error_resilient_mode = - cpi->ext_use_error_resilient && - cpi->common.current_frame.frame_type != KEY_FRAME; -} - #define DUMP_RECON_FRAMES 0 #if DUMP_RECON_FRAMES == 1 @@ -4938,7 +4525,7 @@ static void set_ext_overrides(AV1_COMP *cpi) { static void dump_filtered_recon_frames(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; const CurrentFrame *const current_frame = &cm->current_frame; - const YV12_BUFFER_CONFIG *recon_buf = cm->frame_to_show; + const YV12_BUFFER_CONFIG *recon_buf = &cm->cur_frame->buf; if (recon_buf == NULL) { printf("Frame %d is not ready.\n", current_frame->frame_number); @@ -4960,12 +4547,10 @@ static void dump_filtered_recon_frames(AV1_COMP *cpi) { current_frame->frame_number, current_frame->order_hint, cm->show_frame, cm->show_existing_frame); for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { - RefBuffer *buf = &cm->current_frame.frame_refs[ref_frame - LAST_FRAME]; - const int ref_offset = (buf->buf) ? (int)buf->buf->order_hint : -1; - printf(" %d(%c-%d-%4.2f)", ref_offset, - (cpi->ref_frame_flags & flag_list[ref_frame]) ? 'Y' : 'N', - (buf->buf) ? (int)buf->buf->frame_rf_level : -1, - (buf->buf) ? rate_factor_deltas[buf->buf->frame_rf_level] : -1); + const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame); + const int ref_offset = buf != NULL ? (int)buf->order_hint : -1; + printf(" %d(%c)", ref_offset, + (cpi->ref_frame_flags & flag_list[ref_frame]) ? 'Y' : 'N'); } printf(" ]\n"); @@ -4993,25 +4578,18 @@ static void dump_filtered_recon_frames(AV1_COMP *cpi) { printf( "\nFrame=%5d, encode_update_type[%5d]=%1d, frame_offset=%d, " "show_frame=%d, show_existing_frame=%d, source_alt_ref_active=%d, " - "refresh_alt_ref_frame=%d, rf_level=%d, " + "refresh_alt_ref_frame=%d, " "y_stride=%4d, uv_stride=%4d, cm->width=%4d, cm->height=%4d\n\n", current_frame->frame_number, cpi->twopass.gf_group.index, cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index], current_frame->order_hint, cm->show_frame, cm->show_existing_frame, cpi->rc.source_alt_ref_active, cpi->refresh_alt_ref_frame, - cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index], recon_buf->y_stride, recon_buf->uv_stride, cm->width, cm->height); #if 0 int ref_frame; printf("get_ref_frame_map_idx: ["); for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) - printf(" %d", get_ref_frame_map_idx(cpi, ref_frame)); - printf(" ]\n"); - printf("cm->new_fb_idx = %d\n", cm->new_fb_idx); - printf("cm->ref_frame_map = ["); - for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { - printf(" %d", cm->ref_frame_map[ref_frame - LAST_FRAME]); - } + printf(" %d", get_ref_frame_map_idx(cm, ref_frame)); printf(" ]\n"); #endif // 0 @@ -5035,31 +4613,209 @@ static void dump_filtered_recon_frames(AV1_COMP *cpi) { } #endif // DUMP_RECON_FRAMES -static INLINE int is_frame_droppable(AV1_COMP *cpi) { - return !(cpi->refresh_alt_ref_frame || cpi->refresh_alt2_ref_frame || - cpi->refresh_bwd_ref_frame || cpi->refresh_golden_frame || - cpi->refresh_last_frame); +static int get_interp_filter_selected(const AV1_COMMON *const cm, + MV_REFERENCE_FRAME ref, + InterpFilters ifilter) { + const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref); + if (buf == NULL) return 0; + return buf->interp_filter_selected[ifilter]; +} + +static int setup_interp_filter_search_mask(AV1_COMP *cpi) { + const AV1_COMMON *const cm = &cpi->common; + int ref_total[REF_FRAMES] = { 0 }; + + if (cpi->common.last_frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame) + return 0; + + for (MV_REFERENCE_FRAME ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref) { + for (InterpFilters ifilter = EIGHTTAP_REGULAR; ifilter <= MULTITAP_SHARP; + ++ifilter) { + ref_total[ref] += get_interp_filter_selected(cm, ref, ifilter); + } + } + int ref_total_total = (ref_total[LAST2_FRAME] + ref_total[LAST3_FRAME] + + ref_total[GOLDEN_FRAME] + ref_total[BWDREF_FRAME] + + ref_total[ALTREF2_FRAME] + ref_total[ALTREF_FRAME]); + + int mask = 0; + for (InterpFilters ifilter = EIGHTTAP_REGULAR; ifilter <= MULTITAP_SHARP; + ++ifilter) { + int last_score = get_interp_filter_selected(cm, LAST_FRAME, ifilter) * 30; + if (ref_total[LAST_FRAME] && last_score <= ref_total[LAST_FRAME]) { + int filter_score = + get_interp_filter_selected(cm, LAST2_FRAME, ifilter) * 20 + + get_interp_filter_selected(cm, LAST3_FRAME, ifilter) * 20 + + get_interp_filter_selected(cm, GOLDEN_FRAME, ifilter) * 20 + + get_interp_filter_selected(cm, BWDREF_FRAME, ifilter) * 10 + + get_interp_filter_selected(cm, ALTREF2_FRAME, ifilter) * 10 + + get_interp_filter_selected(cm, ALTREF_FRAME, ifilter) * 10; + if (filter_score < ref_total_total) mask |= 1 << ifilter; + } + } + return mask; +} + +static int is_integer_mv(AV1_COMP *cpi, const YV12_BUFFER_CONFIG *cur_picture, + const YV12_BUFFER_CONFIG *last_picture, + hash_table *last_hash_table) { + aom_clear_system_state(); + // check use hash ME + int k; + uint32_t hash_value_1; + uint32_t hash_value_2; + + const int block_size = 8; + const double threshold_current = 0.8; + const double threshold_average = 0.95; + const int max_history_size = 32; + int T = 0; // total block + int C = 0; // match with collocated block + int S = 0; // smooth region but not match with collocated block + int M = 0; // match with other block + + const int pic_width = cur_picture->y_width; + const int pic_height = cur_picture->y_height; + for (int i = 0; i + block_size <= pic_height; i += block_size) { + for (int j = 0; j + block_size <= pic_width; j += block_size) { + const int x_pos = j; + const int y_pos = i; + int match = 1; + T++; + + // check whether collocated block match with current + uint8_t *p_cur = cur_picture->y_buffer; + uint8_t *p_ref = last_picture->y_buffer; + int stride_cur = cur_picture->y_stride; + int stride_ref = last_picture->y_stride; + p_cur += (y_pos * stride_cur + x_pos); + p_ref += (y_pos * stride_ref + x_pos); + + if (cur_picture->flags & YV12_FLAG_HIGHBITDEPTH) { + uint16_t *p16_cur = CONVERT_TO_SHORTPTR(p_cur); + uint16_t *p16_ref = CONVERT_TO_SHORTPTR(p_ref); + for (int tmpY = 0; tmpY < block_size && match; tmpY++) { + for (int tmpX = 0; tmpX < block_size && match; tmpX++) { + if (p16_cur[tmpX] != p16_ref[tmpX]) { + match = 0; + } + } + p16_cur += stride_cur; + p16_ref += stride_ref; + } + } else { + for (int tmpY = 0; tmpY < block_size && match; tmpY++) { + for (int tmpX = 0; tmpX < block_size && match; tmpX++) { + if (p_cur[tmpX] != p_ref[tmpX]) { + match = 0; + } + } + p_cur += stride_cur; + p_ref += stride_ref; + } + } + + if (match) { + C++; + continue; + } + + if (av1_hash_is_horizontal_perfect(cur_picture, block_size, x_pos, + y_pos) || + av1_hash_is_vertical_perfect(cur_picture, block_size, x_pos, y_pos)) { + S++; + continue; + } + + av1_get_block_hash_value( + cur_picture->y_buffer + y_pos * stride_cur + x_pos, stride_cur, + block_size, &hash_value_1, &hash_value_2, + (cur_picture->flags & YV12_FLAG_HIGHBITDEPTH), &cpi->td.mb); + // Hashing does not work for highbitdepth currently. + // TODO(Roger): Make it work for highbitdepth. + if (av1_use_hash_me(&cpi->common)) { + if (av1_has_exact_match(last_hash_table, hash_value_1, hash_value_2)) { + M++; + } + } + } + } + + assert(T > 0); + double csm_rate = ((double)(C + S + M)) / ((double)(T)); + double m_rate = ((double)(M)) / ((double)(T)); + + cpi->csm_rate_array[cpi->rate_index] = csm_rate; + cpi->m_rate_array[cpi->rate_index] = m_rate; + + cpi->rate_index = (cpi->rate_index + 1) % max_history_size; + cpi->rate_size++; + cpi->rate_size = AOMMIN(cpi->rate_size, max_history_size); + + if (csm_rate < threshold_current) { + return 0; + } + + if (C == T) { + return 1; + } + + double csm_average = 0.0; + double m_average = 0.0; + + for (k = 0; k < cpi->rate_size; k++) { + csm_average += cpi->csm_rate_array[k]; + m_average += cpi->m_rate_array[k]; + } + csm_average /= cpi->rate_size; + m_average /= cpi->rate_size; + + if (csm_average < threshold_average) { + return 0; + } + + if (M > (T - C - S) / 3) { + return 1; + } + + if (csm_rate > 0.99 && m_rate > 0.01) { + return 1; + } + + if (csm_average + m_average > 1.01) { + return 1; + } + + return 0; } -static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest, - int skip_adapt, - unsigned int *frame_flags) { +// Refresh reference frame buffers according to refresh_frame_flags. +static void refresh_reference_frames(AV1_COMP *cpi) { + AV1_COMMON *const cm = &cpi->common; + // All buffers are refreshed for shown keyframes and S-frames. + + for (int ref_frame = 0; ref_frame < REF_FRAMES; ref_frame++) { + if (((cm->current_frame.refresh_frame_flags >> ref_frame) & 1) == 1) { + assign_frame_buffer_p(&cm->ref_frame_map[ref_frame], cm->cur_frame); + } + } +} + +static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, + uint8_t *dest) { AV1_COMMON *const cm = &cpi->common; SequenceHeader *const seq_params = &cm->seq_params; CurrentFrame *const current_frame = &cm->current_frame; const AV1EncoderConfig *const oxcf = &cpi->oxcf; struct segmentation *const seg = &cm->seg; - set_ext_overrides(cpi); - aom_clear_system_state(); +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, encode_frame_to_data_rate_time); +#endif // frame type has been decided outside of this function call - cm->cur_frame->intra_only = frame_is_intra_only(cm); cm->cur_frame->frame_type = current_frame->frame_type; - // S_FRAMEs are always error resilient - cm->error_resilient_mode |= frame_is_sframe(cm); - cm->large_scale_tile = cpi->oxcf.large_scale_tile; cm->single_tile_decoding = cpi->oxcf.single_tile_decoding; @@ -5072,34 +4828,20 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest, cm->allow_warped_motion = cpi->oxcf.allow_warped_motion && frame_might_allow_warped_motion(cm); - // Reset the frame packet stamp index. - if (current_frame->frame_type == KEY_FRAME && cm->show_frame) - current_frame->frame_number = 0; + cm->last_frame_type = current_frame->frame_type; + if (cpi->oxcf.pass == 2 && cpi->sf.adaptive_interp_filter_search) + cpi->sf.interp_filter_search_mask = setup_interp_filter_search_mask(cpi); - // NOTE: - // (1) Move the setup of the ref_frame_flags upfront as it would be - // determined by the current frame properties; - // (2) The setup of the ref_frame_flags applies to both - // show_existing_frame's - // and the other cases. - if (current_frame->frame_number > 0) - cpi->ref_frame_flags = get_ref_frame_flags(cpi); + cpi->two_pass_partition_search = cpi->sf.two_pass_partition_search && + !cpi->partition_search_skippable_frame; if (encode_show_existing_frame(cm)) { - // NOTE(zoeliu): In BIDIR_PRED, the existing frame to show is the current - // BWDREF_FRAME in the reference frame buffer. - if (current_frame->frame_type == KEY_FRAME) { - cm->reset_decoder_state = 1; - } else { - current_frame->frame_type = INTER_FRAME; - } - cm->show_frame = 1; - cpi->frame_flags = *frame_flags; - restore_coding_context(cpi); + finalize_encoded_frame(cpi); // Build the bitstream - if (av1_pack_bitstream(cpi, dest, size) != AOM_CODEC_OK) + int largest_tile_id = 0; // Output from bitstream: unused here + if (av1_pack_bitstream(cpi, dest, size, &largest_tile_id) != AOM_CODEC_OK) return AOM_CODEC_ERROR; if (seq_params->frame_id_numbers_present_flag && @@ -5112,40 +4854,16 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest, cpi->seq_params_locked = 1; - // Set up frame to show to get ready for stats collection. - cm->frame_to_show = &cm->cur_frame->buf; - - // Update current frame offset. - current_frame->order_hint = cm->cur_frame->order_hint; - #if DUMP_RECON_FRAMES == 1 // NOTE(zoeliu): For debug - Output the filtered reconstructed video. dump_filtered_recon_frames(cpi); #endif // DUMP_RECON_FRAMES - // Update the LAST_FRAME in the reference frame buffer. - // NOTE: - // (1) For BWDREF_FRAME as the show_existing_frame, the reference frame - // update has been done previously when handling the LAST_BIPRED_FRAME - // right before BWDREF_FRAME (in the display order); - // (2) For INTNL_OVERLAY as the show_existing_frame, the reference frame - // update will be done when the following is called, which will - // exchange - // the virtual indexes between LAST_FRAME and ALTREF2_FRAME, so that - // LAST3 will get retired, LAST2 becomes LAST3, LAST becomes LAST2, - // and - // ALTREF2_FRAME will serve as the new LAST_FRAME. - update_reference_frames(cpi); - - // Update frame flags - cpi->frame_flags &= ~FRAMEFLAGS_GOLDEN; - cpi->frame_flags &= ~FRAMEFLAGS_BWDREF; - cpi->frame_flags &= ~FRAMEFLAGS_ALTREF; - - *frame_flags = cpi->frame_flags & ~FRAMEFLAGS_KEY; - - // Update the frame type - cm->last_frame_type = current_frame->frame_type; + // NOTE: Save the new show frame buffer index for --test-code=warn, i.e., + // for the purpose to verify no mismatch between encoder and decoder. + if (cm->show_frame) cpi->last_show_frame_buf = cm->cur_frame; + + refresh_reference_frames(cpi); // Since we allocate a spot for the OVERLAY frame in the gf group, we need // to do post-encoding update accordingly. @@ -5159,6 +4877,26 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest, return AOM_CODEC_OK; } + // Work out whether to force_integer_mv this frame + if (oxcf->pass != 1 && cpi->common.allow_screen_content_tools && + !frame_is_intra_only(cm)) { + if (cpi->common.seq_params.force_integer_mv == 2) { + // Adaptive mode: see what previous frame encoded did + if (cpi->unscaled_last_source != NULL) { + cm->cur_frame_force_integer_mv = + is_integer_mv(cpi, cpi->source, cpi->unscaled_last_source, + cpi->previous_hash_table); + } else { + cpi->common.cur_frame_force_integer_mv = 0; + } + } else { + cpi->common.cur_frame_force_integer_mv = + cpi->common.seq_params.force_integer_mv; + } + } else { + cpi->common.cur_frame_force_integer_mv = 0; + } + // Set default state for segment based loop filter update flags. cm->lf.mode_ref_delta_update = 0; @@ -5190,6 +4928,7 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest, current_frame->frame_type != KEY_FRAME) { if (av1_rc_drop_frame(cpi)) { av1_rc_postencode_update_drop_frame(cpi); + release_scaled_references(cpi); return AOM_CODEC_OK; } } @@ -5204,7 +4943,6 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest, if (seq_params->frame_id_numbers_present_flag) { /* Non-normative definition of current_frame_id ("frame counter" with * wraparound) */ - const int frame_id_length = FRAME_ID_LENGTH; if (cm->current_frame_id == -1) { int lsb, msb; /* quasi-random initialization of current_frame_id for a key frame */ @@ -5215,7 +4953,8 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest, lsb = cpi->source->y_buffer[0] & 0xff; msb = cpi->source->y_buffer[1] & 0xff; } - cm->current_frame_id = ((msb << 8) + lsb) % (1 << frame_id_length); + cm->current_frame_id = + ((msb << 8) + lsb) % (1 << seq_params->frame_id_length); // S_frame is meant for stitching different streams of different // resolutions together, so current_frame_id must be the @@ -5225,8 +4964,8 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest, if (cpi->oxcf.sframe_enabled) cm->current_frame_id = 0x37; } else { cm->current_frame_id = - (cm->current_frame_id + 1 + (1 << frame_id_length)) % - (1 << frame_id_length); + (cm->current_frame_id + 1 + (1 << seq_params->frame_id_length)) % + (1 << seq_params->frame_id_length); } } @@ -5249,15 +4988,14 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest, } cm->timing_info_present &= !seq_params->reduced_still_picture_hdr; - if (cpi->sf.recode_loop == DISALLOW_RECODE) { - if (encode_without_recode_loop(cpi) != AOM_CODEC_OK) return AOM_CODEC_ERROR; - } else { - if (encode_with_recode_loop(cpi, size, dest) != AOM_CODEC_OK) - return AOM_CODEC_ERROR; - } - - cm->last_tile_cols = cm->tile_cols; - cm->last_tile_rows = cm->tile_rows; +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, encode_with_recode_loop_time); +#endif + if (encode_with_recode_loop(cpi, size, dest) != AOM_CODEC_OK) + return AOM_CODEC_ERROR; +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, encode_with_recode_loop_time); +#endif #ifdef OUTPUT_YUV_SKINMAP if (cpi->common.current_frame.frame_number > 1) { @@ -5276,23 +5014,16 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest, } } - // If the encoder forced a KEY_FRAME decision or if frame is an S_FRAME - if ((current_frame->frame_type == KEY_FRAME && cm->show_frame) || - frame_is_sframe(cm)) { - cpi->refresh_last_frame = 1; - } - - cm->frame_to_show = &cm->cur_frame->buf; - cm->frame_to_show->color_primaries = seq_params->color_primaries; - cm->frame_to_show->transfer_characteristics = + cm->cur_frame->buf.color_primaries = seq_params->color_primaries; + cm->cur_frame->buf.transfer_characteristics = seq_params->transfer_characteristics; - cm->frame_to_show->matrix_coefficients = seq_params->matrix_coefficients; - cm->frame_to_show->monochrome = seq_params->monochrome; - cm->frame_to_show->chroma_sample_position = + cm->cur_frame->buf.matrix_coefficients = seq_params->matrix_coefficients; + cm->cur_frame->buf.monochrome = seq_params->monochrome; + cm->cur_frame->buf.chroma_sample_position = seq_params->chroma_sample_position; - cm->frame_to_show->color_range = seq_params->color_range; - cm->frame_to_show->render_width = cm->render_width; - cm->frame_to_show->render_height = cm->render_height; + cm->cur_frame->buf.color_range = seq_params->color_range; + cm->cur_frame->buf.render_width = cm->render_width; + cm->cur_frame->buf.render_height = cm->render_height; // TODO(zoeliu): For non-ref frames, loop filtering may need to be turned // off. @@ -5313,26 +5044,31 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest, } // TODO(debargha): Fix mv search range on encoder side - // aom_extend_frame_inner_borders(cm->frame_to_show, av1_num_planes(cm)); - aom_extend_frame_borders(cm->frame_to_show, av1_num_planes(cm)); + // aom_extend_frame_inner_borders(&cm->cur_frame->buf, av1_num_planes(cm)); + aom_extend_frame_borders(&cm->cur_frame->buf, av1_num_planes(cm)); #ifdef OUTPUT_YUV_REC - aom_write_one_yuv_frame(cm, cm->frame_to_show); + aom_write_one_yuv_frame(cm, &cm->cur_frame->buf); #endif + finalize_encoded_frame(cpi); // Build the bitstream - if (av1_pack_bitstream(cpi, dest, size) != AOM_CODEC_OK) + int largest_tile_id = 0; // Output from pack_bitstream +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, av1_pack_bitstream_final_time); +#endif + if (av1_pack_bitstream(cpi, dest, size, &largest_tile_id) != AOM_CODEC_OK) return AOM_CODEC_ERROR; +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, av1_pack_bitstream_final_time); +#endif cpi->seq_params_locked = 1; - if (skip_adapt) return AOM_CODEC_OK; - + // Update reference frame ids for reference frames this frame will overwrite if (seq_params->frame_id_numbers_present_flag) { - int i; - // Update reference frame id values based on the value of refresh_frame_mask - for (i = 0; i < REF_FRAMES; i++) { - if ((cpi->refresh_frame_mask >> i) & 1) { + for (int i = 0; i < REF_FRAMES; i++) { + if ((current_frame->refresh_frame_flags >> i) & 1) { cm->ref_frame_id[i] = cm->current_frame_id; } } @@ -5347,7 +5083,7 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest, if (cm->seg.update_map) { update_reference_segmentation_map(cpi); } else if (cm->last_frame_seg_map) { - memcpy(cm->current_frame_seg_map, cm->last_frame_seg_map, + memcpy(cm->cur_frame->seg_map, cm->last_frame_seg_map, cm->mi_cols * cm->mi_rows * sizeof(uint8_t)); } } @@ -5356,41 +5092,60 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest, release_scaled_references(cpi); } - update_reference_frames(cpi); + // NOTE: Save the new show frame buffer index for --test-code=warn, i.e., + // for the purpose to verify no mismatch between encoder and decoder. + if (cm->show_frame) cpi->last_show_frame_buf = cm->cur_frame; + + refresh_reference_frames(cpi); #if CONFIG_ENTROPY_STATS av1_accumulate_frame_counts(&aggregate_fc, &cpi->counts); #endif // CONFIG_ENTROPY_STATS if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) { - *cm->fc = cpi->tile_data[cm->largest_tile_id].tctx; + *cm->fc = cpi->tile_data[largest_tile_id].tctx; av1_reset_cdf_symbol_counters(cm->fc); } + if (!cm->large_scale_tile) { + cm->cur_frame->frame_context = *cm->fc; + } +#define EXT_TILE_DEBUG 0 +#if EXT_TILE_DEBUG + if (cm->large_scale_tile && oxcf->pass == 2) { + char fn[20] = "./fc"; + fn[4] = current_frame->frame_number / 100 + '0'; + fn[5] = (current_frame->frame_number % 100) / 10 + '0'; + fn[6] = (current_frame->frame_number % 10) + '0'; + fn[7] = '\0'; + av1_print_frame_contexts(cm->fc, fn); + } +#endif // EXT_TILE_DEBUG +#undef EXT_TILE_DEBUG - if (cpi->refresh_golden_frame == 1) - cpi->frame_flags |= FRAMEFLAGS_GOLDEN; - else - cpi->frame_flags &= ~FRAMEFLAGS_GOLDEN; - - if (cpi->refresh_alt_ref_frame == 1) - cpi->frame_flags |= FRAMEFLAGS_ALTREF; - else - cpi->frame_flags &= ~FRAMEFLAGS_ALTREF; +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, encode_frame_to_data_rate_time); - if (cpi->refresh_bwd_ref_frame == 1) - cpi->frame_flags |= FRAMEFLAGS_BWDREF; - else - cpi->frame_flags &= ~FRAMEFLAGS_BWDREF; + // Print out timing information. + int i; + fprintf(stderr, "\n Frame number: %d, Frame type: %s, Show Frame: %d\n", + cm->current_frame.frame_number, + get_frame_type_enum(cm->current_frame.frame_type), cm->show_frame); + for (i = 0; i < kTimingComponents; i++) { + cpi->component_time[i] += cpi->frame_component_time[i]; + fprintf(stderr, " %s: %" PRId64 " us (total: %" PRId64 " us)\n", + get_component_name(i), cpi->frame_component_time[i], + cpi->component_time[i]); + cpi->frame_component_time[i] = 0; + } +#endif cm->last_frame_type = current_frame->frame_type; av1_rc_postencode_update(cpi, *size); - if (current_frame->frame_type == KEY_FRAME) { - // Tell the caller that the frame was coded as a key frame - *frame_flags = cpi->frame_flags | FRAMEFLAGS_KEY; - } else { - *frame_flags = cpi->frame_flags & ~FRAMEFLAGS_KEY; + // Store encoded frame's hash table for is_integer_mv() next time + if (oxcf->pass != 1 && cpi->common.allow_screen_content_tools) { + cpi->previous_hash_table = &cm->cur_frame->hash_table; } // Clear the one shot update flags for segmentation map and mode/ref loop @@ -5414,114 +5169,62 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest, ++current_frame->frame_number; } - // NOTE: Shall not refer to any frame not used as reference. - if (cm->is_reference_frame) { - // keep track of the last coded dimensions - cm->last_width = cm->width; - cm->last_height = cm->height; - } - return AOM_CODEC_OK; } -static INLINE void update_keyframe_counters(AV1_COMP *cpi) { - // TODO(zoeliu): To investigate whether we should treat BWDREF_FRAME - // differently here for rc->avg_frame_bandwidth. - if (cpi->common.show_frame || cpi->rc.is_bwd_ref_frame) { - if (!cpi->common.show_existing_frame || cpi->rc.is_src_frame_alt_ref || - cpi->common.current_frame.frame_type == KEY_FRAME) { - // If this is a show_existing_frame with a source other than altref, - // or if it is not a displayed forward keyframe, the keyframe update - // counters were incremented when it was originally encoded. - cpi->rc.frames_since_key++; - cpi->rc.frames_to_key--; - } - } -} - -static INLINE void update_frames_till_gf_update(AV1_COMP *cpi) { - // TODO(weitinglin): Updating this counter for is_frame_droppable - // is a work-around to handle the condition when a frame is drop. - // We should fix the cpi->common.show_frame flag - // instead of checking the other condition to update the counter properly. - if (cpi->common.show_frame || is_frame_droppable(cpi)) { - // Decrement count down till next gf - if (cpi->rc.frames_till_gf_update_due > 0) - cpi->rc.frames_till_gf_update_due--; - } -} - -static INLINE void update_twopass_gf_group_index(AV1_COMP *cpi) { - // Increment the gf group index ready for the next frame. If this is - // a show_existing_frame with a source other than altref, or if it is not - // a displayed forward keyframe, the index was incremented when it was - // originally encoded. - if (!cpi->common.show_existing_frame || cpi->rc.is_src_frame_alt_ref || - cpi->common.current_frame.frame_type == KEY_FRAME) { - ++cpi->twopass.gf_group.index; - } -} +int av1_encode(AV1_COMP *const cpi, uint8_t *const dest, + const EncodeFrameInput *const frame_input, + const EncodeFrameParams *const frame_params, + EncodeFrameResults *const frame_results) { + AV1_COMMON *const cm = &cpi->common; + CurrentFrame *const current_frame = &cm->current_frame; -static void update_rc_counts(AV1_COMP *cpi) { - update_keyframe_counters(cpi); - update_frames_till_gf_update(cpi); - if (cpi->oxcf.pass == 2) update_twopass_gf_group_index(cpi); -} + cpi->unscaled_source = frame_input->source; + cpi->source = frame_input->source; + cpi->unscaled_last_source = frame_input->last_source; + + current_frame->refresh_frame_flags = frame_params->refresh_frame_flags; + cm->error_resilient_mode = frame_params->error_resilient_mode; + cm->primary_ref_frame = frame_params->primary_ref_frame; + cm->current_frame.frame_type = frame_params->frame_type; + cm->show_frame = frame_params->show_frame; + cpi->ref_frame_flags = frame_params->ref_frame_flags; + cpi->speed = frame_params->speed; + cm->show_existing_frame = frame_params->show_existing_frame; + cpi->existing_fb_idx_to_show = frame_params->existing_fb_idx_to_show; + + memcpy(cm->remapped_ref_idx, frame_params->remapped_ref_idx, + REF_FRAMES * sizeof(*cm->remapped_ref_idx)); + + cpi->refresh_last_frame = frame_params->refresh_last_frame; + cpi->refresh_golden_frame = frame_params->refresh_golden_frame; + cpi->refresh_bwd_ref_frame = frame_params->refresh_bwd_ref_frame; + cpi->refresh_alt2_ref_frame = frame_params->refresh_alt2_ref_frame; + cpi->refresh_alt_ref_frame = frame_params->refresh_alt_ref_frame; -static void set_additional_frame_flags(AV1_COMMON *const cm, - unsigned int *frame_flags) { - if (frame_is_intra_only(cm)) *frame_flags |= FRAMEFLAGS_INTRAONLY; - if (frame_is_sframe(cm)) *frame_flags |= FRAMEFLAGS_SWITCH; - if (cm->error_resilient_mode) *frame_flags |= FRAMEFLAGS_ERROR_RESILIENT; -} + if (current_frame->frame_type == KEY_FRAME && cm->show_frame) + current_frame->frame_number = 0; -static int Pass0Encode(AV1_COMP *cpi, size_t *size, uint8_t *dest, - int skip_adapt, unsigned int *frame_flags) { - if (cpi->oxcf.rc_mode == AOM_CBR) { - av1_rc_get_one_pass_cbr_params(cpi); + if (cm->show_existing_frame) { + current_frame->order_hint = cm->cur_frame->order_hint; } else { - av1_rc_get_one_pass_vbr_params(cpi); - } - if (encode_frame_to_data_rate(cpi, size, dest, skip_adapt, frame_flags) != - AOM_CODEC_OK) { - return AOM_CODEC_ERROR; + current_frame->order_hint = + current_frame->frame_number + frame_params->order_offset; + current_frame->order_hint %= + (1 << (cm->seq_params.order_hint_info.order_hint_bits_minus_1 + 1)); } - set_additional_frame_flags(&cpi->common, frame_flags); - - update_rc_counts(cpi); - check_show_existing_frame(cpi); - return AOM_CODEC_OK; -} - -static int Pass2Encode(AV1_COMP *cpi, size_t *size, uint8_t *dest, - unsigned int *frame_flags) { -#if CONFIG_MISMATCH_DEBUG - mismatch_move_frame_idx_w(); -#endif -#if TXCOEFF_COST_TIMER - AV1_COMMON *cm = &cpi->common; - cm->txcoeff_cost_timer = 0; - cm->txcoeff_cost_count = 0; -#endif - if (encode_frame_to_data_rate(cpi, size, dest, 0, frame_flags) != - AOM_CODEC_OK) { + if (cpi->oxcf.pass == 1) { + av1_first_pass(cpi, frame_input->ts_duration); + } else if (cpi->oxcf.pass == 0 || cpi->oxcf.pass == 2) { + if (encode_frame_to_data_rate(cpi, &frame_results->size, dest) != + AOM_CODEC_OK) { + return AOM_CODEC_ERROR; + } + } else { return AOM_CODEC_ERROR; } - set_additional_frame_flags(&cpi->common, frame_flags); -#if TXCOEFF_COST_TIMER - cm->cum_txcoeff_cost_timer += cm->txcoeff_cost_timer; - fprintf(stderr, - "\ntxb coeff cost block number: %ld, frame time: %ld, cum time %ld " - "in us\n", - cm->txcoeff_cost_count, cm->txcoeff_cost_timer, - cm->cum_txcoeff_cost_timer); -#endif - - av1_twopass_postencode_update(cpi); - update_rc_counts(cpi); - check_show_existing_frame(cpi); return AOM_CODEC_OK; } @@ -5564,7 +5267,6 @@ int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags, int64_t end_time) { AV1_COMMON *const cm = &cpi->common; const SequenceHeader *const seq_params = &cm->seq_params; - struct aom_usec_timer timer; int res = 0; const int subsampling_x = sd->subsampling_x; const int subsampling_y = sd->subsampling_y; @@ -5572,8 +5274,10 @@ int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags, check_initial_width(cpi, use_highbitdepth, subsampling_x, subsampling_y); +#if CONFIG_INTERNAL_STATS + struct aom_usec_timer timer; aom_usec_timer_start(&timer); - +#endif #if CONFIG_DENOISE if (cpi->oxcf.noise_level > 0) if (apply_denoise_2d(cpi, sd, cpi->oxcf.noise_block_size, @@ -5584,9 +5288,10 @@ int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags, if (av1_lookahead_push(cpi->lookahead, sd, time_stamp, end_time, use_highbitdepth, frame_flags)) res = -1; +#if CONFIG_INTERNAL_STATS aom_usec_timer_mark(&timer); cpi->time_receive_data += aom_usec_timer_elapsed(&timer); - +#endif if ((seq_params->profile == PROFILE_0) && !seq_params->monochrome && (subsampling_x != 1 || subsampling_y != 1)) { aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM, @@ -5610,133 +5315,6 @@ int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags, return res; } -static void adjust_frame_rate(AV1_COMP *cpi, - const struct lookahead_entry *source) { - int64_t this_duration; - int step = 0; - - if (source->ts_start == cpi->first_time_stamp_ever) { - this_duration = source->ts_end - source->ts_start; - step = 1; - } else { - int64_t last_duration = - cpi->last_end_time_stamp_seen - cpi->last_time_stamp_seen; - - this_duration = source->ts_end - cpi->last_end_time_stamp_seen; - - // do a step update if the duration changes by 10% - if (last_duration) - step = (int)((this_duration - last_duration) * 10 / last_duration); - } - - if (this_duration) { - if (step) { - av1_new_framerate(cpi, 10000000.0 / this_duration); - } else { - // Average this frame's rate into the last second's average - // frame rate. If we haven't seen 1 second yet, then average - // over the whole interval seen. - const double interval = AOMMIN( - (double)(source->ts_end - cpi->first_time_stamp_ever), 10000000.0); - double avg_duration = 10000000.0 / cpi->framerate; - avg_duration *= (interval - avg_duration + this_duration); - avg_duration /= interval; - - av1_new_framerate(cpi, 10000000.0 / avg_duration); - } - } - cpi->last_time_stamp_seen = source->ts_start; - cpi->last_end_time_stamp_seen = source->ts_end; -} - -// Returns 0 if this is not an alt ref else the offset of the source frame -// used as the arf midpoint. -static int get_arf_src_index(AV1_COMP *cpi) { - RATE_CONTROL *const rc = &cpi->rc; - int arf_src_index = 0; - if (is_altref_enabled(cpi)) { - if (cpi->oxcf.pass == 2) { - const GF_GROUP *const gf_group = &cpi->twopass.gf_group; - if (gf_group->update_type[gf_group->index] == ARF_UPDATE) { - arf_src_index = gf_group->arf_src_offset[gf_group->index]; - } - } else if (rc->source_alt_ref_pending) { - arf_src_index = rc->frames_till_gf_update_due; - } - } - return arf_src_index; -} - -static int get_brf_src_index(AV1_COMP *cpi) { - int brf_src_index = 0; - const GF_GROUP *const gf_group = &cpi->twopass.gf_group; - - // TODO(zoeliu): We need to add the check on the -bwd_ref command line setup - // flag. - if (gf_group->bidir_pred_enabled[gf_group->index]) { - if (cpi->oxcf.pass == 2) { - if (gf_group->update_type[gf_group->index] == BRF_UPDATE) - brf_src_index = gf_group->brf_src_offset[gf_group->index]; - } else { - // TODO(zoeliu): To re-visit the setup for this scenario - brf_src_index = cpi->rc.bipred_group_interval - 1; - } - } - - return brf_src_index; -} - -// Returns 0 if this is not an alt ref else the offset of the source frame -// used as the arf midpoint. -static int get_arf2_src_index(AV1_COMP *cpi) { - int arf2_src_index = 0; - if (is_altref_enabled(cpi) && cpi->num_extra_arfs) { - if (cpi->oxcf.pass == 2) { - const GF_GROUP *const gf_group = &cpi->twopass.gf_group; - if (gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE) { - arf2_src_index = gf_group->arf_src_offset[gf_group->index]; - } - } - } - return arf2_src_index; -} - -static void check_src_altref(AV1_COMP *cpi, - const struct lookahead_entry *source) { - RATE_CONTROL *const rc = &cpi->rc; - - // If pass == 2, the parameters set here will be reset in - // av1_rc_get_second_pass_params() - - if (cpi->oxcf.pass == 2) { - const GF_GROUP *const gf_group = &cpi->twopass.gf_group; - rc->is_src_frame_alt_ref = - (gf_group->update_type[gf_group->index] == INTNL_OVERLAY_UPDATE) || - (gf_group->update_type[gf_group->index] == OVERLAY_UPDATE); - rc->is_src_frame_ext_arf = - gf_group->update_type[gf_group->index] == INTNL_OVERLAY_UPDATE; - } else { - rc->is_src_frame_alt_ref = - cpi->alt_ref_source && (source == cpi->alt_ref_source); - } - - if (rc->is_src_frame_alt_ref) { - // Current frame is an ARF overlay frame. - cpi->alt_ref_source = NULL; - - if (rc->is_src_frame_ext_arf && !cpi->common.show_existing_frame) { - // For INTNL_OVERLAY, when show_existing_frame == 0, they do need to - // refresh the LAST_FRAME, i.e. LAST3 gets retired, LAST2 becomes LAST3, - // LAST becomes LAST2, and INTNL_OVERLAY becomes LAST. - cpi->refresh_last_frame = 1; - } else { - // Don't refresh the last buffer for an ARF overlay frame. It will - // become the GF so preserve last as an alternative prediction option. - cpi->refresh_last_frame = 0; - } - } -} - #if CONFIG_INTERNAL_STATS extern double av1_get_blockiness(const unsigned char *img1, int img1_pitch, const unsigned char *img2, int img2_pitch, @@ -5768,7 +5346,7 @@ static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) { } if (cm->show_frame) { const YV12_BUFFER_CONFIG *orig = cpi->source; - const YV12_BUFFER_CONFIG *recon = cpi->common.frame_to_show; + const YV12_BUFFER_CONFIG *recon = &cpi->common.cur_frame->buf; double y, u, v, frame_all; cpi->count++; @@ -5843,738 +5421,31 @@ static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) { } } #endif // CONFIG_INTERNAL_STATS - -static int is_integer_mv(AV1_COMP *cpi, const YV12_BUFFER_CONFIG *cur_picture, - const YV12_BUFFER_CONFIG *last_picture, - hash_table *last_hash_table) { - aom_clear_system_state(); - // check use hash ME - int k; - uint32_t hash_value_1; - uint32_t hash_value_2; - - const int block_size = 8; - const double threshold_current = 0.8; - const double threshold_average = 0.95; - const int max_history_size = 32; - int T = 0; // total block - int C = 0; // match with collocated block - int S = 0; // smooth region but not match with collocated block - int M = 0; // match with other block - - const int pic_width = cur_picture->y_width; - const int pic_height = cur_picture->y_height; - for (int i = 0; i + block_size <= pic_height; i += block_size) { - for (int j = 0; j + block_size <= pic_width; j += block_size) { - const int x_pos = j; - const int y_pos = i; - int match = 1; - T++; - - // check whether collocated block match with current - uint8_t *p_cur = cur_picture->y_buffer; - uint8_t *p_ref = last_picture->y_buffer; - int stride_cur = cur_picture->y_stride; - int stride_ref = last_picture->y_stride; - p_cur += (y_pos * stride_cur + x_pos); - p_ref += (y_pos * stride_ref + x_pos); - - if (cur_picture->flags & YV12_FLAG_HIGHBITDEPTH) { - uint16_t *p16_cur = CONVERT_TO_SHORTPTR(p_cur); - uint16_t *p16_ref = CONVERT_TO_SHORTPTR(p_ref); - for (int tmpY = 0; tmpY < block_size && match; tmpY++) { - for (int tmpX = 0; tmpX < block_size && match; tmpX++) { - if (p16_cur[tmpX] != p16_ref[tmpX]) { - match = 0; - } - } - p16_cur += stride_cur; - p16_ref += stride_ref; - } - } else { - for (int tmpY = 0; tmpY < block_size && match; tmpY++) { - for (int tmpX = 0; tmpX < block_size && match; tmpX++) { - if (p_cur[tmpX] != p_ref[tmpX]) { - match = 0; - } - } - p_cur += stride_cur; - p_ref += stride_ref; - } - } - - if (match) { - C++; - continue; - } - - if (av1_hash_is_horizontal_perfect(cur_picture, block_size, x_pos, - y_pos) || - av1_hash_is_vertical_perfect(cur_picture, block_size, x_pos, y_pos)) { - S++; - continue; - } - - av1_get_block_hash_value( - cur_picture->y_buffer + y_pos * stride_cur + x_pos, stride_cur, - block_size, &hash_value_1, &hash_value_2, - (cur_picture->flags & YV12_FLAG_HIGHBITDEPTH), &cpi->td.mb); - // Hashing does not work for highbitdepth currently. - // TODO(Roger): Make it work for highbitdepth. - if (av1_use_hash_me(&cpi->common)) { - if (av1_has_exact_match(last_hash_table, hash_value_1, hash_value_2)) { - M++; - } - } - } - } - - assert(T > 0); - double csm_rate = ((double)(C + S + M)) / ((double)(T)); - double m_rate = ((double)(M)) / ((double)(T)); - - cpi->csm_rate_array[cpi->rate_index] = csm_rate; - cpi->m_rate_array[cpi->rate_index] = m_rate; - - cpi->rate_index = (cpi->rate_index + 1) % max_history_size; - cpi->rate_size++; - cpi->rate_size = AOMMIN(cpi->rate_size, max_history_size); - - if (csm_rate < threshold_current) { - return 0; - } - - if (C == T) { - return 1; - } - - double csm_average = 0.0; - double m_average = 0.0; - - for (k = 0; k < cpi->rate_size; k++) { - csm_average += cpi->csm_rate_array[k]; - m_average += cpi->m_rate_array[k]; - } - csm_average /= cpi->rate_size; - m_average /= cpi->rate_size; - - if (csm_average < threshold_average) { - return 0; - } - - if (M > (T - C - S) / 3) { - return 1; - } - - if (csm_rate > 0.99 && m_rate > 0.01) { - return 1; - } - - if (csm_average + m_average > 1.01) { - return 1; - } - - return 0; -} - -// Code for temporal dependency model -typedef struct GF_PICTURE { - YV12_BUFFER_CONFIG *frame; - int ref_frame[7]; -} GF_PICTURE; - -static void init_gop_frames(AV1_COMP *cpi, GF_PICTURE *gf_picture, - const GF_GROUP *gf_group, int *tpl_group_frames) { - AV1_COMMON *cm = &cpi->common; - const SequenceHeader *const seq_params = &cm->seq_params; - int frame_idx = 0; - int i; - int gld_index = -1; - int alt_index = -1; - int lst_index = -1; - int extend_frame_count = 0; - int pframe_qindex = cpi->tpl_stats[2].base_qindex; - - RefCntBuffer *frame_bufs = cm->buffer_pool->frame_bufs; - int recon_frame_index[INTER_REFS_PER_FRAME + 1] = { -1, -1, -1, -1, - -1, -1, -1, -1 }; - - // TODO(jingning): To be used later for gf frame type parsing. - (void)gf_group; - - for (i = 0; i < FRAME_BUFFERS && frame_idx < INTER_REFS_PER_FRAME + 1; ++i) { - if (frame_bufs[i].ref_count == 0) { - alloc_frame_mvs(cm, i); - if (aom_realloc_frame_buffer( - &frame_bufs[i].buf, cm->width, cm->height, - seq_params->subsampling_x, seq_params->subsampling_y, - seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS, - cm->byte_alignment, NULL, NULL, NULL)) - aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, - "Failed to allocate frame buffer"); - - recon_frame_index[frame_idx] = i; - ++frame_idx; - } - } - - for (i = 0; i < INTER_REFS_PER_FRAME + 1; ++i) { - assert(recon_frame_index[i] >= 0); - cpi->tpl_recon_frames[i] = &frame_bufs[recon_frame_index[i]].buf; - } - - *tpl_group_frames = 0; - - // Initialize Golden reference frame. - gf_picture[0].frame = get_ref_frame_buffer(cpi, GOLDEN_FRAME); - for (i = 0; i < 7; ++i) gf_picture[0].ref_frame[i] = -1; - gld_index = 0; - ++*tpl_group_frames; - - // Initialize ARF frame - gf_picture[1].frame = cpi->source; - gf_picture[1].ref_frame[0] = gld_index; - gf_picture[1].ref_frame[1] = lst_index; - gf_picture[1].ref_frame[2] = alt_index; - // TODO(yuec) Need o figure out full AV1 reference model - for (i = 3; i < 7; ++i) gf_picture[1].ref_frame[i] = -1; - alt_index = 1; - ++*tpl_group_frames; - - // Initialize P frames - for (frame_idx = 2; frame_idx < MAX_LAG_BUFFERS; ++frame_idx) { - struct lookahead_entry *buf = - av1_lookahead_peek(cpi->lookahead, frame_idx - 2); - - if (buf == NULL) break; - - gf_picture[frame_idx].frame = &buf->img; - gf_picture[frame_idx].ref_frame[0] = gld_index; - gf_picture[frame_idx].ref_frame[1] = lst_index; - gf_picture[frame_idx].ref_frame[2] = alt_index; - for (i = 3; i < 7; ++i) gf_picture[frame_idx].ref_frame[i] = -1; - - ++*tpl_group_frames; - lst_index = frame_idx; - - if (frame_idx == cpi->rc.baseline_gf_interval + 1) break; - } - - gld_index = frame_idx; - lst_index = AOMMAX(0, frame_idx - 1); - alt_index = -1; - ++frame_idx; - - // Extend two frames outside the current gf group. - for (; frame_idx < MAX_LAG_BUFFERS && extend_frame_count < 2; ++frame_idx) { - struct lookahead_entry *buf = - av1_lookahead_peek(cpi->lookahead, frame_idx - 2); - - if (buf == NULL) break; - - cpi->tpl_stats[frame_idx].base_qindex = pframe_qindex; - - gf_picture[frame_idx].frame = &buf->img; - gf_picture[frame_idx].ref_frame[0] = gld_index; - gf_picture[frame_idx].ref_frame[1] = lst_index; - gf_picture[frame_idx].ref_frame[2] = alt_index; - for (i = 3; i < 7; ++i) gf_picture[frame_idx].ref_frame[i] = -1; - lst_index = frame_idx; - ++*tpl_group_frames; - ++extend_frame_count; - } -} - -static void init_tpl_stats(AV1_COMP *cpi) { - int frame_idx; - for (frame_idx = 0; frame_idx < MAX_LAG_BUFFERS; ++frame_idx) { - TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; - memset(tpl_frame->tpl_stats_ptr, 0, - tpl_frame->height * tpl_frame->width * - sizeof(*tpl_frame->tpl_stats_ptr)); - tpl_frame->is_valid = 0; - } -} - -static uint32_t motion_compensated_prediction(AV1_COMP *cpi, ThreadData *td, - uint8_t *cur_frame_buf, - uint8_t *ref_frame_buf, - int stride, BLOCK_SIZE bsize, - int mi_row, int mi_col) { - AV1_COMMON *cm = &cpi->common; - MACROBLOCK *const x = &td->mb; - MACROBLOCKD *const xd = &x->e_mbd; - MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv; - const SEARCH_METHODS search_method = NSTEP; - int step_param; - int sadpb = x->sadperbit16; - uint32_t bestsme = UINT_MAX; - int distortion; - uint32_t sse; - int cost_list[5]; - const MvLimits tmp_mv_limits = x->mv_limits; - - MV best_ref_mv1 = { 0, 0 }; - MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */ - - best_ref_mv1_full.col = best_ref_mv1.col >> 3; - best_ref_mv1_full.row = best_ref_mv1.row >> 3; - - // Setup frame pointers - x->plane[0].src.buf = cur_frame_buf; - x->plane[0].src.stride = stride; - xd->plane[0].pre[0].buf = ref_frame_buf; - xd->plane[0].pre[0].stride = stride; - - step_param = mv_sf->reduce_first_step_size; - step_param = AOMMIN(step_param, MAX_MVSEARCH_STEPS - 2); - - av1_set_mv_search_range(&x->mv_limits, &best_ref_mv1); - - av1_full_pixel_search(cpi, x, bsize, &best_ref_mv1_full, step_param, - search_method, 0, sadpb, cond_cost_list(cpi, cost_list), - &best_ref_mv1, INT_MAX, 0, (MI_SIZE * mi_col), - (MI_SIZE * mi_row), 0); - - /* restore UMV window */ - x->mv_limits = tmp_mv_limits; - - const int pw = block_size_wide[bsize]; - const int ph = block_size_high[bsize]; - bestsme = cpi->find_fractional_mv_step( - x, cm, mi_row, mi_col, &best_ref_mv1, cpi->common.allow_high_precision_mv, - x->errorperbit, &cpi->fn_ptr[bsize], 0, mv_sf->subpel_iters_per_step, - cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, NULL, - 0, 0, pw, ph, 1, 1); - - return bestsme; -} - -static int get_overlap_area(int grid_pos_row, int grid_pos_col, int ref_pos_row, - int ref_pos_col, int block, BLOCK_SIZE bsize) { - int width = 0, height = 0; - int bw = 4 << mi_size_wide_log2[bsize]; - int bh = 4 << mi_size_high_log2[bsize]; - - switch (block) { - case 0: - width = grid_pos_col + bw - ref_pos_col; - height = grid_pos_row + bh - ref_pos_row; - break; - case 1: - width = ref_pos_col + bw - grid_pos_col; - height = grid_pos_row + bh - ref_pos_row; - break; - case 2: - width = grid_pos_col + bw - ref_pos_col; - height = ref_pos_row + bh - grid_pos_row; - break; - case 3: - width = ref_pos_col + bw - grid_pos_col; - height = ref_pos_row + bh - grid_pos_row; - break; - default: assert(0); - } - - return width * height; -} - -static int round_floor(int ref_pos, int bsize_pix) { - int round; - if (ref_pos < 0) - round = -(1 + (-ref_pos - 1) / bsize_pix); - else - round = ref_pos / bsize_pix; - - return round; -} - -static void tpl_model_store(TplDepStats *tpl_stats, int mi_row, int mi_col, - BLOCK_SIZE bsize, int stride, - const TplDepStats *src_stats) { - const int mi_height = mi_size_high[bsize]; - const int mi_width = mi_size_wide[bsize]; - int idx, idy; - - int64_t intra_cost = src_stats->intra_cost / (mi_height * mi_width); - int64_t inter_cost = src_stats->inter_cost / (mi_height * mi_width); - - TplDepStats *tpl_ptr; - - intra_cost = AOMMAX(1, intra_cost); - inter_cost = AOMMAX(1, inter_cost); - - for (idy = 0; idy < mi_height; ++idy) { - tpl_ptr = &tpl_stats[(mi_row + idy) * stride + mi_col]; - for (idx = 0; idx < mi_width; ++idx) { - tpl_ptr->intra_cost = intra_cost; - tpl_ptr->inter_cost = inter_cost; - tpl_ptr->mc_dep_cost = tpl_ptr->intra_cost + tpl_ptr->mc_flow; - tpl_ptr->ref_frame_index = src_stats->ref_frame_index; - tpl_ptr->mv.as_int = src_stats->mv.as_int; - ++tpl_ptr; - } - } -} - -static void tpl_model_update_b(TplDepFrame *tpl_frame, TplDepStats *tpl_stats, - int mi_row, int mi_col, const BLOCK_SIZE bsize) { - TplDepFrame *ref_tpl_frame = &tpl_frame[tpl_stats->ref_frame_index]; - TplDepStats *ref_stats = ref_tpl_frame->tpl_stats_ptr; - MV mv = tpl_stats->mv.as_mv; - int mv_row = mv.row >> 3; - int mv_col = mv.col >> 3; - - int ref_pos_row = mi_row * MI_SIZE + mv_row; - int ref_pos_col = mi_col * MI_SIZE + mv_col; - - const int bw = 4 << mi_size_wide_log2[bsize]; - const int bh = 4 << mi_size_high_log2[bsize]; - const int mi_height = mi_size_high[bsize]; - const int mi_width = mi_size_wide[bsize]; - const int pix_num = bw * bh; - - // top-left on grid block location in pixel - int grid_pos_row_base = round_floor(ref_pos_row, bh) * bh; - int grid_pos_col_base = round_floor(ref_pos_col, bw) * bw; - int block; - - for (block = 0; block < 4; ++block) { - int grid_pos_row = grid_pos_row_base + bh * (block >> 1); - int grid_pos_col = grid_pos_col_base + bw * (block & 0x01); - - if (grid_pos_row >= 0 && grid_pos_row < ref_tpl_frame->mi_rows * MI_SIZE && - grid_pos_col >= 0 && grid_pos_col < ref_tpl_frame->mi_cols * MI_SIZE) { - int overlap_area = get_overlap_area( - grid_pos_row, grid_pos_col, ref_pos_row, ref_pos_col, block, bsize); - int ref_mi_row = round_floor(grid_pos_row, bh) * mi_height; - int ref_mi_col = round_floor(grid_pos_col, bw) * mi_width; - - int64_t mc_flow = tpl_stats->mc_dep_cost - - (tpl_stats->mc_dep_cost * tpl_stats->inter_cost) / - tpl_stats->intra_cost; - - int idx, idy; - - for (idy = 0; idy < mi_height; ++idy) { - for (idx = 0; idx < mi_width; ++idx) { - TplDepStats *des_stats = - &ref_stats[(ref_mi_row + idy) * ref_tpl_frame->stride + - (ref_mi_col + idx)]; - - des_stats->mc_flow += (mc_flow * overlap_area) / pix_num; - des_stats->mc_ref_cost += - ((tpl_stats->intra_cost - tpl_stats->inter_cost) * overlap_area) / - pix_num; - assert(overlap_area >= 0); - } - } - } - } -} - -static void tpl_model_update(TplDepFrame *tpl_frame, TplDepStats *tpl_stats, - int mi_row, int mi_col, const BLOCK_SIZE bsize) { - int idx, idy; - const int mi_height = mi_size_high[bsize]; - const int mi_width = mi_size_wide[bsize]; - - for (idy = 0; idy < mi_height; ++idy) { - for (idx = 0; idx < mi_width; ++idx) { - TplDepStats *tpl_ptr = - &tpl_stats[(mi_row + idy) * tpl_frame->stride + (mi_col + idx)]; - tpl_model_update_b(tpl_frame, tpl_ptr, mi_row + idy, mi_col + idx, - BLOCK_4X4); - } - } -} - -static void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff, - tran_low_t *qcoeff, tran_low_t *dqcoeff, - TX_SIZE tx_size, int64_t *recon_error, - int64_t *sse) { - const struct macroblock_plane *const p = &x->plane[plane]; - const SCAN_ORDER *const scan_order = &av1_default_scan_orders[tx_size]; - uint16_t eob; - int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]; - const int shift = tx_size == TX_32X32 ? 0 : 2; - - av1_quantize_fp_32x32(coeff, pix_num, p->zbin_QTX, p->round_fp_QTX, - p->quant_fp_QTX, p->quant_shift_QTX, qcoeff, dqcoeff, - p->dequant_QTX, &eob, scan_order->scan, - scan_order->iscan); - - *recon_error = av1_block_error(coeff, dqcoeff, pix_num, sse) >> shift; - *recon_error = AOMMAX(*recon_error, 1); - - *sse = (*sse) >> shift; - *sse = AOMMAX(*sse, 1); -} - -static void wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff, - TX_SIZE tx_size) { - switch (tx_size) { - case TX_8X8: aom_hadamard_8x8(src_diff, bw, coeff); break; - case TX_16X16: aom_hadamard_16x16(src_diff, bw, coeff); break; - case TX_32X32: aom_hadamard_32x32(src_diff, bw, coeff); break; - default: assert(0); - } -} - -static void mode_estimation(AV1_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, - struct scale_factors *sf, GF_PICTURE *gf_picture, - int frame_idx, int16_t *src_diff, tran_low_t *coeff, - tran_low_t *qcoeff, tran_low_t *dqcoeff, int mi_row, - int mi_col, BLOCK_SIZE bsize, TX_SIZE tx_size, - YV12_BUFFER_CONFIG *ref_frame[], uint8_t *predictor, - int64_t *recon_error, int64_t *sse, - TplDepStats *tpl_stats) { - AV1_COMMON *cm = &cpi->common; - ThreadData *td = &cpi->td; - - const int bw = 4 << mi_size_wide_log2[bsize]; - const int bh = 4 << mi_size_high_log2[bsize]; - const int pix_num = bw * bh; - int best_rf_idx = -1; - int_mv best_mv; - int64_t best_inter_cost = INT64_MAX; - int64_t inter_cost; - int rf_idx; - const InterpFilters kernel = - av1_make_interp_filters(EIGHTTAP_REGULAR, EIGHTTAP_REGULAR); - - int64_t best_intra_cost = INT64_MAX; - int64_t intra_cost; - PREDICTION_MODE mode; - int mb_y_offset = mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE; - MB_MODE_INFO mi_above, mi_left; - - memset(tpl_stats, 0, sizeof(*tpl_stats)); - - xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8); - xd->mb_to_bottom_edge = ((cm->mi_rows - 1 - mi_row) * MI_SIZE) * 8; - xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8); - xd->mb_to_right_edge = ((cm->mi_cols - 1 - mi_col) * MI_SIZE) * 8; - xd->above_mbmi = (mi_row > 0) ? &mi_above : NULL; - xd->left_mbmi = (mi_col > 0) ? &mi_left : NULL; - - // Intra prediction search - for (mode = DC_PRED; mode <= PAETH_PRED; ++mode) { - uint8_t *src, *dst; - int src_stride, dst_stride; - - src = xd->cur_buf->y_buffer + mb_y_offset; - src_stride = xd->cur_buf->y_stride; - - dst = &predictor[0]; - dst_stride = bw; - - xd->mi[0]->sb_type = bsize; - xd->mi[0]->ref_frame[0] = INTRA_FRAME; - - av1_predict_intra_block( - cm, xd, block_size_wide[bsize], block_size_high[bsize], tx_size, mode, - 0, 0, FILTER_INTRA_MODES, src, src_stride, dst, dst_stride, 0, 0, 0); - - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - aom_highbd_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, - dst_stride, xd->bd); - } else { - aom_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, - dst_stride); - } - - wht_fwd_txfm(src_diff, bw, coeff, tx_size); - - intra_cost = aom_satd(coeff, pix_num); - - if (intra_cost < best_intra_cost) best_intra_cost = intra_cost; - } - - // Motion compensated prediction - best_mv.as_int = 0; - - (void)mb_y_offset; - // Motion estimation column boundary - x->mv_limits.col_min = -((mi_col * MI_SIZE) + (17 - 2 * AOM_INTERP_EXTEND)); - x->mv_limits.col_max = - ((cm->mi_cols - 1 - mi_col) * MI_SIZE) + (17 - 2 * AOM_INTERP_EXTEND); - - for (rf_idx = 0; rf_idx < 7; ++rf_idx) { - if (ref_frame[rf_idx] == NULL) continue; - - motion_compensated_prediction(cpi, td, xd->cur_buf->y_buffer + mb_y_offset, - ref_frame[rf_idx]->y_buffer + mb_y_offset, - xd->cur_buf->y_stride, bsize, mi_row, mi_col); - - // TODO(jingning): Not yet support high bit-depth in the next three - // steps. - ConvolveParams conv_params = get_conv_params(0, 0, xd->bd); - WarpTypesAllowed warp_types; - memset(&warp_types, 0, sizeof(WarpTypesAllowed)); - - av1_build_inter_predictor( - ref_frame[rf_idx]->y_buffer + mb_y_offset, ref_frame[rf_idx]->y_stride, - &predictor[0], bw, &x->best_mv.as_mv, sf, bw, bh, &conv_params, kernel, - &warp_types, mi_col * MI_SIZE, mi_row * MI_SIZE, 0, 0, MV_PRECISION_Q3, - mi_col * MI_SIZE, mi_row * MI_SIZE, xd, 0); - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - aom_highbd_subtract_block( - bh, bw, src_diff, bw, xd->cur_buf->y_buffer + mb_y_offset, - xd->cur_buf->y_stride, &predictor[0], bw, xd->bd); - } else { - aom_subtract_block(bh, bw, src_diff, bw, - xd->cur_buf->y_buffer + mb_y_offset, - xd->cur_buf->y_stride, &predictor[0], bw); - } - wht_fwd_txfm(src_diff, bw, coeff, tx_size); - - inter_cost = aom_satd(coeff, pix_num); - if (inter_cost < best_inter_cost) { - best_rf_idx = rf_idx; - best_inter_cost = inter_cost; - best_mv.as_int = x->best_mv.as_int; - get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, recon_error, - sse); - } - } - best_intra_cost = AOMMAX(best_intra_cost, 1); - best_inter_cost = AOMMIN(best_intra_cost, best_inter_cost); - tpl_stats->inter_cost = best_inter_cost << TPL_DEP_COST_SCALE_LOG2; - tpl_stats->intra_cost = best_intra_cost << TPL_DEP_COST_SCALE_LOG2; - tpl_stats->mc_dep_cost = tpl_stats->intra_cost + tpl_stats->mc_flow; - - tpl_stats->ref_frame_index = gf_picture[frame_idx].ref_frame[best_rf_idx]; - tpl_stats->mv.as_int = best_mv.as_int; -} - -static void mc_flow_dispenser(AV1_COMP *cpi, GF_PICTURE *gf_picture, - int frame_idx) { - TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; - YV12_BUFFER_CONFIG *this_frame = gf_picture[frame_idx].frame; - YV12_BUFFER_CONFIG *ref_frame[7] = { - NULL, NULL, NULL, NULL, NULL, NULL, NULL - }; - - AV1_COMMON *cm = &cpi->common; - struct scale_factors sf; - int rdmult, idx; - ThreadData *td = &cpi->td; - MACROBLOCK *x = &td->mb; - MACROBLOCKD *xd = &x->e_mbd; - int mi_row, mi_col; - - DECLARE_ALIGNED(16, uint16_t, predictor16[32 * 32 * 3]); - DECLARE_ALIGNED(16, uint8_t, predictor8[32 * 32 * 3]); - uint8_t *predictor; - DECLARE_ALIGNED(16, int16_t, src_diff[32 * 32]); - DECLARE_ALIGNED(16, tran_low_t, coeff[32 * 32]); - DECLARE_ALIGNED(16, tran_low_t, qcoeff[32 * 32]); - DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]); - - const BLOCK_SIZE bsize = BLOCK_32X32; - const TX_SIZE tx_size = max_txsize_lookup[bsize]; - const int mi_height = mi_size_high[bsize]; - const int mi_width = mi_size_wide[bsize]; - int64_t recon_error, sse; - - // Setup scaling factor - av1_setup_scale_factors_for_frame( - &sf, this_frame->y_crop_width, this_frame->y_crop_height, - this_frame->y_crop_width, this_frame->y_crop_height); - - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) - predictor = CONVERT_TO_BYTEPTR(predictor16); - else - predictor = predictor8; - - // Prepare reference frame pointers. If any reference frame slot is - // unavailable, the pointer will be set to Null. - for (idx = 0; idx < 7; ++idx) { - int rf_idx = gf_picture[frame_idx].ref_frame[idx]; - if (rf_idx != -1) ref_frame[idx] = gf_picture[rf_idx].frame; - } - - xd->mi = cm->mi_grid_visible; - xd->mi[0] = cm->mi; - xd->cur_buf = this_frame; - - // Get rd multiplier set up. - rdmult = (int)av1_compute_rd_mult(cpi, tpl_frame->base_qindex); - if (rdmult < 1) rdmult = 1; - set_error_per_bit(&cpi->td.mb, rdmult); - av1_initialize_me_consts(cpi, &cpi->td.mb, tpl_frame->base_qindex); - - tpl_frame->is_valid = 1; - - cm->base_qindex = tpl_frame->base_qindex; - av1_frame_init_quantizer(cpi); - - for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) { - // Motion estimation row boundary - x->mv_limits.row_min = -((mi_row * MI_SIZE) + (17 - 2 * AOM_INTERP_EXTEND)); - x->mv_limits.row_max = - (cm->mi_rows - 1 - mi_row) * MI_SIZE + (17 - 2 * AOM_INTERP_EXTEND); - for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) { - TplDepStats tpl_stats; - mode_estimation(cpi, x, xd, &sf, gf_picture, frame_idx, src_diff, coeff, - qcoeff, dqcoeff, mi_row, mi_col, bsize, tx_size, - ref_frame, predictor, &recon_error, &sse, &tpl_stats); - - // Motion flow dependency dispenser. - tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize, - tpl_frame->stride, &tpl_stats); - - tpl_model_update(cpi->tpl_stats, tpl_frame->tpl_stats_ptr, mi_row, mi_col, - bsize); - } - } -} - -static void setup_tpl_stats(AV1_COMP *cpi) { - GF_PICTURE gf_picture[MAX_LAG_BUFFERS]; - const GF_GROUP *gf_group = &cpi->twopass.gf_group; - int tpl_group_frames = 0; - int frame_idx; - - init_gop_frames(cpi, gf_picture, gf_group, &tpl_group_frames); - - init_tpl_stats(cpi); - - // Backward propagation from tpl_group_frames to 1. - for (frame_idx = tpl_group_frames - 1; frame_idx > 0; --frame_idx) - mc_flow_dispenser(cpi, gf_picture, frame_idx); -} - int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags, size_t *size, uint8_t *dest, int64_t *time_stamp, int64_t *time_end, int flush, const aom_rational_t *timebase) { const AV1EncoderConfig *const oxcf = &cpi->oxcf; AV1_COMMON *const cm = &cpi->common; - CurrentFrame *const current_frame = &cm->current_frame; - const int num_planes = av1_num_planes(cm); - BufferPool *const pool = cm->buffer_pool; - RATE_CONTROL *const rc = &cpi->rc; - struct aom_usec_timer cmptimer; - YV12_BUFFER_CONFIG *force_src_buffer = NULL; - struct lookahead_entry *last_source = NULL; - struct lookahead_entry *source = NULL; - int arf_src_index; - int brf_src_index; - int i; #if CONFIG_BITSTREAM_DEBUG assert(cpi->oxcf.max_threads == 0 && "bitstream debug tool does not support multithreading"); bitstream_queue_record_write(); - bitstream_queue_set_frame_write(current_frame->frame_number * 2 + + bitstream_queue_set_frame_write(cm->current_frame.frame_number * 2 + cm->show_frame); #endif + // Indicates whether or not to use an adaptive quantize b rather than + // the traditional version + cm->use_quant_b_adapt = cpi->oxcf.quant_b_adapt; + cm->showable_frame = 0; + *size = 0; +#if CONFIG_INTERNAL_STATS + struct aom_usec_timer cmptimer; aom_usec_timer_start(&cmptimer); - +#endif set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV, 0); // Normal defaults @@ -6584,387 +5455,42 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags, if (oxcf->large_scale_tile) cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED; - // default reference buffers update config - av1_configure_buffer_updates_firstpass(cpi, LF_UPDATE); - // Initialize fields related to forward keyframes cpi->no_show_kf = 0; - cm->reset_decoder_state = 0; - - // Don't allow a show_existing_frame to coincide with an error resilient or - // S-Frame. An exception can be made in the case of a keyframe, since it - // does not depend on any previous frames. We must make this exception here - // because of the use of show_existing_frame with forward coded keyframes. - struct lookahead_entry *lookahead_src = NULL; - if (current_frame->frame_number > 0) - lookahead_src = av1_lookahead_peek(cpi->lookahead, 0); - - int use_show_existing = 1; - if (lookahead_src != NULL) { - const int is_error_resilient = - cpi->oxcf.error_resilient_mode || - (lookahead_src->flags & AOM_EFLAG_ERROR_RESILIENT); - const int is_s_frame = cpi->oxcf.s_frame_mode || - (lookahead_src->flags & AOM_EFLAG_SET_S_FRAME); - const int is_key_frame = - (rc->frames_to_key == 0) || (cpi->frame_flags & FRAMEFLAGS_KEY); - use_show_existing = !(is_error_resilient || is_s_frame) || is_key_frame; - } - - if (oxcf->pass == 2 && cm->show_existing_frame && use_show_existing) { - // Manage the source buffer and flush out the source frame that has been - // coded already; Also get prepared for PSNR calculation if needed. - if ((source = av1_lookahead_pop(cpi->lookahead, flush)) == NULL) { - *size = 0; - return -1; - } - av1_apply_encoding_flags(cpi, source->flags); - cpi->source = &source->img; - // TODO(zoeliu): To track down to determine whether it's needed to adjust - // the frame rate. - *time_stamp = source->ts_start; - *time_end = source->ts_end; - - // We need to adjust frame rate for an overlay frame - if (cpi->rc.is_src_frame_alt_ref) adjust_frame_rate(cpi, source); - - // Find a free buffer for the new frame, releasing the reference - // previously held. - if (cm->new_fb_idx != INVALID_IDX) { - --pool->frame_bufs[cm->new_fb_idx].ref_count; - } - - cm->cur_frame = NULL; - cm->new_fb_idx = get_free_fb(cm); - if (cm->new_fb_idx == INVALID_IDX) return -1; - cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx]; - - // Clear down mmx registers - aom_clear_system_state(); - - // Start with a 0 size frame. - *size = 0; - - // We need to update the gf_group for show_existing overlay frame - if (cpi->rc.is_src_frame_alt_ref) av1_rc_get_second_pass_params(cpi); - - if (Pass2Encode(cpi, size, dest, frame_flags) != AOM_CODEC_OK) - return AOM_CODEC_ERROR; - - if (cpi->b_calculate_psnr) generate_psnr_packet(cpi); - -#if CONFIG_INTERNAL_STATS - compute_internal_stats(cpi, (int)(*size)); -#endif // CONFIG_INTERNAL_STATS - - // Clear down mmx registers - aom_clear_system_state(); - - cm->show_existing_frame = 0; - return 0; - } - - // Should we encode an arf frame. - arf_src_index = get_arf_src_index(cpi); - if (arf_src_index) { - for (i = 0; i <= arf_src_index; ++i) { - struct lookahead_entry *e = av1_lookahead_peek(cpi->lookahead, i); - // Avoid creating an alt-ref if there's a forced keyframe pending. - if (e == NULL) { - break; - } else if (e->flags == AOM_EFLAG_FORCE_KF) { - arf_src_index = 0; - flush = 1; - break; - } - } - } - - if (arf_src_index) { - assert(arf_src_index <= rc->frames_to_key); - - if ((source = av1_lookahead_peek(cpi->lookahead, arf_src_index)) != NULL) { - cm->showable_frame = 1; - cpi->alt_ref_source = source; - // When arf_src_index == rc->frames_to_key, it indicates a fwd_kf - if (arf_src_index == rc->frames_to_key) { - // Skip temporal filtering and mark as intra_only if we have a fwd_kf - const GF_GROUP *const gf_group = &cpi->twopass.gf_group; - int which_arf = gf_group->arf_update_idx[gf_group->index]; - cpi->is_arf_filter_off[which_arf] = 1; - cpi->no_show_kf = 1; - } else { - if (oxcf->arnr_max_frames > 0) { - // Produce the filtered ARF frame. - av1_temporal_filter(cpi, arf_src_index); - aom_extend_frame_borders(&cpi->alt_ref_buffer, num_planes); - force_src_buffer = &cpi->alt_ref_buffer; - } - } - cm->show_frame = 0; - current_frame->intra_only = 0; - - if (oxcf->pass < 2) { - // In second pass, the buffer updates configure will be set - // in the function av1_rc_get_second_pass_params - av1_configure_buffer_updates_firstpass(cpi, ARF_UPDATE); - } - } - rc->source_alt_ref_pending = 0; - } - - // Should we encode an arf2 frame. - arf_src_index = get_arf2_src_index(cpi); - if (arf_src_index) { - for (i = 0; i <= arf_src_index; ++i) { - struct lookahead_entry *e = av1_lookahead_peek(cpi->lookahead, i); - // Avoid creating an alt-ref if there's a forced keyframe pending. - if (e == NULL) { - break; - } else if (e->flags == AOM_EFLAG_FORCE_KF) { - arf_src_index = 0; - flush = 1; - break; - } - } - } - - if (arf_src_index) { - assert(arf_src_index <= rc->frames_to_key); - - if ((source = av1_lookahead_peek(cpi->lookahead, arf_src_index)) != NULL) { - cm->showable_frame = 1; - cpi->alt_ref_source = source; - - if (oxcf->arnr_max_frames > 0) { - // Produce the filtered ARF frame. - av1_temporal_filter(cpi, arf_src_index); - aom_extend_frame_borders(&cpi->alt_ref_buffer, num_planes); - force_src_buffer = &cpi->alt_ref_buffer; - } - - cm->show_frame = 0; - current_frame->intra_only = 0; - - if (oxcf->pass < 2) { - // In second pass, the buffer updates configure will be set - // in the function av1_rc_get_second_pass_params - av1_configure_buffer_updates_firstpass(cpi, INTNL_ARF_UPDATE); - } - } - rc->source_alt_ref_pending = 0; - } - - rc->is_bwd_ref_frame = 0; - brf_src_index = get_brf_src_index(cpi); - if (brf_src_index) { - assert(brf_src_index <= rc->frames_to_key); - if ((source = av1_lookahead_peek(cpi->lookahead, brf_src_index)) != NULL) { - cm->showable_frame = 1; - cm->show_frame = 0; - current_frame->intra_only = 0; - - if (oxcf->pass < 2) { - // In second pass, the buffer updates configure will be set - // in the function av1_rc_get_second_pass_params - av1_configure_buffer_updates_firstpass(cpi, BIPRED_UPDATE); - } - } - } - if (!source) { - // Get last frame source. - if (current_frame->frame_number > 0) { - if ((last_source = av1_lookahead_peek(cpi->lookahead, -1)) == NULL) - return -1; - } - if (current_frame->frame_number > 0) assert(last_source != NULL); - // Read in the source frame. - source = av1_lookahead_pop(cpi->lookahead, flush); - - if (source != NULL) { - cm->show_frame = 1; - current_frame->intra_only = 0; - - // Check to see if the frame should be encoded as an arf overlay. - check_src_altref(cpi, source); - } - } - if (source) { - cpi->unscaled_source = cpi->source = - force_src_buffer ? force_src_buffer : &source->img; - cpi->unscaled_last_source = last_source != NULL ? &last_source->img : NULL; + if (assign_cur_frame_new_fb(cm) == NULL) return AOM_CODEC_ERROR; - *time_stamp = source->ts_start; - *time_end = source->ts_end; - av1_apply_encoding_flags(cpi, source->flags); - *frame_flags = (source->flags & AOM_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0; - - } else { - *size = 0; - if (flush && oxcf->pass == 1 && !cpi->twopass.first_pass_done) { - av1_end_first_pass(cpi); /* get last stats packet */ - cpi->twopass.first_pass_done = 1; - } + const int result = av1_encode_strategy(cpi, size, dest, frame_flags, + time_stamp, time_end, timebase, flush); + if (result != AOM_CODEC_OK && result != -1) { + return AOM_CODEC_ERROR; + } else if (result == -1) { + // Returning -1 indicates no frame encoded; more input is required return -1; } - - if (source->ts_start < cpi->first_time_stamp_ever) { - cpi->first_time_stamp_ever = source->ts_start; - cpi->last_end_time_stamp_seen = source->ts_start; - } - - // Clear down mmx registers - aom_clear_system_state(); - - // adjust frame rates based on timestamps given - if (cm->show_frame) adjust_frame_rate(cpi, source); - - // Find a free buffer for the new frame, releasing the reference previously - // held. - if (cm->new_fb_idx != INVALID_IDX) { - --pool->frame_bufs[cm->new_fb_idx].ref_count; - } - cm->new_fb_idx = get_free_fb(cm); - - if (cm->new_fb_idx == INVALID_IDX) return -1; - - cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx]; - // Retain the RF_LEVEL for the current newly coded frame. - cm->cur_frame->frame_rf_level = - cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index]; - - cm->cur_frame->buf.buf_8bit_valid = 0; - - if (cpi->film_grain_table) { - cm->seq_params.film_grain_params_present = aom_film_grain_table_lookup( - cpi->film_grain_table, *time_stamp, *time_end, 0 /* =erase */, - &cm->film_grain_params); - } - cm->cur_frame->film_grain_params_present = - cm->seq_params.film_grain_params_present; - - // only one operating point supported now - const int64_t pts64 = ticks_to_timebase_units(timebase, *time_stamp); - if (pts64 < 0 || pts64 > UINT32_MAX) return AOM_CODEC_ERROR; - cpi->common.frame_presentation_time = (uint32_t)pts64; - - // Start with a 0 size frame. - *size = 0; - - cpi->frame_flags = *frame_flags; - - if (oxcf->pass == 2) { - av1_rc_get_second_pass_params(cpi); - } else if (oxcf->pass == 1) { - setup_frame_size(cpi); - } - - if (cpi->oxcf.pass != 0 || frame_is_intra_only(cm) == 1) { - for (i = 0; i < INTER_REFS_PER_FRAME; ++i) - cpi->scaled_ref_idx[i] = INVALID_IDX; - } - - cm->using_qmatrix = cpi->oxcf.using_qm; - cm->min_qmlevel = cpi->oxcf.qm_minlevel; - cm->max_qmlevel = cpi->oxcf.qm_maxlevel; - - if (cm->seq_params.frame_id_numbers_present_flag && *time_stamp == 0) { - cpi->common.current_frame_id = -1; - } - - cpi->cur_poc++; - if (oxcf->pass != 1 && cpi->common.allow_screen_content_tools && - !frame_is_intra_only(cm)) { - if (cpi->common.seq_params.force_integer_mv == 2) { - struct lookahead_entry *previous_entry = - av1_lookahead_peek(cpi->lookahead, cpi->previous_index); - if (!previous_entry) - cpi->common.cur_frame_force_integer_mv = 0; - else - cpi->common.cur_frame_force_integer_mv = is_integer_mv( - cpi, cpi->source, &previous_entry->img, cpi->previous_hash_table); - } else { - cpi->common.cur_frame_force_integer_mv = - cpi->common.seq_params.force_integer_mv; - } - } else { - cpi->common.cur_frame_force_integer_mv = 0; - } - - if (cpi->twopass.gf_group.index == 1 && cpi->oxcf.enable_tpl_model) { - set_frame_size(cpi, cm->width, cm->height); - setup_tpl_stats(cpi); - } - - if (oxcf->pass == 1) { - cpi->td.mb.e_mbd.lossless[0] = is_lossless_requested(oxcf); - av1_first_pass(cpi, source); - } else if (oxcf->pass == 2) { - if (Pass2Encode(cpi, size, dest, frame_flags) != AOM_CODEC_OK) - return AOM_CODEC_ERROR; - } else { - // One pass encode - if (Pass0Encode(cpi, size, dest, 0, frame_flags) != AOM_CODEC_OK) - return AOM_CODEC_ERROR; - } - if (oxcf->pass != 1 && cpi->common.allow_screen_content_tools) { - cpi->previous_hash_table = &cm->cur_frame->hash_table; - { - int l; - for (l = -MAX_PRE_FRAMES; l < cpi->lookahead->max_sz; l++) { - if ((cpi->lookahead->buf + l) == source) { - cpi->previous_index = l; - break; - } - } - - if (l == cpi->lookahead->max_sz) { - aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, - "Failed to find last frame original buffer"); - } - } - } - - if (!cm->large_scale_tile) { - cm->cur_frame->frame_context = *cm->fc; - } - -#define EXT_TILE_DEBUG 0 -#if EXT_TILE_DEBUG - if (cm->large_scale_tile && oxcf->pass == 2) { - char fn[20] = "./fc"; - fn[4] = current_frame->frame_number / 100 + '0'; - fn[5] = (current_frame->frame_number % 100) / 10 + '0'; - fn[6] = (current_frame->frame_number % 10) + '0'; - fn[7] = '\0'; - av1_print_frame_contexts(cm->fc, fn); - } -#endif // EXT_TILE_DEBUG -#undef EXT_TILE_DEBUG - - cm->showable_frame = !cm->show_frame && cm->showable_frame; - - // No frame encoded, or frame was dropped, release scaled references. - if ((*size == 0) && (frame_is_intra_only(cm) == 0)) { - release_scaled_references(cpi); - } - - if (*size > 0) { - cpi->droppable = is_frame_droppable(cpi); - } - +#if CONFIG_INTERNAL_STATS aom_usec_timer_mark(&cmptimer); cpi->time_compress_data += aom_usec_timer_elapsed(&cmptimer); - - if (cpi->b_calculate_psnr && oxcf->pass != 1 && cm->show_frame) - generate_psnr_packet(cpi); +#endif + if (cpi->b_calculate_psnr) { + if (cm->show_existing_frame || (oxcf->pass != 1 && cm->show_frame)) { + generate_psnr_packet(cpi); + } + } + if (cpi->keep_level_stats && oxcf->pass != 1) + av1_update_level_info(cpi, *size, *time_stamp, *time_end); #if CONFIG_INTERNAL_STATS if (oxcf->pass != 1) { compute_internal_stats(cpi, (int)(*size)); } #endif // CONFIG_INTERNAL_STATS +#if CONFIG_SPEED_STATS + if (cpi->oxcf.pass != 1 && !cm->show_existing_frame) { + cpi->tx_search_count += cpi->td.mb.tx_search_count; + cpi->td.mb.tx_search_count = 0; + } +#endif // CONFIG_SPEED_STATS aom_clear_system_state(); @@ -6977,8 +5503,8 @@ int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest) { return -1; } else { int ret; - if (cm->frame_to_show) { - *dest = *cm->frame_to_show; + if (cm->cur_frame != NULL) { + *dest = cm->cur_frame->buf; dest->y_width = cm->width; dest->y_height = cm->height; dest->uv_width = cm->width >> cm->seq_params.subsampling_x; @@ -6993,10 +5519,9 @@ int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest) { } int av1_get_last_show_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *frame) { - if (cpi->last_show_frame_buf_idx == INVALID_IDX) return -1; + if (cpi->last_show_frame_buf == NULL) return -1; - *frame = - cpi->common.buffer_pool->frame_bufs[cpi->last_show_frame_buf_idx].buf; + *frame = cpi->last_show_frame_buf->buf; return 0; } @@ -7148,7 +5673,14 @@ void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags) { upd ^= AOM_ALT2_FLAG; } - av1_update_reference(cpi, upd); + cpi->ext_refresh_last_frame = (upd & AOM_LAST_FLAG) != 0; + cpi->ext_refresh_golden_frame = (upd & AOM_GOLD_FLAG) != 0; + cpi->ext_refresh_alt_ref_frame = (upd & AOM_ALT_FLAG) != 0; + cpi->ext_refresh_bwd_ref_frame = (upd & AOM_BWD_FLAG) != 0; + cpi->ext_refresh_alt2_ref_frame = (upd & AOM_ALT2_FLAG) != 0; + cpi->ext_refresh_frame_flags_pending = 1; + } else { + cpi->ext_refresh_frame_flags_pending = 0; } cpi->ext_use_ref_frame_mvs = cpi->oxcf.allow_ref_frame_mvs & @@ -7164,15 +5696,6 @@ void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags) { } } -int64_t timebase_units_to_ticks(const aom_rational_t *timebase, int64_t n) { - return n * TICKS_PER_SEC * timebase->num / timebase->den; -} - -int64_t ticks_to_timebase_units(const aom_rational_t *timebase, int64_t n) { - const int64_t round = TICKS_PER_SEC * timebase->num / 2 - 1; - return (n * timebase->den + round) / timebase->num / TICKS_PER_SEC; -} - aom_fixed_buf_t *av1_get_global_headers(AV1_COMP *cpi) { if (!cpi) return NULL; @@ -7189,7 +5712,7 @@ aom_fixed_buf_t *av1_get_global_headers(AV1_COMP *cpi) { if (payload_offset + sequence_header_size > sizeof(header_buf)) return NULL; memmove(&header_buf[payload_offset], &header_buf[0], sequence_header_size); - if (write_obu_header(OBU_SEQUENCE_HEADER, 0, &header_buf[0]) != + if (av1_write_obu_header(cpi, OBU_SEQUENCE_HEADER, 0, &header_buf[0]) != obu_header_size) { return NULL; } diff --git a/libaom/av1/encoder/encoder.h b/libaom/av1/encoder/encoder.h index 1ff2ef7..bf02394 100644 --- a/libaom/av1/encoder/encoder.h +++ b/libaom/av1/encoder/encoder.h @@ -12,6 +12,7 @@ #ifndef AOM_AV1_ENCODER_ENCODER_H_ #define AOM_AV1_ENCODER_ENCODER_H_ +#include <stdbool.h> #include <stdio.h> #include "config/aom_config.h" @@ -24,11 +25,14 @@ #include "av1/common/onyxc_int.h" #include "av1/common/resize.h" #include "av1/common/timing.h" +#include "av1/common/blockd.h" +#include "av1/common/enums.h" #include "av1/encoder/aq_cyclicrefresh.h" #include "av1/encoder/av1_quantize.h" #include "av1/encoder/context_tree.h" #include "av1/encoder/encodemb.h" #include "av1/encoder/firstpass.h" +#include "av1/encoder/level.h" #include "av1/encoder/lookahead.h" #include "av1/encoder/mbgraph.h" #include "av1/encoder/mcomp.h" @@ -36,6 +40,7 @@ #include "av1/encoder/rd.h" #include "av1/encoder/speed_features.h" #include "av1/encoder/tokenize.h" +#include "av1/encoder/block.h" #if CONFIG_INTERNAL_STATS #include "aom_dsp/ssim.h" @@ -59,36 +64,33 @@ typedef struct { FRAME_CONTEXT fc; } CODING_CONTEXT; -typedef enum { - // regular inter frame - REGULAR_FRAME = 0, - // alternate reference frame - ARF_FRAME = 1, - // overlay frame - OVERLAY_FRAME = 2, - // golden frame - GLD_FRAME = 3, - // backward reference frame - BRF_FRAME = 4, - // extra alternate reference frame - EXT_ARF_FRAME = 5, +enum { + REGULAR_FRAME, // regular inter frame + ARF_FRAME, // alternate reference frame + OVERLAY_FRAME, // overlay frame + GLD_FRAME, // golden frame + BRF_FRAME, // backward reference frame + INTERNAL_ARF_FRAME, // internal alternate reference frame FRAME_CONTEXT_INDEXES -} FRAME_CONTEXT_INDEX; +} UENUM1BYTE(FRAME_CONTEXT_INDEX); -typedef enum { +enum { NORMAL = 0, FOURFIVE = 1, THREEFIVE = 2, ONETWO = 3 -} AOM_SCALING; +} UENUM1BYTE(AOM_SCALING); -typedef enum { +enum { // Good Quality Fast Encoding. The encoder balances quality with the amount of // time it takes to encode the output. Speed setting controls how fast. - GOOD -} MODE; + GOOD, + // Realtime Fast Encoding. Will force some restrictions on bitrate + // constraints. + REALTIME +} UENUM1BYTE(MODE); -typedef enum { +enum { FRAMEFLAGS_KEY = 1 << 0, FRAMEFLAGS_GOLDEN = 1 << 1, FRAMEFLAGS_BWDREF = 1 << 2, @@ -97,46 +99,62 @@ typedef enum { FRAMEFLAGS_INTRAONLY = 1 << 4, FRAMEFLAGS_SWITCH = 1 << 5, FRAMEFLAGS_ERROR_RESILIENT = 1 << 6, -} FRAMETYPE_FLAGS; +} UENUM1BYTE(FRAMETYPE_FLAGS); -typedef enum { +enum { NO_AQ = 0, VARIANCE_AQ = 1, COMPLEXITY_AQ = 2, CYCLIC_REFRESH_AQ = 3, AQ_MODE_COUNT // This should always be the last member of the enum -} AQ_MODE; -typedef enum { +} UENUM1BYTE(AQ_MODE); +enum { NO_DELTA_Q = 0, DELTA_Q_ONLY = 1, DELTA_Q_LF = 2, DELTAQ_MODE_COUNT // This should always be the last member of the enum -} DELTAQ_MODE; +} UENUM1BYTE(DELTAQ_MODE); -typedef enum { +enum { RESIZE_NONE = 0, // No frame resizing allowed. RESIZE_FIXED = 1, // All frames are coded at the specified scale. RESIZE_RANDOM = 2, // All frames are coded at a random scale. RESIZE_MODES -} RESIZE_MODE; +} UENUM1BYTE(RESIZE_MODE); + +enum { + SUPERRES_NONE, // No frame superres allowed. + SUPERRES_FIXED, // All frames are coded at the specified scale, + // and super-resolved. + SUPERRES_RANDOM, // All frames are coded at a random scale, + // and super-resolved. + SUPERRES_QTHRESH, // Superres scale for a frame is determined based on + // q_index. + SUPERRES_AUTO, // Automatically select superres for appropriate frames. + SUPERRES_MODES +} UENUM1BYTE(SUPERRES_MODE); typedef enum { - SUPERRES_NONE = 0, // No frame superres allowed - SUPERRES_FIXED = 1, // All frames are coded at the specified scale, - // and super-resolved. - SUPERRES_RANDOM = 2, // All frames are coded at a random scale, - // and super-resolved. - SUPERRES_QTHRESH = 3, // Superres scale for a frame is determined based on - // q_index - SUPERRES_MODES -} SUPERRES_MODE; + kInvalid = 0, + kLowSadLowSumdiff = 1, + kLowSadHighSumdiff = 2, + kHighSadLowSumdiff = 3, + kHighSadHighSumdiff = 4, + kLowVarHighSumdiff = 5, + kVeryHighSad = 6, +} CONTENT_STATE_SB; + +enum { + SS_CFG_SRC = 0, + SS_CFG_LOOKAHEAD = 1, + SS_CFG_TOTAL = 2 +} UENUM1BYTE(SS_CFG_OFFSET); typedef struct TplDepStats { int64_t intra_cost; int64_t inter_cost; int64_t mc_flow; int64_t mc_dep_cost; - int64_t mc_ref_cost; int ref_frame_index; int_mv mv; @@ -153,6 +171,12 @@ typedef struct TplDepFrame { int base_qindex; } TplDepFrame; +typedef enum { + COST_UPD_SB, + COST_UPD_SBROW, + COST_UPD_TILE, +} COST_UPDATE_TYPE; + #define TPL_DEP_COST_SCALE_LOG2 4 typedef struct AV1EncoderConfig { @@ -215,6 +239,7 @@ typedef struct AV1EncoderConfig { DELTAQ_MODE deltaq_mode; int enable_cdef; int enable_restoration; + int enable_obmc; int disable_trellis_quant; int using_qm; int qm_y; @@ -274,6 +299,7 @@ typedef struct AV1EncoderConfig { int min_gf_interval; int max_gf_interval; + int gf_max_pyr_height; int row_mt; int tile_columns; @@ -288,11 +314,6 @@ typedef struct AV1EncoderConfig { int max_threads; aom_fixed_buf_t two_pass_stats_in; - struct aom_codec_pkt_list *output_pkt_list; - -#if CONFIG_FP_MB_STATS - aom_fixed_buf_t firstpass_mb_stats_in; -#endif aom_tune_metric tuning; aom_tune_content content; @@ -304,15 +325,12 @@ typedef struct AV1EncoderConfig { int color_range; int render_width; int render_height; - aom_timing_info_type_t timing_info_type; int timing_info_present; aom_timing_info_t timing_info; int decoder_model_info_present_flag; int display_model_info_present_flag; int buffer_removal_time_present; aom_dec_model_info_t buffer_model; - aom_dec_model_op_parameters_t op_params[MAX_NUM_OPERATING_POINTS + 1]; - aom_op_timing_info_t op_frame_timing[MAX_NUM_OPERATING_POINTS + 1]; int film_grain_test_vector; const char *film_grain_table_filename; @@ -320,18 +338,44 @@ typedef struct AV1EncoderConfig { aom_superblock_size_t superblock_size; unsigned int large_scale_tile; unsigned int single_tile_decoding; - int monochrome; + uint8_t monochrome; unsigned int full_still_picture_hdr; int enable_dual_filter; unsigned int motion_vector_unit_test; const cfg_options_t *cfg; + int enable_rect_partitions; + int enable_ab_partitions; + int enable_1to4_partitions; + int min_partition_size; + int max_partition_size; + int enable_intra_edge_filter; + int enable_tx64; + int tx_size_search_method; + int enable_flip_idtx; int enable_order_hint; - int enable_jnt_comp; + int enable_dist_wtd_comp; int enable_ref_frame_mvs; + unsigned int max_reference_frames; + int enable_reduced_reference_set; unsigned int allow_ref_frame_mvs; + int enable_masked_comp; + int enable_onesided_comp; + int enable_interintra_comp; + int enable_smooth_interintra; + int enable_diff_wtd_comp; + int enable_interinter_wedge; + int enable_interintra_wedge; + int enable_global_motion; int enable_warped_motion; int allow_warped_motion; + int enable_filter_intra; + int enable_smooth_intra; + int enable_paeth_intra; + int enable_cfl_intra; int enable_superres; + int enable_palette; + int enable_intrabc; + int enable_angle_delta; unsigned int save_as_annexb; #if CONFIG_DENOISE @@ -341,6 +385,18 @@ typedef struct AV1EncoderConfig { unsigned int chroma_subsampling_x; unsigned int chroma_subsampling_y; + int reduced_tx_type_set; + int use_intra_dct_only; + int use_inter_dct_only; + int use_intra_default_tx_only; + int quant_b_adapt; + COST_UPDATE_TYPE coeff_cost_upd_freq; + COST_UPDATE_TYPE mode_cost_upd_freq; + int border_in_pixels; + AV1_LEVEL target_seq_level_idx[MAX_NUM_OPERATING_POINTS]; + // Bit mask to specify which tier each of the 32 possible operating points + // conforms to. + unsigned int tier_mask; } AV1EncoderConfig; static INLINE int is_lossless_requested(const AV1EncoderConfig *cfg) { @@ -397,7 +453,7 @@ typedef struct FRAME_COUNTS { unsigned int interintra[BLOCK_SIZE_GROUPS][2]; unsigned int interintra_mode[BLOCK_SIZE_GROUPS][INTERINTRA_MODES]; unsigned int wedge_interintra[BLOCK_SIZES_ALL][2]; - unsigned int compound_type[BLOCK_SIZES_ALL][COMPOUND_TYPES - 1]; + unsigned int compound_type[BLOCK_SIZES_ALL][MASKED_COMPOUND_TYPES]; unsigned int motion_mode[BLOCK_SIZES_ALL][MOTION_MODES]; unsigned int obmc[BLOCK_SIZES_ALL][2]; unsigned int intra_inter[INTRA_INTER_CONTEXTS][2]; @@ -433,7 +489,6 @@ typedef struct FRAME_COUNTS { [SWITCHABLE_FILTERS]; } FRAME_COUNTS; -#if CONFIG_COLLECT_INTER_MODE_RD_STATS #define INTER_MODE_RD_DATA_OVERALL_SIZE 6400 typedef struct { @@ -467,8 +522,12 @@ typedef struct inter_modes_info { int64_t sse_arr[MAX_INTER_MODES]; int64_t est_rd_arr[MAX_INTER_MODES]; RdIdxPair rd_idx_pair_arr[MAX_INTER_MODES]; + bool true_rd_arr[MAX_INTER_MODES]; + uint8_t blk_skip_arr[MAX_INTER_MODES][MAX_MIB_SIZE * MAX_MIB_SIZE]; + RD_STATS rd_cost_arr[MAX_INTER_MODES]; + RD_STATS rd_cost_y_arr[MAX_INTER_MODES]; + RD_STATS rd_cost_uv_arr[MAX_INTER_MODES]; } InterModesInfo; -#endif // Encoder row synchronization typedef struct AV1RowMTSyncData { @@ -491,16 +550,13 @@ typedef struct AV1RowMTInfo { typedef struct TileDataEnc { TileInfo tile_info; int thresh_freq_fact[BLOCK_SIZES_ALL][MAX_MODES]; - int mode_map[BLOCK_SIZES_ALL][MAX_MODES]; int m_search_count; int ex_search_count; CFL_CTX cfl; DECLARE_ALIGNED(16, FRAME_CONTEXT, tctx); - DECLARE_ALIGNED(16, FRAME_CONTEXT, backup_tctx); + FRAME_CONTEXT *row_ctx; uint8_t allow_update_cdf; -#if CONFIG_COLLECT_INTER_MODE_RD_STATS InterModeRdModel inter_mode_rd_models[BLOCK_SIZES_ALL]; -#endif AV1RowMTSync row_mt_sync; AV1RowMTInfo row_mt_info; } TileDataEnc; @@ -535,9 +591,7 @@ typedef struct ThreadData { tran_low_t *tree_coeff_buf[MAX_MB_PLANE]; tran_low_t *tree_qcoeff_buf[MAX_MB_PLANE]; tran_low_t *tree_dqcoeff_buf[MAX_MB_PLANE]; -#if CONFIG_COLLECT_INTER_MODE_RD_STATS InterModesInfo *inter_modes_info; -#endif uint32_t *hash_value_buffer[2][2]; int32_t *wsrc_buf; int32_t *mask_buf; @@ -560,13 +614,13 @@ typedef struct ActiveMap { #if CONFIG_INTERNAL_STATS // types of stats -typedef enum { +enum { STAT_Y, STAT_U, STAT_V, STAT_ALL, NUM_STAT_TYPES // This should always be the last member of the enum -} StatType; +} UENUM1BYTE(StatType); typedef struct IMAGE_STAT { double stat[NUM_STAT_TYPES]; @@ -579,10 +633,83 @@ typedef struct { YV12_BUFFER_CONFIG buf; } EncRefCntBuffer; -typedef struct TileBufferEnc { - uint8_t *data; - size_t size; -} TileBufferEnc; +#if CONFIG_COLLECT_PARTITION_STATS == 2 +typedef struct PartitionStats { + int partition_decisions[6][EXT_PARTITION_TYPES]; + int partition_attempts[6][EXT_PARTITION_TYPES]; + int64_t partition_times[6][EXT_PARTITION_TYPES]; + + int partition_redo; +} PartitionStats; +#endif + +#if CONFIG_COLLECT_COMPONENT_TIMING +#include "aom_ports/aom_timer.h" +// Adjust the following to add new components. +enum { + encode_frame_to_data_rate_time, + encode_with_recode_loop_time, + loop_filter_time, + cdef_time, + loop_restoration_time, + av1_pack_bitstream_final_time, + av1_encode_frame_time, + av1_compute_global_motion_time, + av1_setup_motion_field_time, + encode_sb_time, + first_partition_search_pass_time, + rd_pick_partition_time, + rd_pick_sb_modes_time, + av1_rd_pick_intra_mode_sb_time, + av1_rd_pick_inter_mode_sb_time, + handle_intra_mode_time, + handle_inter_mode_time, + do_tx_search_time, + handle_newmv_time, + compound_type_rd_time, + interpolation_filter_search_time, + motion_mode_rd_time, + kTimingComponents, +} UENUM1BYTE(TIMING_COMPONENT); + +static INLINE char const *get_component_name(int index) { + switch (index) { + case encode_frame_to_data_rate_time: + return "encode_frame_to_data_rate_time"; + case encode_with_recode_loop_time: return "encode_with_recode_loop_time"; + case loop_filter_time: return "loop_filter_time"; + case cdef_time: return "cdef_time"; + case loop_restoration_time: return "loop_restoration_time"; + case av1_pack_bitstream_final_time: return "av1_pack_bitstream_final_time"; + case av1_encode_frame_time: return "av1_encode_frame_time"; + case av1_compute_global_motion_time: + return "av1_compute_global_motion_time"; + case av1_setup_motion_field_time: return "av1_setup_motion_field_time"; + case encode_sb_time: return "encode_sb_time"; + case first_partition_search_pass_time: + return "first_partition_search_pass_time"; + case rd_pick_partition_time: return "rd_pick_partition_time"; + case rd_pick_sb_modes_time: return "rd_pick_sb_modes_time"; + case av1_rd_pick_intra_mode_sb_time: + return "av1_rd_pick_intra_mode_sb_time"; + case av1_rd_pick_inter_mode_sb_time: + return "av1_rd_pick_inter_mode_sb_time"; + case handle_intra_mode_time: return "handle_intra_mode_time"; + case handle_inter_mode_time: return "handle_inter_mode_time"; + case do_tx_search_time: return "do_tx_search_time"; + case handle_newmv_time: return "handle_newmv_time"; + case compound_type_rd_time: return "compound_type_rd_time"; + case interpolation_filter_search_time: + return "interpolation_filter_search_time"; + case motion_mode_rd_time: return "motion_mode_rd_time"; + default: assert(0); + } + return "error"; +} +#endif + +// The maximum number of internal ARFs except ALTREF_FRAME +#define MAX_INTERNAL_ARFS (REF_FRAMES - BWDREF_FRAME - 1) typedef struct AV1_COMP { QUANTS quants; @@ -597,7 +724,6 @@ typedef struct AV1_COMP { struct lookahead_entry *alt_ref_source; int no_show_kf; - int optimize_speed_feature; int optimize_seg_arr[MAX_SEGMENTS]; YV12_BUFFER_CONFIG *source; @@ -612,37 +738,20 @@ typedef struct AV1_COMP { // For a still frame, this flag is set to 1 to skip partition search. int partition_search_skippable_frame; + // The following item corresponds to two_pass_partition_search speed features. + int two_pass_partition_search; + double csm_rate_array[32]; double m_rate_array[32]; int rate_size; int rate_index; hash_table *previous_hash_table; int previous_index; - int cur_poc; // DebugInfo unsigned int row_mt; - int scaled_ref_idx[INTER_REFS_PER_FRAME]; - - // For encoder, we have a two-level mapping from reference frame type to the - // corresponding buffer in the buffer pool: - // * 'remapped_ref_idx[i - 1]' maps reference type ‘i’ (range: LAST_FRAME ... - // EXTREF_FRAME) to a remapped index ‘j’ (in range: 0 ... REF_FRAMES - 1) - // * Later, 'cm->ref_frame_map[j]' maps the remapped index ‘j’ to actual index - // of the buffer in the buffer pool ‘cm->buffer_pool.frame_bufs’. - // - // LAST_FRAME, ..., EXTREF_FRAME - // | | - // v v - // remapped_ref_idx[LAST_FRAME - 1], ..., remapped_ref_idx[EXTREF_FRAME - 1] - // | | - // v v - // ref_frame_map[], ..., ref_frame_map[] - // - // Note: INTRA_FRAME always refers to the current frame, so there's no need to - // have a remapped index for the same. - int remapped_ref_idx[REF_FRAMES]; + RefCntBuffer *scaled_ref_buf[INTER_REFS_PER_FRAME]; - int last_show_frame_buf_idx; // last show frame buffer index + RefCntBuffer *last_show_frame_buf; // last show frame buffer // refresh_*_frame are boolean flags. If 'refresh_xyz_frame' is true, then // after the current frame is encoded, the XYZ reference frame gets refreshed @@ -661,14 +770,11 @@ typedef struct AV1_COMP { int refresh_alt2_ref_frame; int refresh_alt_ref_frame; -#if USE_SYMM_MULTI_LAYER - // When true, a new rule for backward (future) reference frames is in effect: - // - BWDREF_FRAME is always the closest future frame available - // - ALTREF2_FRAME is always the 2nd closest future frame available - // - 'refresh_bwd_ref_frame' flag is used for updating both the BWDREF_FRAME - // and ALTREF2_FRAME. ('refresh_alt2_ref_frame' flag is irrelevant). - int new_bwdref_update_rule; -#endif + // For each type of reference frame, this contains the index of a reference + // frame buffer for a reference frame of the same type. We use this to + // choose our primary reference frame (which is the most recent reference + // frame of the same type as the current frame). + int fb_of_context_type[REF_FRAMES]; int ext_refresh_frame_flags_pending; int ext_refresh_last_frame; @@ -707,12 +813,6 @@ typedef struct AV1_COMP { RATE_CONTROL rc; double framerate; - // Relevant for an inter frame. - // - Index '0' corresponds to the values for the currently coded frame. - // - Indices LAST_FRAME ... EXTREF_FRAMES are used to store values for all the - // possible inter reference frames. - int interp_filter_selected[REF_FRAMES + 1][SWITCHABLE]; - struct aom_codec_pkt_list *output_pkt_list; MBGRAPH_FRAME_STATS mbgraph_stats[MAX_LAG_BUFFERS]; @@ -721,12 +821,14 @@ typedef struct AV1_COMP { int ref_frame_flags; int ext_ref_frame_flags; + // speed is passed as a per-frame parameter into the encoder + int speed; + // sf contains fine-grained config set internally based on speed SPEED_FEATURES sf; unsigned int max_mv_magnitude; int mv_step_param; - int allow_comp_inter_inter; int all_one_sided_refs; uint8_t *segmentation_map; @@ -737,13 +839,10 @@ typedef struct AV1_COMP { fractional_mv_step_fp *find_fractional_mv_step; av1_diamond_search_fn_t diamond_search_sad; aom_variance_fn_ptr_t fn_ptr[BLOCK_SIZES_ALL]; + +#if CONFIG_INTERNAL_STATS uint64_t time_receive_data; uint64_t time_compress_data; - uint64_t time_pick_lpf; - uint64_t time_encode_sb_row; - -#if CONFIG_FP_MB_STATS - int use_fp_mb_stats; #endif TWO_PASS twopass; @@ -779,6 +878,9 @@ typedef struct AV1_COMP { Metrics metrics; #endif int b_calculate_psnr; +#if CONFIG_SPEED_STATS + unsigned int tx_search_count; +#endif // CONFIG_SPEED_STATS int droppable; @@ -796,23 +898,21 @@ typedef struct AV1_COMP { int resize_pending_width; int resize_pending_height; - int frame_flags; - - search_site_config ss_cfg; + // ss_cfg[SS_CFG_LOOKAHEAD] : used in following cases + // -> temporal filtering + // -> intrabc + // ss_cfg[SS_CFG_SRC] : used everywhere except above mentioned cases + search_site_config ss_cfg[SS_CFG_TOTAL]; TileDataEnc *tile_data; int allocated_tiles; // Keep track of memory allocated for tiles. TOKENEXTRA *tile_tok[MAX_TILE_ROWS][MAX_TILE_COLS]; - unsigned int tok_count[MAX_TILE_ROWS][MAX_TILE_COLS]; TOKENLIST *tplist[MAX_TILE_ROWS][MAX_TILE_COLS]; - TileBufferEnc tile_buffers[MAX_TILE_ROWS][MAX_TILE_COLS]; - int resize_state; int resize_avg_qp; int resize_buffer_underflow; - int resize_count; // Sequence parameters have been transmitted already and locked // or not. Once locked av1_change_config cannot change the seq @@ -822,19 +922,24 @@ typedef struct AV1_COMP { // VARIANCE_AQ segment map refresh int vaq_refresh; + // VAR_BASED_PARTITION thresholds + // 0 - threshold_128x128; 1 - threshold_64x64; + // 2 - threshold_32x32; 3 - threshold_16x16; + // 4 - vbp_threshold_8x8; + int64_t vbp_thresholds[5]; + int64_t vbp_threshold_minmax; + int64_t vbp_threshold_sad; + int64_t vbp_threshold_copy; + BLOCK_SIZE vbp_bsize_min; + // Multi-threading int num_workers; AVxWorker *workers; struct EncWorkerData *tile_thr_data; - int refresh_frame_mask; int existing_fb_idx_to_show; - int is_arf_filter_off[MAX_EXT_ARFS + 1]; - int num_extra_arfs; - int arf_pos_in_gf[MAX_EXT_ARFS + 1]; - int arf_pos_for_ovrly[MAX_EXT_ARFS + 1]; + int is_arf_filter_off[MAX_INTERNAL_ARFS + 1]; int global_motion_search_done; - tran_low_t *tcoeff_buf[MAX_MB_PLANE]; - int extra_arf_allowed; + int internal_altref_allowed; // A flag to indicate if intrabc is ever used in current frame. int intrabc_used; int dv_cost[2][MV_VALS]; @@ -842,10 +947,16 @@ typedef struct AV1_COMP { int dv_joint_cost[MV_JOINTS]; int has_lossless_segment; - // For frame refs short signaling: - // A mapping of each reference frame from its encoder side value to the - // decoder side value obtained following the short signaling procedure. - int ref_conv[REF_FRAMES]; + // Factors to control gating of compound type selection based on best + // approximate rd so far + int max_comp_type_rd_threshold_mul; + int max_comp_type_rd_threshold_div; + + unsigned int tx_domain_dist_threshold; + + // Factor to control R-D optimization of coeffs based on block + // mse. + unsigned int coeff_opt_dist_threshold; AV1LfSync lf_row_sync; AV1LrSync lr_row_sync; @@ -865,8 +976,72 @@ typedef struct AV1_COMP { #if CONFIG_MULTITHREAD pthread_mutex_t *row_mt_mutex_; #endif + // Set if screen content is set or relevant tools are enabled + int is_screen_content_type; +#if CONFIG_COLLECT_PARTITION_STATS == 2 + PartitionStats partition_stats; +#endif + +#if CONFIG_COLLECT_COMPONENT_TIMING + // component_time[] are initialized to zero while encoder starts. + uint64_t component_time[kTimingComponents]; + struct aom_usec_timer component_timer[kTimingComponents]; + // frame_component_time[] are initialized to zero at beginning of each frame. + uint64_t frame_component_time[kTimingComponents]; +#endif + + // The following data are for AV1 bitstream levels. + AV1_LEVEL target_seq_level_idx[MAX_NUM_OPERATING_POINTS]; + int keep_level_stats; + AV1LevelInfo level_info[MAX_NUM_OPERATING_POINTS]; + // Count the number of OBU_FRAME and OBU_FRAME_HEADER for level calculation. + int frame_header_count; + FrameWindowBuffer frame_window_buffer; } AV1_COMP; +typedef struct { + YV12_BUFFER_CONFIG *source; + YV12_BUFFER_CONFIG *last_source; + int64_t ts_duration; +} EncodeFrameInput; + +// EncodeFrameParams contains per-frame encoding parameters decided upon by +// av1_encode_strategy() and passed down to av1_encode() +struct EncodeFrameParams { + int error_resilient_mode; + FRAME_TYPE frame_type; + int primary_ref_frame; + int order_offset; + int show_frame; + int refresh_frame_flags; + + int show_existing_frame; + int existing_fb_idx_to_show; + + // Bitmask of which reference buffers may be referenced by this frame + int ref_frame_flags; + + // Reference buffer assignment for this frame. + int remapped_ref_idx[REF_FRAMES]; + + // Flags which determine which reference buffers are refreshed by this frame + int refresh_last_frame; + int refresh_golden_frame; + int refresh_bwd_ref_frame; + int refresh_alt2_ref_frame; + int refresh_alt_ref_frame; + + // Speed level to use for this frame: Bigger number means faster. + int speed; +}; +typedef struct EncodeFrameParams EncodeFrameParams; + +// EncodeFrameResults contains information about the result of encoding a +// single frame +typedef struct { + size_t size; // Size of resulting bitstream +} EncodeFrameResults; + // Must not be called more than once. void av1_initialize_enc(void); @@ -887,6 +1062,11 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags, int64_t *time_end, int flush, const aom_rational_t *timebase); +int av1_encode(AV1_COMP *const cpi, uint8_t *const dest, + const EncodeFrameInput *const frame_input, + const EncodeFrameParams *const frame_params, + EncodeFrameResults *const frame_results); + int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest); int av1_get_last_show_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *frame); @@ -897,12 +1077,12 @@ aom_codec_err_t av1_copy_new_frame_enc(AV1_COMMON *cm, int av1_use_as_reference(AV1_COMP *cpi, int ref_frame_flags); -void av1_update_reference(AV1_COMP *cpi, int ref_frame_flags); - int av1_copy_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd); int av1_set_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd); +void av1_set_frame_size(AV1_COMP *cpi, int width, int height); + int av1_update_entropy(AV1_COMP *cpi, int update); int av1_set_active_map(AV1_COMP *cpi, unsigned char *map, int rows, int cols); @@ -916,26 +1096,23 @@ int av1_get_quantizer(struct AV1_COMP *cpi); int av1_convert_sect5obus_to_annexb(uint8_t *buffer, size_t *input_size); -int64_t timebase_units_to_ticks(const aom_rational_t *timebase, int64_t n); -int64_t ticks_to_timebase_units(const aom_rational_t *timebase, int64_t n); +// av1 uses 10,000,000 ticks/second as time stamp +#define TICKS_PER_SEC 10000000LL -static INLINE int frame_is_kf_gf_arf(const AV1_COMP *cpi) { - return frame_is_intra_only(&cpi->common) || cpi->refresh_alt_ref_frame || - (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref); +static INLINE int64_t timebase_units_to_ticks(const aom_rational_t *timebase, + int64_t n) { + return n * TICKS_PER_SEC * timebase->num / timebase->den; } -static INLINE int get_ref_frame_map_idx(const AV1_COMP *cpi, - MV_REFERENCE_FRAME ref_frame) { - return (ref_frame >= LAST_FRAME) - ? cpi->remapped_ref_idx[ref_frame - LAST_FRAME] - : INVALID_IDX; +static INLINE int64_t ticks_to_timebase_units(const aom_rational_t *timebase, + int64_t n) { + const int64_t round = TICKS_PER_SEC * timebase->num / 2 - 1; + return (n * timebase->den + round) / timebase->num / TICKS_PER_SEC; } -static INLINE int get_ref_frame_buf_idx(const AV1_COMP *cpi, - MV_REFERENCE_FRAME ref_frame) { - const AV1_COMMON *const cm = &cpi->common; - const int map_idx = get_ref_frame_map_idx(cpi, ref_frame); - return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : INVALID_IDX; +static INLINE int frame_is_kf_gf_arf(const AV1_COMP *cpi) { + return frame_is_intra_only(&cpi->common) || cpi->refresh_alt_ref_frame || + (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref); } // TODO(huisu@google.com, youzhou@microsoft.com): enable hash-me for HBD. @@ -944,33 +1121,37 @@ static INLINE int av1_use_hash_me(const AV1_COMMON *const cm) { } static INLINE hash_table *av1_get_ref_frame_hash_map( - const AV1_COMP *cpi, MV_REFERENCE_FRAME ref_frame) { - const AV1_COMMON *const cm = &cpi->common; - const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame); - return buf_idx != INVALID_IDX - ? &cm->buffer_pool->frame_bufs[buf_idx].hash_table - : NULL; + const AV1_COMMON *cm, MV_REFERENCE_FRAME ref_frame) { + const int map_idx = get_ref_frame_map_idx(cm, ref_frame); + RefCntBuffer *buf = + (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : NULL; + return buf ? &buf->hash_table : NULL; } -static INLINE YV12_BUFFER_CONFIG *get_ref_frame_buffer( - const AV1_COMP *cpi, MV_REFERENCE_FRAME ref_frame) { - const AV1_COMMON *const cm = &cpi->common; - const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame); - return buf_idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[buf_idx].buf - : NULL; +static INLINE const YV12_BUFFER_CONFIG *get_ref_frame_yv12_buf( + const AV1_COMMON *const cm, MV_REFERENCE_FRAME ref_frame) { + const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame); + return buf != NULL ? &buf->buf : NULL; } -static INLINE int enc_is_ref_frame_buf(AV1_COMP *cpi, RefCntBuffer *frame_buf) { - AV1_COMMON *const cm = &cpi->common; +static INLINE int enc_is_ref_frame_buf(const AV1_COMMON *const cm, + const RefCntBuffer *const frame_buf) { MV_REFERENCE_FRAME ref_frame; for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { - const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame); - if (buf_idx == INVALID_IDX) continue; - if (frame_buf == &cm->buffer_pool->frame_bufs[buf_idx]) break; + const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame); + if (buf == NULL) continue; + if (frame_buf == buf) break; } return (ref_frame <= ALTREF_FRAME); } +static INLINE void alloc_frame_mvs(AV1_COMMON *const cm, RefCntBuffer *buf) { + assert(buf != NULL); + ensure_mv_buffer(buf, cm); + buf->width = cm->width; + buf->height = cm->height; +} + // Token buffer is only used for palette tokens. static INLINE unsigned int get_token_alloc(int mb_rows, int mb_cols, int sb_size_log2, @@ -1026,10 +1207,10 @@ static INLINE int is_altref_enabled(const AV1_COMP *const cpi) { static INLINE void set_ref_ptrs(const AV1_COMMON *cm, MACROBLOCKD *xd, MV_REFERENCE_FRAME ref0, MV_REFERENCE_FRAME ref1) { - xd->block_refs[0] = - &cm->current_frame.frame_refs[ref0 >= LAST_FRAME ? ref0 - LAST_FRAME : 0]; - xd->block_refs[1] = - &cm->current_frame.frame_refs[ref1 >= LAST_FRAME ? ref1 - LAST_FRAME : 0]; + xd->block_ref_scale_factors[0] = + get_ref_scale_factors_const(cm, ref0 >= LAST_FRAME ? ref0 : 1); + xd->block_ref_scale_factors[1] = + get_ref_scale_factors_const(cm, ref1 >= LAST_FRAME ? ref1 : 1); } static INLINE int get_chessboard_index(int frame_index) { @@ -1042,6 +1223,8 @@ static INLINE int *cond_cost_list(const struct AV1_COMP *cpi, int *cost_list) { void av1_new_framerate(AV1_COMP *cpi, double framerate); +void av1_setup_frame_size(AV1_COMP *cpi); + #define LAYER_IDS_TO_IDX(sl, tl, num_tl) ((sl) * (num_tl) + (tl)) // Returns 1 if a frame is scaled and 0 otherwise. @@ -1062,6 +1245,48 @@ static INLINE int encode_show_existing_frame(const AV1_COMMON *cm) { cm->current_frame.frame_type == KEY_FRAME); } +// Lighter version of set_offsets that only sets the mode info +// pointers. +static INLINE void set_mode_info_offsets(const AV1_COMP *const cpi, + MACROBLOCK *const x, + MACROBLOCKD *const xd, int mi_row, + int mi_col) { + const AV1_COMMON *const cm = &cpi->common; + const int idx_str = xd->mi_stride * mi_row + mi_col; + xd->mi = cm->mi_grid_visible + idx_str; + xd->mi[0] = cm->mi + idx_str; + x->mbmi_ext = cpi->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col); +} + +// Check to see if the given partition size is allowed for a specified number +// of mi block rows and columns remaining in the image. +// If not then return the largest allowed partition size +static INLINE BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize, int rows_left, + int cols_left, int *bh, int *bw) { + int int_size = (int)bsize; + if (rows_left <= 0 || cols_left <= 0) { + return AOMMIN(bsize, BLOCK_8X8); + } else { + for (; int_size > 0; int_size -= 3) { + *bh = mi_size_high[int_size]; + *bw = mi_size_wide[int_size]; + if ((*bh <= rows_left) && (*bw <= cols_left)) { + break; + } + } + } + return (BLOCK_SIZE)int_size; +} + +static const uint8_t av1_ref_frame_flag_list[REF_FRAMES] = { 0, + AOM_LAST_FLAG, + AOM_LAST2_FLAG, + AOM_LAST3_FLAG, + AOM_GOLD_FLAG, + AOM_BWD_FLAG, + AOM_ALT2_FLAG, + AOM_ALT_FLAG }; + // Returns a Sequence Header OBU stored in an aom_fixed_buf_t, or NULL upon // failure. When a non-NULL aom_fixed_buf_t pointer is returned by this // function, the memory must be freed by the caller. Both the buf member of the @@ -1073,6 +1298,80 @@ static INLINE int encode_show_existing_frame(const AV1_COMMON *cm) { // field. aom_fixed_buf_t *av1_get_global_headers(AV1_COMP *cpi); +#if CONFIG_COLLECT_PARTITION_STATS == 2 +static INLINE void av1_print_partition_stats(PartitionStats *part_stats) { + FILE *f = fopen("partition_stats.csv", "w"); + if (!f) { + return; + } + + fprintf(f, "bsize,redo,"); + for (int part = 0; part < EXT_PARTITION_TYPES; part++) { + fprintf(f, "decision_%d,", part); + } + for (int part = 0; part < EXT_PARTITION_TYPES; part++) { + fprintf(f, "attempt_%d,", part); + } + for (int part = 0; part < EXT_PARTITION_TYPES; part++) { + fprintf(f, "time_%d,", part); + } + fprintf(f, "\n"); + + const int bsizes[6] = { 128, 64, 32, 16, 8, 4 }; + + for (int bsize_idx = 0; bsize_idx < 6; bsize_idx++) { + fprintf(f, "%d,%d,", bsizes[bsize_idx], part_stats->partition_redo); + for (int part = 0; part < EXT_PARTITION_TYPES; part++) { + fprintf(f, "%d,", part_stats->partition_decisions[bsize_idx][part]); + } + for (int part = 0; part < EXT_PARTITION_TYPES; part++) { + fprintf(f, "%d,", part_stats->partition_attempts[bsize_idx][part]); + } + for (int part = 0; part < EXT_PARTITION_TYPES; part++) { + fprintf(f, "%ld,", part_stats->partition_times[bsize_idx][part]); + } + fprintf(f, "\n"); + } + fclose(f); +} + +static INLINE int av1_get_bsize_idx_for_part_stats(BLOCK_SIZE bsize) { + assert(bsize == BLOCK_128X128 || bsize == BLOCK_64X64 || + bsize == BLOCK_32X32 || bsize == BLOCK_16X16 || bsize == BLOCK_8X8 || + bsize == BLOCK_4X4); + switch (bsize) { + case BLOCK_128X128: return 0; + case BLOCK_64X64: return 1; + case BLOCK_32X32: return 2; + case BLOCK_16X16: return 3; + case BLOCK_8X8: return 4; + case BLOCK_4X4: return 5; + default: assert(0 && "Invalid bsize for partition_stats."); return -1; + } +} +#endif + +#if CONFIG_COLLECT_COMPONENT_TIMING +static INLINE void start_timing(AV1_COMP *cpi, int component) { + aom_usec_timer_start(&cpi->component_timer[component]); +} +static INLINE void end_timing(AV1_COMP *cpi, int component) { + aom_usec_timer_mark(&cpi->component_timer[component]); + cpi->frame_component_time[component] += + aom_usec_timer_elapsed(&cpi->component_timer[component]); +} +static INLINE char const *get_frame_type_enum(int type) { + switch (type) { + case 0: return "KEY_FRAME"; + case 1: return "INTER_FRAME"; + case 2: return "INTRA_ONLY_FRAME"; + case 3: return "S_FRAME"; + default: assert(0); + } + return "error"; +} +#endif + #ifdef __cplusplus } // extern "C" #endif diff --git a/libaom/av1/encoder/encodetxb.c b/libaom/av1/encoder/encodetxb.c index a0c6ec1..37f4bb9 100644 --- a/libaom/av1/encoder/encodetxb.c +++ b/libaom/av1/encoder/encodetxb.c @@ -76,21 +76,12 @@ void av1_free_txb_buf(AV1_COMP *cpi) { aom_free(cpi->coeff_buffer_base); } void av1_set_coeff_buffer(const AV1_COMP *const cpi, MACROBLOCK *const x, int mi_row, int mi_col) { const AV1_COMMON *const cm = &cpi->common; - const int num_planes = av1_num_planes(cm); int mib_size_log2 = cm->seq_params.mib_size_log2; int stride = (cm->mi_cols >> mib_size_log2) + 1; int offset = (mi_row >> mib_size_log2) * stride + (mi_col >> mib_size_log2); - CB_COEFF_BUFFER *coeff_buf = &cpi->coeff_buffer_base[offset]; - const int txb_offset = x->cb_offset / (TX_SIZE_W_MIN * TX_SIZE_H_MIN); + x->mbmi_ext->cb_coef_buff = &cpi->coeff_buffer_base[offset]; + x->mbmi_ext->cb_offset = x->cb_offset; assert(x->cb_offset < (1 << num_pels_log2_lookup[cm->seq_params.sb_size])); - for (int plane = 0; plane < num_planes; ++plane) { - x->mbmi_ext->tcoeff[plane] = coeff_buf->tcoeff[plane] + x->cb_offset; - x->mbmi_ext->eobs[plane] = coeff_buf->eobs[plane] + txb_offset; - x->mbmi_ext->txb_skip_ctx[plane] = - coeff_buf->txb_skip_ctx[plane] + txb_offset; - x->mbmi_ext->dc_sign_ctx[plane] = - coeff_buf->dc_sign_ctx[plane] + txb_offset; - } } static void write_golomb(aom_writer *w, int level) { @@ -284,20 +275,16 @@ static INLINE int get_sign_bit_cost(tran_low_t qc, int coeff_idx, return av1_cost_literal(1); } -static INLINE int get_br_cost(tran_low_t abs_qc, int ctx, - const int *coeff_lps) { - const tran_low_t min_level = 1 + NUM_BASE_LEVELS; - const tran_low_t max_level = 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE; - (void)ctx; - if (abs_qc >= min_level) { - if (abs_qc >= max_level) { - return coeff_lps[COEFF_BASE_RANGE]; // COEFF_BASE_RANGE * cost0; - } else { - return coeff_lps[(abs_qc - min_level)]; // * cost0 + cost1; - } - } - return 0; -} +static const int golomb_bits_cost[32] = { + 0, 512, 512 * 3, 512 * 3, 512 * 5, 512 * 5, 512 * 5, 512 * 5, + 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, + 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, + 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9 +}; +static const int golomb_cost_diff[32] = { + 0, 512, 512 * 2, 0, 512 * 2, 0, 0, 0, 512 * 2, 0, 0, 0, 0, 0, 0, 0, + 512 * 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; static INLINE int get_golomb_cost(int abs_qc) { if (abs_qc >= 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) { @@ -308,6 +295,32 @@ static INLINE int get_golomb_cost(int abs_qc) { return 0; } +static INLINE int get_br_cost_with_diff(tran_low_t level, const int *coeff_lps, + int *diff) { + const int base_range = AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE); + int golomb_bits = 0; + if (level <= COEFF_BASE_RANGE + 1 + NUM_BASE_LEVELS) + *diff += coeff_lps[base_range + COEFF_BASE_RANGE + 1]; + + if (level >= COEFF_BASE_RANGE + 1 + NUM_BASE_LEVELS) { + int r = level - COEFF_BASE_RANGE - NUM_BASE_LEVELS; + if (r < 32) { + golomb_bits = golomb_bits_cost[r]; + *diff += golomb_cost_diff[r]; + } else { + golomb_bits = get_golomb_cost(level); + *diff += (r & (r - 1)) == 0 ? 1024 : 0; + } + } + + return coeff_lps[base_range] + golomb_bits; +} + +static INLINE int get_br_cost(tran_low_t level, const int *coeff_lps) { + const int base_range = AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE); + return coeff_lps[base_range] + get_golomb_cost(level); +} + static int get_coeff_cost(const tran_low_t qc, const int scan_idx, const int is_eob, const TxbInfo *const txb_info, const LV_MAP_COEFF_COST *const txb_costs, @@ -331,8 +344,7 @@ static int get_coeff_cost(const tran_low_t qc, const int scan_idx, if (abs_qc > NUM_BASE_LEVELS) { const int ctx = get_br_ctx(txb_info->levels, pos, txb_info->bwl, tx_class); - cost += get_br_cost(abs_qc, ctx, txb_costs->lps_cost[ctx]); - cost += get_golomb_cost(abs_qc); + cost += get_br_cost(abs_qc, txb_costs->lps_cost[ctx]); } } return cost; @@ -464,8 +476,6 @@ void av1_txb_init_levels_c(const tran_low_t *const coeff, const int width, const int stride = width + TX_PAD_HOR; uint8_t *ls = levels; - memset(levels - TX_PAD_TOP * stride, 0, - sizeof(*levels) * TX_PAD_TOP * stride); memset(levels + stride * height, 0, sizeof(*levels) * (TX_PAD_BOTTOM * stride + TX_PAD_END)); @@ -554,14 +564,15 @@ void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd, break; } - if (k_eob_offset_bits[eob_pt] > 0) { + const int eob_offset_bits = k_eob_offset_bits[eob_pt]; + if (eob_offset_bits > 0) { const int eob_ctx = eob_pt - 3; - int eob_shift = k_eob_offset_bits[eob_pt] - 1; + int eob_shift = eob_offset_bits - 1; int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0; aom_write_symbol(w, bit, ec_ctx->eob_extra_cdf[txs_ctx][plane_type][eob_ctx], 2); - for (int i = 1; i < k_eob_offset_bits[eob_pt]; i++) { - eob_shift = k_eob_offset_bits[eob_pt] - 1 - i; + for (int i = 1; i < eob_offset_bits; i++) { + eob_shift = eob_offset_bits - 1 - i; bit = (eob_extra & (1 << eob_shift)) ? 1 : 0; aom_write_bit(w, bit); } @@ -588,12 +599,11 @@ void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd, // level is above 1. const int base_range = level - 1 - NUM_BASE_LEVELS; const int br_ctx = get_br_ctx(levels, pos, bwl, tx_class); + aom_cdf_prob *cdf = + ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type][br_ctx]; for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) { const int k = AOMMIN(base_range - idx, BR_CDF_SIZE - 1); - aom_write_symbol( - w, k, - ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type][br_ctx], - BR_CDF_SIZE); + aom_write_symbol(w, k, cdf, BR_CDF_SIZE); if (k < BR_CDF_SIZE - 1) break; } } @@ -628,10 +638,18 @@ static void write_coeffs_txb_wrap(const AV1_COMMON *cm, MACROBLOCK *x, aom_writer *w, int plane, int block, int blk_row, int blk_col, TX_SIZE tx_size) { MACROBLOCKD *xd = &x->e_mbd; - tran_low_t *tcoeff = BLOCK_OFFSET(x->mbmi_ext->tcoeff[plane], block); - uint16_t eob = x->mbmi_ext->eobs[plane][block]; - TXB_CTX txb_ctx = { x->mbmi_ext->txb_skip_ctx[plane][block], - x->mbmi_ext->dc_sign_ctx[plane][block] }; + const int txb_offset = + x->mbmi_ext->cb_offset / (TX_SIZE_W_MIN * TX_SIZE_H_MIN); + tran_low_t *tcoeff_txb = + x->mbmi_ext->cb_coef_buff->tcoeff[plane] + x->mbmi_ext->cb_offset; + uint16_t *eob_txb = x->mbmi_ext->cb_coef_buff->eobs[plane] + txb_offset; + uint8_t *txb_skip_ctx_txb = + x->mbmi_ext->cb_coef_buff->txb_skip_ctx[plane] + txb_offset; + int *dc_sign_ctx_txb = + x->mbmi_ext->cb_coef_buff->dc_sign_ctx[plane] + txb_offset; + tran_low_t *tcoeff = BLOCK_OFFSET(tcoeff_txb, block); + uint16_t eob = eob_txb[block]; + TXB_CTX txb_ctx = { txb_skip_ctx_txb[block], dc_sign_ctx_txb[block] }; av1_write_coeffs_txb(cm, xd, w, blk_row, blk_col, plane, tx_size, tcoeff, eob, &txb_ctx); } @@ -745,7 +763,8 @@ static AOM_FORCE_INLINE int warehouse_efficients_txb( av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, coeff_contexts); - const int(*lps_cost)[COEFF_BASE_RANGE + 1] = coeff_costs->lps_cost; + const int(*lps_cost)[COEFF_BASE_RANGE + 1 + COEFF_BASE_RANGE + 1] = + coeff_costs->lps_cost; int c = eob - 1; { const int pos = scan[c]; @@ -758,11 +777,8 @@ static AOM_FORCE_INLINE int warehouse_efficients_txb( if (v) { // sign bit cost if (level > NUM_BASE_LEVELS) { - const int ctx = get_br_ctx(levels, pos, bwl, tx_class); - const int base_range = - AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE); - cost += lps_cost[ctx][base_range]; - cost += get_golomb_cost(level); + const int ctx = get_br_ctx_eob(pos, bwl, tx_class); + cost += get_br_cost(level, lps_cost[ctx]); } if (c) { cost += av1_cost_literal(1); @@ -774,7 +790,7 @@ static AOM_FORCE_INLINE int warehouse_efficients_txb( } } } - const int(*base_cost)[4] = coeff_costs->base_cost; + const int(*base_cost)[8] = coeff_costs->base_cost; for (c = eob - 2; c >= 1; --c) { const int pos = scan[c]; const int coeff_ctx = coeff_contexts[pos]; @@ -786,10 +802,7 @@ static AOM_FORCE_INLINE int warehouse_efficients_txb( cost += av1_cost_literal(1); if (level > NUM_BASE_LEVELS) { const int ctx = get_br_ctx(levels, pos, bwl, tx_class); - const int base_range = - AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE); - cost += lps_cost[ctx][base_range]; - cost += get_golomb_cost(level); + cost += get_br_cost(level, lps_cost[ctx]); } } cost += cost0; @@ -809,10 +822,7 @@ static AOM_FORCE_INLINE int warehouse_efficients_txb( cost += coeff_costs->dc_sign_cost[dc_sign_ctx][sign01]; if (level > NUM_BASE_LEVELS) { const int ctx = get_br_ctx(levels, pos, bwl, tx_class); - const int base_range = - AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE); - cost += lps_cost[ctx][base_range]; - cost += get_golomb_cost(level); + cost += get_br_cost(level, lps_cost[ctx]); } } } @@ -1284,20 +1294,47 @@ static int hbt_create_hashes(TxbInfo *txb_info, txb_eob_costs, p, block, fast_mode, rate_cost); } -static AOM_FORCE_INLINE int get_coeff_cost_simple( +static AOM_FORCE_INLINE int get_two_coeff_cost_simple( int ci, tran_low_t abs_qc, int coeff_ctx, const LV_MAP_COEFF_COST *txb_costs, int bwl, TX_CLASS tx_class, - const uint8_t *levels) { + const uint8_t *levels, int *cost_low) { // this simple version assumes the coeff's scan_idx is not DC (scan_idx != 0) // and not the last (scan_idx != eob - 1) assert(ci > 0); int cost = txb_costs->base_cost[coeff_ctx][AOMMIN(abs_qc, 3)]; + int diff = 0; + if (abs_qc <= 3) diff = txb_costs->base_cost[coeff_ctx][abs_qc + 4]; if (abs_qc) { cost += av1_cost_literal(1); if (abs_qc > NUM_BASE_LEVELS) { const int br_ctx = get_br_ctx(levels, ci, bwl, tx_class); - cost += get_br_cost(abs_qc, br_ctx, txb_costs->lps_cost[br_ctx]); - cost += get_golomb_cost(abs_qc); + int brcost_diff = 0; + cost += get_br_cost_with_diff(abs_qc, txb_costs->lps_cost[br_ctx], + &brcost_diff); + diff += brcost_diff; + } + } + *cost_low = cost - diff; + + return cost; +} + +static INLINE int get_coeff_cost_eob(int ci, tran_low_t abs_qc, int sign, + int coeff_ctx, int dc_sign_ctx, + const LV_MAP_COEFF_COST *txb_costs, + int bwl, TX_CLASS tx_class) { + int cost = 0; + cost += txb_costs->base_eob_cost[coeff_ctx][AOMMIN(abs_qc, 3) - 1]; + if (abs_qc != 0) { + if (ci == 0) { + cost += txb_costs->dc_sign_cost[dc_sign_ctx][sign]; + } else { + cost += av1_cost_literal(1); + } + if (abs_qc > NUM_BASE_LEVELS) { + int br_ctx; + br_ctx = get_br_ctx_eob(ci, bwl, tx_class); + cost += get_br_cost(abs_qc, txb_costs->lps_cost[br_ctx]); } } return cost; @@ -1322,9 +1359,12 @@ static INLINE int get_coeff_cost_general(int is_last, int ci, tran_low_t abs_qc, cost += av1_cost_literal(1); } if (abs_qc > NUM_BASE_LEVELS) { - const int br_ctx = get_br_ctx(levels, ci, bwl, tx_class); - cost += get_br_cost(abs_qc, br_ctx, txb_costs->lps_cost[br_ctx]); - cost += get_golomb_cost(abs_qc); + int br_ctx; + if (is_last) + br_ctx = get_br_ctx_eob(ci, bwl, tx_class); + else + br_ctx = get_br_ctx(levels, ci, bwl, tx_class); + cost += get_br_cost(abs_qc, txb_costs->lps_cost[br_ctx]); } } return cost; @@ -1368,13 +1408,23 @@ static INLINE void update_coeff_general( const int64_t rd = RDCOST(rdmult, rate, dist); tran_low_t qc_low, dqc_low; - get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low); - const tran_low_t abs_qc_low = abs_qc - 1; - const int64_t dist_low = get_coeff_dist(tqc, dqc_low, shift); - const int rate_low = - get_coeff_cost_general(is_last, ci, abs_qc_low, sign, coeff_ctx, - dc_sign_ctx, txb_costs, bwl, tx_class, levels); - const int64_t rd_low = RDCOST(rdmult, rate_low, dist_low); + tran_low_t abs_qc_low; + int64_t dist_low, rd_low; + int rate_low; + if (abs_qc == 1) { + abs_qc_low = qc_low = dqc_low = 0; + dist_low = dist0; + rate_low = txb_costs->base_cost[coeff_ctx][0]; + } else { + get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low); + abs_qc_low = abs_qc - 1; + dist_low = get_coeff_dist(tqc, dqc_low, shift); + rate_low = + get_coeff_cost_general(is_last, ci, abs_qc_low, sign, coeff_ctx, + dc_sign_ctx, txb_costs, bwl, tx_class, levels); + } + + rd_low = RDCOST(rdmult, rate_low, dist_low); if (rd_low < rd) { qcoeff[ci] = qc_low; dqcoeff[ci] = dqc_low; @@ -1408,28 +1458,28 @@ static AOM_FORCE_INLINE void update_coeff_simple( *accu_rate += txb_costs->base_cost[coeff_ctx][0]; } else { const tran_low_t abs_qc = abs(qc); - const tran_low_t tqc = tcoeff[ci]; - const tran_low_t dqc = dqcoeff[ci]; - const int rate = get_coeff_cost_simple(ci, abs_qc, coeff_ctx, txb_costs, - bwl, tx_class, levels); - if (abs(dqc) < abs(tqc)) { + const tran_low_t abs_tqc = abs(tcoeff[ci]); + const tran_low_t abs_dqc = abs(dqcoeff[ci]); + int rate_low = 0; + const int rate = get_two_coeff_cost_simple( + ci, abs_qc, coeff_ctx, txb_costs, bwl, tx_class, levels, &rate_low); + if (abs_dqc < abs_tqc) { *accu_rate += rate; return; } - const int64_t dist = get_coeff_dist(tqc, dqc, shift); + + const int64_t dist = get_coeff_dist(abs_tqc, abs_dqc, shift); const int64_t rd = RDCOST(rdmult, rate, dist); - const int sign = (qc < 0) ? 1 : 0; - tran_low_t qc_low, dqc_low; - get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low); const tran_low_t abs_qc_low = abs_qc - 1; - const int64_t dist_low = get_coeff_dist(tqc, dqc_low, shift); - const int rate_low = get_coeff_cost_simple( - ci, abs_qc_low, coeff_ctx, txb_costs, bwl, tx_class, levels); + const tran_low_t abs_dqc_low = (abs_qc_low * dqv) >> shift; + const int64_t dist_low = get_coeff_dist(abs_tqc, abs_dqc_low, shift); const int64_t rd_low = RDCOST(rdmult, rate_low, dist_low); + if (rd_low < rd) { - qcoeff[ci] = qc_low; - dqcoeff[ci] = dqc_low; + const int sign = (qc < 0) ? 1 : 0; + qcoeff[ci] = (-sign ^ abs_qc_low) + sign; + dqcoeff[ci] = (-sign ^ abs_dqc_low) + sign; levels[get_padded_idx(ci, bwl)] = AOMMIN(abs_qc_low, INT8_MAX); *accu_rate += rate_low; } else { @@ -1438,6 +1488,36 @@ static AOM_FORCE_INLINE void update_coeff_simple( } } +static INLINE void update_coeff_eob_fast(int *eob, int shift, + const int16_t *dequant_ptr, + const int16_t *scan, + const tran_low_t *coeff_ptr, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr) { + // TODO(sarahparker) make this work for aomqm + int eob_out = *eob; + int zbin[2] = { dequant_ptr[0] + ROUND_POWER_OF_TWO(dequant_ptr[0] * 70, 7), + dequant_ptr[1] + ROUND_POWER_OF_TWO(dequant_ptr[1] * 70, 7) }; + + for (int i = *eob - 1; i >= 0; i--) { + const int rc = scan[i]; + const int qcoeff = qcoeff_ptr[rc]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + + if (((abs_coeff << (1 + shift)) < zbin[rc != 0]) || (qcoeff == 0)) { + eob_out--; + qcoeff_ptr[rc] = 0; + dqcoeff_ptr[rc] = 0; + } else { + break; + } + } + + *eob = eob_out; +} + static AOM_FORCE_INLINE void update_coeff_eob( int *accu_rate, int64_t *accu_dist, int *eob, int *nz_num, int *nz_ci, int si, TX_SIZE tx_size, TX_CLASS tx_class, int bwl, int height, @@ -1467,40 +1547,42 @@ static AOM_FORCE_INLINE void update_coeff_eob( int64_t rd = RDCOST(rdmult, *accu_rate + rate, *accu_dist + dist); tran_low_t qc_low, dqc_low; - get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low); - const tran_low_t abs_qc_low = abs_qc - 1; - const int64_t dist_low = get_coeff_dist(tqc, dqc_low, shift) - dist0; - const int rate_low = - get_coeff_cost_general(0, ci, abs_qc_low, sign, coeff_ctx, dc_sign_ctx, - txb_costs, bwl, tx_class, levels); - const int64_t rd_low = - RDCOST(rdmult, *accu_rate + rate_low, *accu_dist + dist_low); + tran_low_t abs_qc_low; + int64_t dist_low, rd_low; + int rate_low; + if (abs_qc == 1) { + abs_qc_low = 0; + dqc_low = qc_low = 0; + dist_low = 0; + rate_low = txb_costs->base_cost[coeff_ctx][0]; + rd_low = RDCOST(rdmult, *accu_rate + rate_low, *accu_dist); + } else { + get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low); + abs_qc_low = abs_qc - 1; + dist_low = get_coeff_dist(tqc, dqc_low, shift) - dist0; + rate_low = + get_coeff_cost_general(0, ci, abs_qc_low, sign, coeff_ctx, + dc_sign_ctx, txb_costs, bwl, tx_class, levels); + rd_low = RDCOST(rdmult, *accu_rate + rate_low, *accu_dist + dist_low); + } int lower_level_new_eob = 0; const int new_eob = si + 1; - uint8_t tmp_levels[3]; - for (int ni = 0; ni < *nz_num; ++ni) { - const int last_ci = nz_ci[ni]; - tmp_levels[ni] = levels[get_padded_idx(last_ci, bwl)]; - levels[get_padded_idx(last_ci, bwl)] = 0; - } - - const int coeff_ctx_new_eob = get_lower_levels_ctx_general( - 1, si, bwl, height, levels, ci, tx_size, tx_class); + const int coeff_ctx_new_eob = get_lower_levels_ctx_eob(bwl, height, si); const int new_eob_cost = get_eob_cost(new_eob, txb_eob_costs, txb_costs, tx_class); int rate_coeff_eob = - new_eob_cost + get_coeff_cost_general(1, ci, abs_qc, sign, - coeff_ctx_new_eob, dc_sign_ctx, - txb_costs, bwl, tx_class, levels); + new_eob_cost + get_coeff_cost_eob(ci, abs_qc, sign, coeff_ctx_new_eob, + dc_sign_ctx, txb_costs, bwl, + tx_class); int64_t dist_new_eob = dist; int64_t rd_new_eob = RDCOST(rdmult, rate_coeff_eob, dist_new_eob); if (abs_qc_low > 0) { const int rate_coeff_eob_low = - new_eob_cost + - get_coeff_cost_general(1, ci, abs_qc_low, sign, coeff_ctx_new_eob, - dc_sign_ctx, txb_costs, bwl, tx_class, levels); + new_eob_cost + get_coeff_cost_eob(ci, abs_qc_low, sign, + coeff_ctx_new_eob, dc_sign_ctx, + txb_costs, bwl, tx_class); const int64_t dist_new_eob_low = dist_low; const int64_t rd_new_eob_low = RDCOST(rdmult, rate_coeff_eob_low, dist_new_eob_low); @@ -1522,7 +1604,7 @@ static AOM_FORCE_INLINE void update_coeff_eob( if (sharpness == 0 && rd_new_eob < rd) { for (int ni = 0; ni < *nz_num; ++ni) { int last_ci = nz_ci[ni]; - // levels[get_padded_idx(last_ci, bwl)] = 0; + levels[get_padded_idx(last_ci, bwl)] = 0; qcoeff[last_ci] = 0; dqcoeff[last_ci] = 0; } @@ -1532,10 +1614,6 @@ static AOM_FORCE_INLINE void update_coeff_eob( *accu_dist = dist_new_eob; lower_level = lower_level_new_eob; } else { - for (int ni = 0; ni < *nz_num; ++ni) { - const int last_ci = nz_ci[ni]; - levels[get_padded_idx(last_ci, bwl)] = tmp_levels[ni]; - } *accu_rate += rate; *accu_dist += dist; } @@ -1575,35 +1653,44 @@ static INLINE void update_skip(int *accu_rate, int64_t accu_dist, int *eob, int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane, int block, TX_SIZE tx_size, TX_TYPE tx_type, const TXB_CTX *const txb_ctx, int *rate_cost, - int sharpness) { - const AV1_COMMON *cm = &cpi->common; + int sharpness, int fast_mode) { MACROBLOCKD *xd = &x->e_mbd; - const PLANE_TYPE plane_type = get_plane_type(plane); - const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); - const TX_CLASS tx_class = tx_type_to_class[tx_type]; - const MB_MODE_INFO *mbmi = xd->mi[0]; - const struct macroblock_plane *p = &x->plane[plane]; struct macroblockd_plane *pd = &xd->plane[plane]; + const struct macroblock_plane *p = &x->plane[plane]; + const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type); + const int16_t *scan = scan_order->scan; + const int shift = av1_get_tx_scale(tx_size); + int eob = p->eobs[block]; + const int16_t *dequant = p->dequant_QTX; tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block); tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); const tran_low_t *tcoeff = BLOCK_OFFSET(p->coeff, block); - const int16_t *dequant = p->dequant_QTX; + + if (fast_mode) { + update_coeff_eob_fast(&eob, shift, dequant, scan, tcoeff, qcoeff, dqcoeff); + p->eobs[block] = eob; + if (eob == 0) { + *rate_cost = av1_cost_skip_txb(x, txb_ctx, plane, tx_size); + return eob; + } + } + + const AV1_COMMON *cm = &cpi->common; + const PLANE_TYPE plane_type = get_plane_type(plane); + const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); + const TX_CLASS tx_class = tx_type_to_class[tx_type]; + const MB_MODE_INFO *mbmi = xd->mi[0]; const int bwl = get_txb_bwl(tx_size); const int width = get_txb_wide(tx_size); const int height = get_txb_high(tx_size); assert(width == (1 << bwl)); const int is_inter = is_inter_block(mbmi); - const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type); - const int16_t *scan = scan_order->scan; const LV_MAP_COEFF_COST *txb_costs = &x->coeff_costs[txs_ctx][plane_type]; const int eob_multi_size = txsize_log2_minus4[tx_size]; const LV_MAP_EOB_COST *txb_eob_costs = &x->eob_costs[eob_multi_size][plane_type]; - const int shift = av1_get_tx_scale(tx_size); - const int64_t rdmult = - ((x->rdmult * plane_rd_mult[is_inter][plane_type] << (2 * (xd->bd - 8))) + - 2) >> + const int rshift = (sharpness + (cpi->oxcf.aq_mode == VARIANCE_AQ && mbmi->segment_id < 4 ? 7 - mbmi->segment_id @@ -1612,17 +1699,21 @@ int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane, cpi->oxcf.deltaq_mode > NO_DELTA_Q && x->sb_energy_level < 0 ? (3 - x->sb_energy_level) : 0)); + const int64_t rdmult = + (((int64_t)x->rdmult * + (plane_rd_mult[is_inter][plane_type] << (2 * (xd->bd - 8)))) + + 2) >> + rshift; uint8_t levels_buf[TX_PAD_2D]; uint8_t *const levels = set_levels(levels_buf, width); - av1_txb_init_levels(qcoeff, width, height, levels); + if (eob > 1) av1_txb_init_levels(qcoeff, width, height, levels); // TODO(angirbird): check iqmatrix const int non_skip_cost = txb_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][0]; const int skip_cost = txb_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1]; - int eob = p->eobs[block]; const int eob_cost = get_eob_cost(eob, txb_eob_costs, txb_costs, tx_class); int accu_rate = eob_cost; int64_t accu_dist = 0; @@ -1642,11 +1733,10 @@ int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane, --si; } else { assert(abs_qc == 1); - const int coeff_ctx = get_lower_levels_ctx_general( - 1, si, bwl, height, levels, ci, tx_size, tx_class); - accu_rate += get_coeff_cost_general(1, ci, abs_qc, sign, coeff_ctx, - txb_ctx->dc_sign_ctx, txb_costs, bwl, - tx_class, levels); + const int coeff_ctx = get_lower_levels_ctx_eob(bwl, height, si); + accu_rate += + get_coeff_cost_eob(ci, abs_qc, sign, coeff_ctx, txb_ctx->dc_sign_ctx, + txb_costs, bwl, tx_class); const tran_low_t tqc = tcoeff[ci]; const tran_low_t dqc = dqcoeff[ci]; const int64_t dist = get_coeff_dist(tqc, dqc, shift); @@ -1657,7 +1747,7 @@ int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane, #define UPDATE_COEFF_EOB_CASE(tx_class_literal) \ case tx_class_literal: \ - for (; si >= 0 && nz_num <= max_nz_num; --si) { \ + for (; si >= 0 && nz_num <= max_nz_num && !fast_mode; --si) { \ update_coeff_eob(&accu_rate, &accu_dist, &eob, &nz_num, nz_ci, si, \ tx_size, tx_class_literal, bwl, height, \ txb_ctx->dc_sign_ctx, rdmult, shift, dequant, scan, \ @@ -1750,7 +1840,8 @@ int av1_optimize_txb(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane, const int shift = av1_get_tx_scale(tx_size); const int64_t rdmult = - ((x->rdmult * plane_rd_mult[is_inter][plane_type] << (2 * (xd->bd - 8))) + + (((int64_t)x->rdmult * plane_rd_mult[is_inter][plane_type] + << (2 * (xd->bd - 8))) + 2) >> 2; uint8_t levels_buf[TX_PAD_2D]; @@ -1763,10 +1854,9 @@ int av1_optimize_txb(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane, assert(width == (1 << bwl)); const int tx_type_cost = get_tx_type_cost(cm, x, xd, plane, tx_size, tx_type); TxbInfo txb_info = { - qcoeff, levels, dqcoeff, tcoeff, dequant, shift, - tx_size, txs_ctx, tx_type, bwl, width, height, - eob, seg_eob, scan_order, txb_ctx, rdmult, &cm->coeff_ctx_table, - iqmatrix, tx_type_cost, + qcoeff, levels, dqcoeff, tcoeff, dequant, shift, tx_size, + txs_ctx, tx_type, bwl, width, height, eob, seg_eob, + scan_order, txb_ctx, rdmult, iqmatrix, tx_type_cost, }; // Hash based trellis (hbt) speed feature: avoid expensive optimize_txb calls @@ -1918,15 +2008,22 @@ void av1_update_and_record_txb_context(int plane, int block, int blk_row, 2); } - x->mbmi_ext->txb_skip_ctx[plane][block] = txb_ctx.txb_skip_ctx; - x->mbmi_ext->eobs[plane][block] = eob; + const int txb_offset = + x->mbmi_ext->cb_offset / (TX_SIZE_W_MIN * TX_SIZE_H_MIN); + uint16_t *eob_txb = x->mbmi_ext->cb_coef_buff->eobs[plane] + txb_offset; + uint8_t *txb_skip_ctx_txb = + x->mbmi_ext->cb_coef_buff->txb_skip_ctx[plane] + txb_offset; + txb_skip_ctx_txb[block] = txb_ctx.txb_skip_ctx; + eob_txb[block] = eob; if (eob == 0) { av1_set_contexts(xd, pd, plane, plane_bsize, tx_size, 0, blk_col, blk_row); return; } - tran_low_t *tcoeff = BLOCK_OFFSET(x->mbmi_ext->tcoeff[plane], block); + tran_low_t *tcoeff_txb = + x->mbmi_ext->cb_coef_buff->tcoeff[plane] + x->mbmi_ext->cb_offset; + tran_low_t *tcoeff = BLOCK_OFFSET(tcoeff_txb, block); const int segment_id = mbmi->segment_id; const int seg_eob = av1_get_tx_eob(&cpi->common.seg, segment_id, tx_size); const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block); @@ -2019,7 +2116,9 @@ void av1_update_and_record_txb_context(int plane, int block, int blk_row, #endif // CONFIG_ENTROPY_STATS if (allow_update_cdf) update_cdf(ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx], dc_sign, 2); - x->mbmi_ext->dc_sign_ctx[plane][block] = dc_sign_ctx; + int *dc_sign_ctx_txb = + x->mbmi_ext->cb_coef_buff->dc_sign_ctx[plane] + txb_offset; + dc_sign_ctx_txb[block] = dc_sign_ctx; } const int cul_level = av1_get_txb_entropy_context(tcoeff, scan_order, eob); diff --git a/libaom/av1/encoder/encodetxb.h b/libaom/av1/encoder/encodetxb.h index 4ee41ce..0682590 100644 --- a/libaom/av1/encoder/encodetxb.h +++ b/libaom/av1/encoder/encodetxb.h @@ -42,7 +42,6 @@ typedef struct TxbInfo { const SCAN_ORDER *scan_order; TXB_CTX *txb_ctx; int64_t rdmult; - const LV_MAP_CTX_TABLE *coeff_ctx_table; const qm_val_t *iqmatrix; int tx_type_cost; } TxbInfo; @@ -79,7 +78,7 @@ void hbt_destroy(); int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane, int block, TX_SIZE tx_size, TX_TYPE tx_type, const TXB_CTX *const txb_ctx, int *rate_cost, - int sharpness); + int sharpness, int fast_mode); // These numbers are empirically obtained. static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = { diff --git a/libaom/av1/encoder/ethread.c b/libaom/av1/encoder/ethread.c index a3fb93e..c8c2107 100644 --- a/libaom/av1/encoder/ethread.c +++ b/libaom/av1/encoder/ethread.c @@ -164,10 +164,7 @@ void av1_row_mt_sync_mem_alloc(AV1RowMTSync *row_mt_sync, AV1_COMMON *cm, aom_malloc(sizeof(*row_mt_sync->cur_col) * rows)); // Set up nsync. - if (cm->seq_params.mib_size_log2 == 4) - row_mt_sync->sync_range = 2; - else - row_mt_sync->sync_range = 1; + row_mt_sync->sync_range = 1; } // Deallocate row based multi-threading synchronization related mutex and data @@ -239,26 +236,34 @@ static void switch_tile_and_get_next_job(AV1_COMP *const cpi, int *cur_tile_id, int tile_index = tile_row * tile_cols + tile_col; TileDataEnc *this_tile = &cpi->tile_data[tile_index]; AV1RowMTInfo *row_mt_info = &this_tile->row_mt_info; - int num_mis_to_encode = - this_tile->tile_info.mi_row_end - row_mt_info->current_mi_row; - - // Tile to be processed by this thread is selected on the basis of - // availability of jobs: - // 1) If jobs are available, tile to be processed is chosen on the - // basis of minimum number of threads working for that tile. If two or - // more tiles have same number of threads working for them, then the tile - // with maximum number of jobs available will be chosen. - // 2) If no jobs are available, then end_of_frame is reached. - if (num_mis_to_encode > 0) { - int num_threads_working = row_mt_info->num_threads_working; - if (num_threads_working < min_num_threads_working) { - min_num_threads_working = num_threads_working; - max_mis_to_encode = 0; - } - if (num_threads_working == min_num_threads_working && - num_mis_to_encode > max_mis_to_encode) { - tile_id = tile_index; - max_mis_to_encode = num_mis_to_encode; + int num_sb_rows_in_tile = + av1_get_sb_rows_in_tile(cm, this_tile->tile_info); + int num_sb_cols_in_tile = + av1_get_sb_cols_in_tile(cm, this_tile->tile_info); + int theoretical_limit_on_threads = + AOMMIN((num_sb_cols_in_tile + 1) >> 1, num_sb_rows_in_tile); + int num_threads_working = row_mt_info->num_threads_working; + if (num_threads_working < theoretical_limit_on_threads) { + int num_mis_to_encode = + this_tile->tile_info.mi_row_end - row_mt_info->current_mi_row; + + // Tile to be processed by this thread is selected on the basis of + // availability of jobs: + // 1) If jobs are available, tile to be processed is chosen on the + // basis of minimum number of threads working for that tile. If two or + // more tiles have same number of threads working for them, then the + // tile with maximum number of jobs available will be chosen. + // 2) If no jobs are available, then end_of_frame is reached. + if (num_mis_to_encode > 0) { + if (num_threads_working < min_num_threads_working) { + min_num_threads_working = num_threads_working; + max_mis_to_encode = 0; + } + if (num_threads_working == min_num_threads_working && + num_mis_to_encode > max_mis_to_encode) { + tile_id = tile_index; + max_mis_to_encode = num_mis_to_encode; + } } } } @@ -313,9 +318,14 @@ static int enc_row_mt_worker_hook(void *arg1, void *unused) { td->mb.e_mbd.tile_ctx = td->tctx; td->mb.tile_pb_ctx = &this_tile->tctx; - td->mb.backup_tile_ctx = &this_tile->backup_tctx; - if (current_mi_row == this_tile->tile_info.mi_row_start) + if (this_tile->allow_update_cdf) { + td->mb.row_ctx = this_tile->row_ctx; + if (current_mi_row == this_tile->tile_info.mi_row_start) + memcpy(td->mb.e_mbd.tile_ctx, &this_tile->tctx, sizeof(FRAME_CONTEXT)); + } else { memcpy(td->mb.e_mbd.tile_ctx, &this_tile->tctx, sizeof(FRAME_CONTEXT)); + } + av1_init_above_context(cm, &td->mb.e_mbd, tile_row); // Disable exhaustive search speed features for row based multi-threading of @@ -356,10 +366,8 @@ static int enc_worker_hook(void *arg1, void *unused) { TileDataEnc *const this_tile = &cpi->tile_data[tile_row * cm->tile_cols + tile_col]; - thread_data->td->tctx = &this_tile->tctx; - thread_data->td->mb.e_mbd.tile_ctx = thread_data->td->tctx; - thread_data->td->mb.tile_pb_ctx = thread_data->td->tctx; - thread_data->td->mb.backup_tile_ctx = &this_tile->backup_tctx; + thread_data->td->mb.e_mbd.tile_ctx = &this_tile->tctx; + thread_data->td->mb.tile_pb_ctx = &this_tile->tctx; av1_encode_tile(cpi, thread_data->td, tile_row, tile_col); } @@ -386,7 +394,7 @@ static void create_enc_workers(AV1_COMP *cpi, int num_workers) { } #endif - for (int i = 0; i < num_workers; i++) { + for (int i = num_workers - 1; i >= 0; i--) { AVxWorker *const worker = &cpi->workers[i]; EncWorkerData *const thread_data = &cpi->tile_thr_data[i]; @@ -397,7 +405,7 @@ static void create_enc_workers(AV1_COMP *cpi, int num_workers) { thread_data->cpi = cpi; thread_data->thread_id = i; - if (i < num_workers - 1) { + if (i > 0) { // Allocate thread data. CHECK_MEM_ERROR(cm, thread_data->td, aom_memalign(32, sizeof(*thread_data->td))); @@ -421,11 +429,9 @@ static void create_enc_workers(AV1_COMP *cpi, int num_workers) { (int32_t *)aom_memalign( 16, MAX_SB_SQUARE * sizeof(*thread_data->td->wsrc_buf))); -#if CONFIG_COLLECT_INTER_MODE_RD_STATS CHECK_MEM_ERROR(cm, thread_data->td->inter_modes_info, (InterModesInfo *)aom_malloc( sizeof(*thread_data->td->inter_modes_info))); -#endif for (int x = 0; x < 2; x++) for (int y = 0; y < 2; y++) @@ -478,14 +484,14 @@ static void create_enc_workers(AV1_COMP *cpi, int num_workers) { static void launch_enc_workers(AV1_COMP *cpi, int num_workers) { const AVxWorkerInterface *const winterface = aom_get_worker_interface(); // Encode a frame - for (int i = 0; i < num_workers; i++) { + for (int i = num_workers - 1; i >= 0; i--) { AVxWorker *const worker = &cpi->workers[i]; EncWorkerData *const thread_data = (EncWorkerData *)worker->data1; // Set the starting tile for each thread. thread_data->start = i; - if (i == cpi->num_workers - 1) + if (i == 0) winterface->execute(worker); else winterface->launch(worker); @@ -497,7 +503,7 @@ static void sync_enc_workers(AV1_COMP *cpi, int num_workers) { int had_error = 0; // Encoding ends. - for (int i = 0; i < num_workers; i++) { + for (int i = num_workers - 1; i >= 0; i--) { AVxWorker *const worker = &cpi->workers[i]; had_error |= !winterface->sync(worker); } @@ -508,22 +514,25 @@ static void sync_enc_workers(AV1_COMP *cpi, int num_workers) { } static void accumulate_counters_enc_workers(AV1_COMP *cpi, int num_workers) { - for (int i = 0; i < num_workers; i++) { + for (int i = num_workers - 1; i >= 0; i--) { AVxWorker *const worker = &cpi->workers[i]; EncWorkerData *const thread_data = (EncWorkerData *)worker->data1; cpi->intrabc_used |= thread_data->td->intrabc_used; // Accumulate counters. - if (i < cpi->num_workers - 1) { + if (i > 0) { av1_accumulate_frame_counts(&cpi->counts, thread_data->td->counts); accumulate_rd_opt(&cpi->td, thread_data->td); cpi->td.mb.txb_split_count += thread_data->td->mb.txb_split_count; +#if CONFIG_SPEED_STATS + cpi->td.mb.tx_search_count += thread_data->td->mb.tx_search_count; +#endif // CONFIG_SPEED_STATS } } } static void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook, int num_workers) { - for (int i = 0; i < num_workers; i++) { + for (int i = num_workers - 1; i >= 0; i--) { AVxWorker *const worker = &cpi->workers[i]; EncWorkerData *const thread_data = &cpi->tile_thr_data[i]; @@ -541,9 +550,7 @@ static void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook, thread_data->td->mb.left_pred_buf = thread_data->td->left_pred_buf; thread_data->td->mb.wsrc_buf = thread_data->td->wsrc_buf; -#if CONFIG_COLLECT_INTER_MODE_RD_STATS thread_data->td->mb.inter_modes_info = thread_data->td->inter_modes_info; -#endif for (int x = 0; x < 2; x++) { for (int y = 0; y < 2; y++) { memcpy(thread_data->td->hash_value_buffer[x][y], @@ -560,7 +567,7 @@ static void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook, memcpy(thread_data->td->counts, &cpi->counts, sizeof(cpi->counts)); } - if (i < num_workers - 1) { + if (i > 0) { thread_data->td->mb.palette_buffer = thread_data->td->palette_buffer; thread_data->td->mb.tmp_conv_dst = thread_data->td->tmp_conv_dst; for (int j = 0; j < 2; ++j) { @@ -617,7 +624,7 @@ void av1_encode_tiles_row_mt(AV1_COMP *cpi) { const int tile_rows = cm->tile_rows; MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt; int num_workers = 0; - int total_num_sb_rows = 0; + int total_num_threads_row_mt = 0; int max_sb_rows = 0; if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) { @@ -632,11 +639,19 @@ void av1_encode_tiles_row_mt(AV1_COMP *cpi) { TileDataEnc *tile_data = &cpi->tile_data[row * cm->tile_cols + col]; int num_sb_rows_in_tile = av1_get_sb_rows_in_tile(cm, tile_data->tile_info); - total_num_sb_rows += num_sb_rows_in_tile; + int num_sb_cols_in_tile = + av1_get_sb_cols_in_tile(cm, tile_data->tile_info); + total_num_threads_row_mt += + AOMMIN((num_sb_cols_in_tile + 1) >> 1, num_sb_rows_in_tile); max_sb_rows = AOMMAX(max_sb_rows, num_sb_rows_in_tile); } } - num_workers = AOMMIN(cpi->oxcf.max_threads, total_num_sb_rows); + // TODO(ravi.chaudhary@ittiam.com): Currently the percentage of + // post-processing stages in encoder is quiet low, so limiting the number of + // threads to the theoretical limit in row-mt does not have much impact on + // post-processing multi-threading stage. Need to revisit this when + // post-processing time starts shooting up. + num_workers = AOMMIN(cpi->oxcf.max_threads, total_num_threads_row_mt); if (multi_thread_ctxt->allocated_tile_cols != tile_cols || multi_thread_ctxt->allocated_tile_rows != tile_rows || @@ -659,9 +674,7 @@ void av1_encode_tiles_row_mt(AV1_COMP *cpi) { this_tile->row_mt_info.current_mi_row = this_tile->tile_info.mi_row_start; this_tile->row_mt_info.num_threads_working = 0; -#if CONFIG_COLLECT_INTER_MODE_RD_STATS av1_inter_mode_data_init(this_tile); -#endif av1_zero_above_context(cm, &cpi->td.mb.e_mbd, this_tile->tile_info.mi_col_start, this_tile->tile_info.mi_col_end, tile_row); diff --git a/libaom/av1/encoder/firstpass.c b/libaom/av1/encoder/firstpass.c index 5117c67..f6a0fb2 100644 --- a/libaom/av1/encoder/firstpass.c +++ b/libaom/av1/encoder/firstpass.c @@ -36,6 +36,7 @@ #include "av1/encoder/encodemb.h" #include "av1/encoder/encodemv.h" #include "av1/encoder/encoder.h" +#include "av1/encoder/encode_strategy.h" #include "av1/encoder/extend.h" #include "av1/encoder/firstpass.h" #include "av1/encoder/mcomp.h" @@ -43,63 +44,14 @@ #include "av1/encoder/reconinter_enc.h" #define OUTPUT_FPF 0 -#define ARF_STATS_OUTPUT 0 -#define GROUP_ADAPTIVE_MAXQ 1 - -#define BOOST_BREAKOUT 12.5 -#define BOOST_FACTOR 12.5 -#define FACTOR_PT_LOW 0.70 -#define FACTOR_PT_HIGH 0.90 #define FIRST_PASS_Q 10.0 -#define GF_MAX_BOOST 90.0 #define INTRA_MODE_PENALTY 1024 -#define KF_MIN_FRAME_BOOST 80.0 -#define KF_MAX_FRAME_BOOST 128.0 -#define MIN_ARF_GF_BOOST 240 -#define MIN_DECAY_FACTOR 0.01 -#define MIN_KF_BOOST 300 // Minimum boost for non-static KF interval -#define MIN_STATIC_KF_BOOST 5400 // Minimum boost for static KF interval #define NEW_MV_MODE_PENALTY 32 #define DARK_THRESH 64 -#define DEFAULT_GRP_WEIGHT 1.0 -#define RC_FACTOR_MIN 0.75 -#define RC_FACTOR_MAX 1.75 -#define MIN_FWD_KF_INTERVAL 8 #define NCOUNT_INTRA_THRESH 8192 #define NCOUNT_INTRA_FACTOR 3 -#define NCOUNT_FRAME_II_THRESH 5.0 - -#define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x)-0.000001 : (x) + 0.000001) - -#if ARF_STATS_OUTPUT -unsigned int arf_count = 0; -#endif - -// Resets the first pass file to the given position using a relative seek from -// the current position. -static void reset_fpf_position(TWO_PASS *p, const FIRSTPASS_STATS *position) { - p->stats_in = position; -} - -// Read frame stats at an offset from the current position. -static const FIRSTPASS_STATS *read_frame_stats(const TWO_PASS *p, int offset) { - if ((offset >= 0 && p->stats_in + offset >= p->stats_in_end) || - (offset < 0 && p->stats_in + offset < p->stats_in_start)) { - return NULL; - } - - return &p->stats_in[offset]; -} - -static int input_stats(TWO_PASS *p, FIRSTPASS_STATS *fps) { - if (p->stats_in >= p->stats_in_end) return EOF; - - *fps = *p->stats_in; - ++p->stats_in; - return 1; -} static void output_stats(FIRSTPASS_STATS *stats, struct aom_codec_pkt_list *pktlist) { @@ -131,18 +83,7 @@ static void output_stats(FIRSTPASS_STATS *stats, #endif } -#if CONFIG_FP_MB_STATS -static void output_fpmb_stats(uint8_t *this_frame_mb_stats, int stats_size, - struct aom_codec_pkt_list *pktlist) { - struct aom_codec_cx_pkt pkt; - pkt.kind = AOM_CODEC_FPMB_STATS_PKT; - pkt.data.firstpass_mb_stats.buf = this_frame_mb_stats; - pkt.data.firstpass_mb_stats.sz = stats_size * sizeof(*this_frame_mb_stats); - aom_codec_pkt_list_add(pktlist, &pkt); -} -#endif - -static void zero_stats(FIRSTPASS_STATS *section) { +void av1_twopass_zero_stats(FIRSTPASS_STATS *section) { section->frame = 0.0; section->weight = 0.0; section->intra_error = 0.0; @@ -195,98 +136,8 @@ static void accumulate_stats(FIRSTPASS_STATS *section, section->duration += frame->duration; } -static void subtract_stats(FIRSTPASS_STATS *section, - const FIRSTPASS_STATS *frame) { - section->frame -= frame->frame; - section->weight -= frame->weight; - section->intra_error -= frame->intra_error; - section->frame_avg_wavelet_energy -= frame->frame_avg_wavelet_energy; - section->coded_error -= frame->coded_error; - section->sr_coded_error -= frame->sr_coded_error; - section->pcnt_inter -= frame->pcnt_inter; - section->pcnt_motion -= frame->pcnt_motion; - section->pcnt_second_ref -= frame->pcnt_second_ref; - section->pcnt_neutral -= frame->pcnt_neutral; - section->intra_skip_pct -= frame->intra_skip_pct; - section->inactive_zone_rows -= frame->inactive_zone_rows; - section->inactive_zone_cols -= frame->inactive_zone_cols; - section->MVr -= frame->MVr; - section->mvr_abs -= frame->mvr_abs; - section->MVc -= frame->MVc; - section->mvc_abs -= frame->mvc_abs; - section->MVrv -= frame->MVrv; - section->MVcv -= frame->MVcv; - section->mv_in_out_count -= frame->mv_in_out_count; - section->new_mv_count -= frame->new_mv_count; - section->count -= frame->count; - section->duration -= frame->duration; -} - -// Calculate the linear size relative to a baseline of 1080P -#define BASE_SIZE 2073600.0 // 1920x1080 -static double get_linear_size_factor(const AV1_COMP *cpi) { - const double this_area = cpi->initial_width * cpi->initial_height; - return pow(this_area / BASE_SIZE, 0.5); -} - -// Calculate an active area of the image that discounts formatting -// bars and partially discounts other 0 energy areas. -#define MIN_ACTIVE_AREA 0.5 -#define MAX_ACTIVE_AREA 1.0 -static double calculate_active_area(const AV1_COMP *cpi, - const FIRSTPASS_STATS *this_frame) { - double active_pct; - - active_pct = - 1.0 - - ((this_frame->intra_skip_pct / 2) + - ((this_frame->inactive_zone_rows * 2) / (double)cpi->common.mb_rows)); - return fclamp(active_pct, MIN_ACTIVE_AREA, MAX_ACTIVE_AREA); -} - -// Calculate a modified Error used in distributing bits between easier and -// harder frames. -#define ACT_AREA_CORRECTION 0.5 -static double calculate_modified_err(const AV1_COMP *cpi, - const TWO_PASS *twopass, - const AV1EncoderConfig *oxcf, - const FIRSTPASS_STATS *this_frame) { - const FIRSTPASS_STATS *const stats = &twopass->total_stats; - const double av_weight = stats->weight / stats->count; - const double av_err = (stats->coded_error * av_weight) / stats->count; - double modified_error = - av_err * pow(this_frame->coded_error * this_frame->weight / - DOUBLE_DIVIDE_CHECK(av_err), - oxcf->two_pass_vbrbias / 100.0); - - // Correction for active area. Frames with a reduced active area - // (eg due to formatting bars) have a higher error per mb for the - // remaining active MBs. The correction here assumes that coding - // 0.5N blocks of complexity 2X is a little easier than coding N - // blocks of complexity X. - modified_error *= - pow(calculate_active_area(cpi, this_frame), ACT_AREA_CORRECTION); - - return fclamp(modified_error, twopass->modified_error_min, - twopass->modified_error_max); -} - -// This function returns the maximum target rate per frame. -static int frame_max_bits(const RATE_CONTROL *rc, - const AV1EncoderConfig *oxcf) { - int64_t max_bits = ((int64_t)rc->avg_frame_bandwidth * - (int64_t)oxcf->two_pass_vbrmax_section) / - 100; - if (max_bits < 0) - max_bits = 0; - else if (max_bits > rc->max_frame_bandwidth) - max_bits = rc->max_frame_bandwidth; - - return (int)max_bits; -} - void av1_init_first_pass(AV1_COMP *cpi) { - zero_stats(&cpi->twopass.total_stats); + av1_twopass_zero_stats(&cpi->twopass.total_stats); } void av1_end_first_pass(AV1_COMP *cpi) { @@ -380,13 +231,13 @@ static void first_pass_motion_search(AV1_COMP *cpi, MACROBLOCK *x, // Override the default variance function to use MSE. v_fn_ptr.vf = get_block_variance_fn(bsize); - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + if (is_cur_buf_hbd(xd)) { v_fn_ptr.vf = highbd_get_block_variance_fn(bsize, xd->bd); } // Center the initial step/diamond search on best mv. - tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv, - step_param, x->sadperbit16, &num00, + tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg[SS_CFG_SRC], &ref_mv_full, + &tmp_mv, step_param, x->sadperbit16, &num00, &v_fn_ptr, ref_mv); if (tmp_err < INT_MAX) tmp_err = av1_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1); @@ -407,9 +258,9 @@ static void first_pass_motion_search(AV1_COMP *cpi, MACROBLOCK *x, if (num00) { --num00; } else { - tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv, - step_param + n, x->sadperbit16, &num00, - &v_fn_ptr, ref_mv); + tmp_err = cpi->diamond_search_sad( + x, &cpi->ss_cfg[SS_CFG_SRC], &ref_mv_full, &tmp_mv, step_param + n, + x->sadperbit16, &num00, &v_fn_ptr, ref_mv); if (tmp_err < INT_MAX) tmp_err = av1_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1); if (tmp_err < INT_MAX - new_mv_mode_penalty) @@ -439,26 +290,7 @@ static BLOCK_SIZE get_bsize(const AV1_COMMON *cm, int mb_row, int mb_col) { } static int find_fp_qindex(aom_bit_depth_t bit_depth) { - int i; - - for (i = 0; i < QINDEX_RANGE; ++i) - if (av1_convert_qindex_to_q(i, bit_depth) >= FIRST_PASS_Q) break; - - if (i == QINDEX_RANGE) i--; - - return i; -} - -static void set_first_pass_params(AV1_COMP *cpi) { - AV1_COMMON *const cm = &cpi->common; - if (!cpi->refresh_alt_ref_frame && (cm->current_frame.frame_number == 0 || - (cpi->frame_flags & FRAMEFLAGS_KEY))) { - cm->current_frame.frame_type = KEY_FRAME; - } else { - cm->current_frame.frame_type = INTER_FRAME; - } - // Do not use periodic key frames. - cpi->rc.frames_to_key = INT_MAX; + return av1_find_qindex(FIRST_PASS_Q, bit_depth, 0, QINDEX_RANGE - 1); } static double raw_motion_error_stdev(int *raw_motion_err_list, @@ -486,7 +318,7 @@ static double raw_motion_error_stdev(int *raw_motion_err_list, #define UL_INTRA_THRESH 50 #define INVALID_ROW -1 -void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { +void av1_first_pass(AV1_COMP *cpi, const int64_t ts_duration) { int mb_row, mb_col; MACROBLOCK *const x = &cpi->td.mb; AV1_COMMON *const cm = &cpi->common; @@ -501,7 +333,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { &cpi->td.pc_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2]->none; int i; - int recon_yoffset, recon_uvoffset; + int recon_yoffset, src_yoffset, recon_uvoffset; int64_t intra_error = 0; int64_t frame_avg_wavelet_energy = 0; int64_t coded_error = 0; @@ -521,15 +353,15 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { int sum_in_vectors = 0; MV lastmv = kZeroMv; TWO_PASS *twopass = &cpi->twopass; - int recon_y_stride, recon_uv_stride, uv_mb_height; + int recon_y_stride, src_y_stride, recon_uv_stride, uv_mb_height; - YV12_BUFFER_CONFIG *const lst_yv12 = get_ref_frame_buffer(cpi, LAST_FRAME); - YV12_BUFFER_CONFIG *gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME); + const YV12_BUFFER_CONFIG *const lst_yv12 = + get_ref_frame_yv12_buf(cm, LAST_FRAME); + const YV12_BUFFER_CONFIG *gld_yv12 = get_ref_frame_yv12_buf(cm, GOLDEN_FRAME); YV12_BUFFER_CONFIG *const new_yv12 = &cm->cur_frame->buf; const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12; double intra_factor; double brightness_factor; - BufferPool *const pool = cm->buffer_pool; const int qindex = find_fp_qindex(seq_params->bit_depth); const int mb_scale = mi_size_wide[BLOCK_16X16]; @@ -542,12 +374,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { assert(new_yv12 != NULL); assert(frame_is_intra_only(cm) || (lst_yv12 != NULL)); -#if CONFIG_FP_MB_STATS - if (cpi->use_fp_mb_stats) { - av1_zero_array(cpi->twopass.frame_mb_stats_buf, cpi->initial_mbs); - } -#endif - + av1_setup_frame_size(cpi); aom_clear_system_state(); xd->mi = cm->mi_grid_visible; @@ -558,7 +385,9 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { brightness_factor = 0.0; neutral_count = 0.0; - set_first_pass_params(cpi); + // Do not use periodic key frames. + cpi->rc.frames_to_key = INT_MAX; + av1_set_quantizer(cm, qindex); av1_setup_block_planes(&x->e_mbd, seq_params->subsampling_x, @@ -589,12 +418,11 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { } av1_init_mv_probs(cm); - av1_init_lv_map(cm); av1_initialize_rd_consts(cpi); // Tiling is ignored in the first pass. av1_tile_init(&tile, cm, 0, 0); - + src_y_stride = cpi->source->y_stride; recon_y_stride = new_yv12->y_stride; recon_uv_stride = new_yv12->uv_stride; uv_mb_height = 16 >> (new_yv12->y_height > new_yv12->uv_height); @@ -605,6 +433,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { // Reset above block coeffs. xd->up_available = (mb_row != 0); recon_yoffset = (mb_row * recon_y_stride * 16); + src_yoffset = (mb_row * src_y_stride * 16); recon_uvoffset = (mb_row * recon_uv_stride * uv_mb_height); // Set up limit values for motion vectors to prevent them extending @@ -620,10 +449,6 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { double log_intra; int level_sample; -#if CONFIG_FP_MB_STATS - const int mb_index = mb_row * cm->mb_cols + mb_col; -#endif - aom_clear_system_state(); const int idx_str = xd->mi_stride * mb_row * mb_scale + mb_col * mb_scale; @@ -650,11 +475,6 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { av1_encode_intra_block_plane(cpi, x, bsize, 0, 0, mb_row * 2, mb_col * 2); this_error = aom_get_mb_ss(x->plane[0].src_diff); - // Keep a record of blocks that have almost no intra error residual - // (i.e. are in effect completely flat and untextured in the intra - // domain). In natural videos this is uncommon, but it is much more - // common in animations, graphics and screen content, so may be used - // as a signal to detect these types of content. if (this_error < UL_INTRA_THRESH) { ++intra_skip_count; } else if ((mb_col > 0) && (image_data_start_row == INVALID_ROW)) { @@ -702,21 +522,15 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { // Accumulate the intra error. intra_error += (int64_t)this_error; - int stride = x->plane[0].src.stride; + const int hbd = is_cur_buf_hbd(xd); + const int stride = x->plane[0].src.stride; uint8_t *buf = x->plane[0].src.buf; - for (int r8 = 0; r8 < 2; ++r8) + for (int r8 = 0; r8 < 2; ++r8) { for (int c8 = 0; c8 < 2; ++c8) { - int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH; frame_avg_wavelet_energy += av1_haar_ac_sad_8x8_uint8_input( buf + c8 * 8 + r8 * 8 * stride, stride, hbd); } - -#if CONFIG_FP_MB_STATS - if (cpi->use_fp_mb_stats) { - // initialization - cpi->twopass.frame_mb_stats_buf[mb_index] = 0; } -#endif // Set up limit values for motion vectors to prevent them extending // outside the UMV borders. @@ -731,7 +545,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { struct buf_2d unscaled_last_source_buf_2d; xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset; - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + if (is_cur_buf_hbd(xd)) { motion_error = highbd_get_prediction_error( bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd); } else { @@ -743,10 +557,10 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { // frame as the reference. Skip the further motion search on // reconstructed frame if this error is small. unscaled_last_source_buf_2d.buf = - cpi->unscaled_last_source->y_buffer + recon_yoffset; + cpi->unscaled_last_source->y_buffer + src_yoffset; unscaled_last_source_buf_2d.stride = cpi->unscaled_last_source->y_stride; - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + if (is_cur_buf_hbd(xd)) { raw_motion_error = highbd_get_prediction_error( bsize, &x->plane[0].src, &unscaled_last_source_buf_2d, xd->bd); } else { @@ -778,7 +592,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { int gf_motion_error; xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset; - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + if (is_cur_buf_hbd(xd)) { gf_motion_error = highbd_get_prediction_error( bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd); } else { @@ -816,20 +630,6 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { best_ref_mv.row = 0; best_ref_mv.col = 0; -#if CONFIG_FP_MB_STATS - if (cpi->use_fp_mb_stats) { - // intra predication statistics - cpi->twopass.frame_mb_stats_buf[mb_index] = 0; - cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_DCINTRA_MASK; - cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK; - if (this_error > FPMB_ERROR_LARGE_TH) { - cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LARGE_MASK; - } else if (this_error < FPMB_ERROR_SMALL_TH) { - cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_SMALL_MASK; - } - } -#endif - if (motion_error <= this_error) { aom_clear_system_state(); @@ -855,8 +655,9 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { xd->mi[0]->tx_size = TX_4X4; xd->mi[0]->ref_frame[0] = LAST_FRAME; xd->mi[0]->ref_frame[1] = NONE_FRAME; - av1_build_inter_predictors_sby(cm, xd, mb_row * mb_scale, - mb_col * mb_scale, NULL, bsize); + av1_enc_build_inter_predictor(cm, xd, mb_row * mb_scale, + mb_col * mb_scale, NULL, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); av1_encode_sby_pass1(cm, x, bsize); sum_mvr += mv.row; sum_mvr_abs += abs(mv.row); @@ -868,50 +669,9 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { best_ref_mv = mv; -#if CONFIG_FP_MB_STATS - if (cpi->use_fp_mb_stats) { - // inter predication statistics - cpi->twopass.frame_mb_stats_buf[mb_index] = 0; - cpi->twopass.frame_mb_stats_buf[mb_index] &= ~FPMB_DCINTRA_MASK; - cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK; - if (this_error > FPMB_ERROR_LARGE_TH) { - cpi->twopass.frame_mb_stats_buf[mb_index] |= - FPMB_ERROR_LARGE_MASK; - } else if (this_error < FPMB_ERROR_SMALL_TH) { - cpi->twopass.frame_mb_stats_buf[mb_index] |= - FPMB_ERROR_SMALL_MASK; - } - } -#endif - if (!is_zero_mv(&mv)) { ++mvcount; -#if CONFIG_FP_MB_STATS - if (cpi->use_fp_mb_stats) { - cpi->twopass.frame_mb_stats_buf[mb_index] &= - ~FPMB_MOTION_ZERO_MASK; - // check estimated motion direction - if (mv.col > 0 && mv.col >= abs(mv.row)) { - // right direction - cpi->twopass.frame_mb_stats_buf[mb_index] |= - FPMB_MOTION_RIGHT_MASK; - } else if (mv.row < 0 && abs(mv.row) >= abs(mv.col)) { - // up direction - cpi->twopass.frame_mb_stats_buf[mb_index] |= - FPMB_MOTION_UP_MASK; - } else if (mv.col < 0 && abs(mv.col) >= abs(mv.row)) { - // left direction - cpi->twopass.frame_mb_stats_buf[mb_index] |= - FPMB_MOTION_LEFT_MASK; - } else { - // down direction - cpi->twopass.frame_mb_stats_buf[mb_index] |= - FPMB_MOTION_DOWN_MASK; - } - } -#endif - // Non-zero vector, was it different from the last non zero vector? if (!is_equal_mv(&mv, &lastmv)) ++new_mv_count; lastmv = mv; @@ -955,6 +715,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { x->plane[2].src.buf += uv_mb_height; recon_yoffset += 16; + src_yoffset += 16; recon_uvoffset += uv_mb_height; } // Adjust to the next row of MBs. @@ -1039,19 +800,12 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { // TODO(paulwilkins): Handle the case when duration is set to 0, or // something less than the full time between subsequent values of // cpi->source_time_stamp. - fps.duration = (double)(source->ts_end - source->ts_start); + fps.duration = (double)ts_duration; // Don't want to do output stats with a stack variable! twopass->this_frame_stats = fps; output_stats(&twopass->this_frame_stats, cpi->output_pkt_list); accumulate_stats(&twopass->total_stats, &fps); - -#if CONFIG_FP_MB_STATS - if (cpi->use_fp_mb_stats) { - output_fpmb_stats(twopass->frame_mb_stats_buf, cpi->initial_mbs, - cpi->output_pkt_list); - } -#endif } // Copy the previous Last Frame back into gf and and arf buffers if @@ -1062,10 +816,9 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { ((twopass->this_frame_stats.intra_error / DOUBLE_DIVIDE_CHECK(twopass->this_frame_stats.coded_error)) > 2.0))) { if (gld_yv12 != NULL) { - assign_frame_buffer( - pool->frame_bufs, - &cm->ref_frame_map[get_ref_frame_map_idx(cpi, GOLDEN_FRAME)], - cm->ref_frame_map[get_ref_frame_map_idx(cpi, LAST_FRAME)]); + assign_frame_buffer_p( + &cm->ref_frame_map[get_ref_frame_map_idx(cm, GOLDEN_FRAME)], + cm->ref_frame_map[get_ref_frame_map_idx(cm, LAST_FRAME)]); } twopass->sr_update_lag = 1; } else { @@ -1075,19 +828,16 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { aom_extend_frame_borders(new_yv12, num_planes); // The frame we just compressed now becomes the last frame. - assign_frame_buffer( - pool->frame_bufs, - &cm->ref_frame_map[get_ref_frame_map_idx(cpi, LAST_FRAME)], - cm->new_fb_idx); + assign_frame_buffer_p( + &cm->ref_frame_map[get_ref_frame_map_idx(cm, LAST_FRAME)], cm->cur_frame); // Special case for the first frame. Copy into the GF buffer as a second // reference. if (current_frame->frame_number == 0 && - get_ref_frame_map_idx(cpi, GOLDEN_FRAME) != INVALID_IDX) { - assign_frame_buffer( - pool->frame_bufs, - &cm->ref_frame_map[get_ref_frame_map_idx(cpi, GOLDEN_FRAME)], - cm->ref_frame_map[get_ref_frame_map_idx(cpi, LAST_FRAME)]); + get_ref_frame_map_idx(cm, GOLDEN_FRAME) != INVALID_IDX) { + assign_frame_buffer_p( + &cm->ref_frame_map[get_ref_frame_map_idx(cm, GOLDEN_FRAME)], + cm->ref_frame_map[get_ref_frame_map_idx(cm, LAST_FRAME)]); } // Use this to see what the first pass reconstruction looks like. @@ -1108,2333 +858,3 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) { ++current_frame->frame_number; } - -static double calc_correction_factor(double err_per_mb, double err_divisor, - double pt_low, double pt_high, int q, - aom_bit_depth_t bit_depth) { - const double error_term = err_per_mb / err_divisor; - - // Adjustment based on actual quantizer to power term. - const double power_term = - AOMMIN(av1_convert_qindex_to_q(q, bit_depth) * 0.01 + pt_low, pt_high); - - // Calculate correction factor. - if (power_term < 1.0) assert(error_term >= 0.0); - - return fclamp(pow(error_term, power_term), 0.05, 5.0); -} - -#define ERR_DIVISOR 100.0 -static int get_twopass_worst_quality(const AV1_COMP *cpi, - const double section_err, - double inactive_zone, - int section_target_bandwidth, - double group_weight_factor) { - const RATE_CONTROL *const rc = &cpi->rc; - const AV1EncoderConfig *const oxcf = &cpi->oxcf; - - inactive_zone = fclamp(inactive_zone, 0.0, 1.0); - - if (section_target_bandwidth <= 0) { - return rc->worst_quality; // Highest value allowed - } else { - const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) - ? cpi->initial_mbs - : cpi->common.MBs; - const int active_mbs = AOMMAX(1, num_mbs - (int)(num_mbs * inactive_zone)); - const double av_err_per_mb = section_err / active_mbs; - const double speed_term = 1.0; - double ediv_size_correction; - const int target_norm_bits_per_mb = - (int)((uint64_t)section_target_bandwidth << BPER_MB_NORMBITS) / - active_mbs; - int q; - - // Larger image formats are expected to be a little harder to code - // relatively given the same prediction error score. This in part at - // least relates to the increased size and hence coding overheads of - // motion vectors. Some account of this is made through adjustment of - // the error divisor. - ediv_size_correction = - AOMMAX(0.2, AOMMIN(5.0, get_linear_size_factor(cpi))); - if (ediv_size_correction < 1.0) - ediv_size_correction = -(1.0 / ediv_size_correction); - ediv_size_correction *= 4.0; - - // Try and pick a max Q that will be high enough to encode the - // content at the given rate. - for (q = rc->best_quality; q < rc->worst_quality; ++q) { - const double factor = calc_correction_factor( - av_err_per_mb, ERR_DIVISOR - ediv_size_correction, FACTOR_PT_LOW, - FACTOR_PT_HIGH, q, cpi->common.seq_params.bit_depth); - const int bits_per_mb = av1_rc_bits_per_mb( - INTER_FRAME, q, factor * speed_term * group_weight_factor, - cpi->common.seq_params.bit_depth); - if (bits_per_mb <= target_norm_bits_per_mb) break; - } - - // Restriction on active max q for constrained quality mode. - if (cpi->oxcf.rc_mode == AOM_CQ) q = AOMMAX(q, oxcf->cq_level); - return q; - } -} - -static void setup_rf_level_maxq(AV1_COMP *cpi) { - int i; - RATE_CONTROL *const rc = &cpi->rc; - for (i = INTER_NORMAL; i < RATE_FACTOR_LEVELS; ++i) { - int qdelta = av1_frame_type_qdelta(cpi, i, rc->worst_quality); - rc->rf_level_maxq[i] = AOMMAX(rc->worst_quality + qdelta, rc->best_quality); - } -} - -void av1_init_second_pass(AV1_COMP *cpi) { - const AV1EncoderConfig *const oxcf = &cpi->oxcf; - TWO_PASS *const twopass = &cpi->twopass; - double frame_rate; - FIRSTPASS_STATS *stats; - - zero_stats(&twopass->total_stats); - zero_stats(&twopass->total_left_stats); - - if (!twopass->stats_in_end) return; - - stats = &twopass->total_stats; - - *stats = *twopass->stats_in_end; - twopass->total_left_stats = *stats; - - frame_rate = 10000000.0 * stats->count / stats->duration; - // Each frame can have a different duration, as the frame rate in the source - // isn't guaranteed to be constant. The frame rate prior to the first frame - // encoded in the second pass is a guess. However, the sum duration is not. - // It is calculated based on the actual durations of all frames from the - // first pass. - av1_new_framerate(cpi, frame_rate); - twopass->bits_left = - (int64_t)(stats->duration * oxcf->target_bandwidth / 10000000.0); - - // This variable monitors how far behind the second ref update is lagging. - twopass->sr_update_lag = 1; - - // Scan the first pass file and calculate a modified total error based upon - // the bias/power function used to allocate bits. - { - const double avg_error = - stats->coded_error / DOUBLE_DIVIDE_CHECK(stats->count); - const FIRSTPASS_STATS *s = twopass->stats_in; - double modified_error_total = 0.0; - twopass->modified_error_min = - (avg_error * oxcf->two_pass_vbrmin_section) / 100; - twopass->modified_error_max = - (avg_error * oxcf->two_pass_vbrmax_section) / 100; - while (s < twopass->stats_in_end) { - modified_error_total += calculate_modified_err(cpi, twopass, oxcf, s); - ++s; - } - twopass->modified_error_left = modified_error_total; - } - - // Reset the vbr bits off target counters - cpi->rc.vbr_bits_off_target = 0; - cpi->rc.vbr_bits_off_target_fast = 0; - - cpi->rc.rate_error_estimate = 0; - - // Static sequence monitor variables. - twopass->kf_zeromotion_pct = 100; - twopass->last_kfgroup_zeromotion_pct = 100; - - if (oxcf->resize_mode != RESIZE_NONE) { - setup_rf_level_maxq(cpi); - } -} - -#define SR_DIFF_PART 0.0015 -#define MOTION_AMP_PART 0.003 -#define INTRA_PART 0.005 -#define DEFAULT_DECAY_LIMIT 0.75 -#define LOW_SR_DIFF_TRHESH 0.1 -#define SR_DIFF_MAX 128.0 - -static double get_sr_decay_rate(const AV1_COMP *cpi, - const FIRSTPASS_STATS *frame) { - const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs - : cpi->common.MBs; - double sr_diff = (frame->sr_coded_error - frame->coded_error) / num_mbs; - double sr_decay = 1.0; - double modified_pct_inter; - double modified_pcnt_intra; - const double motion_amplitude_factor = - frame->pcnt_motion * ((frame->mvc_abs + frame->mvr_abs) / 2); - - modified_pct_inter = frame->pcnt_inter; - if ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) < - (double)NCOUNT_FRAME_II_THRESH) { - modified_pct_inter = frame->pcnt_inter - frame->pcnt_neutral; - } - modified_pcnt_intra = 100 * (1.0 - modified_pct_inter); - - if ((sr_diff > LOW_SR_DIFF_TRHESH)) { - sr_diff = AOMMIN(sr_diff, SR_DIFF_MAX); - sr_decay = 1.0 - (SR_DIFF_PART * sr_diff) - - (MOTION_AMP_PART * motion_amplitude_factor) - - (INTRA_PART * modified_pcnt_intra); - } - return AOMMAX(sr_decay, AOMMIN(DEFAULT_DECAY_LIMIT, modified_pct_inter)); -} - -// This function gives an estimate of how badly we believe the prediction -// quality is decaying from frame to frame. -static double get_zero_motion_factor(const AV1_COMP *cpi, - const FIRSTPASS_STATS *frame) { - const double zero_motion_pct = frame->pcnt_inter - frame->pcnt_motion; - double sr_decay = get_sr_decay_rate(cpi, frame); - return AOMMIN(sr_decay, zero_motion_pct); -} - -#define ZM_POWER_FACTOR 0.75 - -static double get_prediction_decay_rate(const AV1_COMP *cpi, - const FIRSTPASS_STATS *next_frame) { - const double sr_decay_rate = get_sr_decay_rate(cpi, next_frame); - const double zero_motion_factor = - (0.95 * pow((next_frame->pcnt_inter - next_frame->pcnt_motion), - ZM_POWER_FACTOR)); - - return AOMMAX(zero_motion_factor, - (sr_decay_rate + ((1.0 - sr_decay_rate) * zero_motion_factor))); -} - -// Function to test for a condition where a complex transition is followed -// by a static section. For example in slide shows where there is a fade -// between slides. This is to help with more optimal kf and gf positioning. -static int detect_transition_to_still(AV1_COMP *cpi, int frame_interval, - int still_interval, - double loop_decay_rate, - double last_decay_rate) { - TWO_PASS *const twopass = &cpi->twopass; - RATE_CONTROL *const rc = &cpi->rc; - - // Break clause to detect very still sections after motion - // For example a static image after a fade or other transition - // instead of a clean scene cut. - if (frame_interval > rc->min_gf_interval && loop_decay_rate >= 0.999 && - last_decay_rate < 0.9) { - int j; - - // Look ahead a few frames to see if static condition persists... - for (j = 0; j < still_interval; ++j) { - const FIRSTPASS_STATS *stats = &twopass->stats_in[j]; - if (stats >= twopass->stats_in_end) break; - - if (stats->pcnt_inter - stats->pcnt_motion < 0.999) break; - } - - // Only if it does do we signal a transition to still. - return j == still_interval; - } - - return 0; -} - -// This function detects a flash through the high relative pcnt_second_ref -// score in the frame following a flash frame. The offset passed in should -// reflect this. -static int detect_flash(const TWO_PASS *twopass, int offset) { - const FIRSTPASS_STATS *const next_frame = read_frame_stats(twopass, offset); - - // What we are looking for here is a situation where there is a - // brief break in prediction (such as a flash) but subsequent frames - // are reasonably well predicted by an earlier (pre flash) frame. - // The recovery after a flash is indicated by a high pcnt_second_ref - // compared to pcnt_inter. - return next_frame != NULL && - next_frame->pcnt_second_ref > next_frame->pcnt_inter && - next_frame->pcnt_second_ref >= 0.5; -} - -// Update the motion related elements to the GF arf boost calculation. -static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats, - double *mv_in_out, - double *mv_in_out_accumulator, - double *abs_mv_in_out_accumulator, - double *mv_ratio_accumulator) { - const double pct = stats->pcnt_motion; - - // Accumulate Motion In/Out of frame stats. - *mv_in_out = stats->mv_in_out_count * pct; - *mv_in_out_accumulator += *mv_in_out; - *abs_mv_in_out_accumulator += fabs(*mv_in_out); - - // Accumulate a measure of how uniform (or conversely how random) the motion - // field is (a ratio of abs(mv) / mv). - if (pct > 0.05) { - const double mvr_ratio = - fabs(stats->mvr_abs) / DOUBLE_DIVIDE_CHECK(fabs(stats->MVr)); - const double mvc_ratio = - fabs(stats->mvc_abs) / DOUBLE_DIVIDE_CHECK(fabs(stats->MVc)); - - *mv_ratio_accumulator += - pct * (mvr_ratio < stats->mvr_abs ? mvr_ratio : stats->mvr_abs); - *mv_ratio_accumulator += - pct * (mvc_ratio < stats->mvc_abs ? mvc_ratio : stats->mvc_abs); - } -} - -#define BASELINE_ERR_PER_MB 1000.0 -static double calc_frame_boost(AV1_COMP *cpi, const FIRSTPASS_STATS *this_frame, - double this_frame_mv_in_out, double max_boost) { - double frame_boost; - const double lq = av1_convert_qindex_to_q( - cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.seq_params.bit_depth); - const double boost_q_correction = AOMMIN((0.5 + (lq * 0.015)), 1.5); - int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs - : cpi->common.MBs; - - // Correct for any inactive region in the image - num_mbs = (int)AOMMAX(1, num_mbs * calculate_active_area(cpi, this_frame)); - - // Underlying boost factor is based on inter error ratio. - frame_boost = (BASELINE_ERR_PER_MB * num_mbs) / - DOUBLE_DIVIDE_CHECK(this_frame->coded_error); - frame_boost = frame_boost * BOOST_FACTOR * boost_q_correction; - - // Increase boost for frames where new data coming into frame (e.g. zoom out). - // Slightly reduce boost if there is a net balance of motion out of the frame - // (zoom in). The range for this_frame_mv_in_out is -1.0 to +1.0. - if (this_frame_mv_in_out > 0.0) - frame_boost += frame_boost * (this_frame_mv_in_out * 2.0); - // In the extreme case the boost is halved. - else - frame_boost += frame_boost * (this_frame_mv_in_out / 2.0); - - return AOMMIN(frame_boost, max_boost * boost_q_correction); -} - -static int calc_arf_boost(AV1_COMP *cpi, int offset, int f_frames, int b_frames, - int *f_boost, int *b_boost) { - TWO_PASS *const twopass = &cpi->twopass; - int i; - double boost_score = 0.0; - double mv_ratio_accumulator = 0.0; - double decay_accumulator = 1.0; - double this_frame_mv_in_out = 0.0; - double mv_in_out_accumulator = 0.0; - double abs_mv_in_out_accumulator = 0.0; - int arf_boost; - int flash_detected = 0; - - // Search forward from the proposed arf/next gf position. - for (i = 0; i < f_frames; ++i) { - const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset); - if (this_frame == NULL) break; - - // Update the motion related elements to the boost calculation. - accumulate_frame_motion_stats( - this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator, - &abs_mv_in_out_accumulator, &mv_ratio_accumulator); - - // We want to discount the flash frame itself and the recovery - // frame that follows as both will have poor scores. - flash_detected = detect_flash(twopass, i + offset) || - detect_flash(twopass, i + offset + 1); - - // Accumulate the effect of prediction quality decay. - if (!flash_detected) { - decay_accumulator *= get_prediction_decay_rate(cpi, this_frame); - decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR - ? MIN_DECAY_FACTOR - : decay_accumulator; - } - - boost_score += - decay_accumulator * - calc_frame_boost(cpi, this_frame, this_frame_mv_in_out, GF_MAX_BOOST); - } - - *f_boost = (int)boost_score; - - // Reset for backward looking loop. - boost_score = 0.0; - mv_ratio_accumulator = 0.0; - decay_accumulator = 1.0; - this_frame_mv_in_out = 0.0; - mv_in_out_accumulator = 0.0; - abs_mv_in_out_accumulator = 0.0; - - // Search backward towards last gf position. - for (i = -1; i >= -b_frames; --i) { - const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset); - if (this_frame == NULL) break; - - // Update the motion related elements to the boost calculation. - accumulate_frame_motion_stats( - this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator, - &abs_mv_in_out_accumulator, &mv_ratio_accumulator); - - // We want to discount the the flash frame itself and the recovery - // frame that follows as both will have poor scores. - flash_detected = detect_flash(twopass, i + offset) || - detect_flash(twopass, i + offset + 1); - - // Cumulative effect of prediction quality decay. - if (!flash_detected) { - decay_accumulator *= get_prediction_decay_rate(cpi, this_frame); - decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR - ? MIN_DECAY_FACTOR - : decay_accumulator; - } - - boost_score += - decay_accumulator * - calc_frame_boost(cpi, this_frame, this_frame_mv_in_out, GF_MAX_BOOST); - } - *b_boost = (int)boost_score; - - arf_boost = (*f_boost + *b_boost); - if (arf_boost < ((b_frames + f_frames) * 20)) - arf_boost = ((b_frames + f_frames) * 20); - arf_boost = AOMMAX(arf_boost, MIN_ARF_GF_BOOST); - - return arf_boost; -} - -// Calculate a section intra ratio used in setting max loop filter. -static int calculate_section_intra_ratio(const FIRSTPASS_STATS *begin, - const FIRSTPASS_STATS *end, - int section_length) { - const FIRSTPASS_STATS *s = begin; - double intra_error = 0.0; - double coded_error = 0.0; - int i = 0; - - while (s < end && i < section_length) { - intra_error += s->intra_error; - coded_error += s->coded_error; - ++s; - ++i; - } - - return (int)(intra_error / DOUBLE_DIVIDE_CHECK(coded_error)); -} - -// Calculate the total bits to allocate in this GF/ARF group. -static int64_t calculate_total_gf_group_bits(AV1_COMP *cpi, - double gf_group_err) { - const RATE_CONTROL *const rc = &cpi->rc; - const TWO_PASS *const twopass = &cpi->twopass; - const int max_bits = frame_max_bits(rc, &cpi->oxcf); - int64_t total_group_bits; - - // Calculate the bits to be allocated to the group as a whole. - if ((twopass->kf_group_bits > 0) && (twopass->kf_group_error_left > 0)) { - total_group_bits = (int64_t)(twopass->kf_group_bits * - (gf_group_err / twopass->kf_group_error_left)); - } else { - total_group_bits = 0; - } - - // Clamp odd edge cases. - total_group_bits = (total_group_bits < 0) - ? 0 - : (total_group_bits > twopass->kf_group_bits) - ? twopass->kf_group_bits - : total_group_bits; - - // Clip based on user supplied data rate variability limit. - if (total_group_bits > (int64_t)max_bits * rc->baseline_gf_interval) - total_group_bits = (int64_t)max_bits * rc->baseline_gf_interval; - - return total_group_bits; -} - -// Calculate the number bits extra to assign to boosted frames in a group. -static int calculate_boost_bits(int frame_count, int boost, - int64_t total_group_bits) { - int allocation_chunks; - - // return 0 for invalid inputs (could arise e.g. through rounding errors) - if (!boost || (total_group_bits <= 0) || (frame_count <= 0)) return 0; - - allocation_chunks = (frame_count * 100) + boost; - - // Prevent overflow. - if (boost > 1023) { - int divisor = boost >> 10; - boost /= divisor; - allocation_chunks /= divisor; - } - - // Calculate the number of extra bits for use in the boosted frame or frames. - return AOMMAX((int)(((int64_t)boost * total_group_bits) / allocation_chunks), - 0); -} - -#if USE_SYMM_MULTI_LAYER -// #define CHCEK_GF_PARAMETER -#ifdef CHCEK_GF_PARAMETER -void check_frame_params(GF_GROUP *const gf_group, int gf_interval, - int frame_nums) { - static const char *update_type_strings[] = { - "KF_UPDATE", "LF_UPDATE", "GF_UPDATE", - "ARF_UPDATE", "OVERLAY_UPDATE", "BRF_UPDATE", - "LAST_BIPRED_UPDATE", "BIPRED_UPDATE", "INTNL_OVERLAY_UPDATE", - "INTNL_ARF_UPDATE" - }; - FILE *fid = fopen("GF_PARAMS.txt", "a"); - - fprintf(fid, "\n{%d}\n", gf_interval); - for (int i = 0; i <= frame_nums; ++i) { - fprintf(fid, "%s %d %d %d %d\n", - update_type_strings[gf_group->update_type[i]], - gf_group->arf_src_offset[i], gf_group->arf_pos_in_gf[i], - gf_group->arf_update_idx[i], gf_group->pyramid_level[i]); - } - - fprintf(fid, "number of nodes in each level: \n"); - for (int i = 0; i < MAX_PYRAMID_LVL; ++i) { - fprintf(fid, "lvl %d: %d ", i, gf_group->pyramid_lvl_nodes[i]); - } - fprintf(fid, "\n"); - fclose(fid); -} -#endif // CHCEK_GF_PARAMETER -static int update_type_2_rf_level(FRAME_UPDATE_TYPE update_type) { - // Derive rf_level from update_type - switch (update_type) { - case LF_UPDATE: return INTER_NORMAL; - case ARF_UPDATE: return GF_ARF_STD; - case OVERLAY_UPDATE: return INTER_NORMAL; - case BRF_UPDATE: return GF_ARF_LOW; - case LAST_BIPRED_UPDATE: return INTER_NORMAL; - case BIPRED_UPDATE: return INTER_NORMAL; - case INTNL_ARF_UPDATE: return GF_ARF_LOW; - case INTNL_OVERLAY_UPDATE: return INTER_NORMAL; - default: return INTER_NORMAL; - } -} - -static void set_multi_layer_params(GF_GROUP *const gf_group, int l, int r, - int *frame_ind, int arf_ind, int level) { - if (r - l < 4) { - while (++l < r) { - // leaf nodes, not a look-ahead frame - gf_group->update_type[*frame_ind] = LF_UPDATE; - gf_group->arf_src_offset[*frame_ind] = 0; - gf_group->arf_pos_in_gf[*frame_ind] = 0; - gf_group->arf_update_idx[*frame_ind] = arf_ind; - gf_group->pyramid_level[*frame_ind] = 0; - ++gf_group->pyramid_lvl_nodes[0]; - ++(*frame_ind); - } - } else { - int m = (l + r) / 2; - int arf_pos_in_gf = *frame_ind; - - gf_group->update_type[*frame_ind] = INTNL_ARF_UPDATE; - gf_group->arf_src_offset[*frame_ind] = m - l - 1; - gf_group->arf_pos_in_gf[*frame_ind] = 0; - gf_group->arf_update_idx[*frame_ind] = 1; // mark all internal ARF 1 - gf_group->pyramid_level[*frame_ind] = level; - ++gf_group->pyramid_lvl_nodes[level]; - ++(*frame_ind); - - // set parameters for frames displayed before this frame - set_multi_layer_params(gf_group, l, m, frame_ind, 1, level - 1); - - // for overlay frames, we need to record the position of its corresponding - // arf frames for bit allocation - gf_group->update_type[*frame_ind] = INTNL_OVERLAY_UPDATE; - gf_group->arf_src_offset[*frame_ind] = 0; - gf_group->arf_pos_in_gf[*frame_ind] = arf_pos_in_gf; - gf_group->arf_update_idx[*frame_ind] = 1; - gf_group->pyramid_level[*frame_ind] = 0; - ++(*frame_ind); - - // set parameters for frames displayed after this frame - set_multi_layer_params(gf_group, m, r, frame_ind, arf_ind, level - 1); - } -} - -static INLINE unsigned char get_pyramid_height(int pyramid_width) { - assert(pyramid_width <= 16 && pyramid_width >= 4 && - "invalid gf interval for pyramid structure"); - - return pyramid_width > 12 ? 4 : (pyramid_width > 6 ? 3 : 2); -} - -static int construct_multi_layer_gf_structure(GF_GROUP *const gf_group, - const int gf_interval) { - int frame_index = 0; - gf_group->pyramid_height = get_pyramid_height(gf_interval); - - assert(gf_group->pyramid_height <= MAX_PYRAMID_LVL); - - av1_zero_array(gf_group->pyramid_lvl_nodes, MAX_PYRAMID_LVL); - - // At the beginning of each GF group it will be a key or overlay frame, - gf_group->update_type[frame_index] = OVERLAY_UPDATE; - gf_group->arf_src_offset[frame_index] = 0; - gf_group->arf_pos_in_gf[frame_index] = 0; - gf_group->arf_update_idx[frame_index] = 0; - gf_group->pyramid_level[frame_index] = 0; - ++frame_index; - - // ALT0 - gf_group->update_type[frame_index] = ARF_UPDATE; - gf_group->arf_src_offset[frame_index] = gf_interval - 1; - gf_group->arf_pos_in_gf[frame_index] = 0; - gf_group->arf_update_idx[frame_index] = 0; - gf_group->pyramid_level[frame_index] = gf_group->pyramid_height; - ++frame_index; - - // set parameters for the rest of the frames - set_multi_layer_params(gf_group, 0, gf_interval, &frame_index, 0, - gf_group->pyramid_height - 1); - return frame_index; -} - -static void define_customized_gf_group_structure(AV1_COMP *cpi) { - RATE_CONTROL *const rc = &cpi->rc; - TWO_PASS *const twopass = &cpi->twopass; - GF_GROUP *const gf_group = &twopass->gf_group; - const int key_frame = cpi->common.current_frame.frame_type == KEY_FRAME; - - assert(rc->baseline_gf_interval >= 4 && - rc->baseline_gf_interval <= MAX_PYRAMID_SIZE); - - const int gf_update_frames = - construct_multi_layer_gf_structure(gf_group, rc->baseline_gf_interval); - int frame_index; - - cpi->num_extra_arfs = 0; - - for (frame_index = 0; frame_index < gf_update_frames; ++frame_index) { - // Set unused variables to default values - gf_group->bidir_pred_enabled[frame_index] = 0; - gf_group->brf_src_offset[frame_index] = 0; - - // Special handle for the first frame for assigning update_type - if (frame_index == 0) { - // For key frames the frame target rate is already set and it - // is also the golden frame. - if (key_frame) { - gf_group->update_type[frame_index] = KF_UPDATE; - continue; - } - - if (rc->source_alt_ref_active) { - gf_group->update_type[frame_index] = OVERLAY_UPDATE; - } else { - gf_group->update_type[frame_index] = GF_UPDATE; - } - } else { - if (gf_group->update_type[frame_index] == INTNL_ARF_UPDATE) - ++cpi->num_extra_arfs; - } - - // Assign rf level based on update type - gf_group->rf_level[frame_index] = - update_type_2_rf_level(gf_group->update_type[frame_index]); - } - - // NOTE: We need to configure the frame at the end of the sequence + 1 that - // will be the start frame for the next group. Otherwise prior to the - // call to av1_rc_get_second_pass_params() the data will be undefined. - if (rc->source_alt_ref_pending) { - gf_group->update_type[frame_index] = OVERLAY_UPDATE; - gf_group->rf_level[frame_index] = INTER_NORMAL; - } else { - gf_group->update_type[frame_index] = GF_UPDATE; - gf_group->rf_level[frame_index] = GF_ARF_STD; - } - - gf_group->bidir_pred_enabled[frame_index] = 0; - gf_group->brf_src_offset[frame_index] = 0; - gf_group->arf_update_idx[frame_index] = 0; - // This value is only used for INTNL_OVERLAY_UPDATE - gf_group->arf_pos_in_gf[frame_index] = 0; - - // This parameter is useless? - gf_group->arf_ref_idx[frame_index] = 0; -#ifdef CHCEK_GF_PARAMETER - check_frame_params(gf_group, rc->baseline_gf_interval, gf_update_frames); -#endif -} - -// It is an example of how to define a GF stucture manually. The function will -// result in exactly the same GF group structure as -// define_customized_gf_group_structure() when rc->baseline_gf_interval == 4 -#if USE_MANUAL_GF4_STRUCT -#define GF_INTERVAL_4 4 -static const unsigned char gf4_multi_layer_params[][GF_FRAME_PARAMS] = { - { - // gf_group->index == 0 (Frame 0) - // It can also be KEY frame. Will assign the proper value - // in define_gf_group_structure - OVERLAY_UPDATE, // update_type (default value) - 0, // arf_src_offset - 0, // arf_pos_in_gf - 0 // arf_update_idx - }, - { - // gf_group->index == 1 (Frame 4) - ARF_UPDATE, // update_type - GF_INTERVAL_4 - 1, // arf_src_offset - 0, // arf_pos_in_gf - 0 // arf_update_idx - }, - { - // gf_group->index == 2 (Frame 2) - INTNL_ARF_UPDATE, // update_type - (GF_INTERVAL_4 >> 1) - 1, // arf_src_offset - 0, // arf_pos_in_gf - 0 // arf_update_idx - }, - { - // gf_group->index == 3 (Frame 1) - LAST_BIPRED_UPDATE, // update_type - 0, // arf_src_offset - 0, // arf_pos_in_gf - 0 // arf_update_idx - }, - - { - // gf_group->index == 4 (Frame 2 - OVERLAY) - INTNL_OVERLAY_UPDATE, // update_type - 0, // arf_src_offset - 2, // arf_pos_in_gf - 0 // arf_update_idx - }, - { - // gf_group->index == 5 (Frame 3) - LF_UPDATE, // update_type - 0, // arf_src_offset - 0, // arf_pos_in_gf - 1 // arf_update_idx - } -}; - -static int define_gf_group_structure_4(AV1_COMP *cpi) { - RATE_CONTROL *const rc = &cpi->rc; - TWO_PASS *const twopass = &cpi->twopass; - GF_GROUP *const gf_group = &twopass->gf_group; - const int key_frame = cpi->common.current_frame.frame_type == KEY_FRAME; - - assert(rc->baseline_gf_interval == GF_INTERVAL_4); - - const int gf_update_frames = rc->baseline_gf_interval + 2; - int frame_index; - - for (frame_index = 0; frame_index < gf_update_frames; ++frame_index) { - int param_idx = 0; - - gf_group->bidir_pred_enabled[frame_index] = 0; - - if (frame_index == 0) { - // gf_group->arf_src_offset[frame_index] = 0; - gf_group->brf_src_offset[frame_index] = 0; - gf_group->bidir_pred_enabled[frame_index] = 0; - - // For key frames the frame target rate is already set and it - // is also the golden frame. - if (key_frame) continue; - - gf_group->update_type[frame_index] = - gf4_multi_layer_params[frame_index][param_idx++]; - - if (rc->source_alt_ref_active) { - gf_group->update_type[frame_index] = OVERLAY_UPDATE; - } else { - gf_group->update_type[frame_index] = GF_UPDATE; - } - param_idx++; - } else { - gf_group->update_type[frame_index] = - gf4_multi_layer_params[frame_index][param_idx++]; - } - - // setup other parameters - gf_group->rf_level[frame_index] = - update_type_2_rf_level(gf_group->update_type[frame_index]); - - // == arf_src_offset == - gf_group->arf_src_offset[frame_index] = - gf4_multi_layer_params[frame_index][param_idx++]; - - // == arf_pos_in_gf == - gf_group->arf_pos_in_gf[frame_index] = - gf4_multi_layer_params[frame_index][param_idx++]; - - // == arf_update_idx == - gf_group->brf_src_offset[frame_index] = - gf4_multi_layer_params[frame_index][param_idx]; - } - - // NOTE: We need to configure the frame at the end of the sequence + 1 that - // will be the start frame for the next group. Otherwise prior to the - // call to av1_rc_get_second_pass_params() the data will be undefined. - gf_group->arf_update_idx[frame_index] = 0; - gf_group->arf_ref_idx[frame_index] = 0; - - if (rc->source_alt_ref_pending) { - gf_group->update_type[frame_index] = OVERLAY_UPDATE; - gf_group->rf_level[frame_index] = INTER_NORMAL; - - } else { - gf_group->update_type[frame_index] = GF_UPDATE; - gf_group->rf_level[frame_index] = GF_ARF_STD; - } - - gf_group->bidir_pred_enabled[frame_index] = 0; - gf_group->brf_src_offset[frame_index] = 0; - - // This value is only used for INTNL_OVERLAY_UPDATE - gf_group->arf_pos_in_gf[frame_index] = 0; - - return gf_update_frames; -} -#endif // USE_MANUAL_GF4_STRUCT -#endif // USE_SYMM_MULTI_LAYER - -static void define_gf_group_structure(AV1_COMP *cpi) { - RATE_CONTROL *const rc = &cpi->rc; - -#if USE_SYMM_MULTI_LAYER - const int valid_customized_gf_length = - rc->baseline_gf_interval >= 4 && - rc->baseline_gf_interval <= MAX_PYRAMID_SIZE; - // used the new structure only if extra_arf is allowed - if (valid_customized_gf_length && rc->source_alt_ref_pending && - cpi->extra_arf_allowed > 0) { -#if USE_MANUAL_GF4_STRUCT - if (rc->baseline_gf_interval == 4) - define_gf_group_structure_4(cpi); - else -#endif - define_customized_gf_group_structure(cpi); - cpi->new_bwdref_update_rule = 1; - return; - } else { - cpi->new_bwdref_update_rule = 0; - } -#endif - - TWO_PASS *const twopass = &cpi->twopass; - GF_GROUP *const gf_group = &twopass->gf_group; - int i; - int frame_index = 0; - const int key_frame = cpi->common.current_frame.frame_type == KEY_FRAME; - - // The use of bi-predictive frames are only enabled when following 3 - // conditions are met: - // (1) ALTREF is enabled; - // (2) The bi-predictive group interval is at least 2; and - // (3) The bi-predictive group interval is strictly smaller than the - // golden group interval. - const int is_bipred_enabled = - cpi->extra_arf_allowed && rc->source_alt_ref_pending && - rc->bipred_group_interval && - rc->bipred_group_interval <= - (rc->baseline_gf_interval - rc->source_alt_ref_pending); - int bipred_group_end = 0; - int bipred_frame_index = 0; - - const unsigned char ext_arf_interval = - (unsigned char)(rc->baseline_gf_interval / (cpi->num_extra_arfs + 1) - 1); - int which_arf = cpi->num_extra_arfs; - int subgroup_interval[MAX_EXT_ARFS + 1]; - int is_sg_bipred_enabled = is_bipred_enabled; - int accumulative_subgroup_interval = 0; - - // For key frames the frame target rate is already set and it - // is also the golden frame. - // === [frame_index == 0] === - if (!key_frame) { - if (rc->source_alt_ref_active) { - gf_group->update_type[frame_index] = OVERLAY_UPDATE; - gf_group->rf_level[frame_index] = INTER_NORMAL; - } else { - gf_group->update_type[frame_index] = GF_UPDATE; - gf_group->rf_level[frame_index] = GF_ARF_STD; - } - gf_group->arf_update_idx[frame_index] = 0; - gf_group->arf_ref_idx[frame_index] = 0; - } - - gf_group->bidir_pred_enabled[frame_index] = 0; - gf_group->brf_src_offset[frame_index] = 0; - - frame_index++; - - bipred_frame_index++; - - // === [frame_index == 1] === - if (rc->source_alt_ref_pending) { - gf_group->update_type[frame_index] = ARF_UPDATE; - gf_group->rf_level[frame_index] = GF_ARF_STD; - gf_group->arf_src_offset[frame_index] = - (unsigned char)(rc->baseline_gf_interval - 1); - - gf_group->arf_update_idx[frame_index] = 0; - gf_group->arf_ref_idx[frame_index] = 0; - - gf_group->bidir_pred_enabled[frame_index] = 0; - gf_group->brf_src_offset[frame_index] = 0; - // NOTE: "bidir_pred_frame_index" stays unchanged for ARF_UPDATE frames. - - // Work out the ARFs' positions in this gf group - // NOTE(weitinglin): ALT_REFs' are indexed inversely, but coded in display - // order (except for the original ARF). In the example of three ALT_REF's, - // We index ALTREF's as: KEY ----- ALT2 ----- ALT1 ----- ALT0 - // but code them in the following order: - // KEY-ALT0-ALT2 ----- OVERLAY2-ALT1 ----- OVERLAY1 ----- OVERLAY0 - // - // arf_pos_for_ovrly[]: Position for OVERLAY - // arf_pos_in_gf[]: Position for ALTREF - cpi->arf_pos_for_ovrly[0] = frame_index + cpi->num_extra_arfs + - gf_group->arf_src_offset[frame_index] + 1; - for (i = 0; i < cpi->num_extra_arfs; ++i) { - cpi->arf_pos_for_ovrly[i + 1] = - frame_index + (cpi->num_extra_arfs - i) * (ext_arf_interval + 2); - subgroup_interval[i] = cpi->arf_pos_for_ovrly[i] - - cpi->arf_pos_for_ovrly[i + 1] - (i == 0 ? 1 : 2); - } - subgroup_interval[cpi->num_extra_arfs] = - cpi->arf_pos_for_ovrly[cpi->num_extra_arfs] - frame_index - - (cpi->num_extra_arfs == 0 ? 1 : 2); - - ++frame_index; - - // Insert an extra ARF - // === [frame_index == 2] === - if (cpi->num_extra_arfs) { - gf_group->update_type[frame_index] = INTNL_ARF_UPDATE; - gf_group->rf_level[frame_index] = GF_ARF_LOW; - gf_group->arf_src_offset[frame_index] = ext_arf_interval; - - gf_group->arf_update_idx[frame_index] = which_arf; - gf_group->arf_ref_idx[frame_index] = 0; - ++frame_index; - } - accumulative_subgroup_interval += subgroup_interval[cpi->num_extra_arfs]; - } - - const int normal_frames = - rc->baseline_gf_interval - (key_frame || rc->source_alt_ref_pending); - - for (i = 0; i < normal_frames; ++i) { - gf_group->arf_update_idx[frame_index] = which_arf; - gf_group->arf_ref_idx[frame_index] = which_arf; - - // If we are going to have ARFs, check whether we can have BWDREF in this - // subgroup, and further, whether we can have ARF subgroup which contains - // the BWDREF subgroup but contained within the GF group: - // - // GF group --> ARF subgroup --> BWDREF subgroup - if (rc->source_alt_ref_pending) { - is_sg_bipred_enabled = - is_bipred_enabled && - (subgroup_interval[which_arf] > rc->bipred_group_interval); - } - - // NOTE: BIDIR_PRED is only enabled when the length of the bi-predictive - // frame group interval is strictly smaller than that of the GOLDEN - // FRAME group interval. - // TODO(zoeliu): Currently BIDIR_PRED is only enabled when alt-ref is on. - if (is_sg_bipred_enabled && !bipred_group_end) { - const int cur_brf_src_offset = rc->bipred_group_interval - 1; - - if (bipred_frame_index == 1) { - // --- BRF_UPDATE --- - gf_group->update_type[frame_index] = BRF_UPDATE; - gf_group->rf_level[frame_index] = GF_ARF_LOW; - gf_group->brf_src_offset[frame_index] = cur_brf_src_offset; - } else if (bipred_frame_index == rc->bipred_group_interval) { - // --- LAST_BIPRED_UPDATE --- - gf_group->update_type[frame_index] = LAST_BIPRED_UPDATE; - gf_group->rf_level[frame_index] = INTER_NORMAL; - gf_group->brf_src_offset[frame_index] = 0; - - // Reset the bi-predictive frame index. - bipred_frame_index = 0; - } else { - // --- BIPRED_UPDATE --- - gf_group->update_type[frame_index] = BIPRED_UPDATE; - gf_group->rf_level[frame_index] = INTER_NORMAL; - gf_group->brf_src_offset[frame_index] = 0; - } - gf_group->bidir_pred_enabled[frame_index] = 1; - - bipred_frame_index++; - // Check whether the next bi-predictive frame group would entirely be - // included within the current golden frame group. - // In addition, we need to avoid coding a BRF right before an ARF. - if (bipred_frame_index == 1 && - (i + 2 + cur_brf_src_offset) >= accumulative_subgroup_interval) { - bipred_group_end = 1; - } - } else { - gf_group->update_type[frame_index] = LF_UPDATE; - gf_group->rf_level[frame_index] = INTER_NORMAL; - gf_group->bidir_pred_enabled[frame_index] = 0; - gf_group->brf_src_offset[frame_index] = 0; - } - - ++frame_index; - - // Check if we need to update the ARF. - if (is_sg_bipred_enabled && cpi->num_extra_arfs && which_arf > 0 && - frame_index > cpi->arf_pos_for_ovrly[which_arf]) { - --which_arf; - accumulative_subgroup_interval += subgroup_interval[which_arf] + 1; - - // Meet the new subgroup; Reset the bipred_group_end flag. - bipred_group_end = 0; - // Insert another extra ARF after the overlay frame - if (which_arf) { - gf_group->update_type[frame_index] = INTNL_ARF_UPDATE; - gf_group->rf_level[frame_index] = GF_ARF_LOW; - gf_group->arf_src_offset[frame_index] = ext_arf_interval; - - gf_group->arf_update_idx[frame_index] = which_arf; - gf_group->arf_ref_idx[frame_index] = 0; - ++frame_index; - } - } - } - - // NOTE: We need to configure the frame at the end of the sequence + 1 that - // will be the start frame for the next group. Otherwise prior to the - // call to av1_rc_get_second_pass_params() the data will be undefined. - gf_group->arf_update_idx[frame_index] = 0; - gf_group->arf_ref_idx[frame_index] = 0; - - if (rc->source_alt_ref_pending) { - gf_group->update_type[frame_index] = OVERLAY_UPDATE; - gf_group->rf_level[frame_index] = INTER_NORMAL; - - cpi->arf_pos_in_gf[0] = 1; - if (cpi->num_extra_arfs) { - // Overwrite the update_type for extra-ARF's corresponding internal - // OVERLAY's: Change from LF_UPDATE to INTNL_OVERLAY_UPDATE. - for (i = cpi->num_extra_arfs; i > 0; --i) { - cpi->arf_pos_in_gf[i] = - (i == cpi->num_extra_arfs ? 2 : cpi->arf_pos_for_ovrly[i + 1] + 1); - - gf_group->update_type[cpi->arf_pos_for_ovrly[i]] = INTNL_OVERLAY_UPDATE; - gf_group->rf_level[cpi->arf_pos_for_ovrly[i]] = INTER_NORMAL; - } - } - } else { - gf_group->update_type[frame_index] = GF_UPDATE; - gf_group->rf_level[frame_index] = GF_ARF_STD; - } - - gf_group->bidir_pred_enabled[frame_index] = 0; - gf_group->brf_src_offset[frame_index] = 0; -} - -#if USE_SYMM_MULTI_LAYER -#define NEW_MULTI_LVL_BOOST_VBR_ALLOC 1 - -#if NEW_MULTI_LVL_BOOST_VBR_ALLOC -#define LEAF_REDUCTION_FACTOR 0.75 -static double lvl_budget_factor[MAX_PYRAMID_LVL - 1][MAX_PYRAMID_LVL - 1] = { - { 1.0, 0.0, 0.0 }, { 0.6, 0.4, 0 }, { 0.45, 0.35, 0.20 } -}; -#endif // NEW_MULTI_LVL_BOOST_VBR_ALLOC -#endif // USE_SYMM_MULTI_LAYER -static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits, - double group_error, int gf_arf_bits) { - RATE_CONTROL *const rc = &cpi->rc; - const AV1EncoderConfig *const oxcf = &cpi->oxcf; - TWO_PASS *const twopass = &cpi->twopass; - GF_GROUP *const gf_group = &twopass->gf_group; - int i; - int frame_index = 0; - int key_frame; - const int max_bits = frame_max_bits(&cpi->rc, &cpi->oxcf); - int64_t total_group_bits = gf_group_bits; - int ext_arf_boost[MAX_EXT_ARFS]; - - define_gf_group_structure(cpi); - - av1_zero_array(ext_arf_boost, MAX_EXT_ARFS); - - key_frame = cpi->common.current_frame.frame_type == KEY_FRAME; - - // For key frames the frame target rate is already set and it - // is also the golden frame. - // === [frame_index == 0] === - if (!key_frame) { - if (rc->source_alt_ref_active) - gf_group->bit_allocation[frame_index] = 0; - else - gf_group->bit_allocation[frame_index] = gf_arf_bits; - - // Step over the golden frame / overlay frame - FIRSTPASS_STATS frame_stats; - if (EOF == input_stats(twopass, &frame_stats)) return; - } - - // Deduct the boost bits for arf (or gf if it is not a key frame) - // from the group total. - if (rc->source_alt_ref_pending || !key_frame) total_group_bits -= gf_arf_bits; - - frame_index++; - - // Store the bits to spend on the ARF if there is one. - // === [frame_index == 1] === - if (rc->source_alt_ref_pending) { - gf_group->bit_allocation[frame_index] = gf_arf_bits; - - ++frame_index; - - // Skip all the extra-ARF's right after ARF at the starting segment of - // the current GF group. - if (cpi->num_extra_arfs) { - while (gf_group->update_type[frame_index] == INTNL_ARF_UPDATE) - ++frame_index; - } - } - -#if USE_SYMM_MULTI_LAYER -#if NEW_MULTI_LVL_BOOST_VBR_ALLOC - // Save. - const int tmp_frame_index = frame_index; - int budget_reduced_from_leaf_level = 0; -#endif // NEW_MULTI_LVL_BOOST_VBR_ALLOC -#endif // USE_SYMM_MULTI_LAYER - - // Allocate bits to the other frames in the group. - const int normal_frames = - rc->baseline_gf_interval - (key_frame || rc->source_alt_ref_pending); - - for (i = 0; i < normal_frames; ++i) { - FIRSTPASS_STATS frame_stats; - if (EOF == input_stats(twopass, &frame_stats)) break; - - const double modified_err = - calculate_modified_err(cpi, twopass, oxcf, &frame_stats); - const double err_fraction = - (group_error > 0) ? modified_err / DOUBLE_DIVIDE_CHECK(group_error) - : 0.0; - const int target_frame_size = - clamp((int)((double)total_group_bits * err_fraction), 0, - AOMMIN(max_bits, (int)total_group_bits)); - - if (gf_group->update_type[frame_index] == BRF_UPDATE) { - // Boost up the allocated bits on BWDREF_FRAME - gf_group->bit_allocation[frame_index] = - target_frame_size + (target_frame_size >> 2); - } else if (gf_group->update_type[frame_index] == LAST_BIPRED_UPDATE) { - // Press down the allocated bits on LAST_BIPRED_UPDATE frames - gf_group->bit_allocation[frame_index] = - target_frame_size - (target_frame_size >> 1); - } else if (gf_group->update_type[frame_index] == BIPRED_UPDATE) { - // TODO(zoeliu): To investigate whether the allocated bits on - // BIPRED_UPDATE frames need to be further adjusted. - gf_group->bit_allocation[frame_index] = target_frame_size; -#if USE_SYMM_MULTI_LAYER - } else if (cpi->new_bwdref_update_rule && - gf_group->update_type[frame_index] == INTNL_OVERLAY_UPDATE) { - assert(gf_group->pyramid_height <= MAX_PYRAMID_LVL && - "non-valid height for a pyramid structure"); - - const int arf_pos = gf_group->arf_pos_in_gf[frame_index]; - gf_group->bit_allocation[frame_index] = 0; - - gf_group->bit_allocation[arf_pos] = target_frame_size; - // Note: Boost, if needed, is added in the next loop. -#endif // USE_SYMM_MULTI_LAYER - } else { - assert(gf_group->update_type[frame_index] == LF_UPDATE || - gf_group->update_type[frame_index] == INTNL_OVERLAY_UPDATE); - gf_group->bit_allocation[frame_index] = target_frame_size; -#if MULTI_LVL_BOOST_VBR_CQ - if (cpi->new_bwdref_update_rule) { -#if NEW_MULTI_LVL_BOOST_VBR_ALLOC - const int this_budget_reduction = - (int)(target_frame_size * LEAF_REDUCTION_FACTOR); - gf_group->bit_allocation[frame_index] -= this_budget_reduction; - budget_reduced_from_leaf_level += this_budget_reduction; -#else - gf_group->bit_allocation[frame_index] -= (target_frame_size >> 1); -#endif // NEW_MULTI_LVL_BOOST_VBR_ALLOC - } -#endif // MULTI_LVL_BOOST_VBR_CQ - } - - ++frame_index; - - // Skip all the extra-ARF's. - if (cpi->num_extra_arfs) { - while (gf_group->update_type[frame_index] == INTNL_ARF_UPDATE) - ++frame_index; - } - } - -#if USE_SYMM_MULTI_LAYER -#if MULTI_LVL_BOOST_VBR_CQ - if (budget_reduced_from_leaf_level > 0) { - // Restore. - frame_index = tmp_frame_index; - - // Re-distribute this extra budget to overlay frames in the group. - for (i = 0; i < normal_frames; ++i) { - if (cpi->new_bwdref_update_rule && - gf_group->update_type[frame_index] == INTNL_OVERLAY_UPDATE) { - assert(gf_group->pyramid_height <= MAX_PYRAMID_LVL && - "non-valid height for a pyramid structure"); - const int arf_pos = gf_group->arf_pos_in_gf[frame_index]; - const int this_lvl = gf_group->pyramid_level[arf_pos]; - const int dist2top = gf_group->pyramid_height - 1 - this_lvl; -#if NEW_MULTI_LVL_BOOST_VBR_ALLOC - const double lvl_boost_factor = - lvl_budget_factor[gf_group->pyramid_height - 2][dist2top]; - const int extra_size = - (int)(budget_reduced_from_leaf_level * lvl_boost_factor / - gf_group->pyramid_lvl_nodes[this_lvl]); -#else - const int target_frame_size = gf_group->bit_allocation[arf_pos]; - const int extra_size = target_frame_size >> dist2top; -#endif // NEW_MULTI_LVL_BOOST_VBR_ALLOC - gf_group->bit_allocation[arf_pos] += extra_size; - } - ++frame_index; - - // Skip all the extra-ARF's. - if (cpi->num_extra_arfs) { - while (gf_group->update_type[frame_index] == INTNL_ARF_UPDATE) - ++frame_index; - } - } - } -#endif // MULTI_LVL_BOOST_VBR_CQ -#endif // USE_SYMM_MULTI_LAYER - -#if USE_SYMM_MULTI_LAYER - if (cpi->new_bwdref_update_rule == 0 && rc->source_alt_ref_pending) { -#else - if (rc->source_alt_ref_pending) { -#endif - if (cpi->num_extra_arfs) { - // NOTE: For bit allocation, move the allocated bits associated with - // INTNL_OVERLAY_UPDATE to the corresponding INTNL_ARF_UPDATE. - // i > 0 for extra-ARF's and i == 0 for ARF: - // arf_pos_for_ovrly[i]: Position for INTNL_OVERLAY_UPDATE - // arf_pos_in_gf[i]: Position for INTNL_ARF_UPDATE - for (i = cpi->num_extra_arfs; i > 0; --i) { - assert(gf_group->update_type[cpi->arf_pos_for_ovrly[i]] == - INTNL_OVERLAY_UPDATE); - - // Encoder's choice: - // Set show_existing_frame == 1 for all extra-ARF's, and hence - // allocate zero bit for both all internal OVERLAY frames. - gf_group->bit_allocation[cpi->arf_pos_in_gf[i]] = - gf_group->bit_allocation[cpi->arf_pos_for_ovrly[i]]; - gf_group->bit_allocation[cpi->arf_pos_for_ovrly[i]] = 0; - } - } - } -} - -// Returns true if KF group and GF group both are almost completely static. -static INLINE int is_almost_static(double gf_zero_motion, int kf_zero_motion) { - return (gf_zero_motion >= 0.995) && - (kf_zero_motion >= STATIC_KF_GROUP_THRESH); -} - -// Analyse and define a gf/arf group. -static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { - AV1_COMMON *const cm = &cpi->common; - RATE_CONTROL *const rc = &cpi->rc; - AV1EncoderConfig *const oxcf = &cpi->oxcf; - TWO_PASS *const twopass = &cpi->twopass; - FIRSTPASS_STATS next_frame; - const FIRSTPASS_STATS *const start_pos = twopass->stats_in; - int i; - - double boost_score = 0.0; -#if !CONFIG_FIX_GF_LENGTH - double old_boost_score = 0.0; - double mv_ratio_accumulator_thresh; - int active_max_gf_interval; - int active_min_gf_interval; -#endif - double gf_group_err = 0.0; -#if GROUP_ADAPTIVE_MAXQ - double gf_group_raw_error = 0.0; -#endif - double gf_group_skip_pct = 0.0; - double gf_group_inactive_zone_rows = 0.0; - double gf_first_frame_err = 0.0; - double mod_frame_err = 0.0; - - double mv_ratio_accumulator = 0.0; - double decay_accumulator = 1.0; - double zero_motion_accumulator = 1.0; - - double loop_decay_rate = 1.00; - double last_loop_decay_rate = 1.00; - - double this_frame_mv_in_out = 0.0; - double mv_in_out_accumulator = 0.0; - double abs_mv_in_out_accumulator = 0.0; - - unsigned int allow_alt_ref = is_altref_enabled(cpi); - - int f_boost = 0; - int b_boost = 0; - int flash_detected; - int64_t gf_group_bits; - double gf_group_error_left; - int gf_arf_bits; - const int is_key_frame = frame_is_intra_only(cm); - const int arf_active_or_kf = is_key_frame || rc->source_alt_ref_active; - - cpi->extra_arf_allowed = 1; - - // Reset the GF group data structures unless this is a key - // frame in which case it will already have been done. - if (is_key_frame == 0) { - av1_zero(twopass->gf_group); - } - - aom_clear_system_state(); - av1_zero(next_frame); - - // Load stats for the current frame. - mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame); - - // Note the error of the frame at the start of the group. This will be - // the GF frame error if we code a normal gf. - gf_first_frame_err = mod_frame_err; - - // If this is a key frame or the overlay from a previous arf then - // the error score / cost of this frame has already been accounted for. - if (arf_active_or_kf) { - gf_group_err -= gf_first_frame_err; -#if GROUP_ADAPTIVE_MAXQ - gf_group_raw_error -= this_frame->coded_error; -#endif - gf_group_skip_pct -= this_frame->intra_skip_pct; - gf_group_inactive_zone_rows -= this_frame->inactive_zone_rows; - } -#if !CONFIG_FIX_GF_LENGTH - // Motion breakout threshold for loop below depends on image size. - mv_ratio_accumulator_thresh = - (cpi->initial_height + cpi->initial_width) / 4.0; - // Set a maximum and minimum interval for the GF group. - // If the image appears almost completely static we can extend beyond this. - { - int int_max_q = (int)(av1_convert_qindex_to_q( - twopass->active_worst_quality, cpi->common.seq_params.bit_depth)); - int int_lbq = (int)(av1_convert_qindex_to_q( - rc->last_boosted_qindex, cpi->common.seq_params.bit_depth)); - - active_min_gf_interval = rc->min_gf_interval + AOMMIN(2, int_max_q / 200); - if (active_min_gf_interval > rc->max_gf_interval) - active_min_gf_interval = rc->max_gf_interval; - - // The value chosen depends on the active Q range. At low Q we have - // bits to spare and are better with a smaller interval and smaller boost. - // At high Q when there are few bits to spare we are better with a longer - // interval to spread the cost of the GF. - active_max_gf_interval = 12 + AOMMIN(4, (int_lbq / 6)); - - // We have: active_min_gf_interval <= rc->max_gf_interval - if (active_max_gf_interval < active_min_gf_interval) - active_max_gf_interval = active_min_gf_interval; - else if (active_max_gf_interval > rc->max_gf_interval) - active_max_gf_interval = rc->max_gf_interval; - } -#endif // !CONFIG_FIX_GF_LENGTH - double avg_sr_coded_error = 0; - double avg_raw_err_stdev = 0; - int non_zero_stdev_count = 0; - - i = 0; - while (i < rc->static_scene_max_gf_interval && i < rc->frames_to_key) { - ++i; - - // Accumulate error score of frames in this gf group. - mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame); - gf_group_err += mod_frame_err; -#if GROUP_ADAPTIVE_MAXQ - gf_group_raw_error += this_frame->coded_error; -#endif - gf_group_skip_pct += this_frame->intra_skip_pct; - gf_group_inactive_zone_rows += this_frame->inactive_zone_rows; - - if (EOF == input_stats(twopass, &next_frame)) break; - - // Test for the case where there is a brief flash but the prediction - // quality back to an earlier frame is then restored. - flash_detected = detect_flash(twopass, 0); - - // Update the motion related elements to the boost calculation. - accumulate_frame_motion_stats( - &next_frame, &this_frame_mv_in_out, &mv_in_out_accumulator, - &abs_mv_in_out_accumulator, &mv_ratio_accumulator); - // sum up the metric values of current gf group - avg_sr_coded_error += next_frame.sr_coded_error; - if (fabs(next_frame.raw_error_stdev) > 0.000001) { - non_zero_stdev_count++; - avg_raw_err_stdev += next_frame.raw_error_stdev; - } - - // Accumulate the effect of prediction quality decay. - if (!flash_detected) { - last_loop_decay_rate = loop_decay_rate; - loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame); - - decay_accumulator = decay_accumulator * loop_decay_rate; - - // Monitor for static sections. - if ((rc->frames_since_key + i - 1) > 1) { - zero_motion_accumulator = AOMMIN( - zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame)); - } - - // Break clause to detect very still sections after motion. For example, - // a static image after a fade or other transition. - if (detect_transition_to_still(cpi, i, 5, loop_decay_rate, - last_loop_decay_rate)) { - allow_alt_ref = 0; - break; - } - } - - // Calculate a boost number for this frame. - boost_score += - decay_accumulator * - calc_frame_boost(cpi, &next_frame, this_frame_mv_in_out, GF_MAX_BOOST); -#if CONFIG_FIX_GF_LENGTH - // If almost totally static, we will not use the FIXED_GF_LENGTH later, so - // we can continue for more frames. - if (i >= (FIXED_GF_LENGTH + 1) && - !is_almost_static(zero_motion_accumulator, - twopass->kf_zeromotion_pct)) { - break; - } -#else - // Break out conditions. - // Break at maximum of active_max_gf_interval unless almost totally static. - // - // Note that the addition of a test of rc->source_alt_ref_active is - // deliberate. The effect of this is that after a normal altref group even - // if the material is static there will be one normal length GF group - // before allowing longer GF groups. The reason for this is that in cases - // such as slide shows where slides are separated by a complex transition - // such as a fade, the arf group spanning the transition may not be coded - // at a very high quality and hence this frame (with its overlay) is a - // poor golden frame to use for an extended group. - if ((i >= (active_max_gf_interval + arf_active_or_kf) && - ((zero_motion_accumulator < 0.995) || (rc->source_alt_ref_active))) || - ( - // Don't break out with a very short interval. - (i >= active_min_gf_interval + arf_active_or_kf) && - (!flash_detected) && - ((mv_ratio_accumulator > mv_ratio_accumulator_thresh) || - (abs_mv_in_out_accumulator > 3.0) || - (mv_in_out_accumulator < -2.0) || - ((boost_score - old_boost_score) < BOOST_BREAKOUT)))) { - // If GF group interval is < 12, we force it to be 8. Otherwise, - // if it is >= 12, we keep it as is. - // NOTE: 'i' is 1 more than the GF group interval candidate that is being - // checked. - if (i == (8 + 1) || i >= (12 + 1)) { - boost_score = old_boost_score; - break; - } - } - old_boost_score = boost_score; -#endif // CONFIG_FIX_GF_LENGTH - *this_frame = next_frame; - } - twopass->gf_zeromotion_pct = (int)(zero_motion_accumulator * 1000.0); - - // Was the group length constrained by the requirement for a new KF? - rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0; - - const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs - : cpi->common.MBs; - assert(num_mbs > 0); - if (i) avg_sr_coded_error /= i; - - if (non_zero_stdev_count) avg_raw_err_stdev /= non_zero_stdev_count; - - // Disable extra altrefs and backward refs for "still" gf group: - // zero_motion_accumulator: minimum percentage of (0,0) motion; - // avg_sr_coded_error: average of the SSE per pixel of each frame; - // avg_raw_err_stdev: average of the standard deviation of (0,0) - // motion error per block of each frame. - const int disable_bwd_extarf = - (zero_motion_accumulator > MIN_ZERO_MOTION && - avg_sr_coded_error / num_mbs < MAX_SR_CODED_ERROR && - avg_raw_err_stdev < MAX_RAW_ERR_VAR); - - if (disable_bwd_extarf) cpi->extra_arf_allowed = 0; - - const int use_alt_ref = - !is_almost_static(zero_motion_accumulator, twopass->kf_zeromotion_pct) && - allow_alt_ref && (i < cpi->oxcf.lag_in_frames) && - (i >= rc->min_gf_interval); - -#define REDUCE_GF_LENGTH_THRESH 4 -#define REDUCE_GF_LENGTH_TO_KEY_THRESH 9 -#define REDUCE_GF_LENGTH_BY 1 - int alt_offset = 0; -#if REDUCE_LAST_GF_LENGTH - // The length reduction strategy is tweaked using AOM_Q mode, and doesn't work - // for VBR mode. - // Also, we don't have do adjustment for lossless mode. - const int allow_gf_length_reduction = - (cpi->oxcf.rc_mode == AOM_Q || cpi->extra_arf_allowed == 0) && - !is_lossless_requested(&cpi->oxcf); - - if (allow_gf_length_reduction && use_alt_ref) { - // adjust length of this gf group if one of the following condition met - // 1: only one overlay frame left and this gf is too long - // 2: next gf group is too short to have arf compared to the current gf - - // maximum length of next gf group - const int next_gf_len = rc->frames_to_key - i; - const int single_overlay_left = - next_gf_len == 0 && i > REDUCE_GF_LENGTH_THRESH; - // the next gf is probably going to have a ARF but it will be shorter than - // this gf - const int unbalanced_gf = - i > REDUCE_GF_LENGTH_TO_KEY_THRESH && - next_gf_len + 1 < REDUCE_GF_LENGTH_TO_KEY_THRESH && - next_gf_len + 1 >= rc->min_gf_interval; - - if (single_overlay_left || unbalanced_gf) { - // Note: Tried roll_back = DIVIDE_AND_ROUND(i, 8), but is does not work - // better in the current setting - const int roll_back = REDUCE_GF_LENGTH_BY; - alt_offset = -roll_back; - i -= roll_back; - } - } -#endif // REDUCE_LAST_GF_LENGTH - - // Should we use the alternate reference frame. - if (use_alt_ref) { - // Calculate the boost for alt ref. - rc->gfu_boost = - calc_arf_boost(cpi, alt_offset, (i - 1), (i - 1), &f_boost, &b_boost); - rc->source_alt_ref_pending = 1; - - // do not replace ARFs with overlay frames, and keep it as GOLDEN_REF - cpi->preserve_arf_as_gld = 1; - } else { - rc->gfu_boost = AOMMAX((int)boost_score, MIN_ARF_GF_BOOST); - rc->source_alt_ref_pending = 0; - cpi->preserve_arf_as_gld = 0; - } - - // Set the interval until the next gf. - // If forward keyframes are enabled, ensure the final gf group obeys the - // MIN_FWD_KF_INTERVAL. - if (cpi->oxcf.fwd_kf_enabled && - ((twopass->stats_in - i + rc->frames_to_key) < twopass->stats_in_end)) { - if (i == rc->frames_to_key) { - rc->baseline_gf_interval = i; - // if the last gf group will be smaller than MIN_FWD_KF_INTERVAL - } else if ((rc->frames_to_key - i < - AOMMAX(MIN_FWD_KF_INTERVAL, rc->min_gf_interval)) && - (rc->frames_to_key != i)) { - // if possible, merge the last two gf groups - if (rc->frames_to_key <= MAX_PYRAMID_SIZE) { - rc->baseline_gf_interval = rc->frames_to_key; - // if merging the last two gf groups creates a group that is too long, - // split them and force the last gf group to be the MIN_FWD_KF_INTERVAL - } else { - rc->baseline_gf_interval = rc->frames_to_key - MIN_FWD_KF_INTERVAL; - } - } else { - rc->baseline_gf_interval = i - rc->source_alt_ref_pending; - } - } else { - rc->baseline_gf_interval = i - rc->source_alt_ref_pending; - } - -#if REDUCE_LAST_ALT_BOOST -#define LAST_ALR_BOOST_FACTOR 0.2f - rc->arf_boost_factor = 1.0; - if (rc->source_alt_ref_pending && !is_lossless_requested(&cpi->oxcf)) { - // Reduce the boost of altref in the last gf group - if (rc->frames_to_key - i == REDUCE_GF_LENGTH_BY || - rc->frames_to_key - i == 0) { - rc->arf_boost_factor = LAST_ALR_BOOST_FACTOR; - } - } -#endif - - if (!cpi->extra_arf_allowed) { - cpi->num_extra_arfs = 0; - } else { -#if USE_SYMM_MULTI_LAYER - if (rc->baseline_gf_interval == 4 && rc->source_alt_ref_pending) - cpi->num_extra_arfs = 1; - else - cpi->num_extra_arfs = get_number_of_extra_arfs( - rc->baseline_gf_interval, rc->source_alt_ref_pending); -#else - // Compute how many extra alt_refs we can have - cpi->num_extra_arfs = get_number_of_extra_arfs(rc->baseline_gf_interval, - rc->source_alt_ref_pending); -#endif // USE_SYMM_MULTI_LAYER - } - -#if !USE_SYMM_MULTI_LAYER - // Currently at maximum two extra ARFs' are allowed - assert(cpi->num_extra_arfs <= MAX_EXT_ARFS); -#endif - - rc->frames_till_gf_update_due = rc->baseline_gf_interval; - - rc->bipred_group_interval = BFG_INTERVAL; - // The minimum bi-predictive frame group interval is 2. - if (rc->bipred_group_interval < 2) rc->bipred_group_interval = 0; - - // Reset the file position. - reset_fpf_position(twopass, start_pos); - - // Calculate the bits to be allocated to the gf/arf group as a whole - gf_group_bits = calculate_total_gf_group_bits(cpi, gf_group_err); - -#if GROUP_ADAPTIVE_MAXQ - // Calculate an estimate of the maxq needed for the group. - // We are more agressive about correcting for sections - // where there could be significant overshoot than for easier - // sections where we do not wish to risk creating an overshoot - // of the allocated bit budget. - if ((cpi->oxcf.rc_mode != AOM_Q) && (rc->baseline_gf_interval > 1)) { - const int vbr_group_bits_per_frame = - (int)(gf_group_bits / rc->baseline_gf_interval); - const double group_av_err = gf_group_raw_error / rc->baseline_gf_interval; - const double group_av_skip_pct = - gf_group_skip_pct / rc->baseline_gf_interval; - const double group_av_inactive_zone = - ((gf_group_inactive_zone_rows * 2) / - (rc->baseline_gf_interval * (double)cm->mb_rows)); - - int tmp_q; - // rc factor is a weight factor that corrects for local rate control drift. - double rc_factor = 1.0; - if (rc->rate_error_estimate > 0) { - rc_factor = AOMMAX(RC_FACTOR_MIN, - (double)(100 - rc->rate_error_estimate) / 100.0); - } else { - rc_factor = AOMMIN(RC_FACTOR_MAX, - (double)(100 - rc->rate_error_estimate) / 100.0); - } - tmp_q = get_twopass_worst_quality( - cpi, group_av_err, (group_av_skip_pct + group_av_inactive_zone), - vbr_group_bits_per_frame, twopass->kfgroup_inter_fraction * rc_factor); - twopass->active_worst_quality = - AOMMAX(tmp_q, twopass->active_worst_quality >> 1); - } -#endif - - // Calculate the extra bits to be used for boosted frame(s) - gf_arf_bits = calculate_boost_bits(rc->baseline_gf_interval, rc->gfu_boost, - gf_group_bits); - - // Adjust KF group bits and error remaining. - twopass->kf_group_error_left -= (int64_t)gf_group_err; - - // If this is an arf update we want to remove the score for the overlay - // frame at the end which will usually be very cheap to code. - // The overlay frame has already, in effect, been coded so we want to spread - // the remaining bits among the other frames. - // For normal GFs remove the score for the GF itself unless this is - // also a key frame in which case it has already been accounted for. - if (rc->source_alt_ref_pending) { - gf_group_error_left = gf_group_err - mod_frame_err; - } else if (is_key_frame == 0) { - gf_group_error_left = gf_group_err - gf_first_frame_err; - } else { - gf_group_error_left = gf_group_err; - } - - // Allocate bits to each of the frames in the GF group. - allocate_gf_group_bits(cpi, gf_group_bits, gf_group_error_left, gf_arf_bits); - - // Reset the file position. - reset_fpf_position(twopass, start_pos); - - // Calculate a section intra ratio used in setting max loop filter. - if (cpi->common.current_frame.frame_type != KEY_FRAME) { - twopass->section_intra_rating = calculate_section_intra_ratio( - start_pos, twopass->stats_in_end, rc->baseline_gf_interval); - } -} - -// Threshold for use of the lagging second reference frame. High second ref -// usage may point to a transient event like a flash or occlusion rather than -// a real scene cut. -#define SECOND_REF_USEAGE_THRESH 0.1 -// Minimum % intra coding observed in first pass (1.0 = 100%) -#define MIN_INTRA_LEVEL 0.25 -// Minimum ratio between the % of intra coding and inter coding in the first -// pass after discounting neutral blocks (discounting neutral blocks in this -// way helps catch scene cuts in clips with very flat areas or letter box -// format clips with image padding. -#define INTRA_VS_INTER_THRESH 2.0 -// Hard threshold where the first pass chooses intra for almost all blocks. -// In such a case even if the frame is not a scene cut coding a key frame -// may be a good option. -#define VERY_LOW_INTER_THRESH 0.05 -// Maximum threshold for the relative ratio of intra error score vs best -// inter error score. -#define KF_II_ERR_THRESHOLD 2.5 -// In real scene cuts there is almost always a sharp change in the intra -// or inter error score. -#define ERR_CHANGE_THRESHOLD 0.4 -// For real scene cuts we expect an improvment in the intra inter error -// ratio in the next frame. -#define II_IMPROVEMENT_THRESHOLD 3.5 -#define KF_II_MAX 128.0 - -static int test_candidate_kf(TWO_PASS *twopass, - const FIRSTPASS_STATS *last_frame, - const FIRSTPASS_STATS *this_frame, - const FIRSTPASS_STATS *next_frame) { - int is_viable_kf = 0; - double pcnt_intra = 1.0 - this_frame->pcnt_inter; - double modified_pcnt_inter = - this_frame->pcnt_inter - this_frame->pcnt_neutral; - - // Does the frame satisfy the primary criteria of a key frame? - // See above for an explanation of the test criteria. - // If so, then examine how well it predicts subsequent frames. - if ((this_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) && - (next_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) && - ((this_frame->pcnt_inter < VERY_LOW_INTER_THRESH) || - ((pcnt_intra > MIN_INTRA_LEVEL) && - (pcnt_intra > (INTRA_VS_INTER_THRESH * modified_pcnt_inter)) && - ((this_frame->intra_error / - DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) < - KF_II_ERR_THRESHOLD) && - ((fabs(last_frame->coded_error - this_frame->coded_error) / - DOUBLE_DIVIDE_CHECK(this_frame->coded_error) > - ERR_CHANGE_THRESHOLD) || - (fabs(last_frame->intra_error - this_frame->intra_error) / - DOUBLE_DIVIDE_CHECK(this_frame->intra_error) > - ERR_CHANGE_THRESHOLD) || - ((next_frame->intra_error / - DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) > - II_IMPROVEMENT_THRESHOLD))))) { - int i; - const FIRSTPASS_STATS *start_pos = twopass->stats_in; - FIRSTPASS_STATS local_next_frame = *next_frame; - double boost_score = 0.0; - double old_boost_score = 0.0; - double decay_accumulator = 1.0; - - // Examine how well the key frame predicts subsequent frames. - for (i = 0; i < 16; ++i) { - double next_iiratio = (BOOST_FACTOR * local_next_frame.intra_error / - DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error)); - - if (next_iiratio > KF_II_MAX) next_iiratio = KF_II_MAX; - - // Cumulative effect of decay in prediction quality. - if (local_next_frame.pcnt_inter > 0.85) - decay_accumulator *= local_next_frame.pcnt_inter; - else - decay_accumulator *= (0.85 + local_next_frame.pcnt_inter) / 2.0; - - // Keep a running total. - boost_score += (decay_accumulator * next_iiratio); - - // Test various breakout clauses. - if ((local_next_frame.pcnt_inter < 0.05) || (next_iiratio < 1.5) || - (((local_next_frame.pcnt_inter - local_next_frame.pcnt_neutral) < - 0.20) && - (next_iiratio < 3.0)) || - ((boost_score - old_boost_score) < 3.0) || - (local_next_frame.intra_error < 200)) { - break; - } - - old_boost_score = boost_score; - - // Get the next frame details - if (EOF == input_stats(twopass, &local_next_frame)) break; - } - - // If there is tolerable prediction for at least the next 3 frames then - // break out else discard this potential key frame and move on - if (boost_score > 30.0 && (i > 3)) { - is_viable_kf = 1; - } else { - // Reset the file position - reset_fpf_position(twopass, start_pos); - - is_viable_kf = 0; - } - } - - return is_viable_kf; -} - -#define FRAMES_TO_CHECK_DECAY 8 - -static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { - int i, j; - RATE_CONTROL *const rc = &cpi->rc; - TWO_PASS *const twopass = &cpi->twopass; - GF_GROUP *const gf_group = &twopass->gf_group; - const AV1EncoderConfig *const oxcf = &cpi->oxcf; - const FIRSTPASS_STATS first_frame = *this_frame; - const FIRSTPASS_STATS *const start_position = twopass->stats_in; - FIRSTPASS_STATS next_frame; - FIRSTPASS_STATS last_frame; - int kf_bits = 0; - int loop_decay_counter = 0; - double decay_accumulator = 1.0; - double av_decay_accumulator = 0.0; - double zero_motion_accumulator = 1.0; - double boost_score = 0.0; - double kf_mod_err = 0.0; - double kf_group_err = 0.0; - double recent_loop_decay[FRAMES_TO_CHECK_DECAY]; - - av1_zero(next_frame); - - cpi->common.current_frame.frame_type = KEY_FRAME; - rc->frames_since_key = 0; - - // Reset the GF group data structures. - av1_zero(*gf_group); - - // Is this a forced key frame by interval. - rc->this_key_frame_forced = rc->next_key_frame_forced; - - // Clear the alt ref active flag and last group multi arf flags as they - // can never be set for a key frame. - rc->source_alt_ref_active = 0; - - // KF is always a GF so clear frames till next gf counter. - rc->frames_till_gf_update_due = 0; - - rc->frames_to_key = 1; - - twopass->kf_group_bits = 0; // Total bits available to kf group - twopass->kf_group_error_left = 0; // Group modified error score. - - kf_mod_err = calculate_modified_err(cpi, twopass, oxcf, this_frame); - - // Initialize the decay rates for the recent frames to check - for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j) recent_loop_decay[j] = 1.0; - - // Find the next keyframe. - i = 0; - while (twopass->stats_in < twopass->stats_in_end && - rc->frames_to_key < cpi->oxcf.key_freq) { - // Accumulate kf group error. - kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame); - - // Load the next frame's stats. - last_frame = *this_frame; - input_stats(twopass, this_frame); - - // Provided that we are not at the end of the file... - if (cpi->oxcf.auto_key && twopass->stats_in < twopass->stats_in_end) { - double loop_decay_rate; - - // Check for a scene cut. - if (test_candidate_kf(twopass, &last_frame, this_frame, - twopass->stats_in)) - break; - - // How fast is the prediction quality decaying? - loop_decay_rate = get_prediction_decay_rate(cpi, twopass->stats_in); - - // We want to know something about the recent past... rather than - // as used elsewhere where we are concerned with decay in prediction - // quality since the last GF or KF. - recent_loop_decay[i % FRAMES_TO_CHECK_DECAY] = loop_decay_rate; - decay_accumulator = 1.0; - for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j) - decay_accumulator *= recent_loop_decay[j]; - - // Special check for transition or high motion followed by a - // static scene. - if (detect_transition_to_still(cpi, i, cpi->oxcf.key_freq - i, - loop_decay_rate, decay_accumulator)) - break; - - // Step on to the next frame. - ++rc->frames_to_key; - - // If we don't have a real key frame within the next two - // key_freq intervals then break out of the loop. - if (rc->frames_to_key >= 2 * cpi->oxcf.key_freq) break; - } else { - ++rc->frames_to_key; - } - ++i; - } - - // If there is a max kf interval set by the user we must obey it. - // We already breakout of the loop above at 2x max. - // This code centers the extra kf if the actual natural interval - // is between 1x and 2x. - if (cpi->oxcf.auto_key && rc->frames_to_key > cpi->oxcf.key_freq) { - FIRSTPASS_STATS tmp_frame = first_frame; - - rc->frames_to_key /= 2; - - // Reset to the start of the group. - reset_fpf_position(twopass, start_position); - - kf_group_err = 0.0; - - // Rescan to get the correct error data for the forced kf group. - for (i = 0; i < rc->frames_to_key; ++i) { - kf_group_err += calculate_modified_err(cpi, twopass, oxcf, &tmp_frame); - input_stats(twopass, &tmp_frame); - } - rc->next_key_frame_forced = 1; - } else if (twopass->stats_in == twopass->stats_in_end || - rc->frames_to_key >= cpi->oxcf.key_freq) { - rc->next_key_frame_forced = 1; - } else { - rc->next_key_frame_forced = 0; - } - - // Special case for the last key frame of the file. - if (twopass->stats_in >= twopass->stats_in_end) { - // Accumulate kf group error. - kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame); - } - - // Calculate the number of bits that should be assigned to the kf group. - if (twopass->bits_left > 0 && twopass->modified_error_left > 0.0) { - // Maximum number of bits for a single normal frame (not key frame). - const int max_bits = frame_max_bits(rc, &cpi->oxcf); - - // Maximum number of bits allocated to the key frame group. - int64_t max_grp_bits; - - // Default allocation based on bits left and relative - // complexity of the section. - twopass->kf_group_bits = (int64_t)( - twopass->bits_left * (kf_group_err / twopass->modified_error_left)); - - // Clip based on maximum per frame rate defined by the user. - max_grp_bits = (int64_t)max_bits * (int64_t)rc->frames_to_key; - if (twopass->kf_group_bits > max_grp_bits) - twopass->kf_group_bits = max_grp_bits; - } else { - twopass->kf_group_bits = 0; - } - twopass->kf_group_bits = AOMMAX(0, twopass->kf_group_bits); - - // Reset the first pass file position. - reset_fpf_position(twopass, start_position); - - // Scan through the kf group collating various stats used to determine - // how many bits to spend on it. - decay_accumulator = 1.0; - boost_score = 0.0; - const double kf_max_boost = - cpi->oxcf.rc_mode == AOM_Q - ? AOMMIN(AOMMAX(rc->frames_to_key * 2.0, KF_MIN_FRAME_BOOST), - KF_MAX_FRAME_BOOST) - : KF_MAX_FRAME_BOOST; - for (i = 0; i < (rc->frames_to_key - 1); ++i) { - if (EOF == input_stats(twopass, &next_frame)) break; - - // Monitor for static sections. - // For the first frame in kf group, the second ref indicator is invalid. - if (i > 0) { - zero_motion_accumulator = AOMMIN( - zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame)); - } else { - zero_motion_accumulator = next_frame.pcnt_inter - next_frame.pcnt_motion; - } - - // Not all frames in the group are necessarily used in calculating boost. - if ((i <= rc->max_gf_interval) || - ((i <= (rc->max_gf_interval * 4)) && (decay_accumulator > 0.5))) { - const double frame_boost = - calc_frame_boost(cpi, this_frame, 0, kf_max_boost); - - // How fast is prediction quality decaying. - if (!detect_flash(twopass, 0)) { - const double loop_decay_rate = - get_prediction_decay_rate(cpi, &next_frame); - decay_accumulator *= loop_decay_rate; - decay_accumulator = AOMMAX(decay_accumulator, MIN_DECAY_FACTOR); - av_decay_accumulator += decay_accumulator; - ++loop_decay_counter; - } - boost_score += (decay_accumulator * frame_boost); - } - } - if (loop_decay_counter > 0) - av_decay_accumulator /= (double)loop_decay_counter; - - reset_fpf_position(twopass, start_position); - - // Store the zero motion percentage - twopass->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0); - - // Calculate a section intra ratio used in setting max loop filter. - twopass->section_intra_rating = calculate_section_intra_ratio( - start_position, twopass->stats_in_end, rc->frames_to_key); - - rc->kf_boost = (int)(av_decay_accumulator * boost_score); - - // Special case for static / slide show content but don't apply - // if the kf group is very short. - if ((zero_motion_accumulator > STATIC_KF_GROUP_FLOAT_THRESH) && - (rc->frames_to_key > 8)) { - rc->kf_boost = AOMMAX(rc->kf_boost, MIN_STATIC_KF_BOOST); - } else { - // Apply various clamps for min and max boost - rc->kf_boost = AOMMAX(rc->kf_boost, (rc->frames_to_key * 3)); - rc->kf_boost = AOMMAX(rc->kf_boost, MIN_KF_BOOST); - } - - // Work out how many bits to allocate for the key frame itself. - kf_bits = calculate_boost_bits((rc->frames_to_key - 1), rc->kf_boost, - twopass->kf_group_bits); - // printf("kf boost = %d kf_bits = %d kf_zeromotion_pct = %d\n", rc->kf_boost, - // kf_bits, twopass->kf_zeromotion_pct); - - // Work out the fraction of the kf group bits reserved for the inter frames - // within the group after discounting the bits for the kf itself. - if (twopass->kf_group_bits) { - twopass->kfgroup_inter_fraction = - (double)(twopass->kf_group_bits - kf_bits) / - (double)twopass->kf_group_bits; - } else { - twopass->kfgroup_inter_fraction = 1.0; - } - - twopass->kf_group_bits -= kf_bits; - - // Save the bits to spend on the key frame. - gf_group->bit_allocation[0] = kf_bits; - gf_group->update_type[0] = KF_UPDATE; - gf_group->rf_level[0] = KF_STD; - - // Note the total error score of the kf group minus the key frame itself. - twopass->kf_group_error_left = (int)(kf_group_err - kf_mod_err); - - // Adjust the count of total modified error left. - // The count of bits left is adjusted elsewhere based on real coded frame - // sizes. - twopass->modified_error_left -= kf_group_err; -} - -void av1_configure_buffer_updates_firstpass(AV1_COMP *cpi, - FRAME_UPDATE_TYPE update_type) { - RATE_CONTROL *rc = &cpi->rc; - - cpi->refresh_last_frame = 1; - cpi->refresh_golden_frame = 0; - cpi->refresh_bwd_ref_frame = 0; - cpi->refresh_alt2_ref_frame = 0; - cpi->refresh_alt_ref_frame = 0; - - rc->is_bwd_ref_frame = 0; - - switch (update_type) { - case ARF_UPDATE: - cpi->refresh_alt_ref_frame = 1; - cpi->refresh_last_frame = 0; - cpi->refresh_golden_frame = 0; - cpi->refresh_bwd_ref_frame = 0; - cpi->refresh_alt2_ref_frame = 0; - - rc->is_src_frame_alt_ref = 0; - break; - case INTNL_ARF_UPDATE: - cpi->refresh_alt2_ref_frame = 1; - cpi->refresh_last_frame = 0; - cpi->refresh_golden_frame = 0; - cpi->refresh_bwd_ref_frame = 0; - cpi->refresh_alt_ref_frame = 0; - rc->is_src_frame_alt_ref = 0; - rc->is_src_frame_ext_arf = 0; - - break; - case BIPRED_UPDATE: - cpi->refresh_bwd_ref_frame = 1; - cpi->refresh_last_frame = 0; - cpi->refresh_golden_frame = 0; - cpi->refresh_alt2_ref_frame = 0; - cpi->refresh_alt_ref_frame = 0; - - rc->is_bwd_ref_frame = 1; - break; - default: break; - } -} - -static int is_skippable_frame(const AV1_COMP *cpi) { - // If the current frame does not have non-zero motion vector detected in the - // first pass, and so do its previous and forward frames, then this frame - // can be skipped for partition check, and the partition size is assigned - // according to the variance - const TWO_PASS *const twopass = &cpi->twopass; - - return (!frame_is_intra_only(&cpi->common) && - twopass->stats_in - 2 > twopass->stats_in_start && - twopass->stats_in < twopass->stats_in_end && - (twopass->stats_in - 1)->pcnt_inter - - (twopass->stats_in - 1)->pcnt_motion == - 1 && - (twopass->stats_in - 2)->pcnt_inter - - (twopass->stats_in - 2)->pcnt_motion == - 1 && - twopass->stats_in->pcnt_inter - twopass->stats_in->pcnt_motion == 1); -} - -void av1_rc_get_second_pass_params(AV1_COMP *cpi) { - AV1_COMMON *const cm = &cpi->common; - CurrentFrame *const current_frame = &cm->current_frame; - RATE_CONTROL *const rc = &cpi->rc; - TWO_PASS *const twopass = &cpi->twopass; - GF_GROUP *const gf_group = &twopass->gf_group; - int frames_left; - FIRSTPASS_STATS this_frame; - - int target_rate; - - frames_left = (int)(twopass->total_stats.count - current_frame->frame_number); - - if (!twopass->stats_in) return; - - // If this is an arf frame then we dont want to read the stats file or - // advance the input pointer as we already have what we need. - if (gf_group->update_type[gf_group->index] == ARF_UPDATE || - gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE) { - av1_configure_buffer_updates(cpi); - target_rate = gf_group->bit_allocation[gf_group->index]; - target_rate = av1_rc_clamp_pframe_target_size(cpi, target_rate); - rc->base_frame_target = target_rate; - - if (cpi->no_show_kf) { - assert(gf_group->update_type[gf_group->index] == ARF_UPDATE); - current_frame->frame_type = KEY_FRAME; - } else { - current_frame->frame_type = INTER_FRAME; - } - - // Do the firstpass stats indicate that this frame is skippable for the - // partition search? - if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2) { - cpi->partition_search_skippable_frame = is_skippable_frame(cpi); - } - - return; - } - - aom_clear_system_state(); - - if (cpi->oxcf.rc_mode == AOM_Q) { - twopass->active_worst_quality = cpi->oxcf.cq_level; - } else if (current_frame->frame_number == 0) { - // Special case code for first frame. - const int section_target_bandwidth = - (int)(twopass->bits_left / frames_left); - const double section_length = twopass->total_left_stats.count; - const double section_error = - twopass->total_left_stats.coded_error / section_length; - const double section_intra_skip = - twopass->total_left_stats.intra_skip_pct / section_length; - const double section_inactive_zone = - (twopass->total_left_stats.inactive_zone_rows * 2) / - ((double)cm->mb_rows * section_length); - const int tmp_q = get_twopass_worst_quality( - cpi, section_error, section_intra_skip + section_inactive_zone, - section_target_bandwidth, DEFAULT_GRP_WEIGHT); - - twopass->active_worst_quality = tmp_q; - twopass->baseline_active_worst_quality = tmp_q; - rc->ni_av_qi = tmp_q; - rc->last_q[INTER_FRAME] = tmp_q; - rc->avg_q = av1_convert_qindex_to_q(tmp_q, cm->seq_params.bit_depth); - rc->avg_frame_qindex[INTER_FRAME] = tmp_q; - rc->last_q[KEY_FRAME] = (tmp_q + cpi->oxcf.best_allowed_q) / 2; - rc->avg_frame_qindex[KEY_FRAME] = rc->last_q[KEY_FRAME]; - } - - av1_zero(this_frame); - if (EOF == input_stats(twopass, &this_frame)) return; - - // Set the frame content type flag. - if (this_frame.intra_skip_pct >= FC_ANIMATION_THRESH) - twopass->fr_content_type = FC_GRAPHICS_ANIMATION; - else - twopass->fr_content_type = FC_NORMAL; - - // Keyframe and section processing. - if (rc->frames_to_key == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY)) { - FIRSTPASS_STATS this_frame_copy; - this_frame_copy = this_frame; - // Define next KF group and assign bits to it. - find_next_key_frame(cpi, &this_frame); - this_frame = this_frame_copy; - } else { - current_frame->frame_type = INTER_FRAME; - } - - // Define a new GF/ARF group. (Should always enter here for key frames). - if (rc->frames_till_gf_update_due == 0) { - define_gf_group(cpi, &this_frame); - - rc->frames_till_gf_update_due = rc->baseline_gf_interval; - -#if ARF_STATS_OUTPUT - { - FILE *fpfile; - fpfile = fopen("arf.stt", "a"); - ++arf_count; - fprintf(fpfile, "%10d %10d %10d %10d %10d\n", current_frame->frame_number, - rc->frames_till_gf_update_due, rc->kf_boost, arf_count, - rc->gfu_boost); - - fclose(fpfile); - } -#endif - } - - av1_configure_buffer_updates(cpi); - - // Do the firstpass stats indicate that this frame is skippable for the - // partition search? - if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2) { - cpi->partition_search_skippable_frame = is_skippable_frame(cpi); - } - - target_rate = gf_group->bit_allocation[gf_group->index]; - - if (cpi->common.current_frame.frame_type == KEY_FRAME) - target_rate = av1_rc_clamp_iframe_target_size(cpi, target_rate); - else - target_rate = av1_rc_clamp_pframe_target_size(cpi, target_rate); - - rc->base_frame_target = target_rate; - - { - const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) - ? cpi->initial_mbs - : cpi->common.MBs; - // The multiplication by 256 reverses a scaling factor of (>> 8) - // applied when combining MB error values for the frame. - twopass->mb_av_energy = log((this_frame.intra_error / num_mbs) + 1.0); - twopass->frame_avg_haar_energy = - log((this_frame.frame_avg_wavelet_energy / num_mbs) + 1.0); - } - - // Update the total stats remaining structure. - subtract_stats(&twopass->total_left_stats, &this_frame); -} - -#define MINQ_ADJ_LIMIT 48 -#define MINQ_ADJ_LIMIT_CQ 20 -#define HIGH_UNDERSHOOT_RATIO 2 -void av1_twopass_postencode_update(AV1_COMP *cpi) { - TWO_PASS *const twopass = &cpi->twopass; - RATE_CONTROL *const rc = &cpi->rc; - const int bits_used = rc->base_frame_target; - - // VBR correction is done through rc->vbr_bits_off_target. Based on the - // sign of this value, a limited % adjustment is made to the target rate - // of subsequent frames, to try and push it back towards 0. This method - // is designed to prevent extreme behaviour at the end of a clip - // or group of frames. - rc->vbr_bits_off_target += rc->base_frame_target - rc->projected_frame_size; - twopass->bits_left = AOMMAX(twopass->bits_left - bits_used, 0); - - // Calculate the pct rc error. - if (rc->total_actual_bits) { - rc->rate_error_estimate = - (int)((rc->vbr_bits_off_target * 100) / rc->total_actual_bits); - rc->rate_error_estimate = clamp(rc->rate_error_estimate, -100, 100); - } else { - rc->rate_error_estimate = 0; - } - - if (cpi->common.current_frame.frame_type != KEY_FRAME) { - twopass->kf_group_bits -= bits_used; - twopass->last_kfgroup_zeromotion_pct = twopass->kf_zeromotion_pct; - } - twopass->kf_group_bits = AOMMAX(twopass->kf_group_bits, 0); - - // If the rate control is drifting consider adjustment to min or maxq. - if ((cpi->oxcf.rc_mode != AOM_Q) && - (cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD) && - !cpi->rc.is_src_frame_alt_ref) { - const int maxq_adj_limit = - rc->worst_quality - twopass->active_worst_quality; - const int minq_adj_limit = - (cpi->oxcf.rc_mode == AOM_CQ ? MINQ_ADJ_LIMIT_CQ : MINQ_ADJ_LIMIT); - - // Undershoot. - if (rc->rate_error_estimate > cpi->oxcf.under_shoot_pct) { - --twopass->extend_maxq; - if (rc->rolling_target_bits >= rc->rolling_actual_bits) - ++twopass->extend_minq; - // Overshoot. - } else if (rc->rate_error_estimate < -cpi->oxcf.over_shoot_pct) { - --twopass->extend_minq; - if (rc->rolling_target_bits < rc->rolling_actual_bits) - ++twopass->extend_maxq; - } else { - // Adjustment for extreme local overshoot. - if (rc->projected_frame_size > (2 * rc->base_frame_target) && - rc->projected_frame_size > (2 * rc->avg_frame_bandwidth)) - ++twopass->extend_maxq; - - // Unwind undershoot or overshoot adjustment. - if (rc->rolling_target_bits < rc->rolling_actual_bits) - --twopass->extend_minq; - else if (rc->rolling_target_bits > rc->rolling_actual_bits) - --twopass->extend_maxq; - } - - twopass->extend_minq = clamp(twopass->extend_minq, 0, minq_adj_limit); - twopass->extend_maxq = clamp(twopass->extend_maxq, 0, maxq_adj_limit); - - // If there is a big and undexpected undershoot then feed the extra - // bits back in quickly. One situation where this may happen is if a - // frame is unexpectedly almost perfectly predicted by the ARF or GF - // but not very well predcited by the previous frame. - if (!frame_is_kf_gf_arf(cpi) && !cpi->rc.is_src_frame_alt_ref) { - int fast_extra_thresh = rc->base_frame_target / HIGH_UNDERSHOOT_RATIO; - if (rc->projected_frame_size < fast_extra_thresh) { - rc->vbr_bits_off_target_fast += - fast_extra_thresh - rc->projected_frame_size; - rc->vbr_bits_off_target_fast = - AOMMIN(rc->vbr_bits_off_target_fast, (4 * rc->avg_frame_bandwidth)); - - // Fast adaptation of minQ if necessary to use up the extra bits. - if (rc->avg_frame_bandwidth) { - twopass->extend_minq_fast = - (int)(rc->vbr_bits_off_target_fast * 8 / rc->avg_frame_bandwidth); - } - twopass->extend_minq_fast = AOMMIN( - twopass->extend_minq_fast, minq_adj_limit - twopass->extend_minq); - } else if (rc->vbr_bits_off_target_fast) { - twopass->extend_minq_fast = AOMMIN( - twopass->extend_minq_fast, minq_adj_limit - twopass->extend_minq); - } else { - twopass->extend_minq_fast = 0; - } - } - } -} diff --git a/libaom/av1/encoder/firstpass.h b/libaom/av1/encoder/firstpass.h index 7c40615..1b8636c 100644 --- a/libaom/av1/encoder/firstpass.h +++ b/libaom/av1/encoder/firstpass.h @@ -21,35 +21,7 @@ extern "C" { #endif -#if CONFIG_FP_MB_STATS - -#define FPMB_DCINTRA_MASK 0x01 - -#define FPMB_MOTION_ZERO_MASK 0x02 -#define FPMB_MOTION_LEFT_MASK 0x04 -#define FPMB_MOTION_RIGHT_MASK 0x08 -#define FPMB_MOTION_UP_MASK 0x10 -#define FPMB_MOTION_DOWN_MASK 0x20 - -#define FPMB_ERROR_SMALL_MASK 0x40 -#define FPMB_ERROR_LARGE_MASK 0x80 -#define FPMB_ERROR_SMALL_TH 2000 -#define FPMB_ERROR_LARGE_TH 48000 - -typedef struct { - uint8_t *mb_stats_start; - uint8_t *mb_stats_end; -} FIRSTPASS_MB_STATS; -#endif - -// Length of the bi-predictive frame group (BFG) -// NOTE: Currently each BFG contains one backward ref (BWF) frame plus a certain -// number of bi-predictive frames. -#define BFG_INTERVAL 2 -// The maximum number of extra ALTREF's except ALTREF_FRAME -#define MAX_EXT_ARFS (REF_FRAMES - BWDREF_FRAME - 1) - -#define MIN_EXT_ARF_INTERVAL 4 +#define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x)-0.000001 : (x) + 0.000001) #define MIN_ZERO_MOTION 0.95 #define MAX_SR_CODED_ERROR 40 @@ -59,73 +31,99 @@ typedef struct { #define VLOW_MOTION_THRESHOLD 950 typedef struct { + // Frame number in display order, if stats are for a single frame. + // No real meaning for a collection of frames. double frame; + // Weight assigned to this frame (or total weight for the collection of + // frames) currently based on intra factor and brightness factor. This is used + // to distribute bits betweeen easier and harder frames. double weight; + // Intra prediction error. double intra_error; + // Average wavelet energy computed using Discrete Wavelet Transform (DWT). double frame_avg_wavelet_energy; + // Best of intra pred error and inter pred error using last frame as ref. double coded_error; + // Best of intra pred error and inter pred error using golden frame as ref. double sr_coded_error; + // Percentage of blocks with inter pred error < intra pred error. double pcnt_inter; + // Percentage of blocks using (inter prediction and) non-zero motion vectors. double pcnt_motion; + // Percentage of blocks where golden frame was the best reference. That is: + // inter pred error using golden frame < inter pred error using last frame and + // inter pred error using golden frame < intra pred error double pcnt_second_ref; + // Percentage of blocks where intra and inter prediction errors were very + // close. Note that this is a 'weighted count', that is, the so blocks may be + // weighted by how close the two errors were. double pcnt_neutral; + // Percentage of blocks that have almost no intra error residual + // (i.e. are in effect completely flat and untextured in the intra + // domain). In natural videos this is uncommon, but it is much more + // common in animations, graphics and screen content, so may be used + // as a signal to detect these types of content. double intra_skip_pct; - double inactive_zone_rows; // Image mask rows top and bottom. - double inactive_zone_cols; // Image mask columns at left and right edges. + // Image mask rows top and bottom. + double inactive_zone_rows; + // Image mask columns at left and right edges. + double inactive_zone_cols; + // Average of row motion vectors. double MVr; + // Mean of absolute value of row motion vectors. double mvr_abs; + // Mean of column motion vectors. double MVc; + // Mean of absolute value of column motion vectors. double mvc_abs; + // Variance of row motion vectors. double MVrv; + // Variance of column motion vectors. double MVcv; + // Value in range [-1,1] indicating fraction of row and column motion vectors + // that point inwards (negative MV value) or outwards (positive MV value). + // For example, value of 1 indicates, all row/column MVs are inwards. double mv_in_out_count; + // Count of unique non-zero motion vectors. double new_mv_count; + // Duration of the frame / collection of frames. double duration; + // 1.0 if stats are for a single frame, OR + // Number of frames in this collection for which the stats are accumulated. double count; // standard deviation for (0, 0) motion prediction error double raw_error_stdev; } FIRSTPASS_STATS; -typedef enum { - KF_UPDATE = 0, - LF_UPDATE = 1, - GF_UPDATE = 2, - ARF_UPDATE = 3, - OVERLAY_UPDATE = 4, - BRF_UPDATE = 5, // Backward Reference Frame - LAST_BIPRED_UPDATE = 6, // Last Bi-predictive Frame - BIPRED_UPDATE = 7, // Bi-predictive Frame, but not the last one - INTNL_OVERLAY_UPDATE = 8, // Internal Overlay Frame - INTNL_ARF_UPDATE = 9, // Internal Altref Frame (candidate for ALTREF2) - FRAME_UPDATE_TYPES = 10 -} FRAME_UPDATE_TYPE; +enum { + KF_UPDATE, + LF_UPDATE, + GF_UPDATE, + ARF_UPDATE, + OVERLAY_UPDATE, + INTNL_OVERLAY_UPDATE, // Internal Overlay Frame + INTNL_ARF_UPDATE, // Internal Altref Frame + FRAME_UPDATE_TYPES +} UENUM1BYTE(FRAME_UPDATE_TYPE); #define FC_ANIMATION_THRESH 0.15 -typedef enum { +enum { FC_NORMAL = 0, FC_GRAPHICS_ANIMATION = 1, FRAME_CONTENT_TYPES = 2 -} FRAME_CONTENT_TYPE; +} UENUM1BYTE(FRAME_CONTENT_TYPE); typedef struct { unsigned char index; - RATE_FACTOR_LEVEL rf_level[MAX_STATIC_GF_GROUP_LENGTH + 1]; FRAME_UPDATE_TYPE update_type[MAX_STATIC_GF_GROUP_LENGTH + 1]; unsigned char arf_src_offset[MAX_STATIC_GF_GROUP_LENGTH + 1]; unsigned char arf_update_idx[MAX_STATIC_GF_GROUP_LENGTH + 1]; - unsigned char arf_ref_idx[MAX_STATIC_GF_GROUP_LENGTH + 1]; -#if USE_SYMM_MULTI_LAYER unsigned char arf_pos_in_gf[MAX_STATIC_GF_GROUP_LENGTH + 1]; unsigned char pyramid_level[MAX_STATIC_GF_GROUP_LENGTH + 1]; unsigned char pyramid_height; unsigned char pyramid_lvl_nodes[MAX_PYRAMID_LVL]; -#endif // USE_SYMM_MULTI_LAYER - unsigned char brf_src_offset[MAX_STATIC_GF_GROUP_LENGTH + 1]; - unsigned char bidir_pred_enabled[MAX_STATIC_GF_GROUP_LENGTH + 1]; - unsigned char ref_fb_idx_map[MAX_STATIC_GF_GROUP_LENGTH + 1][REF_FRAMES]; - unsigned char refresh_idx[MAX_STATIC_GF_GROUP_LENGTH + 1]; - unsigned char refresh_flag[MAX_STATIC_GF_GROUP_LENGTH + 1]; int bit_allocation[MAX_STATIC_GF_GROUP_LENGTH + 1]; + int size; } GF_GROUP; typedef struct { @@ -144,11 +142,6 @@ typedef struct { double mb_av_energy; double frame_avg_haar_energy; -#if CONFIG_FP_MB_STATS - uint8_t *frame_mb_stats_buf; - uint8_t *this_frame_mb_stats; - FIRSTPASS_MB_STATS firstpass_mb_stats; -#endif // An indication of the content type of the current frame FRAME_CONTENT_TYPE fr_content_type; @@ -165,7 +158,6 @@ typedef struct { int kf_zeromotion_pct; int last_kfgroup_zeromotion_pct; - int gf_zeromotion_pct; int active_worst_quality; int baseline_active_worst_quality; int extend_minq; @@ -176,30 +168,15 @@ typedef struct { } TWO_PASS; struct AV1_COMP; +struct EncodeFrameParams; +struct AV1EncoderConfig; void av1_init_first_pass(struct AV1_COMP *cpi); void av1_rc_get_first_pass_params(struct AV1_COMP *cpi); -void av1_first_pass(struct AV1_COMP *cpi, const struct lookahead_entry *source); +void av1_first_pass(struct AV1_COMP *cpi, const int64_t ts_duration); void av1_end_first_pass(struct AV1_COMP *cpi); -void av1_init_second_pass(struct AV1_COMP *cpi); -void av1_rc_get_second_pass_params(struct AV1_COMP *cpi); -void av1_configure_buffer_updates_firstpass(struct AV1_COMP *cpi, - FRAME_UPDATE_TYPE update_type); - -// Post encode update of the rate control parameters for 2-pass -void av1_twopass_postencode_update(struct AV1_COMP *cpi); - -static INLINE int get_number_of_extra_arfs(int interval, int arf_pending) { - if (arf_pending && MAX_EXT_ARFS > 0) - return interval >= MIN_EXT_ARF_INTERVAL * (MAX_EXT_ARFS + 1) - ? MAX_EXT_ARFS - : interval >= MIN_EXT_ARF_INTERVAL * MAX_EXT_ARFS - ? MAX_EXT_ARFS - 1 - : 0; - else - return 0; -} +void av1_twopass_zero_stats(FIRSTPASS_STATS *section); #ifdef __cplusplus } // extern "C" diff --git a/libaom/av1/encoder/global_motion.c b/libaom/av1/encoder/global_motion.c index e35a208..b8b13c3 100644 --- a/libaom/av1/encoder/global_motion.c +++ b/libaom/av1/encoder/global_motion.c @@ -32,17 +32,24 @@ #define MIN_INLIER_PROB 0.1 #define MIN_TRANS_THRESH (1 * GM_TRANS_DECODE_FACTOR) -#define USE_GM_FEATURE_BASED 1 // Border over which to compute the global motion #define ERRORADV_BORDER 0 // Number of pyramid levels in disflow computation -#define N_LEVELS 5 +#define N_LEVELS 2 // Size of square patches in the disflow dense grid -#define PATCH_SIZE 5 +#define PATCH_SIZE 8 +// Center point of square patch +#define PATCH_CENTER ((PATCH_SIZE + 1) >> 1) +// Step size between patches, lower value means greater patch overlap +#define PATCH_STEP 1 // Minimum size of border padding for disflow #define MIN_PAD 7 +// Warp error convergence threshold for disflow +#define DISFLOW_ERROR_TR 0.01 +// Max number of iterations if warp convergence is not found +#define DISFLOW_MAX_ITR 10 // Struct for an image pyramid typedef struct { @@ -104,7 +111,7 @@ static void convert_to_params(const double *params, int32_t *model) { void av1_convert_model_to_params(const double *params, WarpedMotionParams *model) { convert_to_params(params, model->wmmat); - model->wmtype = get_gmtype(model); + model->wmtype = get_wmtype(model); model->invalid = 0; } @@ -237,7 +244,7 @@ int64_t av1_refine_integerized_param(WarpedMotionParams *wm, } } force_wmtype(wm, wmtype); - wm->wmtype = get_gmtype(wm); + wm->wmtype = get_wmtype(wm); return best_error; } @@ -268,7 +275,6 @@ static unsigned char *downconvert_frame(YV12_BUFFER_CONFIG *frm, return buf_8bit; } -#if USE_GM_FEATURE_BASED static int compute_global_motion_feature_based( TransformationType type, YV12_BUFFER_CONFIG *frm, YV12_BUFFER_CONFIG *ref, int bit_depth, int *num_inliers_by_motion, double *params_by_motion, @@ -323,7 +329,7 @@ static int compute_global_motion_feature_based( } return 0; } -#else + static INLINE RansacFuncDouble get_ransac_double_prec_type(TransformationType type) { switch (type) { @@ -334,6 +340,35 @@ get_ransac_double_prec_type(TransformationType type) { } } +// Don't use points around the frame border since they are less reliable +static INLINE int valid_point(int x, int y, int width, int height) { + return (x > (PATCH_SIZE + PATCH_CENTER)) && + (x < (width - PATCH_SIZE - PATCH_CENTER)) && + (y > (PATCH_SIZE + PATCH_CENTER)) && + (y < (height - PATCH_SIZE - PATCH_CENTER)); +} + +static int determine_disflow_correspondence(int *frm_corners, + int num_frm_corners, double *flow_u, + double *flow_v, int width, + int height, int stride, + double *correspondences) { + int num_correspondences = 0; + int x, y; + for (int i = 0; i < num_frm_corners; ++i) { + x = frm_corners[2 * i]; + y = frm_corners[2 * i + 1]; + if (valid_point(x, y, width, height)) { + correspondences[4 * num_correspondences] = x; + correspondences[4 * num_correspondences + 1] = y; + correspondences[4 * num_correspondences + 2] = x + flow_u[y * stride + x]; + correspondences[4 * num_correspondences + 3] = y + flow_v[y * stride + x]; + num_correspondences++; + } + } + return num_correspondences; +} + double getCubicValue(double p[4], double x) { return p[1] + 0.5 * x * (p[2] - p[0] + @@ -436,21 +471,24 @@ unsigned char interpolate(unsigned char *ref, double x, double y, int width, // Warps a block using flow vector [u, v] and computes the mse double compute_warp_and_error(unsigned char *ref, unsigned char *frm, int width, - int height, int stride, double u, double v) { + int height, int stride, int x, int y, double u, + double v, int16_t *dt) { int i, j; - double warped, x, y; + unsigned char warped; + double x_w, y_w; double mse = 0; - double err = 0; - for (i = 0; i < height; ++i) - for (j = 0; j < width; ++j) { - x = (double)j - u; - y = (double)i - v; - warped = interpolate(ref, x, y, width, height, stride); + int16_t err = 0; + for (i = y; i < y + PATCH_SIZE; ++i) + for (j = x; j < x + PATCH_SIZE; ++j) { + x_w = (double)j + u; + y_w = (double)i + v; + warped = interpolate(ref, x_w, y_w, width, height, stride); err = warped - frm[j + i * stride]; mse += err * err; + dt[(i - y) * PATCH_SIZE + (j - x)] = err; } - mse /= (width * height); + mse /= (PATCH_SIZE * PATCH_SIZE); return mse; } @@ -465,19 +503,21 @@ double compute_warp_and_error(unsigned char *ref, unsigned char *frm, int width, // 2.) b = |sum(dx * dt)| // |sum(dy * dt)| // Where the sums are computed over a square window of PATCH_SIZE. -static INLINE void compute_flow_system(const double *dx, const double *dy, - const double *dt, int stride, double *M, - double *b) { +static INLINE void compute_flow_system(const double *dx, int dx_stride, + const double *dy, int dy_stride, + const int16_t *dt, int dt_stride, + double *M, double *b) { for (int i = 0; i < PATCH_SIZE; i++) { for (int j = 0; j < PATCH_SIZE; j++) { - M[0] += dx[i * stride + j] * dx[i * stride + j]; - M[1] += dx[i * stride + j] * dy[i * stride + j]; - M[3] += dy[i * stride + j] * dy[i * stride + j]; + M[0] += dx[i * dx_stride + j] * dx[i * dx_stride + j]; + M[1] += dx[i * dx_stride + j] * dy[i * dy_stride + j]; + M[3] += dy[i * dy_stride + j] * dy[i * dy_stride + j]; - b[0] += dx[i * stride + j] * dt[i * stride + j]; - b[1] += dy[i * stride + j] * dt[i * stride + j]; + b[0] += dx[i * dx_stride + j] * dt[i * dt_stride + j]; + b[1] += dy[i * dy_stride + j] * dt[i * dt_stride + j]; } } + M[2] = M[1]; } @@ -501,6 +541,7 @@ static INLINE void solve_2x2_system(const double *M, const double *b, output_vec[1] = -M[2] * mult_b0 + M_0 * mult_b1; } +/* static INLINE void image_difference(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, int16_t *dst, int dst_stride, int height, @@ -515,6 +556,7 @@ static INLINE void image_difference(const uint8_t *src, int src_stride, } } } +*/ // Compute an image gradient using a sobel filter. // If dir == 1, compute the x gradient. If dir == 0, compute y. This function @@ -523,7 +565,7 @@ static INLINE void image_difference(const uint8_t *src, int src_stride, static INLINE void sobel_xy_image_gradient(const uint8_t *src, int src_stride, double *dst, int dst_stride, int height, int width, int dir) { - double norm = 1.0 / 8; + double norm = 1.0; // TODO(sarahparker) experiment with doing this over larger block sizes const int block_unit = 8; // Filter in 8x8 blocks to eventually make use of optimized convolve function @@ -606,6 +648,24 @@ static void compute_flow_pyramids(unsigned char *frm, const int frm_width, frm_pyr->heights[0], frm_pyr->widths[0], frm_pyr->strides[0]); + if (compute_grad) { + cur_width = frm_pyr->widths[0]; + cur_height = frm_pyr->heights[0]; + cur_stride = frm_pyr->strides[0]; + cur_loc = frm_pyr->level_loc[0]; + assert(frm_pyr->has_gradient && frm_pyr->level_dx_buffer != NULL && + frm_pyr->level_dy_buffer != NULL); + // Computation x gradient + sobel_xy_image_gradient(frm_pyr->level_buffer + cur_loc, cur_stride, + frm_pyr->level_dx_buffer + cur_loc, cur_stride, + cur_height, cur_width, 1); + + // Computation y gradient + sobel_xy_image_gradient(frm_pyr->level_buffer + cur_loc, cur_stride, + frm_pyr->level_dy_buffer + cur_loc, cur_stride, + cur_height, cur_width, 0); + } + // Start at the finest level and resize down to the coarsest level for (int level = 1; level < n_levels; ++level) { update_level_dims(frm_pyr, level); @@ -636,6 +696,86 @@ static void compute_flow_pyramids(unsigned char *frm, const int frm_width, } } +static INLINE void compute_flow_at_point(unsigned char *frm, unsigned char *ref, + double *dx, double *dy, int x, int y, + int width, int height, int stride, + double *u, double *v) { + double M[4] = { 0 }; + double b[2] = { 0 }; + double tmp_output_vec[2] = { 0 }; + double error = 0; + int16_t dt[PATCH_SIZE * PATCH_SIZE]; + double o_u = *u; + double o_v = *v; + + for (int itr = 0; itr < DISFLOW_MAX_ITR; itr++) { + error = compute_warp_and_error(ref, frm, width, height, stride, x, y, *u, + *v, dt); + if (error <= DISFLOW_ERROR_TR) break; + compute_flow_system(dx, stride, dy, stride, dt, PATCH_SIZE, M, b); + solve_2x2_system(M, b, tmp_output_vec); + *u += tmp_output_vec[0]; + *v += tmp_output_vec[1]; + } + if (fabs(*u - o_u) > PATCH_SIZE || fabs(*v - o_u) > PATCH_SIZE) { + *u = o_u; + *v = o_v; + } +} + +// make sure flow_u and flow_v start at 0 +static void compute_flow_field(ImagePyramid *frm_pyr, ImagePyramid *ref_pyr, + double *flow_u, double *flow_v) { + int cur_width, cur_height, cur_stride, cur_loc, patch_loc, patch_center; + double *u_upscale = + aom_malloc(frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_u)); + double *v_upscale = + aom_malloc(frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_v)); + + assert(frm_pyr->n_levels == ref_pyr->n_levels); + + // Compute flow field from coarsest to finest level of the pyramid + for (int level = frm_pyr->n_levels - 1; level >= 0; --level) { + cur_width = frm_pyr->widths[level]; + cur_height = frm_pyr->heights[level]; + cur_stride = frm_pyr->strides[level]; + cur_loc = frm_pyr->level_loc[level]; + + for (int i = PATCH_SIZE; i < cur_height - PATCH_SIZE; i += PATCH_STEP) { + for (int j = PATCH_SIZE; j < cur_width - PATCH_SIZE; j += PATCH_STEP) { + patch_loc = i * cur_stride + j; + patch_center = patch_loc + PATCH_CENTER * cur_stride + PATCH_CENTER; + compute_flow_at_point(frm_pyr->level_buffer + cur_loc, + ref_pyr->level_buffer + cur_loc, + frm_pyr->level_dx_buffer + cur_loc + patch_loc, + frm_pyr->level_dy_buffer + cur_loc + patch_loc, j, + i, cur_width, cur_height, cur_stride, + flow_u + patch_center, flow_v + patch_center); + } + } + // TODO(sarahparker) Replace this with upscale function in resize.c + if (level > 0) { + int h_upscale = frm_pyr->heights[level - 1]; + int w_upscale = frm_pyr->widths[level - 1]; + int s_upscale = frm_pyr->strides[level - 1]; + for (int i = 0; i < h_upscale; ++i) { + for (int j = 0; j < w_upscale; ++j) { + u_upscale[j + i * s_upscale] = + flow_u[(int)(j >> 1) + (int)(i >> 1) * cur_stride]; + v_upscale[j + i * s_upscale] = + flow_v[(int)(j >> 1) + (int)(i >> 1) * cur_stride]; + } + } + memcpy(flow_u, u_upscale, + frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_u)); + memcpy(flow_v, v_upscale, + frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_v)); + } + } + aom_free(u_upscale); + aom_free(v_upscale); +} + static int compute_global_motion_disflow_based( TransformationType type, YV12_BUFFER_CONFIG *frm, YV12_BUFFER_CONFIG *ref, int bit_depth, int *num_inliers_by_motion, double *params_by_motion, @@ -647,6 +787,11 @@ static int compute_global_motion_disflow_based( const int ref_width = ref->y_width; const int ref_height = ref->y_height; const int pad_size = AOMMAX(PATCH_SIZE, MIN_PAD); + int num_frm_corners; + int num_correspondences; + double *correspondences; + int frm_corners[2 * MAX_CORNERS]; + RansacFuncDouble ransac = get_ransac_double_prec_type(type); assert(frm_width == ref_width); assert(frm_height == ref_height); @@ -683,29 +828,63 @@ static int compute_global_motion_disflow_based( compute_flow_pyramids(ref_buffer, ref_width, ref_height, ref->y_stride, n_levels, pad_size, compute_gradient, ref_pyr); - // TODO(sarahparker) Implement the rest of DISFlow, currently only the image - // pyramid is implemented. - (void)num_inliers_by_motion; - (void)params_by_motion; - (void)num_motions; - (void)type; + double *flow_u = + aom_malloc(frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_u)); + double *flow_v = + aom_malloc(frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_v)); + + memset(flow_u, 0, + frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_u)); + memset(flow_v, 0, + frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_v)); + + compute_flow_field(frm_pyr, ref_pyr, flow_u, flow_v); + + // compute interest points in images using FAST features + num_frm_corners = fast_corner_detect(frm_buffer, frm_width, frm_height, + frm->y_stride, frm_corners, MAX_CORNERS); + // find correspondences between the two images using the flow field + correspondences = aom_malloc(num_frm_corners * 4 * sizeof(*correspondences)); + num_correspondences = determine_disflow_correspondence( + frm_corners, num_frm_corners, flow_u, flow_v, frm_width, frm_height, + frm_pyr->strides[0], correspondences); + ransac(correspondences, num_correspondences, num_inliers_by_motion, + params_by_motion, num_motions); + free_pyramid(frm_pyr); free_pyramid(ref_pyr); + aom_free(correspondences); + aom_free(flow_u); + aom_free(flow_v); + // Set num_inliers = 0 for motions with too few inliers so they are ignored. + for (int i = 0; i < num_motions; ++i) { + if (num_inliers_by_motion[i] < MIN_INLIER_PROB * num_correspondences) { + num_inliers_by_motion[i] = 0; + } + } + + // Return true if any one of the motions has inliers. + for (int i = 0; i < num_motions; ++i) { + if (num_inliers_by_motion[i] > 0) return 1; + } return 0; } -#endif int av1_compute_global_motion(TransformationType type, YV12_BUFFER_CONFIG *frm, YV12_BUFFER_CONFIG *ref, int bit_depth, + GlobalMotionEstimationType gm_estimation_type, int *num_inliers_by_motion, double *params_by_motion, int num_motions) { -#if USE_GM_FEATURE_BASED - return compute_global_motion_feature_based(type, frm, ref, bit_depth, - num_inliers_by_motion, - params_by_motion, num_motions); -#else - return compute_global_motion_disflow_based(type, frm, ref, bit_depth, - num_inliers_by_motion, - params_by_motion, num_motions); -#endif + switch (gm_estimation_type) { + case GLOBAL_MOTION_FEATURE_BASED: + return compute_global_motion_feature_based(type, frm, ref, bit_depth, + num_inliers_by_motion, + params_by_motion, num_motions); + case GLOBAL_MOTION_DISFLOW_BASED: + return compute_global_motion_disflow_based(type, frm, ref, bit_depth, + num_inliers_by_motion, + params_by_motion, num_motions); + default: assert(0 && "Unknown global motion estimation type"); + } + return 0; } diff --git a/libaom/av1/encoder/global_motion.h b/libaom/av1/encoder/global_motion.h index 42cf221..2cfddad 100644 --- a/libaom/av1/encoder/global_motion.h +++ b/libaom/av1/encoder/global_motion.h @@ -22,6 +22,11 @@ extern "C" { #define RANSAC_NUM_MOTIONS 1 +typedef enum { + GLOBAL_MOTION_FEATURE_BASED, + GLOBAL_MOTION_DISFLOW_BASED, +} GlobalMotionEstimationType; + void av1_convert_model_to_params(const double *params, WarpedMotionParams *model); @@ -56,6 +61,7 @@ int64_t av1_refine_integerized_param(WarpedMotionParams *wm, */ int av1_compute_global_motion(TransformationType type, YV12_BUFFER_CONFIG *frm, YV12_BUFFER_CONFIG *ref, int bit_depth, + GlobalMotionEstimationType gm_estimation_type, int *num_inliers_by_motion, double *params_by_motion, int num_motions); #ifdef __cplusplus diff --git a/libaom/av1/encoder/gop_structure.c b/libaom/av1/encoder/gop_structure.c new file mode 100644 index 0000000..73cb0ed --- /dev/null +++ b/libaom/av1/encoder/gop_structure.c @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <stdint.h> + +#include "config/aom_config.h" +#include "config/aom_scale_rtcd.h" + +#include "aom/aom_codec.h" +#include "aom/aom_encoder.h" + +#include "aom_ports/system_state.h" + +#include "av1/common/onyxc_int.h" + +#include "av1/encoder/encoder.h" +#include "av1/encoder/firstpass.h" +#include "av1/encoder/gop_structure.h" + +// Set parameters for frames between 'start' and 'end' (excluding both). +static void set_multi_layer_params(GF_GROUP *const gf_group, int start, int end, + int *frame_ind, int arf_ind, int level) { + assert(level >= MIN_PYRAMID_LVL); + const int num_frames_to_process = end - start - 1; + assert(num_frames_to_process >= 0); + if (num_frames_to_process == 0) return; + + // Either we are at the last level of the pyramid, or we don't have enough + // frames between 'l' and 'r' to create one more level. + if (level == MIN_PYRAMID_LVL || num_frames_to_process < 3) { + // Leaf nodes. + while (++start < end) { + gf_group->update_type[*frame_ind] = LF_UPDATE; + gf_group->arf_src_offset[*frame_ind] = 0; + gf_group->arf_pos_in_gf[*frame_ind] = 0; + gf_group->arf_update_idx[*frame_ind] = arf_ind; + gf_group->pyramid_level[*frame_ind] = MIN_PYRAMID_LVL; + ++gf_group->pyramid_lvl_nodes[MIN_PYRAMID_LVL]; + ++(*frame_ind); + } + } else { + const int m = (start + end) / 2; + const int arf_pos_in_gf = *frame_ind; + + // Internal ARF. + gf_group->update_type[*frame_ind] = INTNL_ARF_UPDATE; + gf_group->arf_src_offset[*frame_ind] = m - start - 1; + gf_group->arf_pos_in_gf[*frame_ind] = 0; + gf_group->arf_update_idx[*frame_ind] = 1; // mark all internal ARF 1 + gf_group->pyramid_level[*frame_ind] = level; + ++gf_group->pyramid_lvl_nodes[level]; + ++(*frame_ind); + + // Frames displayed before this internal ARF. + set_multi_layer_params(gf_group, start, m, frame_ind, 1, level - 1); + + // Overlay for internal ARF. + gf_group->update_type[*frame_ind] = INTNL_OVERLAY_UPDATE; + gf_group->arf_src_offset[*frame_ind] = 0; + gf_group->arf_pos_in_gf[*frame_ind] = arf_pos_in_gf; // For bit allocation. + gf_group->arf_update_idx[*frame_ind] = 1; + gf_group->pyramid_level[*frame_ind] = MIN_PYRAMID_LVL; + ++(*frame_ind); + + // Frames displayed after this internal ARF. + set_multi_layer_params(gf_group, m, end, frame_ind, arf_ind, level - 1); + } +} + +static int construct_multi_layer_gf_structure( + GF_GROUP *const gf_group, int gf_interval, int pyr_height, + FRAME_UPDATE_TYPE first_frame_update_type) { + gf_group->pyramid_height = pyr_height; + av1_zero_array(gf_group->pyramid_lvl_nodes, MAX_PYRAMID_LVL); + int frame_index = 0; + + // Keyframe / Overlay frame / Golden frame. + assert(gf_interval >= 1); + assert(first_frame_update_type == KF_UPDATE || + first_frame_update_type == OVERLAY_UPDATE || + first_frame_update_type == GF_UPDATE); + gf_group->update_type[frame_index] = first_frame_update_type; + gf_group->arf_src_offset[frame_index] = 0; + gf_group->arf_pos_in_gf[frame_index] = 0; + gf_group->arf_update_idx[frame_index] = 0; + gf_group->pyramid_level[frame_index] = MIN_PYRAMID_LVL; + ++frame_index; + + // ALTREF. + const int use_altref = (gf_group->pyramid_height > 0); + if (use_altref) { + gf_group->update_type[frame_index] = ARF_UPDATE; + gf_group->arf_src_offset[frame_index] = gf_interval - 1; + gf_group->arf_pos_in_gf[frame_index] = 0; + gf_group->arf_update_idx[frame_index] = 0; + gf_group->pyramid_level[frame_index] = gf_group->pyramid_height; + ++frame_index; + } + + // Rest of the frames. + const int next_height = + use_altref ? gf_group->pyramid_height - 1 : gf_group->pyramid_height; + assert(next_height >= MIN_PYRAMID_LVL); + set_multi_layer_params(gf_group, 0, gf_interval, &frame_index, 0, + next_height); + return frame_index; +} + +#define CHECK_GF_PARAMETER 0 +#if CHECK_GF_PARAMETER +void check_frame_params(GF_GROUP *const gf_group, int gf_interval) { + static const char *update_type_strings[FRAME_UPDATE_TYPES] = { + "KF_UPDATE", "LF_UPDATE", "GF_UPDATE", + "ARF_UPDATE", "OVERLAY_UPDATE", "INTNL_OVERLAY_UPDATE", + "INTNL_ARF_UPDATE" + }; + FILE *fid = fopen("GF_PARAMS.txt", "a"); + + fprintf(fid, "\ngf_interval = {%d}\n", gf_interval); + for (int i = 0; i <= gf_group->size; ++i) { + fprintf(fid, "#%2d : %s %d %d %d %d\n", i, + update_type_strings[gf_group->update_type[i]], + gf_group->arf_src_offset[i], gf_group->arf_pos_in_gf[i], + gf_group->arf_update_idx[i], gf_group->pyramid_level[i]); + } + + fprintf(fid, "number of nodes in each level: \n"); + for (int i = 0; i < gf_group->pyramid_height; ++i) { + fprintf(fid, "lvl %d: %d ", i, gf_group->pyramid_lvl_nodes[i]); + } + fprintf(fid, "\n"); + fclose(fid); +} +#endif // CHECK_GF_PARAMETER + +static INLINE int max_pyramid_height_from_width(int pyramid_width) { + if (pyramid_width > 12) return 4; + if (pyramid_width > 6) return 3; + if (pyramid_width > 3) return 2; + if (pyramid_width > 1) return 1; + return 0; +} + +static int get_pyramid_height(const AV1_COMP *const cpi) { + const RATE_CONTROL *const rc = &cpi->rc; + assert(IMPLIES(cpi->oxcf.gf_max_pyr_height == MIN_PYRAMID_LVL, + !rc->source_alt_ref_pending)); // define_gf_group() enforced. + if (!rc->source_alt_ref_pending) { + return MIN_PYRAMID_LVL; + } + assert(cpi->oxcf.gf_max_pyr_height > MIN_PYRAMID_LVL); + if (!cpi->internal_altref_allowed) { + assert(MIN_PYRAMID_LVL + 1 <= cpi->oxcf.gf_max_pyr_height); + return MIN_PYRAMID_LVL + 1; + } + return AOMMIN(max_pyramid_height_from_width(rc->baseline_gf_interval), + cpi->oxcf.gf_max_pyr_height); +} + +void av1_gop_setup_structure(AV1_COMP *cpi, + const EncodeFrameParams *const frame_params) { + RATE_CONTROL *const rc = &cpi->rc; + TWO_PASS *const twopass = &cpi->twopass; + GF_GROUP *const gf_group = &twopass->gf_group; + const int key_frame = (frame_params->frame_type == KEY_FRAME); + const FRAME_UPDATE_TYPE first_frame_update_type = + key_frame ? KF_UPDATE + : rc->source_alt_ref_active ? OVERLAY_UPDATE : GF_UPDATE; + gf_group->size = construct_multi_layer_gf_structure( + gf_group, rc->baseline_gf_interval, get_pyramid_height(cpi), + first_frame_update_type); + + // We need to configure the frame at the end of the sequence + 1 that + // will be the start frame for the next group. Otherwise prior to the + // call to av1_get_second_pass_params(), the data will be undefined. + gf_group->update_type[gf_group->size] = + (rc->source_alt_ref_pending) ? OVERLAY_UPDATE : GF_UPDATE; + gf_group->arf_update_idx[gf_group->size] = 0; + gf_group->arf_pos_in_gf[gf_group->size] = 0; + +#if CHECK_GF_PARAMETER + check_frame_params(gf_group, rc->baseline_gf_interval); +#endif +} diff --git a/libaom/av1/encoder/gop_structure.h b/libaom/av1/encoder/gop_structure.h new file mode 100644 index 0000000..d9d5ae7 --- /dev/null +++ b/libaom/av1/encoder/gop_structure.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_GOP_STRUCTURE_H_ +#define AOM_AV1_ENCODER_GOP_STRUCTURE_H_ + +#include "av1/common/onyxc_int.h" +#include "av1/encoder/ratectrl.h" + +#ifdef __cplusplus +extern "C" { +#endif + +struct AV1_COMP; +struct EncodeFrameParams; + +// Set up the Group-Of-Pictures structure for this GF_GROUP. This involves +// deciding where to place the various FRAME_UPDATE_TYPEs in the group. It does +// this primarily by setting the contents of +// cpi->twopass.gf_group.update_type[]. +void av1_gop_setup_structure( + struct AV1_COMP *cpi, const struct EncodeFrameParams *const frame_params); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_GOP_STRUCTURE_H_ diff --git a/libaom/av1/encoder/hash_motion.c b/libaom/av1/encoder/hash_motion.c index e85a516..00915e5 100644 --- a/libaom/av1/encoder/hash_motion.c +++ b/libaom/av1/encoder/hash_motion.c @@ -147,7 +147,8 @@ static void hash_table_add_to_table(hash_table *p_hash_table, } } -int32_t av1_hash_table_count(hash_table *p_hash_table, uint32_t hash_value) { +int32_t av1_hash_table_count(const hash_table *p_hash_table, + uint32_t hash_value) { if (p_hash_table->p_lookup_table[hash_value] == NULL) { return 0; } else { @@ -392,8 +393,9 @@ void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size, uint32_t *hash_value1, uint32_t *hash_value2, int use_highbitdepth, MACROBLOCK *x) { uint32_t to_hash[4]; - const int add_value = hash_block_size_to_index(block_size) << crc_bits; + int add_value = hash_block_size_to_index(block_size); assert(add_value >= 0); + add_value <<= crc_bits; const int crc_mask = (1 << crc_bits) - 1; // 2x2 subblock hash values in current CU diff --git a/libaom/av1/encoder/hash_motion.h b/libaom/av1/encoder/hash_motion.h index df3ec32..ed9bb6e 100644 --- a/libaom/av1/encoder/hash_motion.h +++ b/libaom/av1/encoder/hash_motion.h @@ -37,7 +37,8 @@ typedef struct _hash_table { void av1_hash_table_init(hash_table *p_hash_table, struct macroblock *x); void av1_hash_table_destroy(hash_table *p_hash_table); void av1_hash_table_create(hash_table *p_hash_table); -int32_t av1_hash_table_count(hash_table *p_hash_table, uint32_t hash_value); +int32_t av1_hash_table_count(const hash_table *p_hash_table, + uint32_t hash_value); Iterator av1_hash_get_first_iterator(hash_table *p_hash_table, uint32_t hash_value); int32_t av1_has_exact_match(hash_table *p_hash_table, uint32_t hash_value1, diff --git a/libaom/av1/encoder/level.c b/libaom/av1/encoder/level.c new file mode 100644 index 0000000..1668bdf --- /dev/null +++ b/libaom/av1/encoder/level.c @@ -0,0 +1,647 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "aom_ports/system_state.h" + +#include "av1/encoder/encoder.h" +#include "av1/encoder/level.h" + +#define UNDEFINED_LEVEL \ + { \ + .level = SEQ_LEVEL_MAX, .max_picture_size = 0, .max_h_size = 0, \ + .max_v_size = 0, .max_display_rate = 0, .max_decode_rate = 0, \ + .max_header_rate = 0, .main_mbps = 0, .high_mbps = 0, .main_cr = 0, \ + .high_cr = 0, .max_tiles = 0, .max_tile_cols = 0 \ + } + +static const AV1LevelSpec av1_level_defs[SEQ_LEVELS] = { + { .level = SEQ_LEVEL_2_0, + .max_picture_size = 147456, + .max_h_size = 2048, + .max_v_size = 1152, + .max_display_rate = 4423680L, + .max_decode_rate = 5529600L, + .max_header_rate = 150, + .main_mbps = 1.5, + .high_mbps = 0, + .main_cr = 2.0, + .high_cr = 0, + .max_tiles = 8, + .max_tile_cols = 4 }, + { .level = SEQ_LEVEL_2_1, + .max_picture_size = 278784, + .max_h_size = 2816, + .max_v_size = 1584, + .max_display_rate = 8363520L, + .max_decode_rate = 10454400L, + .max_header_rate = 150, + .main_mbps = 3.0, + .high_mbps = 0, + .main_cr = 2.0, + .high_cr = 0, + .max_tiles = 8, + .max_tile_cols = 4 }, + UNDEFINED_LEVEL, + UNDEFINED_LEVEL, + { .level = SEQ_LEVEL_3_0, + .max_picture_size = 665856, + .max_h_size = 4352, + .max_v_size = 2448, + .max_display_rate = 19975680L, + .max_decode_rate = 24969600L, + .max_header_rate = 150, + .main_mbps = 6.0, + .high_mbps = 0, + .main_cr = 2.0, + .high_cr = 0, + .max_tiles = 16, + .max_tile_cols = 6 }, + { .level = SEQ_LEVEL_3_1, + .max_picture_size = 1065024, + .max_h_size = 5504, + .max_v_size = 3096, + .max_display_rate = 31950720L, + .max_decode_rate = 39938400L, + .max_header_rate = 150, + .main_mbps = 10.0, + .high_mbps = 0, + .main_cr = 2.0, + .high_cr = 0, + .max_tiles = 16, + .max_tile_cols = 6 }, + UNDEFINED_LEVEL, + UNDEFINED_LEVEL, + { .level = SEQ_LEVEL_4_0, + .max_picture_size = 2359296, + .max_h_size = 6144, + .max_v_size = 3456, + .max_display_rate = 70778880L, + .max_decode_rate = 77856768L, + .max_header_rate = 300, + .main_mbps = 12.0, + .high_mbps = 30.0, + .main_cr = 4.0, + .high_cr = 4.0, + .max_tiles = 32, + .max_tile_cols = 8 }, + { .level = SEQ_LEVEL_4_1, + .max_picture_size = 2359296, + .max_h_size = 6144, + .max_v_size = 3456, + .max_display_rate = 141557760L, + .max_decode_rate = 155713536L, + .max_header_rate = 300, + .main_mbps = 20.0, + .high_mbps = 50.0, + .main_cr = 4.0, + .high_cr = 4.0, + .max_tiles = 32, + .max_tile_cols = 8 }, + UNDEFINED_LEVEL, + UNDEFINED_LEVEL, + { .level = SEQ_LEVEL_5_0, + .max_picture_size = 8912896, + .max_h_size = 8192, + .max_v_size = 4352, + .max_display_rate = 267386880L, + .max_decode_rate = 273715200L, + .max_header_rate = 300, + .main_mbps = 30.0, + .high_mbps = 100.0, + .main_cr = 6.0, + .high_cr = 4.0, + .max_tiles = 64, + .max_tile_cols = 8 }, + { .level = SEQ_LEVEL_5_1, + .max_picture_size = 8912896, + .max_h_size = 8192, + .max_v_size = 4352, + .max_display_rate = 534773760L, + .max_decode_rate = 547430400L, + .max_header_rate = 300, + .main_mbps = 40.0, + .high_mbps = 160.0, + .main_cr = 8.0, + .high_cr = 4.0, + .max_tiles = 64, + .max_tile_cols = 8 }, + { .level = SEQ_LEVEL_5_2, + .max_picture_size = 8912896, + .max_h_size = 8192, + .max_v_size = 4352, + .max_display_rate = 1069547520L, + .max_decode_rate = 1094860800L, + .max_header_rate = 300, + .main_mbps = 60.0, + .high_mbps = 240.0, + .main_cr = 8.0, + .high_cr = 4.0, + .max_tiles = 64, + .max_tile_cols = 8 }, + { .level = SEQ_LEVEL_5_3, + .max_picture_size = 8912896, + .max_h_size = 8192, + .max_v_size = 4352, + .max_display_rate = 1069547520L, + .max_decode_rate = 1176502272L, + .max_header_rate = 300, + .main_mbps = 60.0, + .high_mbps = 240.0, + .main_cr = 8.0, + .high_cr = 4.0, + .max_tiles = 64, + .max_tile_cols = 8 }, + { .level = SEQ_LEVEL_6_0, + .max_picture_size = 35651584, + .max_h_size = 16384, + .max_v_size = 8704, + .max_display_rate = 1069547520L, + .max_decode_rate = 1176502272L, + .max_header_rate = 300, + .main_mbps = 60.0, + .high_mbps = 240.0, + .main_cr = 8.0, + .high_cr = 4.0, + .max_tiles = 128, + .max_tile_cols = 16 }, + { .level = SEQ_LEVEL_6_1, + .max_picture_size = 35651584, + .max_h_size = 16384, + .max_v_size = 8704, + .max_display_rate = 2139095040L, + .max_decode_rate = 2189721600L, + .max_header_rate = 300, + .main_mbps = 100.0, + .high_mbps = 480.0, + .main_cr = 8.0, + .high_cr = 4.0, + .max_tiles = 128, + .max_tile_cols = 16 }, + { .level = SEQ_LEVEL_6_2, + .max_picture_size = 35651584, + .max_h_size = 16384, + .max_v_size = 8704, + .max_display_rate = 4278190080L, + .max_decode_rate = 4379443200L, + .max_header_rate = 300, + .main_mbps = 160.0, + .high_mbps = 800.0, + .main_cr = 8.0, + .high_cr = 4.0, + .max_tiles = 128, + .max_tile_cols = 16 }, + { .level = SEQ_LEVEL_6_3, + .max_picture_size = 35651584, + .max_h_size = 16384, + .max_v_size = 8704, + .max_display_rate = 4278190080L, + .max_decode_rate = 4706009088L, + .max_header_rate = 300, + .main_mbps = 160.0, + .high_mbps = 800.0, + .main_cr = 8.0, + .high_cr = 4.0, + .max_tiles = 128, + .max_tile_cols = 16 }, + UNDEFINED_LEVEL, + UNDEFINED_LEVEL, + UNDEFINED_LEVEL, + UNDEFINED_LEVEL, +}; + +typedef enum { + LUMA_PIC_SIZE_TOO_LARGE, + LUMA_PIC_H_SIZE_TOO_LARGE, + LUMA_PIC_V_SIZE_TOO_LARGE, + LUMA_PIC_H_SIZE_TOO_SMALL, + LUMA_PIC_V_SIZE_TOO_SMALL, + TOO_MANY_TILE_COLUMNS, + TOO_MANY_TILES, + TILE_RATE_TOO_HIGH, + TILE_TOO_LARGE, + SUPERRES_TILE_WIDTH_TOO_LARGE, + CROPPED_TILE_WIDTH_TOO_SMALL, + CROPPED_TILE_HEIGHT_TOO_SMALL, + TILE_WIDTH_INVALID, + FRAME_HEADER_RATE_TOO_HIGH, + DISPLAY_RATE_TOO_HIGH, + DECODE_RATE_TOO_HIGH, + CR_TOO_SMALL, + + TARGET_LEVEL_FAIL_IDS, + TARGET_LEVEL_OK, +} TARGET_LEVEL_FAIL_ID; + +static const char *level_fail_messages[TARGET_LEVEL_FAIL_IDS] = { + "The picture size is too large.", + "The picture width is too large.", + "The picture height is too large.", + "The picture width is too small.", + "The picture height is too small.", + "Too many tile columns are used.", + "Too many tiles are used.", + "The tile rate is too high.", + "The tile size is too large.", + "The superres tile width is too large.", + "The cropped tile width is less than 8.", + "The cropped tile height is less than 8.", + "The tile width is invalid.", + "The frame header rate is too high.", + "The display luma sample rate is too high.", + "The decoded luma sample rate is too high.", + "The compression ratio is too small.", +}; + +static double get_min_cr(const AV1LevelSpec *const level_spec, int tier, + int is_still_picture, int64_t decoded_sample_rate) { + if (is_still_picture) return 0.8; + const double min_cr_basis = tier ? level_spec->high_cr : level_spec->main_cr; + const double speed_adj = + (double)decoded_sample_rate / level_spec->max_display_rate; + return AOMMAX(min_cr_basis * speed_adj, 0.8); +} + +static TARGET_LEVEL_FAIL_ID check_level_constraints( + const AV1LevelSpec *const target_level_spec, + const AV1LevelSpec *const level_spec, + const AV1LevelStats *const level_stats, int tier, int is_still_picture) { + const double min_cr = get_min_cr(target_level_spec, tier, is_still_picture, + level_spec->max_decode_rate); + TARGET_LEVEL_FAIL_ID fail_id = TARGET_LEVEL_OK; + + do { + if (level_spec->max_picture_size > target_level_spec->max_picture_size) { + fail_id = LUMA_PIC_SIZE_TOO_LARGE; + break; + } + + if (level_spec->max_h_size > target_level_spec->max_h_size) { + fail_id = LUMA_PIC_H_SIZE_TOO_LARGE; + break; + } + + if (level_spec->max_v_size > target_level_spec->max_v_size) { + fail_id = LUMA_PIC_V_SIZE_TOO_LARGE; + break; + } + + if (level_spec->max_tile_cols > target_level_spec->max_tile_cols) { + fail_id = TOO_MANY_TILE_COLUMNS; + break; + } + + if (level_spec->max_tiles > target_level_spec->max_tiles) { + fail_id = TOO_MANY_TILES; + break; + } + + if (level_spec->max_header_rate > target_level_spec->max_header_rate) { + fail_id = FRAME_HEADER_RATE_TOO_HIGH; + break; + } + + if (level_spec->max_display_rate > target_level_spec->max_display_rate) { + fail_id = DISPLAY_RATE_TOO_HIGH; + break; + } + + if (level_spec->max_decode_rate > target_level_spec->max_decode_rate) { + fail_id = DECODE_RATE_TOO_HIGH; + break; + } + + if (level_spec->max_tile_rate > target_level_spec->max_tiles * 120) { + fail_id = TILE_RATE_TOO_HIGH; + break; + } + + if (level_stats->max_tile_size > 4096 * 2304) { + fail_id = TILE_TOO_LARGE; + break; + } + + if (level_stats->max_superres_tile_width > MAX_TILE_WIDTH) { + fail_id = SUPERRES_TILE_WIDTH_TOO_LARGE; + break; + } + + if (level_stats->min_cropped_tile_width < 8) { + fail_id = CROPPED_TILE_WIDTH_TOO_SMALL; + break; + } + + if (level_stats->min_cropped_tile_height < 8) { + fail_id = CROPPED_TILE_HEIGHT_TOO_SMALL; + break; + } + + if (level_stats->min_frame_width < 16) { + fail_id = LUMA_PIC_H_SIZE_TOO_SMALL; + break; + } + + if (level_stats->min_frame_height < 16) { + fail_id = LUMA_PIC_V_SIZE_TOO_SMALL; + break; + } + + if (!level_stats->tile_width_is_valid) { + fail_id = TILE_WIDTH_INVALID; + break; + } + + if (level_stats->min_cr < min_cr) { + fail_id = CR_TOO_SMALL; + break; + } + } while (0); + + return fail_id; +} + +static INLINE int is_in_operating_point(int operating_point, + int temporal_layer_id, + int spatial_layer_id) { + if (!operating_point) return 1; + + return ((operating_point >> temporal_layer_id) & 1) && + ((operating_point >> (spatial_layer_id + 8)) & 1); +} + +static void get_tile_stats(const AV1_COMP *const cpi, int *max_tile_size, + int *max_superres_tile_width, + int *min_cropped_tile_width, + int *min_cropped_tile_height, + int *tile_width_valid) { + const AV1_COMMON *const cm = &cpi->common; + const int tile_cols = cm->tile_cols; + const int tile_rows = cm->tile_rows; + const int superres_scale_denominator = cm->superres_scale_denominator; + + *max_tile_size = 0; + *max_superres_tile_width = 0; + *min_cropped_tile_width = INT_MAX; + *min_cropped_tile_height = INT_MAX; + *tile_width_valid = 1; + + for (int tile_row = 0; tile_row < tile_rows; ++tile_row) { + for (int tile_col = 0; tile_col < tile_cols; ++tile_col) { + const TileInfo *const tile_info = + &cpi->tile_data[tile_row * cm->tile_cols + tile_col].tile_info; + const int tile_width = + (tile_info->mi_col_end - tile_info->mi_col_start) * MI_SIZE; + const int tile_height = + (tile_info->mi_row_end - tile_info->mi_row_start) * MI_SIZE; + const int tile_size = tile_width * tile_height; + *max_tile_size = AOMMAX(*max_tile_size, tile_size); + + const int supperres_tile_width = + tile_width * superres_scale_denominator / SCALE_NUMERATOR; + *max_superres_tile_width = + AOMMAX(*max_superres_tile_width, supperres_tile_width); + + const int cropped_tile_width = + cm->width - tile_info->mi_col_start * MI_SIZE; + const int cropped_tile_height = + cm->height - tile_info->mi_row_start * MI_SIZE; + *min_cropped_tile_width = + AOMMIN(*min_cropped_tile_width, cropped_tile_width); + *min_cropped_tile_height = + AOMMIN(*min_cropped_tile_height, cropped_tile_height); + + const int is_right_most_tile = tile_info->mi_col_end == cm->mi_cols; + if (!is_right_most_tile) { + if (av1_superres_scaled(cm)) + *tile_width_valid &= tile_width >= 128; + else + *tile_width_valid &= tile_width >= 64; + } + } + } +} + +static int store_frame_record(int64_t ts_start, int64_t ts_end, int pic_size, + int frame_header_count, int tiles, int show_frame, + int show_existing_frame, + FrameWindowBuffer *const buffer) { + if (buffer->num < FRAME_WINDOW_SIZE) { + ++buffer->num; + } else { + buffer->start = (buffer->start + 1) % FRAME_WINDOW_SIZE; + } + const int new_idx = (buffer->start + buffer->num - 1) % FRAME_WINDOW_SIZE; + FrameRecord *const record = &buffer->buf[new_idx]; + record->ts_start = ts_start; + record->ts_end = ts_end; + record->pic_size = pic_size; + record->frame_header_count = frame_header_count; + record->tiles = tiles; + record->show_frame = show_frame; + record->show_existing_frame = show_existing_frame; + + return new_idx; +} + +// Count the number of frames encoded in the last "duration" ticks, in display +// time. +static int count_frames(const FrameWindowBuffer *const buffer, + int64_t duration) { + const int current_idx = (buffer->start + buffer->num - 1) % FRAME_WINDOW_SIZE; + // Assume current frame is shown frame. + assert(buffer->buf[current_idx].show_frame); + + const int64_t current_time = buffer->buf[current_idx].ts_end; + const int64_t time_limit = AOMMAX(current_time - duration, 0); + int num_frames = 1; + int index = current_idx - 1; + for (int i = buffer->num - 2; i >= 0; --i, --index, ++num_frames) { + if (index < 0) index = FRAME_WINDOW_SIZE - 1; + const FrameRecord *const record = &buffer->buf[index]; + if (!record->show_frame) continue; + const int64_t ts_start = record->ts_start; + if (ts_start < time_limit) break; + } + + return num_frames; +} + +// Scan previously encoded frames and update level metrics accordingly. +static void scan_past_frames(const FrameWindowBuffer *const buffer, + int num_frames_to_scan, + AV1LevelSpec *const level_spec) { + const int num_frames_in_buffer = buffer->num; + int index = (buffer->start + num_frames_in_buffer - 1) % FRAME_WINDOW_SIZE; + int frame_headers = 0; + int tiles = 0; + int64_t display_samples = 0; + int64_t decoded_samples = 0; + for (int i = 0; i < AOMMIN(num_frames_in_buffer, num_frames_to_scan); ++i) { + const FrameRecord *const record = &buffer->buf[index]; + if (!record->show_existing_frame) { + frame_headers += record->frame_header_count; + decoded_samples += record->pic_size; + } + if (record->show_frame) { + display_samples += record->pic_size; + } + tiles += record->tiles; + --index; + if (index < 0) index = FRAME_WINDOW_SIZE - 1; + } + level_spec->max_header_rate = + AOMMAX(level_spec->max_header_rate, frame_headers); + level_spec->max_display_rate = + AOMMAX(level_spec->max_display_rate, display_samples); + level_spec->max_decode_rate = + AOMMAX(level_spec->max_decode_rate, decoded_samples); + level_spec->max_tile_rate = AOMMAX(level_spec->max_tile_rate, tiles); +} + +void av1_update_level_info(AV1_COMP *cpi, size_t size, int64_t ts_start, + int64_t ts_end) { + AV1_COMMON *const cm = &cpi->common; + const int upscaled_width = cm->superres_upscaled_width; + const int width = cm->width; + const int height = cm->height; + const int tile_cols = cm->tile_cols; + const int tile_rows = cm->tile_rows; + const int tiles = tile_cols * tile_rows; + const int luma_pic_size = upscaled_width * height; + const int frame_header_count = cpi->frame_header_count; + const int show_frame = cm->show_frame; + const int show_existing_frame = cm->show_existing_frame; + + // Store info. of current frame into FrameWindowBuffer. + FrameWindowBuffer *const buffer = &cpi->frame_window_buffer; + store_frame_record(ts_start, ts_end, luma_pic_size, frame_header_count, tiles, + show_frame, show_existing_frame, buffer); + // Count the number of frames encoded in the past 1 second. + const int encoded_frames_in_last_second = + show_frame ? count_frames(buffer, TICKS_PER_SEC) : 0; + + int max_tile_size; + int min_cropped_tile_width; + int min_cropped_tile_height; + int max_superres_tile_width; + int tile_width_is_valid; + get_tile_stats(cpi, &max_tile_size, &max_superres_tile_width, + &min_cropped_tile_width, &min_cropped_tile_height, + &tile_width_is_valid); + + const SequenceHeader *const seq_params = &cm->seq_params; + const BITSTREAM_PROFILE profile = seq_params->profile; + const int pic_size_profile_factor = + profile == PROFILE_0 ? 15 : (profile == PROFILE_1 ? 30 : 36); + const size_t frame_compressed_size = (size > 129 ? size - 128 : 1); + const size_t frame_uncompressed_size = + (luma_pic_size * pic_size_profile_factor) >> 3; + + aom_clear_system_state(); + const double compression_ratio = + frame_uncompressed_size / (double)frame_compressed_size; + const double total_time_encoded = + (cpi->last_end_time_stamp_seen - cpi->first_time_stamp_ever) / + (double)TICKS_PER_SEC; + + const int temporal_layer_id = cm->temporal_layer_id; + const int spatial_layer_id = cm->spatial_layer_id; + const int is_still_picture = seq_params->still_picture; + // update level_stats + // TODO(kyslov@) fix the implementation according to buffer model + for (int i = 0; i < seq_params->operating_points_cnt_minus_1 + 1; ++i) { + if (!is_in_operating_point(seq_params->operating_point_idc[i], + temporal_layer_id, spatial_layer_id)) { + continue; + } + + AV1LevelInfo *const level_info = &cpi->level_info[i]; + AV1LevelStats *const level_stats = &level_info->level_stats; + + level_stats->max_tile_size = + AOMMAX(level_stats->max_tile_size, max_tile_size); + level_stats->max_superres_tile_width = + AOMMAX(level_stats->max_superres_tile_width, max_superres_tile_width); + level_stats->min_cropped_tile_width = + AOMMIN(level_stats->min_cropped_tile_width, min_cropped_tile_width); + level_stats->min_cropped_tile_height = + AOMMIN(level_stats->min_cropped_tile_height, min_cropped_tile_height); + level_stats->tile_width_is_valid &= tile_width_is_valid; + level_stats->min_frame_width = AOMMIN(level_stats->min_frame_width, width); + level_stats->min_frame_height = + AOMMIN(level_stats->min_frame_height, height); + level_stats->total_compressed_size += frame_compressed_size; + if (show_frame) level_stats->total_time_encoded = total_time_encoded; + level_stats->min_cr = AOMMIN(level_stats->min_cr, compression_ratio); + + // update level_spec + // TODO(kyslov@) update all spec fields + AV1LevelSpec *const level_spec = &level_info->level_spec; + level_spec->max_picture_size = + AOMMAX(level_spec->max_picture_size, luma_pic_size); + level_spec->max_h_size = + AOMMAX(level_spec->max_h_size, cm->superres_upscaled_width); + level_spec->max_v_size = AOMMAX(level_spec->max_v_size, height); + level_spec->max_tile_cols = AOMMAX(level_spec->max_tile_cols, tile_cols); + level_spec->max_tiles = AOMMAX(level_spec->max_tiles, tiles); + + if (show_frame) { + scan_past_frames(buffer, encoded_frames_in_last_second, level_spec); + } + + // Check whether target level is met. + const AV1_LEVEL target_seq_level_idx = cpi->target_seq_level_idx[i]; + if (target_seq_level_idx < SEQ_LEVELS) { + const AV1LevelSpec *const target_level_spec = + av1_level_defs + target_seq_level_idx; + const int tier = seq_params->tier[i]; + const TARGET_LEVEL_FAIL_ID fail_id = check_level_constraints( + target_level_spec, level_spec, level_stats, tier, is_still_picture); + if (fail_id != TARGET_LEVEL_OK) { + const int target_level_major = 2 + (target_seq_level_idx >> 2); + const int target_level_minor = target_seq_level_idx & 3; + aom_internal_error(&cm->error, AOM_CODEC_ERROR, + "Failed to encode to the target level %d_%d. %s", + target_level_major, target_level_minor, + level_fail_messages[fail_id]); + } + } + } +} + +aom_codec_err_t av1_get_seq_level_idx(const AV1_COMP *cpi, int *seq_level_idx) { + const SequenceHeader *const seq_params = &cpi->common.seq_params; + if (!cpi->keep_level_stats) { + for (int op = 0; op < seq_params->operating_points_cnt_minus_1 + 1; ++op) { + seq_level_idx[op] = (int)SEQ_LEVEL_MAX; + } + return AOM_CODEC_OK; + } + + const int is_still_picture = seq_params->still_picture; + for (int op = 0; op < seq_params->operating_points_cnt_minus_1 + 1; ++op) { + seq_level_idx[op] = (int)SEQ_LEVEL_MAX; + const int tier = seq_params->tier[op]; + const AV1LevelInfo *const level_info = &cpi->level_info[op]; + const AV1LevelStats *const level_stats = &level_info->level_stats; + const AV1LevelSpec *const level_spec = &level_info->level_spec; + for (int level = 0; level < SEQ_LEVELS; ++level) { + const AV1LevelSpec *const target_level_spec = av1_level_defs + level; + const TARGET_LEVEL_FAIL_ID fail_id = check_level_constraints( + target_level_spec, level_spec, level_stats, tier, is_still_picture); + if (fail_id == TARGET_LEVEL_OK) { + seq_level_idx[op] = level; + break; + } + } + } + + return AOM_CODEC_OK; +} diff --git a/libaom/av1/encoder/level.h b/libaom/av1/encoder/level.h new file mode 100644 index 0000000..9f1664d --- /dev/null +++ b/libaom/av1/encoder/level.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_LEVEL_H_ +#define AOM_AV1_ENCODER_LEVEL_H_ + +#include "av1/common/enums.h" + +struct AV1_COMP; + +// AV1 Level Specifications +typedef struct { + AV1_LEVEL level; + int max_picture_size; + int max_h_size; + int max_v_size; + int max_header_rate; + int max_tile_rate; + int max_tiles; + int max_tile_cols; + int64_t max_display_rate; + int64_t max_decode_rate; + double main_mbps; + double high_mbps; + double main_cr; + double high_cr; +} AV1LevelSpec; + +typedef struct { + int64_t ts_start; + int64_t ts_end; + int pic_size; + int frame_header_count; + int tiles; + int show_frame; + int show_existing_frame; +} FrameRecord; + +// Record frame info. in a rolling window. +#define FRAME_WINDOW_SIZE 256 +typedef struct { + FrameRecord buf[FRAME_WINDOW_SIZE]; + int num; // Number of FrameRecord stored in the buffer. + int start; // Buffer index of the first FrameRecord. +} FrameWindowBuffer; + +// Used to keep track of AV1 Level Stats. Currently unimplemented. +typedef struct { + uint64_t total_compressed_size; + int max_tile_size; + int max_superres_tile_width; + int min_cropped_tile_width; + int min_cropped_tile_height; + int tile_width_is_valid; + int min_frame_width; + int min_frame_height; + double total_time_encoded; + double min_cr; +} AV1LevelStats; + +typedef struct { + AV1LevelStats level_stats; + AV1LevelSpec level_spec; +} AV1LevelInfo; + +void av1_update_level_info(struct AV1_COMP *cpi, size_t size, int64_t ts_start, + int64_t ts_end); + +// Return sequence level indices in seq_level_idx[MAX_NUM_OPERATING_POINTS]. +aom_codec_err_t av1_get_seq_level_idx(const struct AV1_COMP *cpi, + int *seq_level_idx); + +#endif // AOM_AV1_ENCODER_LEVEL_H_ diff --git a/libaom/av1/encoder/lookahead.c b/libaom/av1/encoder/lookahead.c index 1bf8ecb..f5298f7 100644 --- a/libaom/av1/encoder/lookahead.c +++ b/libaom/av1/encoder/lookahead.c @@ -43,7 +43,8 @@ void av1_lookahead_destroy(struct lookahead_ctx *ctx) { struct lookahead_ctx *av1_lookahead_init( unsigned int width, unsigned int height, unsigned int subsampling_x, - unsigned int subsampling_y, int use_highbitdepth, unsigned int depth) { + unsigned int subsampling_y, int use_highbitdepth, unsigned int depth, + const int border_in_pixels, int is_scale) { struct lookahead_ctx *ctx = NULL; // Clamp the lookahead queue depth @@ -61,10 +62,19 @@ struct lookahead_ctx *av1_lookahead_init( ctx->buf = calloc(depth, sizeof(*ctx->buf)); if (!ctx->buf) goto bail; for (i = 0; i < depth; i++) - if (aom_alloc_frame_buffer(&ctx->buf[i].img, width, height, subsampling_x, - subsampling_y, use_highbitdepth, - AOM_BORDER_IN_PIXELS, legacy_byte_alignment)) - goto bail; + if (is_scale) { + if (aom_alloc_frame_buffer( + &ctx->buf[i].img, width, height, subsampling_x, subsampling_y, + use_highbitdepth, border_in_pixels, legacy_byte_alignment)) + goto bail; + } else { + aom_free_frame_buffer(&ctx->buf[i].img); + if (aom_realloc_lookahead_buffer( + &ctx->buf[i].img, width, height, subsampling_x, subsampling_y, + use_highbitdepth, AOM_ENC_LOOKAHEAD_BORDER, + legacy_byte_alignment, NULL, NULL, NULL)) + goto bail; + } } return ctx; bail: diff --git a/libaom/av1/encoder/lookahead.h b/libaom/av1/encoder/lookahead.h index e55224c..3b2d94b 100644 --- a/libaom/av1/encoder/lookahead.h +++ b/libaom/av1/encoder/lookahead.h @@ -46,7 +46,8 @@ struct lookahead_ctx { */ struct lookahead_ctx *av1_lookahead_init( unsigned int width, unsigned int height, unsigned int subsampling_x, - unsigned int subsampling_y, int use_highbitdepth, unsigned int depth); + unsigned int subsampling_y, int use_highbitdepth, unsigned int depth, + const int border_in_pixels, int is_scale); /**\brief Destroys the lookahead stage */ diff --git a/libaom/av1/encoder/mbgraph.c b/libaom/av1/encoder/mbgraph.c index cc50458..0cb6286 100644 --- a/libaom/av1/encoder/mbgraph.c +++ b/libaom/av1/encoder/mbgraph.c @@ -71,8 +71,8 @@ static unsigned int do_16x16_motion_iteration(AV1_COMP *cpi, const MV *ref_mv, xd->mi[0]->mv[0] = x->best_mv; xd->mi[0]->ref_frame[1] = NONE_FRAME; - av1_build_inter_predictors_sby(&cpi->common, xd, mb_row, mb_col, NULL, - BLOCK_16X16); + av1_enc_build_inter_predictor(&cpi->common, xd, mb_row, mb_col, NULL, + BLOCK_16X16, AOM_PLANE_Y, AOM_PLANE_Y); /* restore UMV window */ x->mv_limits = tmp_mv_limits; @@ -364,7 +364,7 @@ static void separate_arf_mbs(AV1_COMP *cpi) { void av1_update_mbgraph_stats(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; int i, n_frames = av1_lookahead_depth(cpi->lookahead); - YV12_BUFFER_CONFIG *golden_ref = get_ref_frame_buffer(cpi, GOLDEN_FRAME); + YV12_BUFFER_CONFIG *golden_ref = &get_ref_frame_buf(cm, GOLDEN_FRAME)->buf; assert(golden_ref != NULL); diff --git a/libaom/av1/encoder/mcomp.c b/libaom/av1/encoder/mcomp.c index 63b4947..f077a4e 100644 --- a/libaom/av1/encoder/mcomp.c +++ b/libaom/av1/encoder/mcomp.c @@ -19,6 +19,7 @@ #include "aom_dsp/aom_dsp_common.h" #include "aom_mem/aom_mem.h" #include "aom_ports/mem.h" +#include "aom_ports/system_state.h" #include "av1/common/common.h" #include "av1/common/mvref_common.h" @@ -28,6 +29,7 @@ #include "av1/encoder/encoder.h" #include "av1/encoder/encodemv.h" #include "av1/encoder/mcomp.h" +#include "av1/encoder/partition_strategy.h" #include "av1/encoder/rdopt.h" #include "av1/encoder/reconinter_enc.h" @@ -336,7 +338,7 @@ static unsigned int setup_center_error( int *mvcost[2], unsigned int *sse1, int *distortion) { unsigned int besterr; if (second_pred != NULL) { - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + if (is_cur_buf_hbd(xd)) { DECLARE_ALIGNED(16, uint16_t, comp_pred16[MAX_SB_SQUARE]); uint8_t *comp_pred = CONVERT_TO_BYTEPTR(comp_pred16); if (mask) { @@ -641,7 +643,7 @@ static int upsampled_pref_error(MACROBLOCKD *xd, const AV1_COMMON *const cm, int mask_stride, int invert_mask, int w, int h, unsigned int *sse, int subpel_search) { unsigned int besterr; - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + if (is_cur_buf_hbd(xd)) { DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]); uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred16); if (second_pred != NULL) { @@ -899,7 +901,8 @@ unsigned int av1_compute_motion_cost(const AV1_COMP *cpi, MACROBLOCK *const x, unsigned int mse; unsigned int sse; - av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, NULL, bsize); + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); mse = vfp->vf(dst, dst_stride, src, src_stride, &sse); mse += mv_err_cost(this_mv, &ref_mv.as_mv, x->nmv_vec_cost, x->mv_cost_stack, x->errorperbit); @@ -1797,11 +1800,11 @@ static int full_pixel_diamond(const AV1_COMP *const cpi, MACROBLOCK *x, MV *mvp_full, int step_param, int sadpb, int further_steps, int do_refine, int *cost_list, const aom_variance_fn_ptr_t *fn_ptr, - const MV *ref_mv) { + const MV *ref_mv, const search_site_config *cfg) { MV temp_mv; int thissme, n, num00 = 0; - int bestsme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv, - step_param, sadpb, &n, fn_ptr, ref_mv); + int bestsme = cpi->diamond_search_sad(x, cfg, mvp_full, &temp_mv, step_param, + sadpb, &n, fn_ptr, ref_mv); if (bestsme < INT_MAX) bestsme = av1_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1); x->best_mv.as_mv = temp_mv; @@ -1816,9 +1819,9 @@ static int full_pixel_diamond(const AV1_COMP *const cpi, MACROBLOCK *x, if (num00) { num00--; } else { - thissme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv, - step_param + n, sadpb, &num00, fn_ptr, - ref_mv); + thissme = + cpi->diamond_search_sad(x, cfg, mvp_full, &temp_mv, step_param + n, + sadpb, &num00, fn_ptr, ref_mv); if (thissme < INT_MAX) thissme = av1_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1); @@ -2094,11 +2097,222 @@ static int is_exhaustive_allowed(const AV1_COMP *const cpi, MACROBLOCK *x) { return is_allowed; } +static int vector_match(int16_t *ref, int16_t *src, int bwl) { + int best_sad = INT_MAX; + int this_sad; + int d; + int center, offset = 0; + int bw = 4 << bwl; // redundant variable, to be changed in the experiments. + for (d = 0; d <= bw; d += 16) { + this_sad = aom_vector_var(&ref[d], src, bwl); + if (this_sad < best_sad) { + best_sad = this_sad; + offset = d; + } + } + center = offset; + + for (d = -8; d <= 8; d += 16) { + int this_pos = offset + d; + // check limit + if (this_pos < 0 || this_pos > bw) continue; + this_sad = aom_vector_var(&ref[this_pos], src, bwl); + if (this_sad < best_sad) { + best_sad = this_sad; + center = this_pos; + } + } + offset = center; + + for (d = -4; d <= 4; d += 8) { + int this_pos = offset + d; + // check limit + if (this_pos < 0 || this_pos > bw) continue; + this_sad = aom_vector_var(&ref[this_pos], src, bwl); + if (this_sad < best_sad) { + best_sad = this_sad; + center = this_pos; + } + } + offset = center; + + for (d = -2; d <= 2; d += 4) { + int this_pos = offset + d; + // check limit + if (this_pos < 0 || this_pos > bw) continue; + this_sad = aom_vector_var(&ref[this_pos], src, bwl); + if (this_sad < best_sad) { + best_sad = this_sad; + center = this_pos; + } + } + offset = center; + + for (d = -1; d <= 1; d += 2) { + int this_pos = offset + d; + // check limit + if (this_pos < 0 || this_pos > bw) continue; + this_sad = aom_vector_var(&ref[this_pos], src, bwl); + if (this_sad < best_sad) { + best_sad = this_sad; + center = this_pos; + } + } + + return (center - (bw >> 1)); +} + +static const MV search_pos[4] = { + { -1, 0 }, + { 0, -1 }, + { 0, 1 }, + { 1, 0 }, +}; + +unsigned int av1_int_pro_motion_estimation(const AV1_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int mi_row, + int mi_col, const MV *ref_mv) { + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mi = xd->mi[0]; + struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } }; + DECLARE_ALIGNED(16, int16_t, hbuf[256]); + DECLARE_ALIGNED(16, int16_t, vbuf[256]); + DECLARE_ALIGNED(16, int16_t, src_hbuf[128]); + DECLARE_ALIGNED(16, int16_t, src_vbuf[128]); + int idx; + const int bw = 4 << mi_size_wide_log2[bsize]; + const int bh = 4 << mi_size_high_log2[bsize]; + const int search_width = bw << 1; + const int search_height = bh << 1; + const int src_stride = x->plane[0].src.stride; + const int ref_stride = xd->plane[0].pre[0].stride; + uint8_t const *ref_buf, *src_buf; + MV *tmp_mv = &xd->mi[0]->mv[0].as_mv; + unsigned int best_sad, tmp_sad, this_sad[4]; + MV this_mv; + const int norm_factor = 3 + (bw >> 5); + const YV12_BUFFER_CONFIG *scaled_ref_frame = + av1_get_scaled_ref_frame(cpi, mi->ref_frame[0]); + MvLimits subpel_mv_limits; + + if (scaled_ref_frame) { + int i; + // Swap out the reference frame for a version that's been scaled to + // match the resolution of the current frame, allowing the existing + // motion search code to be used without additional modifications. + for (i = 0; i < MAX_MB_PLANE; i++) backup_yv12[i] = xd->plane[i].pre[0]; + av1_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL, + MAX_MB_PLANE); + } + + if (xd->bd != 8) { + unsigned int sad; + tmp_mv->row = 0; + tmp_mv->col = 0; + sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride, + xd->plane[0].pre[0].buf, ref_stride); + + if (scaled_ref_frame) { + int i; + for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i]; + } + return sad; + } + + // Set up prediction 1-D reference set + ref_buf = xd->plane[0].pre[0].buf - (bw >> 1); + for (idx = 0; idx < search_width; idx += 16) { + aom_int_pro_row(&hbuf[idx], ref_buf, ref_stride, bh); + ref_buf += 16; + } + + ref_buf = xd->plane[0].pre[0].buf - (bh >> 1) * ref_stride; + for (idx = 0; idx < search_height; ++idx) { + vbuf[idx] = aom_int_pro_col(ref_buf, bw) >> norm_factor; + ref_buf += ref_stride; + } + + // Set up src 1-D reference set + for (idx = 0; idx < bw; idx += 16) { + src_buf = x->plane[0].src.buf + idx; + aom_int_pro_row(&src_hbuf[idx], src_buf, src_stride, bh); + } + + src_buf = x->plane[0].src.buf; + for (idx = 0; idx < bh; ++idx) { + src_vbuf[idx] = aom_int_pro_col(src_buf, bw) >> norm_factor; + src_buf += src_stride; + } + + // Find the best match per 1-D search + tmp_mv->col = vector_match(hbuf, src_hbuf, mi_size_wide_log2[bsize]); + tmp_mv->row = vector_match(vbuf, src_vbuf, mi_size_high_log2[bsize]); + + this_mv = *tmp_mv; + src_buf = x->plane[0].src.buf; + ref_buf = xd->plane[0].pre[0].buf + this_mv.row * ref_stride + this_mv.col; + best_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride); + + { + const uint8_t *const pos[4] = { + ref_buf - ref_stride, + ref_buf - 1, + ref_buf + 1, + ref_buf + ref_stride, + }; + + cpi->fn_ptr[bsize].sdx4df(src_buf, src_stride, pos, ref_stride, this_sad); + } + + for (idx = 0; idx < 4; ++idx) { + if (this_sad[idx] < best_sad) { + best_sad = this_sad[idx]; + tmp_mv->row = search_pos[idx].row + this_mv.row; + tmp_mv->col = search_pos[idx].col + this_mv.col; + } + } + + if (this_sad[0] < this_sad[3]) + this_mv.row -= 1; + else + this_mv.row += 1; + + if (this_sad[1] < this_sad[2]) + this_mv.col -= 1; + else + this_mv.col += 1; + + ref_buf = xd->plane[0].pre[0].buf + this_mv.row * ref_stride + this_mv.col; + + tmp_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride); + if (best_sad > tmp_sad) { + *tmp_mv = this_mv; + best_sad = tmp_sad; + } + + tmp_mv->row *= 8; + tmp_mv->col *= 8; + + set_subpel_mv_search_range( + &x->mv_limits, &subpel_mv_limits.col_min, &subpel_mv_limits.col_max, + &subpel_mv_limits.row_min, &subpel_mv_limits.row_max, ref_mv); + clamp_mv(tmp_mv, subpel_mv_limits.col_min, subpel_mv_limits.col_max, + subpel_mv_limits.row_min, subpel_mv_limits.row_max); + + if (scaled_ref_frame) { + int i; + for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i]; + } + + return best_sad; +} + int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, MV *mvp_full, int step_param, int method, int run_mesh_search, int error_per_bit, int *cost_list, const MV *ref_mv, int var_max, int rd, - int x_pos, int y_pos, int intra) { + int x_pos, int y_pos, int intra, + const search_site_config *cfg) { const SPEED_FEATURES *const sf = &cpi->sf; const aom_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize]; int var = 0; @@ -2138,7 +2352,7 @@ int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, case NSTEP: var = full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit, MAX_MVSEARCH_STEPS - 1 - step_param, 1, - cost_list, fn_ptr, ref_mv); + cost_list, fn_ptr, ref_mv, cfg); // Should we allow a follow on exhaustive search? if (is_exhaustive_allowed(cpi, x)) { @@ -2209,13 +2423,12 @@ int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, // for the hashMap hash_table *ref_frame_hash = - intra - ? &cpi->common.cur_frame->hash_table - : av1_get_ref_frame_hash_map(cpi, x->e_mbd.mi[0]->ref_frame[0]); + intra ? &cpi->common.cur_frame->hash_table + : av1_get_ref_frame_hash_map(&cpi->common, + x->e_mbd.mi[0]->ref_frame[0]); - av1_get_block_hash_value( - what, what_stride, block_width, &hash_value1, &hash_value2, - x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, x); + av1_get_block_hash_value(what, what_stride, block_width, &hash_value1, + &hash_value2, is_cur_buf_hbd(&x->e_mbd), x); const int count = av1_hash_table_count(ref_frame_hash, hash_value1); // for intra, at lest one matching can be found, itself. @@ -2334,7 +2547,7 @@ static int upsampled_obmc_pref_error( unsigned int besterr; DECLARE_ALIGNED(16, uint8_t, pred[2 * MAX_SB_SQUARE]); - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + if (is_cur_buf_hbd(xd)) { uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred); aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred8, w, h, subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd, @@ -2676,14 +2889,15 @@ static int obmc_full_pixel_diamond(const AV1_COMP *cpi, MACROBLOCK *x, MV *mvp_full, int step_param, int sadpb, int further_steps, int do_refine, const aom_variance_fn_ptr_t *fn_ptr, - const MV *ref_mv, MV *dst_mv, - int is_second) { + const MV *ref_mv, MV *dst_mv, int is_second, + const search_site_config *cfg) { + (void)cpi; // to silence compiler warning const int32_t *wsrc = x->wsrc_buf; const int32_t *mask = x->mask_buf; MV temp_mv; int thissme, n, num00 = 0; int bestsme = - obmc_diamond_search_sad(x, &cpi->ss_cfg, wsrc, mask, mvp_full, &temp_mv, + obmc_diamond_search_sad(x, cfg, wsrc, mask, mvp_full, &temp_mv, step_param, sadpb, &n, fn_ptr, ref_mv, is_second); if (bestsme < INT_MAX) bestsme = get_obmc_mvpred_var(x, wsrc, mask, &temp_mv, ref_mv, fn_ptr, 1, @@ -2700,9 +2914,9 @@ static int obmc_full_pixel_diamond(const AV1_COMP *cpi, MACROBLOCK *x, if (num00) { num00--; } else { - thissme = obmc_diamond_search_sad(x, &cpi->ss_cfg, wsrc, mask, mvp_full, - &temp_mv, step_param + n, sadpb, &num00, - fn_ptr, ref_mv, is_second); + thissme = obmc_diamond_search_sad(x, cfg, wsrc, mask, mvp_full, &temp_mv, + step_param + n, sadpb, &num00, fn_ptr, + ref_mv, is_second); if (thissme < INT_MAX) thissme = get_obmc_mvpred_var(x, wsrc, mask, &temp_mv, ref_mv, fn_ptr, 1, is_second); @@ -2738,11 +2952,12 @@ int av1_obmc_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, MV *mvp_full, int step_param, int sadpb, int further_steps, int do_refine, const aom_variance_fn_ptr_t *fn_ptr, - const MV *ref_mv, MV *dst_mv, int is_second) { + const MV *ref_mv, MV *dst_mv, int is_second, + const search_site_config *cfg) { if (cpi->sf.obmc_full_pixel_search_level == 0) { return obmc_full_pixel_diamond(cpi, x, mvp_full, step_param, sadpb, further_steps, do_refine, fn_ptr, ref_mv, - dst_mv, is_second); + dst_mv, is_second, cfg); } else { const int32_t *wsrc = x->wsrc_buf; const int32_t *mask = x->mask_buf; @@ -2851,3 +3066,119 @@ int av1_return_min_sub_pixel_mv( lower_mv_precision(bestmv, allow_hp, 0); return besterr; } + +void av1_simple_motion_search(AV1_COMP *const cpi, MACROBLOCK *x, int mi_row, + int mi_col, BLOCK_SIZE bsize, int ref, + MV ref_mv_full, int num_planes, + int use_subpixel) { + assert(num_planes == 1 && + "Currently simple_motion_search only supports luma plane"); + assert(!frame_is_intra_only(&cpi->common) && + "Simple motion search only enabled for non-key frames"); + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + + set_offsets_for_motion_search(cpi, x, mi_row, mi_col, bsize); + + MB_MODE_INFO *mbmi = xd->mi[0]; + mbmi->sb_type = bsize; + mbmi->ref_frame[0] = ref; + mbmi->ref_frame[1] = NONE_FRAME; + mbmi->motion_mode = SIMPLE_TRANSLATION; + + const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, ref); + const YV12_BUFFER_CONFIG *scaled_ref_frame = + av1_get_scaled_ref_frame(cpi, ref); + struct buf_2d backup_yv12; + // ref_mv is used to code the motion vector. ref_mv_full is the initial point. + // ref_mv is in units of 1/8 pel whereas ref_mv_full is in units of pel. + MV ref_mv = { 0, 0 }; + const int step_param = cpi->mv_step_param; + const MvLimits tmp_mv_limits = x->mv_limits; + const SEARCH_METHODS search_methods = NSTEP; + const int do_mesh_search = 0; + const int sadpb = x->sadperbit16; + int cost_list[5]; + const int ref_idx = 0; + int var; + + av1_setup_pre_planes(xd, ref_idx, yv12, mi_row, mi_col, + get_ref_scale_factors(cm, ref), num_planes); + set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); + if (scaled_ref_frame) { + backup_yv12 = xd->plane[AOM_PLANE_Y].pre[ref_idx]; + av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL, + num_planes); + } + + // This overwrites the mv_limits so we will need to restore it later. + av1_set_mv_search_range(&x->mv_limits, &ref_mv); + var = av1_full_pixel_search( + cpi, x, bsize, &ref_mv_full, step_param, search_methods, do_mesh_search, + sadpb, cond_cost_list(cpi, cost_list), &ref_mv, INT_MAX, 1, + mi_col * MI_SIZE, mi_row * MI_SIZE, 0, &cpi->ss_cfg[SS_CFG_SRC]); + // Restore + x->mv_limits = tmp_mv_limits; + + const int use_subpel_search = + var < INT_MAX && !cpi->common.cur_frame_force_integer_mv && use_subpixel; + if (scaled_ref_frame) { + xd->plane[AOM_PLANE_Y].pre[ref_idx] = backup_yv12; + } + if (use_subpel_search) { + int not_used = 0; + if (cpi->sf.use_accurate_subpel_search) { + const int pw = block_size_wide[bsize]; + const int ph = block_size_high[bsize]; + cpi->find_fractional_mv_step( + x, cm, mi_row, mi_col, &ref_mv, cm->allow_high_precision_mv, + x->errorperbit, &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop, + cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list), + x->nmv_vec_cost, x->mv_cost_stack, ¬_used, &x->pred_sse[ref], NULL, + NULL, 0, 0, pw, ph, cpi->sf.use_accurate_subpel_search, 1); + } else { + cpi->find_fractional_mv_step( + x, cm, mi_row, mi_col, &ref_mv, cm->allow_high_precision_mv, + x->errorperbit, &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop, + cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list), + x->nmv_vec_cost, x->mv_cost_stack, ¬_used, &x->pred_sse[ref], NULL, + NULL, 0, 0, 0, 0, 0, 1); + } + } else { + // Manually convert from units of pixel to 1/8-pixels if we are not doing + // subpel search + x->best_mv.as_mv.row *= 8; + x->best_mv.as_mv.col *= 8; + } + + mbmi->mv[0].as_mv = x->best_mv.as_mv; + + // Get a copy of the prediction output + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + + aom_clear_system_state(); + + if (scaled_ref_frame) { + xd->plane[AOM_PLANE_Y].pre[ref_idx] = backup_yv12; + } +} + +void av1_simple_motion_sse_var(AV1_COMP *cpi, MACROBLOCK *x, int mi_row, + int mi_col, BLOCK_SIZE bsize, + const MV ref_mv_full, int use_subpixel, + unsigned int *sse, unsigned int *var) { + MACROBLOCKD *xd = &x->e_mbd; + const MV_REFERENCE_FRAME ref = + cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME : LAST_FRAME; + + av1_simple_motion_search(cpi, x, mi_row, mi_col, bsize, ref, ref_mv_full, 1, + use_subpixel); + + const uint8_t *src = x->plane[0].src.buf; + const int src_stride = x->plane[0].src.stride; + const uint8_t *dst = xd->plane[0].dst.buf; + const int dst_stride = xd->plane[0].dst.stride; + + *var = cpi->fn_ptr[bsize].vf(src, src_stride, dst, dst_stride, sse); +} diff --git a/libaom/av1/encoder/mcomp.h b/libaom/av1/encoder/mcomp.h index 3f8b3b1..71547da 100644 --- a/libaom/av1/encoder/mcomp.h +++ b/libaom/av1/encoder/mcomp.h @@ -13,6 +13,7 @@ #define AOM_AV1_ENCODER_MCOMP_H_ #include "av1/encoder/block.h" + #include "aom_dsp/variance.h" #ifdef __cplusplus @@ -83,6 +84,11 @@ int av1_refining_search_sad(struct macroblock *x, MV *ref_mv, int sad_per_bit, int distance, const aom_variance_fn_ptr_t *fn_ptr, const MV *center_mv); +unsigned int av1_int_pro_motion_estimation(const struct AV1_COMP *cpi, + MACROBLOCK *x, BLOCK_SIZE bsize, + int mi_row, int mi_col, + const MV *ref_mv); + // Runs sequence of diamond searches in smaller steps for RD. int av1_full_pixel_diamond(const struct AV1_COMP *cpi, MACROBLOCK *x, MV *mvp_full, int step_param, int sadpb, @@ -132,13 +138,15 @@ int av1_full_pixel_search(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, MV *mvp_full, int step_param, int method, int run_mesh_search, int error_per_bit, int *cost_list, const MV *ref_mv, int var_max, int rd, - int x_pos, int y_pos, int intra); + int x_pos, int y_pos, int intra, + const search_site_config *cfg); int av1_obmc_full_pixel_search(const struct AV1_COMP *cpi, MACROBLOCK *x, MV *mvp_full, int step_param, int sadpb, int further_steps, int do_refine, const aom_variance_fn_ptr_t *fn_ptr, - const MV *ref_mv, MV *dst_mv, int is_second); + const MV *ref_mv, MV *dst_mv, int is_second, + const search_site_config *cfg); int av1_find_best_obmc_sub_pixel_tree_up( MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col, MV *bestmv, const MV *ref_mv, int allow_hp, int error_per_bit, @@ -154,6 +162,19 @@ unsigned int av1_refine_warped_mv(const struct AV1_COMP *cpi, int mi_row, int mi_col, int *pts0, int *pts_inref0, int total_samples); +// Performs a motion search in SIMPLE_TRANSLATION mode using reference frame +// ref. Note that this sets the offset of mbmi, so we will need to reset it +// after calling this function. +void av1_simple_motion_search(struct AV1_COMP *const cpi, MACROBLOCK *x, + int mi_row, int mi_col, BLOCK_SIZE bsize, int ref, + MV ref_mv_full, int num_planes, int use_subpixel); + +// Performs a simple motion search to calculate the sse and var of the residue +void av1_simple_motion_sse_var(struct AV1_COMP *cpi, MACROBLOCK *x, int mi_row, + int mi_col, BLOCK_SIZE bsize, + const MV ref_mv_full, int use_subpixel, + unsigned int *sse, unsigned int *var); + static INLINE void av1_set_fractional_mv(int_mv *fractional_best_mv) { for (int z = 0; z < 3; z++) { fractional_best_mv[z].as_int = INVALID_MV; diff --git a/libaom/av1/encoder/mips/msa/temporal_filter_msa.c b/libaom/av1/encoder/mips/msa/temporal_filter_msa.c index 531ae09..effa75b 100644 --- a/libaom/av1/encoder/mips/msa/temporal_filter_msa.c +++ b/libaom/av1/encoder/mips/msa/temporal_filter_msa.c @@ -267,6 +267,7 @@ static void temporal_filter_apply_16size_msa(uint8_t *frm1_ptr, uint32_t stride, } } +// TODO(yunqing) The following optimization is not used since c code changes. void av1_temporal_filter_apply_msa(uint8_t *frame1_ptr, uint32_t stride, uint8_t *frame2_ptr, uint32_t blk_w, uint32_t blk_h, int32_t strength, diff --git a/libaom/av1/encoder/ml.c b/libaom/av1/encoder/ml.c index ad664ac..579900a 100644 --- a/libaom/av1/encoder/ml.c +++ b/libaom/av1/encoder/ml.c @@ -65,7 +65,9 @@ void av1_nn_softmax(const float *input, float *output, int n) { for (int i = 1; i < n; i++) max_inp = AOMMAX(max_inp, input[i]); float sum_out = 0.0f; for (int i = 0; i < n; i++) { - output[i] = (float)exp(input[i] - max_inp); + // Clamp to range [-10.0, 0.0] to prevent FE_UNDERFLOW errors. + const float normalized_input = AOMMAX(input[i] - max_inp, -10.0f); + output[i] = (float)exp(normalized_input); sum_out += output[i]; } for (int i = 0; i < n; i++) output[i] /= sum_out; diff --git a/libaom/av1/encoder/partition_model_weights.h b/libaom/av1/encoder/partition_model_weights.h index 271764a..b754c88 100644 --- a/libaom/av1/encoder/partition_model_weights.h +++ b/libaom/av1/encoder/partition_model_weights.h @@ -2441,145 +2441,20 @@ static const NN_CONFIG av1_rect_partition_nnconfig_128 = { #undef NUM_NODES #undef LABEL_SIZE -#if CONFIG_ONE_PASS_SVM -#define FEATURE_SIZE 24 -static const float av1_op_svm_early_term_weights_128[FEATURE_SIZE + 1] = { - -4.5893036051f, 6.9065208136f, -9.1579514692f, 0.1353151366f, - -1.0271889653f, -0.0020988254f, -0.0094355949f, 0.0040209656f, - 0.0073014747f, 0.7939705382f, 0.0254545714f, 0.0557559708f, - -0.0339662064f, -0.0496818300f, 0.3053600283f, 0.3699486845f, - 0.0848271391f, 0.4091075988f, 0.1196729398f, -0.0038137193f, - -0.0773495909f, -0.0651630642f, -0.0123704995f, -0.0036697401f, - -4.1930227095f, -}; - -static const float av1_op_svm_early_term_weights_64[FEATURE_SIZE + 1] = { - -2.7600454480f, 5.6822046712f, -6.7576830133f, 0.1326457117f, - -1.0541818372f, 0.0107782654f, 0.0050469147f, -0.0021362631f, - -0.0135151040f, -0.1020115005f, -0.0283409957f, -0.0176311233f, - 0.0250648204f, 0.0196228570f, 0.5441528594f, 0.2767320141f, - 0.1261231351f, 0.2998476408f, 0.1336215695f, -0.1107823946f, - -0.0697279598f, -0.0577520545f, -0.0558441075f, -0.0699750617f, - -2.6995991503f, -}; - -static const float av1_op_svm_early_term_weights_32[FEATURE_SIZE + 1] = { - -0.8950734172f, 1.3559565008f, -2.6733642653f, 0.2661361319f, - -0.0314731140f, 0.0044943456f, 0.0006438044f, -0.0029066686f, - -0.0021903213f, 0.5845049496f, -0.0003629350f, 0.0006982840f, - 0.0014157386f, -0.0017427528f, 0.7078456733f, 0.1600998068f, - 0.0933852747f, 0.2822125876f, 0.1923826165f, -0.0905903459f, - -0.0564717590f, -0.0591007486f, -0.0692268554f, -0.0677411981f, - -0.7101853206f, -}; - -static const float av1_op_svm_early_term_weights_16[FEATURE_SIZE + 1] = { - -0.1719124013f, -0.3192305362f, -1.1714597182f, 0.4437770294f, - -0.0042344643f, 0.0000027764f, 0.0018827450f, -0.0015555613f, - -0.0003250050f, 0.9413693294f, 0.0076188418f, -0.0067870352f, - 0.0006329246f, -0.0013059613f, 0.8596697254f, 0.0635558018f, - 0.0447224598f, 0.0915706321f, 0.0741662273f, -0.0269096547f, - -0.0244610614f, -0.0281113318f, -0.0326108845f, -0.0350908892f, - -0.0307521675f, -}; - -static const float av1_op_svm_early_term_mean_128[FEATURE_SIZE] = { - 940540.3259649610f, 3988285.5905584921f, 575475302.3545289040f, - 0.5775348803f, 866.9828469502f, 0.2503762393f, - 0.2501466215f, 0.2513213770f, 0.2481557622f, - 521994448.3219169378f, 0.2666920631f, 0.2535864361f, - 0.2481589186f, 0.2315625823f, 100519.1049708007f, - 12.1299754840f, 0.8279971004f, 12.6664603305f, - 0.7313258998f, 935.8233056680f, 0.7436563032f, - 0.7710055018f, 0.7376516970f, 0.6859818720f, -}; - -static const float av1_op_svm_early_term_mean_64[FEATURE_SIZE] = { - 420419.7529613562f, 839754.4414347620f, 129360420.5256031156f, - 0.6525652037f, 548.8972009954f, 0.2506918565f, - 0.2488349076f, 0.2501724146f, 0.2503008213f, - 113132974.7944754064f, 0.2479344278f, 0.2471446791f, - 0.2524478512f, 0.2524730419f, 91147.9854189453f, - 10.9642508460f, 0.8936554428f, 11.3877865621f, - 0.8307555282f, 752.7787491956f, 0.7243363939f, - 0.7198362119f, 0.7329432336f, 0.7245090283f, -}; - -static const float av1_op_svm_early_term_mean_32[FEATURE_SIZE] = { - 105111.0236438536f, 184296.0939716828f, 29117017.6751756854f, - 0.6402298612f, 140.2223339218f, 0.2495860872f, - 0.2496407600f, 0.2506238629f, 0.2501492900f, - 24480304.9390618578f, 0.2494442027f, 0.2496080963f, - 0.2504881563f, 0.2504595447f, 60297.6762059058f, - 9.4279752138f, 0.9287901132f, 9.6516813792f, - 0.9009173677f, 591.5406335030f, 0.6944486917f, - 0.6983941982f, 0.6927236901f, 0.6921613649f, -}; - -static const float av1_op_svm_early_term_mean_16[FEATURE_SIZE] = { - 34080.7994802934f, 44108.1176228864f, 7494288.4946180154f, 0.6240636218f, - 36.4539515827f, 0.2490867417f, 0.2499231014f, 0.2505361492f, - 0.2504540077f, 5913397.2957480755f, 0.2487482536f, 0.2495500728f, - 0.2503693302f, 0.2513323434f, 36574.9686737814f, 7.4345592768f, - 0.9592429205f, 7.6001764585f, 0.9459867777f, 490.4635033056f, - 0.6626215237f, 0.6580791886f, 0.6655481064f, 0.6589010119f, -}; - -static const float av1_op_svm_early_term_std_128[FEATURE_SIZE] = { - 2054266.2732957317f, 7550554.6241466375f, 1078688147.1656334400f, - 0.4939517611f, 1414.3139592985f, 0.1504634077f, - 0.1515907199f, 0.1590329744f, 0.1515653324f, - 1006422867.8989596367f, 0.1168668155f, 0.1195725959f, - 0.1195825693f, 0.1123065533f, 195261.0940245980f, - 4.5876675121f, 0.3773829648f, 4.8017339769f, - 0.4432700397f, 973.7532938848f, 0.4790027843f, - 0.5056275222f, 0.5262278749f, 0.4685586148f, -}; - -static const float av1_op_svm_early_term_std_64[FEATURE_SIZE] = { - 1093636.0522712648f, 1749863.5221569177f, 255168612.8025657237f, - 0.4761552884f, 1084.7927994662f, 0.1099344646f, - 0.1100619440f, 0.1090853225f, 0.1115303745f, - 232084513.1365262568f, 0.0759732385f, 0.0762942913f, - 0.0785624106f, 0.0779284747f, 185687.9441778057f, - 4.4371901245f, 0.3082781088f, 4.6670562831f, - 0.3749677061f, 854.3212307408f, 0.4920531348f, - 0.5073919158f, 0.5054698298f, 0.4904895620f, -}; - -static const float av1_op_svm_early_term_std_32[FEATURE_SIZE] = { - 238229.7484988807f, 400136.8703966461f, 60267828.4581554681f, - 0.4799328974f, 268.9377064297f, 0.1122938575f, - 0.1126479260f, 0.1137018559f, 0.1126389337f, - 52174139.1477040648f, 0.0715628767f, 0.0720997035f, - 0.0728961434f, 0.0732065300f, 147785.0049793872f, - 4.2092341484f, 0.2571751131f, 4.3893075417f, - 0.2987729310f, 769.0253148602f, 0.5027558039f, - 0.4982811444f, 0.5092312751f, 0.4991214994f, -}; - -static const float av1_op_svm_early_term_std_16[FEATURE_SIZE] = { - 64177.9527087587f, 103729.9987511119f, 16632490.8146969266f, - 0.4843637247f, 65.8114470725f, 0.0884226846f, - 0.0912638659f, 0.0914771167f, 0.0916078800f, - 13364581.3877149168f, 0.0677468925f, 0.0689631274f, - 0.0689915367f, 0.0702648469f, 111397.2620676765f, - 3.7858187888f, 0.1977269328f, 3.9420183951f, - 0.2260437881f, 717.5336868275f, 0.5017939514f, - 0.5066633533f, 0.5086806985f, 0.5085585987f, -}; - -#undef FEATURE_SIZE -#endif // CONFIG_ONE_PASS_SVM +// Below are the models used for simple_motion_search_based_split +static const float av1_simple_motion_search_based_split_thresh_128 = 2.0f; +static const float av1_simple_motion_search_based_split_thresh_64 = 2.0f; +static const float av1_simple_motion_search_based_split_thresh_32 = 2.0f; +static const float av1_simple_motion_search_based_split_thresh_16 = 2.0f; +static const float av1_simple_motion_search_based_split_thresh_8 = 2.0f; -// Below are the models used for full_pixel_motion_search_based_split // BLOCK_128X128 #define NUM_HIDDEN_LAYERS_128 1 #define NUM_FEATURES_128 6 #define NUM_LAYER_0_UNITS_128 16 #define NUM_LOGITS_128 1 -static const float full_pixel_motion_search_based_split_layer_0_kernel_128[] = { +static const float av1_simple_motion_search_based_split_layer_0_kernel_128[] = { -0.807346f, 0.242298f, 12.9862f, -1.19161f, 5.21734f, -1.1363f, -2.39127f, 0.930915f, -2.44285f, -2.42966f, 5.73476f, 0.0506879f, -0.234878f, -0.317875f, 0.361322f, 0.431648f, -0.39105f, -0.110225f, @@ -2598,23 +2473,23 @@ static const float full_pixel_motion_search_based_split_layer_0_kernel_128[] = { 0.702545f, -0.612227f, -7.68881f, 9.52225f, -1.18581f, -2.56762f }; -static const float full_pixel_motion_search_based_split_logits_kernel_128[] = { +static const float av1_simple_motion_search_based_split_logits_kernel_128[] = { 0.364895f, 0.577553f, 0.115758f, -0.999496f, 0.124885f, 3.23193f, -0.00386642f, 0.970794f, 0.136637f, -4.28052f, -1.49234f, 0.370436f, 0.576981f, -0.469656f, -0.124071f, 1.07669f }; -static const float full_pixel_motion_search_based_split_layer_0_bias_128[] = { +static const float av1_simple_motion_search_based_split_layer_0_bias_128[] = { 1.32916f, 0.817212f, 0.0f, -0.921066f, 0.0f, 3.57649f, -0.0204517f, 2.97286f, 0.0f, 5.49957f, -8.14518f, 0.0f, 1.30826f, -0.349536f, -0.638933f, 5.4496f }; -static const float full_pixel_motion_search_based_split_logits_bias_128[] = { +static const float av1_simple_motion_search_based_split_logits_bias_128[] = { 0.683442f }; -static const NN_CONFIG full_pixel_motion_search_based_split_nn_config_128 = { +static const NN_CONFIG av1_simple_motion_search_based_split_nn_config_128 = { NUM_FEATURES_128, NUM_LOGITS_128, NUM_HIDDEN_LAYERS_128, @@ -2622,17 +2497,15 @@ static const NN_CONFIG full_pixel_motion_search_based_split_nn_config_128 = { NUM_LAYER_0_UNITS_128, }, { - full_pixel_motion_search_based_split_layer_0_kernel_128, - full_pixel_motion_search_based_split_logits_kernel_128, + av1_simple_motion_search_based_split_layer_0_kernel_128, + av1_simple_motion_search_based_split_logits_kernel_128, }, { - full_pixel_motion_search_based_split_layer_0_bias_128, - full_pixel_motion_search_based_split_logits_bias_128, + av1_simple_motion_search_based_split_layer_0_bias_128, + av1_simple_motion_search_based_split_logits_bias_128, }, }; -static const float full_pixel_motion_search_based_split_thresh_128 = 2.0f; - #undef NUM_HIDDEN_LAYERS_128 #undef NUM_FEATURES_128 #undef NUM_LAYER_0_UNITS_128 @@ -2644,7 +2517,7 @@ static const float full_pixel_motion_search_based_split_thresh_128 = 2.0f; #define NUM_LAYER_0_UNITS_64 16 #define NUM_LOGITS_64 1 -static const float full_pixel_motion_search_based_split_layer_0_kernel_64[] = { +static const float av1_simple_motion_search_based_split_layer_0_kernel_64[] = { 0.0345945f, -0.394064f, 0.0919978f, 0.270358f, -0.384502f, -0.504608f, -0.25759f, 0.155981f, 2.62567f, -10.7204f, -0.709802f, 8.15948f, 0.589866f, -0.445645f, -1.68232f, 10.0061f, -3.17671f, 4.87259f, @@ -2663,23 +2536,23 @@ static const float full_pixel_motion_search_based_split_layer_0_kernel_64[] = { -0.217072f, -0.0984913f, -0.265515f, 0.360021f, 0.0779512f, 0.361516f }; -static const float full_pixel_motion_search_based_split_logits_kernel_64[] = { +static const float av1_simple_motion_search_based_split_logits_kernel_64[] = { 0.470821f, 0.474747f, -0.571292f, 0.403221f, 0.628966f, -0.617029f, 0.501105f, 0.499962f, -1.5451f, -0.473518f, -0.730568f, -5.55817f, 0.776761f, 0.42569f, 0.311925f, 0.469968f }; -static const float full_pixel_motion_search_based_split_layer_0_bias_64[] = { +static const float av1_simple_motion_search_based_split_layer_0_bias_64[] = { -0.134085f, 0.0758715f, 1.10419f, 0.0f, -5.75737f, 1.65494f, 0.0f, 3.44047f, 0.394852f, 3.43858f, 3.65871f, -4.84987f, 1.21207f, -1.7705f, -5.46469f, -0.0889634f }; -static const float full_pixel_motion_search_based_split_logits_bias_64[] = { +static const float av1_simple_motion_search_based_split_logits_bias_64[] = { -0.479491f }; -static const NN_CONFIG full_pixel_motion_search_based_split_nn_config_64 = { +static const NN_CONFIG av1_simple_motion_search_based_split_nn_config_64 = { NUM_FEATURES_64, NUM_LOGITS_64, NUM_HIDDEN_LAYERS_64, @@ -2687,17 +2560,15 @@ static const NN_CONFIG full_pixel_motion_search_based_split_nn_config_64 = { NUM_LAYER_0_UNITS_64, }, { - full_pixel_motion_search_based_split_layer_0_kernel_64, - full_pixel_motion_search_based_split_logits_kernel_64, + av1_simple_motion_search_based_split_layer_0_kernel_64, + av1_simple_motion_search_based_split_logits_kernel_64, }, { - full_pixel_motion_search_based_split_layer_0_bias_64, - full_pixel_motion_search_based_split_logits_bias_64, + av1_simple_motion_search_based_split_layer_0_bias_64, + av1_simple_motion_search_based_split_logits_bias_64, }, }; -static const float full_pixel_motion_search_based_split_thresh_64 = 2.0f; - #undef NUM_HIDDEN_LAYERS_64 #undef NUM_FEATURES_64 #undef NUM_LAYER_0_UNITS_64 @@ -2709,7 +2580,7 @@ static const float full_pixel_motion_search_based_split_thresh_64 = 2.0f; #define NUM_LAYER_0_UNITS_32 16 #define NUM_LOGITS_32 1 -static const float full_pixel_motion_search_based_split_layer_0_kernel_32[] = { +static const float av1_simple_motion_search_based_split_layer_0_kernel_32[] = { -1.61796f, 0.0585128f, 1.57904f, 1.52703f, 0.367779f, 0.220434f, 1.66652f, -1.77782f, 6.41118f, 4.16976f, 4.97299f, 4.84111f, -0.0956536f, -0.163284f, -0.143662f, 0.129329f, 0.449659f, -0.528844f, @@ -2728,23 +2599,23 @@ static const float full_pixel_motion_search_based_split_layer_0_kernel_32[] = { -1.91327f, -0.0356497f, 1.47611f, 1.27499f, -1.76108f, -0.578954f }; -static const float full_pixel_motion_search_based_split_logits_kernel_32[] = { +static const float av1_simple_motion_search_based_split_logits_kernel_32[] = { -0.220382f, -0.693902f, 0.424827f, 0.379952f, -0.413791f, -0.326785f, -0.455086f, 0.242402f, 0.307986f, 0.175746f, 0.498901f, -0.628053f, 0.285447f, 0.230052f, 0.415151f, -0.842946f }; -static const float full_pixel_motion_search_based_split_layer_0_bias_32[] = { +static const float av1_simple_motion_search_based_split_layer_0_bias_32[] = { -1.80751f, 6.40356f, -0.0512058f, -4.59163f, -0.369933f, -0.195755f, -0.16648f, -0.599755f, -5.35975f, -1.21349f, 2.48414f, 1.07096f, -3.66684f, -6.17761f, 4.2159f, -1.05286f }; -static const float full_pixel_motion_search_based_split_logits_bias_32[] = { +static const float av1_simple_motion_search_based_split_logits_bias_32[] = { -2.58676f }; -static const NN_CONFIG full_pixel_motion_search_based_split_nn_config_32 = { +static const NN_CONFIG av1_simple_motion_search_based_split_nn_config_32 = { NUM_FEATURES_32, NUM_LOGITS_32, NUM_HIDDEN_LAYERS_32, @@ -2752,17 +2623,15 @@ static const NN_CONFIG full_pixel_motion_search_based_split_nn_config_32 = { NUM_LAYER_0_UNITS_32, }, { - full_pixel_motion_search_based_split_layer_0_kernel_32, - full_pixel_motion_search_based_split_logits_kernel_32, + av1_simple_motion_search_based_split_layer_0_kernel_32, + av1_simple_motion_search_based_split_logits_kernel_32, }, { - full_pixel_motion_search_based_split_layer_0_bias_32, - full_pixel_motion_search_based_split_logits_bias_32, + av1_simple_motion_search_based_split_layer_0_bias_32, + av1_simple_motion_search_based_split_logits_bias_32, }, }; -static const float full_pixel_motion_search_based_split_thresh_32 = 2.0f; - #undef NUM_HIDDEN_LAYERS_32 #undef NUM_FEATURES_32 #undef NUM_LAYER_0_UNITS_32 @@ -2774,7 +2643,7 @@ static const float full_pixel_motion_search_based_split_thresh_32 = 2.0f; #define NUM_LAYER_0_UNITS_16 16 #define NUM_LOGITS_16 1 -static const float full_pixel_motion_search_based_split_layer_0_kernel_16[] = { +static const float av1_simple_motion_search_based_split_layer_0_kernel_16[] = { -0.611497f, -0.0422086f, -0.555957f, -0.632451f, -0.144179f, -0.152722f, -0.330265f, -0.419866f, 0.287343f, 0.385295f, -0.424486f, 0.424281f, 2.27442f, -2.47933f, 5.24731f, 4.33827f, 4.73215f, 3.41909f, @@ -2793,23 +2662,23 @@ static const float full_pixel_motion_search_based_split_layer_0_kernel_16[] = { 0.0333619f, -0.377782f, 0.160767f, -0.128169f, -0.484818f, -0.311973f }; -static const float full_pixel_motion_search_based_split_logits_kernel_16[] = { +static const float av1_simple_motion_search_based_split_logits_kernel_16[] = { -0.132207f, 0.15176f, -0.680086f, 0.605921f, -0.43294f, 0.485811f, -0.306286f, 0.551368f, 0.413904f, 0.548748f, -0.437391f, 0.560778f, -0.00685266f, -0.558657f, 0.122127f, 0.260165f }; -static const float full_pixel_motion_search_based_split_layer_0_bias_16[] = { +static const float av1_simple_motion_search_based_split_layer_0_bias_16[] = { -0.200928f, -0.074132f, 8.69963f, -9.00807f, 9.08983f, -6.83586f, -3.89329f, 10.4881f, -0.0670618f, 0.0f, 9.21614f, 8.41773f, -0.145851f, 0.0f, -1.43038f, -0.0460311f }; -static const float full_pixel_motion_search_based_split_logits_bias_16[] = { +static const float av1_simple_motion_search_based_split_logits_bias_16[] = { -4.19885f }; -static const NN_CONFIG full_pixel_motion_search_based_split_nn_config_16 = { +static const NN_CONFIG av1_simple_motion_search_based_split_nn_config_16 = { NUM_FEATURES_16, NUM_LOGITS_16, NUM_HIDDEN_LAYERS_16, @@ -2817,17 +2686,15 @@ static const NN_CONFIG full_pixel_motion_search_based_split_nn_config_16 = { NUM_LAYER_0_UNITS_16, }, { - full_pixel_motion_search_based_split_layer_0_kernel_16, - full_pixel_motion_search_based_split_logits_kernel_16, + av1_simple_motion_search_based_split_layer_0_kernel_16, + av1_simple_motion_search_based_split_logits_kernel_16, }, { - full_pixel_motion_search_based_split_layer_0_bias_16, - full_pixel_motion_search_based_split_logits_bias_16, + av1_simple_motion_search_based_split_layer_0_bias_16, + av1_simple_motion_search_based_split_logits_bias_16, }, }; -static const float full_pixel_motion_search_based_split_thresh_16 = 2.0f; - #undef NUM_HIDDEN_LAYERS_16 #undef NUM_FEATURES_16 #undef NUM_LAYER_0_UNITS_16 @@ -2840,7 +2707,7 @@ static const float full_pixel_motion_search_based_split_thresh_16 = 2.0f; #define NUM_LAYER_0_UNITS_8 16 #define NUM_LOGITS_8 1 -static const float full_pixel_motion_search_based_split_layer_0_kernel_8[] = { +static const float av1_simple_motion_search_based_split_layer_0_kernel_8[] = { 0.0370236f, -0.580211f, 2.0134f, 1.69637f, 2.43181f, -0.521648f, -0.00375187f, 0.122712f, -4.74411f, 7.36187f, 5.42574f, -5.53557f, 0.0993344f, -0.358843f, 0.0765453f, -0.615987f, -0.754633f, -0.175846f, @@ -2859,23 +2726,1240 @@ static const float full_pixel_motion_search_based_split_layer_0_kernel_8[] = { 0.616966f, -0.451472f, -0.319365f, 0.00807278f, -0.303261f, -0.351679f }; -static const float full_pixel_motion_search_based_split_logits_kernel_8[] = { +static const float av1_simple_motion_search_based_split_logits_kernel_8[] = { -0.625847f, 0.381323f, 0.342475f, 0.526161f, -0.665965f, -0.515317f, -0.406218f, 0.568007f, 0.479397f, -0.426116f, 0.615638f, 0.338572f, 0.185583f, 0.308031f, 0.260748f, 0.531619f }; -static const float full_pixel_motion_search_based_split_layer_0_bias_8[] = { +static const float av1_simple_motion_search_based_split_layer_0_bias_8[] = { 4.73775f, -1.12658f, -0.258038f, -6.06696f, 1.79131f, 2.49609f, 4.28388f, 0.0f, -4.63598f, 3.06034f, 5.31994f, -0.152142f, 0.514738f, -1.30098f, 3.00296f, -3.83481f }; -static const float full_pixel_motion_search_based_split_logits_bias_8[] = { +static const float av1_simple_motion_search_based_split_logits_bias_8[] = { -3.44508f }; -static const NN_CONFIG full_pixel_motion_search_based_split_nn_config_8 = { +static const NN_CONFIG av1_simple_motion_search_based_split_nn_config_8 = { + NUM_FEATURES_8, + NUM_LOGITS_8, + NUM_HIDDEN_LAYERS_8, + { + NUM_LAYER_0_UNITS_8, + }, + { + av1_simple_motion_search_based_split_layer_0_kernel_8, + av1_simple_motion_search_based_split_logits_kernel_8, + }, + { + av1_simple_motion_search_based_split_layer_0_bias_8, + av1_simple_motion_search_based_split_logits_bias_8, + }, +}; + +#endif + +// Model based on simple_motion_search + +// Thresholds for doing a single type of partition +// TODO(chiyotsai@google.com): Set the thresholds for PARTITION_SPLIT. +static const float av1_simple_motion_search_prune_part_only_thresh_128[10] = { + 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f +}; + +static const float av1_simple_motion_search_prune_part_only_thresh_64[10] = { + 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f +}; + +static const float av1_simple_motion_search_prune_part_only_thresh_32[10] = { + 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f +}; + +static const float av1_simple_motion_search_prune_part_only_thresh_16[10] = { + 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f +}; + +static const float av1_simple_motion_search_prune_part_only_thresh_8[10] = { + 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f +}; + +// Thresholds for pruning a partition type +static const float av1_simple_motion_search_prune_part_prune_thresh_128[10] = { + 0.0f, 0.0288721601835f, 0.0288721601835f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f +}; + +static const float av1_simple_motion_search_prune_part_prune_thresh_64[10] = { + 0.0f, 0.0281573780991f, 0.0281573780991f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f +}; + +static const float av1_simple_motion_search_prune_part_prune_thresh_32[10] = { + 0.0f, 0.0225501403434f, 0.0225501403434f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + 0.0f +}; + +static const float av1_simple_motion_search_prune_part_prune_thresh_16[10] = { + 0.0f, + 0.000961189195907f, + 0.000961189195907f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f, + 0.0f +}; + +static const float av1_simple_motion_search_prune_part_prune_thresh_8[10] = { + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f +}; + +// Mean and std +static const float av1_simple_motion_search_prune_part_mean_128[25] = { + 13.292176f, 13.231236f, 11.098058f, 11.049944f, 10.481336f, + 10.431587f, 10.789337f, 10.732787f, 10.233817f, 10.173738f, + 12.214045f, 12.157505f, 11.863353f, 11.802220f, 12.204053f, + 12.152315f, 11.517566f, 11.465651f, 5.383040f, 0.757934f, + 4.012611f, 4.052191f, 0.853365f, 3.954503f, 3.944135f, +}; + +static const float av1_simple_motion_search_prune_part_std_128[25] = { + 2.589217f, 2.559396f, 2.268402f, 2.282274f, 3.341234f, 3.341994f, 3.033007f, + 3.041550f, 3.786247f, 3.784053f, 2.523459f, 2.511275f, 3.349364f, 3.340481f, + 2.390149f, 2.384226f, 3.599467f, 3.587460f, 2.319911f, 0.428335f, 1.241087f, + 1.208679f, 0.353742f, 1.228122f, 1.211777f, +}; + +static const float av1_simple_motion_search_prune_part_mean_64[25] = { + 11.439831f, 11.382639f, 9.647134f, 9.578121f, 9.146770f, + 9.084122f, 8.559063f, 8.499496f, 8.095865f, 8.041795f, + 10.547537f, 10.486240f, 9.362147f, 9.308391f, 10.548071f, + 10.484358f, 10.002225f, 9.944480f, 4.964504f, 0.897164f, + 3.306144f, 3.351039f, 0.928582f, 3.319739f, 3.287726f, +}; + +static const float av1_simple_motion_search_prune_part_std_64[25] = { + 2.033404f, 2.050657f, 2.064671f, 2.081519f, 2.916312f, 2.914649f, 3.628949f, + 3.618760f, 4.011421f, 3.996068f, 2.087127f, 2.103106f, 3.885277f, 3.876166f, + 2.035599f, 2.052976f, 3.052501f, 3.050985f, 2.232998f, 0.303745f, 1.111161f, + 1.081292f, 0.257521f, 1.112510f, 1.089404f, +}; + +static const float av1_simple_motion_search_prune_part_mean_32[25] = { + 9.862349f, 9.793658f, 8.043962f, 7.954083f, 8.058867f, 7.966165f, 8.046844f, + 7.956817f, 8.061414f, 7.967906f, 8.966450f, 8.890165f, 8.968315f, 8.891513f, + 8.953573f, 8.877070f, 8.974275f, 8.895363f, 4.387239f, 0.954143f, 2.701000f, + 2.751266f, 0.963302f, 2.716584f, 2.709725f, +}; + +static const float av1_simple_motion_search_prune_part_std_32[25] = { + 1.971555f, 1.985517f, 1.935986f, 1.944743f, 1.924122f, 1.932169f, 1.943151f, + 1.950612f, 1.931156f, 1.938242f, 1.987803f, 1.997670f, 2.000859f, 2.009913f, + 1.938270f, 1.949277f, 1.922999f, 1.933145f, 1.991504f, 0.209175f, 0.973824f, + 0.952221f, 0.188018f, 0.985295f, 0.946228f, +}; + +static const float av1_simple_motion_search_prune_part_mean_16[25] = { + 8.391692f, 8.303431f, 6.590342f, 6.459725f, 6.460719f, 6.333274f, 6.592615f, + 6.461661f, 6.464787f, 6.337191f, 7.499753f, 7.395166f, 7.503220f, 7.398344f, + 7.498312f, 7.395039f, 7.353743f, 7.253139f, 3.874267f, 0.979701f, 2.087404f, + 2.131698f, 0.981005f, 2.110868f, 2.106539f, +}; + +static const float av1_simple_motion_search_prune_part_std_16[25] = { + 1.865867f, 1.870012f, 1.773885f, 1.770447f, 1.972922f, 1.961361f, 1.777224f, + 1.772864f, 1.974519f, 1.962281f, 1.831632f, 1.831837f, 1.837595f, 1.837008f, + 1.822791f, 1.822053f, 2.074991f, 2.067200f, 1.676261f, 0.141022f, 0.840297f, + 0.829935f, 0.136507f, 0.828972f, 0.808563f, +}; + +static const float av1_simple_motion_search_prune_part_mean_8[25] = { + 6.997798f, 6.867032f, 5.134819f, 4.883330f, 5.134804f, 4.879707f, 5.140518f, + 4.886751f, 5.142186f, 4.885262f, 6.069946f, 5.896944f, 6.080442f, 5.906130f, + 6.077539f, 5.905929f, 6.083087f, 5.909298f, 3.552709f, 0.990654f, 1.497349f, + 1.531762f, 0.989606f, 1.496581f, 1.484139f, +}; + +static const float av1_simple_motion_search_prune_part_std_8[25] = { + 1.727562f, 1.725050f, 1.633396f, 1.618773f, 1.633586f, 1.620657f, 1.620798f, + 1.604892f, 1.621570f, 1.607439f, 1.691024f, 1.684225f, 1.676065f, 1.668442f, + 1.680016f, 1.672452f, 1.677775f, 1.671586f, 1.451902f, 0.096223f, 0.751190f, + 0.754040f, 0.101419f, 0.738239f, 0.729455f, +}; + +#define NUM_HIDDEN_LAYERS_128 1 +#define NUM_FEATURES_128 25 +#define NUM_LAYER_0_UNITS_128 8 +#define NUM_LOGITS_128 4 + +static const float av1_simple_motion_search_prune_part_logits_kernel_128[] = { + -0.129103f, 0.457758f, -0.489986f, 0.65462f, -0.184312f, 3.81202f, + -0.444407f, -0.64198f, -0.575008f, 0.0311711f, 0.525243f, -20.892f, + 1.08811f, -65.0976f, -12.3973f, -1.38278f, -0.264233f, 0.241636f, + -10.6925f, -0.725414f, -18.8987f, -40.2284f, -16.08f, 0.995331f, + 1.47614f, -0.964864f, 0.405506f, 0.140449f, 0.459534f, -1.9093f, + 0.398452f, 0.696949f +}; + +static const float av1_simple_motion_search_prune_part_layer_0_bias_128[] = { + 1.22789f, -1.34527f, 0.759048f, 0.315086f, + 1.0834f, -1.58019f, -0.465158f, 1.20716f +}; + +static const float av1_simple_motion_search_prune_part_layer_0_kernel_128[] = { + -0.668677f, 0.58694f, -0.417094f, 0.754735f, -0.7859f, + 0.377479f, -0.0415929f, -0.0140585f, -0.730001f, 0.747528f, + -0.135247f, 0.406505f, -0.234184f, 0.956362f, -0.637555f, + 0.791884f, 0.0303722f, 1.04424f, -0.727859f, -0.274321f, + -0.122986f, 0.066312f, -0.00559175f, -0.239643f, -0.0188767f, + -0.102787f, -0.262967f, 0.071882f, -0.283398f, 0.111607f, + -0.425826f, 0.02699f, 0.108873f, -0.180558f, -0.0794057f, + 0.29665f, -0.0252969f, -0.0266213f, -0.277462f, -0.361973f, + 0.512552f, 0.395011f, -0.225876f, 0.301924f, 0.136954f, + 0.507259f, 1.23425f, 0.0137135f, 0.662572f, 0.591583f, + 0.101564f, 0.416805f, -0.645081f, -0.179086f, -0.36747f, + -0.332213f, 0.095177f, 0.220739f, -0.153256f, 0.706155f, + 0.161701f, 0.696815f, -1.21531f, -0.115059f, 0.486764f, + -0.396093f, 0.784883f, 0.535357f, -0.278021f, 0.143496f, + -0.44931f, -0.144543f, 0.319326f, 0.0190167f, -0.206295f, + 0.373995f, -0.247897f, -0.608095f, -0.41796f, -0.137129f, + -0.709562f, 0.678273f, 0.537607f, 0.557474f, 0.453308f, + 0.21405f, -0.0466495f, 0.519139f, -0.168832f, 0.902911f, + 0.681131f, -0.139876f, -0.2052f, -0.393271f, 0.262222f, + -0.246246f, -0.213993f, 0.646619f, 0.0496181f, -0.00354157f, + 0.822927f, 0.0939522f, 0.180738f, 0.118355f, 0.120456f, + -0.0472214f, -0.144958f, 0.173405f, -0.886644f, -0.0949769f, + -0.813518f, -0.3947f, -0.128021f, 0.356196f, 0.469169f, + -0.413702f, 1.04242f, 0.428853f, -0.387293f, 0.0850877f, + 0.279409f, -0.142276f, 0.0579376f, 0.211112f, 0.0703013f, + -1.9274f, -0.729147f, 0.534193f, 0.773586f, 0.922864f, + 0.642881f, 1.15127f, 0.621032f, 0.933942f, 1.01837f, + -0.660282f, -0.40059f, -1.11279f, -0.77088f, -0.43349f, + 0.202361f, -0.0840912f, 0.0935707f, 0.056333f, -0.0779369f, + 0.0173447f, -0.0104756f, 0.0115005f, -0.0195593f, 0.03592f, + -0.343454f, -0.618048f, 0.258172f, -0.412322f, -0.0463746f, + -0.0413654f, -0.0400194f, 0.615981f, -0.452094f, 0.644555f, + 0.0822476f, -0.359791f, -0.0904274f, 0.209427f, 0.0116338f, + -0.190978f, 0.890233f, 0.737769f, -1.66663f, -0.392605f, + 0.0785728f, -0.224553f, -0.128258f, -0.227227f, -0.0777773f, + 0.685976f, 0.347042f, -0.555325f, -0.249221f, 0.0919837f, + -0.0660016f, -0.272316f, 0.0390632f, -0.619624f, -0.0565801f, + 0.585026f, 0.597375f, 0.54114f, 0.593389f, 0.604391f, + 0.0820294f, -0.85339f, -1.40741f, -0.391675f, 0.0579205f, + -0.197626f, 0.130044f, -0.234488f, -0.0373991f, -0.0717973f +}; + +static const float av1_simple_motion_search_prune_part_logits_bias_128[] = { + 1.58571f, -4.6314f, -2.00273f, 0.543699f +}; + +static const NN_CONFIG av1_simple_motion_search_prune_part_nn_config_128 = { + NUM_FEATURES_128, + NUM_LOGITS_128, + NUM_HIDDEN_LAYERS_128, + { + NUM_LAYER_0_UNITS_128, + }, + { + av1_simple_motion_search_prune_part_layer_0_kernel_128, + av1_simple_motion_search_prune_part_logits_kernel_128, + }, + { + av1_simple_motion_search_prune_part_layer_0_bias_128, + av1_simple_motion_search_prune_part_logits_bias_128, + }, +}; + +#undef NUM_HIDDEN_LAYERS_128 +#undef NUM_FEATURES_128 +#undef NUM_LAYER_0_UNITS_128 +#undef NUM_LOGITS_128 + +#define NUM_HIDDEN_LAYERS_64 1 +#define NUM_FEATURES_64 25 +#define NUM_LAYER_0_UNITS_64 32 +#define NUM_LOGITS_64 10 + +static const float av1_simple_motion_search_prune_part_logits_kernel_64[] = { + 0.10424f, -0.346025f, 0.534547f, -0.385925f, 2.58341f, -0.256414f, + -0.232498f, 0.329823f, -0.0777376f, -0.590939f, 0.062657f, -0.628252f, + 0.0934588f, 2.04029f, -0.224448f, 0.371168f, -0.385348f, -0.589883f, + -3.73627f, -0.943144f, 0.346409f, -0.211215f, -0.351008f, 0.418807f, + 0.943663f, 0.173267f, 1.16585f, -0.0840888f, 0.227464f, 0.374412f, + 0.0422597f, -0.338868f, 0.222576f, 0.431713f, 1.12366f, 0.00753411f, + 0.248412f, -0.0902425f, 0.542455f, -0.665629f, -0.311245f, -0.205639f, + -0.447149f, -0.0502733f, -0.290186f, -0.794384f, 0.0940881f, -0.0686117f, + -0.0199961f, -0.587965f, 0.777096f, -0.083381f, -1.21282f, 0.652959f, + -1.18238f, 0.539991f, 0.352497f, -0.540076f, -0.26222f, -0.568556f, + 0.409102f, -0.131146f, -0.407161f, -0.188287f, -0.478657f, 0.000401932f, + -0.689324f, 0.351064f, -1.43704f, -0.315185f, -0.868726f, 0.376341f, + -0.0566277f, 0.364831f, 0.611298f, -0.495253f, -0.0193132f, 0.617978f, + 0.189586f, -0.236758f, -0.608246f, -0.149017f, -1.78303f, 0.143023f, + 0.698386f, -0.994086f, -0.673327f, 0.233868f, 0.360425f, 0.0294123f, + -0.248683f, -0.148392f, 0.0861829f, -0.190843f, -0.414906f, 0.607378f, + -0.756715f, -0.511713f, -0.321556f, 1.0078f, -1.18141f, 0.519751f, + 0.834629f, -0.359343f, 0.612262f, -0.0730553f, 0.262935f, 0.488276f, + 0.387071f, -1.44123f, 1.08269f, 0.554402f, -0.069f, 0.14113f, + 0.323817f, 0.824314f, -0.431417f, -0.349448f, 0.950728f, -0.587836f, + -0.83914f, -0.10844f, 0.26602f, 0.831933f, -0.271315f, 0.231563f, + 0.417049f, 0.190627f, -0.0940667f, 0.255363f, -0.0741022f, -0.0987662f, + -0.847522f, 0.00287554f, 0.0615741f, -0.0832218f, 0.0847148f, -0.392843f, + -0.938068f, -0.10621f, -0.260859f, -0.825175f, -0.401039f, 0.315213f, + -0.108269f, 0.288036f, -8.66166f, -0.970752f, -0.66678f, -0.593405f, + -0.518294f, -0.138722f, -0.454698f, -0.22969f, -0.553006f, -0.440111f, + 0.462661f, -0.536854f, 0.0108295f, -0.522888f, 0.00111157f, 0.229999f, + 0.0267768f, 0.176266f, -1.57043f, 0.0318106f, 0.257534f, -0.198583f, + 0.175564f, -0.251465f, -0.262441f, -1.65283f, -0.319603f, -0.875282f, + -0.301303f, 0.0170948f, -0.227075f, 0.0299545f, -4.98346f, 0.470046f, + -1.28051f, -0.213809f, -0.486585f, -0.906463f, -0.169984f, -0.333153f, + -0.376733f, 0.108016f, 0.486744f, -0.186936f, -0.429259f, 0.056501f, + -0.266545f, 0.265447f, -0.137718f, -0.490687f, -0.935668f, -0.16229f, + -0.696932f, 0.173157f, 0.434959f, -0.140595f, 0.345845f, -1.08013f, + -0.0205929f, -0.815874f, -0.179812f, 0.02767f, -0.141727f, 0.471936f, + -7.29453f, -1.04362f, -0.745482f, -0.28725f, -0.214997f, -0.0850651f, + -0.748471f, 0.161325f, -1.04387f, -0.705305f, 0.489427f, -0.765373f, + -0.301576f, 0.0742467f, -0.331282f, 0.0372328f, -0.90298f, -0.0608646f, + -2.18756f, 0.170384f, -0.258357f, 0.106287f, -0.161684f, -0.103799f, + -0.127774f, -0.156313f, 0.0705286f, -0.977908f, -0.281191f, -0.056757f, + -0.309474f, 0.050476f, -9.78198f, -2.42795f, -0.289626f, -1.07579f, + -0.439256f, -1.09948f, -0.564671f, 0.0913182f, -0.417216f, -1.19909f, + 0.287063f, 0.402315f, -0.17646f, 0.540488f, 0.00840239f, 0.397492f, + 0.702393f, -0.10566f, 0.655296f, -0.0443876f, 0.154918f, -0.760479f, + -0.0523153f, -0.366199f, -1.08212f, -0.398556f, -0.415203f, -1.10488f, + 0.208349f, 0.27079f, 0.101546f, -0.205752f, -13.7923f, -0.218637f, + -1.10077f, 0.355735f, -0.306196f, 0.627434f, -0.473101f, -0.308027f, + -1.12724f, 0.301597f, 0.660785f, 0.0576217f, -0.155925f, -0.56107f, + -0.223537f, 0.114299f, -0.53803f, -0.252674f, -2.66103f, -0.185245f, + -0.314673f, 0.403337f, 0.679821f, -0.69231f, 0.506264f, -0.999705f, + -0.549097f, 0.353745f, 0.188249f, 0.414484f, -0.615853f, 0.525681f, + -5.23065f, -3.05174f, 1.02074f, -0.965499f, -0.158947f, 0.0436088f, + -0.485824f, 0.0375094f, -1.39985f, -0.481392f, 0.485785f, -0.24874f, + -0.359633f, 0.668108f +}; + +static const float av1_simple_motion_search_prune_part_layer_0_bias_64[] = { + 0.0735592f, -0.045064f, -0.0114103f, 1.39246f, -0.683467f, 0.155765f, + -0.667652f, -0.202425f, -0.585433f, -0.146752f, -0.0812931f, 0.580642f, + 0.578542f, -0.831916f, 0.610063f, 0.0101856f, -0.235863f, 0.538141f, + -2.91334f, -1.71887f, 0.126616f, 0.582497f, -0.438879f, 0.221833f, + 0.850773f, -0.280886f, 0.443233f, -0.0964873f, -0.216161f, 0.34413f, + 0.656818f, 0.0169274f +}; + +static const float av1_simple_motion_search_prune_part_layer_0_kernel_64[] = { + -0.310947f, -0.232675f, 0.0171092f, 0.0834474f, 0.373977f, + 0.300429f, 0.215072f, -0.454074f, 0.187565f, 0.282742f, + 0.562562f, -0.0419322f, 0.000978486f, -0.298267f, 0.216934f, + -0.388722f, -0.146866f, -0.275946f, 0.202361f, 0.225847f, + 1.42868f, 0.473127f, -0.145747f, -0.104986f, 0.153459f, + 0.69382f, 0.162266f, 0.0207715f, -0.45095f, -0.412071f, + -0.235109f, -0.130199f, 0.231741f, 0.460193f, 0.0378202f, + 0.429516f, 0.387691f, -0.272479f, 0.0723884f, -0.453914f, + -0.150618f, -0.10745f, -0.258615f, 0.0838312f, -0.00554958f, + 0.105377f, -0.0415479f, 0.13228f, 1.09044f, -0.73053f, + -0.422553f, -0.435842f, 0.211416f, 0.420332f, 0.0181353f, + -0.030891f, 0.522788f, 0.613526f, 0.374032f, 0.287986f, + -0.403118f, -0.287362f, -1.11523f, -0.577713f, -0.020228f, + 0.86465f, -0.0590579f, 0.341274f, -0.0115644f, -0.260236f, + 0.192123f, -0.0849825f, 0.0501709f, 0.444382f, 0.0762727f, + 0.0926596f, -0.101157f, -0.142787f, 0.40861f, 0.555805f, + -0.00614654f, -0.122846f, 0.203163f, 0.234266f, 0.409795f, + -0.0206245f, -0.224679f, 0.025081f, 0.518044f, -0.287186f, + 0.016494f, -0.0886331f, 0.236438f, -1.01032f, 0.118332f, + 0.364217f, 0.061438f, 0.0381303f, 0.128418f, 0.0257077f, + -0.975751f, -0.694894f, 0.00351914f, 0.278179f, 0.29363f, + 0.525576f, 0.0604849f, 0.531734f, 0.406643f, 0.812497f, + -0.403196f, -0.16664f, -0.620887f, -0.428194f, 0.275401f, + 0.432063f, -0.00378342f, 0.295758f, 0.105615f, -0.00683626f, + 0.00396146f, 0.00598654f, -0.0131701f, -0.0115787f, 0.00386643f, + -0.69686f, -0.139623f, -0.440817f, 0.0542873f, 0.217962f, + 0.527035f, -0.0201046f, 0.0471354f, 0.0271858f, -0.0775197f, + -0.309797f, 0.184879f, -0.232854f, -0.407081f, 0.706227f, + -0.0877534f, 0.306843f, 0.455075f, -0.333961f, 0.0759148f, + 0.0444791f, -0.0693626f, -0.0850289f, -0.513063f, -0.643971f, + -0.630279f, -0.153889f, 0.123315f, 0.00548238f, 0.170707f, + 0.734339f, -0.176988f, 0.322519f, 0.178365f, 0.183519f, + -0.698683f, -0.12043f, -0.349914f, -0.0696762f, -0.53986f, + -0.104738f, 1.05264f, 0.983568f, -0.109035f, 0.0113748f, + 0.0815189f, -0.0628812f, 0.0769389f, 0.010261f, 0.146573f, + -0.433194f, -0.211572f, -0.000397392f, 0.445325f, 0.145091f, + -0.0625902f, 0.29394f, 0.302315f, 0.0892226f, -0.209504f, + -0.0150374f, 0.242608f, 0.216223f, 0.366857f, 0.209829f, + -0.540035f, 0.117599f, -0.329315f, 0.0471133f, -0.0115449f, + -0.0638235f, 0.0527461f, 0.348149f, 0.360802f, 1.06624f, + -0.615991f, -0.341396f, 0.18972f, 0.0709888f, -0.0414466f, + -0.0193809f, 0.0938933f, 0.209058f, 0.575042f, 0.483608f, + -0.285875f, -0.115905f, -0.363637f, 0.375425f, 0.336217f, + 0.0336358f, -0.00265618f, -0.406854f, -0.792959f, -0.219354f, + 0.0331615f, 0.0298859f, -0.211446f, -0.00280773f, -0.194011f, + 0.262109f, 0.548076f, 0.120183f, -0.661603f, 0.241855f, + -0.501428f, 0.00102718f, -0.347331f, -0.58306f, 0.0977254f, + 0.117491f, 0.0840667f, 0.00693675f, 0.000600294f, 0.649569f, + -0.0553811f, -0.197198f, 0.397236f, -0.523737f, -0.564192f, + -0.374679f, -0.249344f, 0.00861428f, 0.00393439f, -0.0834608f, + 0.124389f, -0.0393049f, 0.0425391f, -0.153383f, -0.182346f, + 0.420953f, 0.464221f, 0.288984f, 0.570921f, -0.239965f, + 0.247239f, -0.083434f, 0.714418f, 0.986323f, -0.460244f, + -0.260993f, -0.947743f, -1.0789f, -0.0391231f, 0.612407f, + -0.0306767f, 0.281419f, 0.0072426f, -0.37623f, 0.188744f, + 0.221666f, -0.424914f, 0.29703f, 0.261715f, 0.277809f, + -0.0617616f, -0.000611999f, -0.0547053f, -0.0901018f, -0.347669f, + 0.856072f, 0.596675f, -0.467639f, -1.09324f, -0.184224f, + -0.56051f, -0.0144704f, 0.102894f, -0.122982f, -0.0020749f, + -0.0423487f, 0.0328702f, -0.0154263f, 0.0349021f, -0.00315595f, + 0.0254802f, -0.729191f, 0.207296f, -0.0212349f, -0.207078f, + 0.20636f, -0.156883f, 0.429765f, -0.42672f, 0.138775f, + -0.0267343f, 0.631528f, 0.300646f, -0.4793f, -0.273833f, + -0.0135367f, -0.530819f, -0.534881f, 0.830896f, 0.0266992f, + 0.473744f, 0.210334f, 0.0234739f, 0.255394f, 0.123531f, + -0.489341f, -0.796627f, 0.372617f, 0.190136f, 0.275342f, + 0.739505f, 0.402354f, 0.782806f, 0.437374f, 1.04948f, + -0.55963f, 0.382704f, -0.698321f, 0.0817868f, -0.440108f, + -0.0635004f, -0.277851f, -0.524194f, 0.286157f, -0.01097f, + -0.0293145f, -0.0405071f, -0.035662f, -0.012871f, -0.0516409f, + -0.406671f, 0.709259f, -0.525177f, 0.521123f, -0.44813f, + 0.48412f, -0.0546513f, 0.305253f, -0.468328f, 0.316453f, + -0.36307f, 0.497515f, -0.0606276f, 0.315764f, -0.422066f, + 0.554025f, -0.679183f, 0.616914f, 0.00283324f, -0.000643824f, + 0.0639999f, 0.0488285f, -0.141031f, 0.068003f, -0.0792678f, + -0.425307f, -0.152235f, 0.269917f, -0.352327f, 0.44792f, + -0.116514f, -0.465868f, 0.154287f, 0.0161028f, -0.16848f, + -0.255487f, 0.189832f, 0.254883f, 0.0240822f, 0.432638f, + -0.136564f, 0.137036f, 0.0375734f, 0.989246f, -0.126287f, + 0.111416f, -0.0271002f, 0.718755f, -0.0412969f, 0.00645681f, + 0.253811f, -0.0186998f, 0.691971f, -0.282042f, -0.0783915f, + 0.274592f, -0.358449f, 0.34155f, -0.186374f, -0.136907f, + -0.192334f, -0.251168f, -0.100874f, -0.166578f, -0.336507f, + 0.402373f, 0.173695f, 0.108788f, 0.00885581f, -0.310063f, + 1.05545f, 0.0295867f, 0.180785f, -0.173469f, -0.469924f, + -0.224155f, 0.665862f, -0.126546f, 0.240691f, -0.0415301f, + -0.598534f, 0.0012723f, -0.122297f, -0.558947f, 0.268844f, + 0.241193f, 0.0524422f, -0.1683f, 0.575588f, -0.139012f, + 0.0636691f, -0.446709f, -0.094532f, 0.883809f, -0.112981f, + -0.224047f, 0.0811193f, -0.140571f, -0.09683f, -0.0796143f, + -0.102246f, -0.863392f, -0.0755124f, 0.23125f, -0.0301361f, + -0.153029f, -0.172238f, -0.0286382f, -0.338495f, -0.317216f, + -0.146629f, -0.242264f, -0.702306f, -0.285052f, 0.0623479f, + 0.265735f, 0.00674475f, 0.666196f, 0.883586f, 0.278416f, + -0.341692f, -0.509931f, -0.156263f, 0.635885f, -0.544143f, + -0.572632f, -0.213285f, 0.443396f, -0.268329f, 0.0638439f, + -0.185397f, 0.071126f, 0.386503f, -0.402212f, -0.140784f, + -0.411661f, 0.049398f, -0.0672907f, -0.267034f, -0.0560875f, + 0.0607937f, 0.0445484f, -0.547651f, 0.574718f, 0.417189f, + -0.0610166f, 0.0632293f, 0.391619f, -0.00671215f, -0.136883f, + -0.339346f, 0.0356183f, 0.511993f, 0.178676f, 0.286998f, + 0.136511f, -0.00796929f, 0.203985f, 0.0423532f, -0.175196f, + 0.378534f, 0.770417f, 0.593778f, 0.0256067f, -0.82394f, + -0.500691f, -0.425725f, -0.623708f, -0.0406241f, -0.00226464f, + 0.0207836f, 0.30732f, -0.00784268f, 0.0065445f, -0.0991039f, + -0.20871f, -0.206835f, 0.281219f, 0.119361f, 0.259346f, + -0.102713f, 0.186488f, -0.034455f, -0.00198392f, -0.279107f, + -0.638993f, -0.374404f, -0.48601f, -0.262345f, 0.624532f, + 0.620632f, -0.227014f, 0.433579f, -0.0455096f, 1.22123f, + -0.429156f, 0.12396f, 0.0815152f, -0.0837355f, 0.0282623f, + -0.407475f, 0.787321f, -0.434974f, 0.312904f, -0.230805f, + 0.213042f, -0.250929f, 0.302997f, -0.354709f, 0.0504905f, + -0.561706f, 0.595558f, 0.374951f, 0.802969f, -0.674902f, + 0.33136f, 0.156606f, 0.0218968f, -0.694188f, -0.0221949f, + -0.00639123f, 0.0146536f, 0.0104145f, 0.021635f, -0.0499428f, + -0.575116f, -0.239035f, -0.0588276f, 0.599722f, 0.541932f, + 0.437433f, 0.716268f, 0.193207f, 0.548351f, 0.326951f, + -0.197124f, 0.0355353f, -0.0952009f, -0.217265f, -0.389789f, + 0.0528124f, -0.21334f, -0.190296f, -1.17367f, 0.108905f, + 0.109397f, -0.0192577f, 0.0343813f, 0.085004f, -0.0556737f, + -0.0411158f, -0.534989f, 0.0361896f, 0.124415f, 0.291603f, + -0.0311974f, -0.326726f, 0.343131f, 0.0276456f, -0.231827f, + -0.373894f, -0.208898f, -0.273011f, 0.061323f, -0.0910538f, + -0.30746f, -0.108644f, -0.190736f, 1.58048f, -0.0739711f, + -0.0623489f, -0.137967f, -0.0601359f, -0.133004f, -0.0857153f, + 0.00955987f, -0.365561f, -0.0329051f, 0.463463f, 0.14758f, + -0.512256f, -0.227463f, -0.26008f, -0.567777f, 0.0646234f, + 1.02161f, 0.66157f, -0.16733f, 0.264921f, -0.242036f, + 0.214622f, 0.0712054f, -0.260377f, 0.0849665f, 0.735094f, + 0.11001f, 0.297301f, -0.333342f, 0.066978f, -0.123625f, + 1.07596f, 0.401263f, 0.0800875f, -0.340862f, -0.115587f, + -0.32692f, -0.300842f, 0.0277397f, 0.0630788f, -0.261198f, + 0.428695f, -0.0544757f, -0.124511f, 0.036992f, 0.126322f, + 0.0317603f, 0.0820762f, 0.117277f, -1.14594f, -0.108076f, + -0.0258198f, -0.00337525f, -0.00512531f, 0.1274f, -0.0660535f, + -0.640733f, 0.197142f, 0.147278f, 0.489271f, 0.226507f, + -0.0668414f, 0.0946318f, 0.0994164f, -0.820516f, 0.512939f, + -0.305172f, -0.715187f, -0.195125f, 0.279346f, 0.462144f, + 0.913882f, -0.453879f, 0.0582033f, -0.462866f, 0.0538736f, + 0.0115737f, 0.00626993f, -0.0185185f, 0.0114601f, -0.0181164f, + 0.41588f, -0.0447331f, 0.611756f, 0.43385f, 0.834465f, + 0.122019f, -0.352983f, 0.340429f, -0.245425f, -0.365328f, + -0.521825f, 0.0371057f, 0.172188f, -0.387949f, 0.221054f, + 0.0126359f, 0.422958f, 0.584198f, -0.581498f, -0.019466f, + -0.0271737f, -0.0740885f, 0.00540879f, 0.186086f, -0.0324402f, + -0.563462f, -0.458759f, -0.425296f, -0.0118862f, -0.641508f, + 0.0132084f, 0.0581128f, 0.0231444f, 0.468587f, 0.258838f, + 0.0296665f, 0.0562801f, 0.630014f, 0.381816f, -0.269761f, + -0.135515f, 0.046186f, 1.07632f, -0.050616f, 0.104987f, + 0.29991f, 0.119316f, 0.117248f, 0.0795009f, 0.242573f, + 0.0416634f, -0.0577639f, -0.0974078f, 0.106255f, -0.13098f, + 0.0141486f, -0.00418257f, 0.144848f, -0.463934f, 0.0452591f, + 0.252617f, 0.205222f, -0.189843f, 0.0652245f, -0.135386f, + 0.0500646f, -0.200368f, -0.0142312f, -0.0286832f, -0.254355f, + -1.02752f, -0.73549f, 0.0364518f, 0.0416227f, -0.13185f, + -0.0886515f, -0.502314f, -0.102916f, 0.410911f, -0.355655f, + 0.400416f, -0.340217f, 0.208829f, 0.245972f, 0.149739f, + -0.49458f, 0.589482f, 0.550827f, 0.912709f, -0.351275f, + -0.128076f, -0.285172f, -0.672752f, 0.090583f, -0.245286f, + -0.737297f, -0.201515f, -0.025122f, -0.109854f, 0.36738f +}; + +static const float av1_simple_motion_search_prune_part_logits_bias_64[] = { + 0.346819f, 0.442965f, -0.0216032f, 0.0229235f, -0.402797f, + -0.666074f, -0.455388f, -0.00353411f, -0.595511f, -0.845667f +}; + +static const NN_CONFIG av1_simple_motion_search_prune_part_nn_config_64 = { + NUM_FEATURES_64, + NUM_LOGITS_64, + NUM_HIDDEN_LAYERS_64, + { + NUM_LAYER_0_UNITS_64, + }, + { + av1_simple_motion_search_prune_part_layer_0_kernel_64, + av1_simple_motion_search_prune_part_logits_kernel_64, + }, + { + av1_simple_motion_search_prune_part_layer_0_bias_64, + av1_simple_motion_search_prune_part_logits_bias_64, + }, +}; + +#undef NUM_HIDDEN_LAYERS_64 +#undef NUM_FEATURES_64 +#undef NUM_LAYER_0_UNITS_64 +#undef NUM_LOGITS_64 + +#define NUM_HIDDEN_LAYERS_32 1 +#define NUM_FEATURES_32 25 +#define NUM_LAYER_0_UNITS_32 28 +#define NUM_LOGITS_32 10 + +static const float av1_simple_motion_search_prune_part_logits_kernel_32[] = { + 0.486581f, 0.340847f, -0.109226f, 0.467224f, -0.541561f, + 0.0943619f, -0.429442f, -0.207442f, 0.959963f, 0.618666f, + -0.0636751f, 0.144508f, -0.0278289f, 0.332293f, -0.751493f, + 0.245438f, -0.917758f, 0.612128f, -0.32648f, 0.534618f, + -0.615239f, 2.71641f, 0.233759f, 0.820558f, -0.249758f, + -0.427783f, -0.359361f, 0.0375732f, 0.806973f, 0.352512f, + -0.0532192f, 0.0576861f, -0.464178f, -0.334877f, -0.697042f, + 0.0538218f, 0.0919659f, -0.00765812f, 0.0603847f, -0.460315f, + 0.37979f, -0.0867612f, -0.670683f, -0.188619f, -0.570586f, + 0.233418f, 0.153581f, 0.290905f, -0.624885f, -0.557842f, + -0.555567f, 0.463773f, -0.123909f, -0.277731f, 0.0374468f, + 0.409903f, 0.287638f, -0.593066f, -0.223434f, 0.154263f, + -0.250464f, -0.077696f, 0.229652f, -0.304174f, 0.308053f, + 0.33155f, -0.502825f, 0.361216f, -0.499294f, 0.00595444f, + -0.307201f, 0.5766f, -0.438384f, -0.093701f, -0.118586f, + 0.202337f, -0.486623f, 0.261552f, 0.139756f, -0.655642f, + -0.0627001f, -0.213053f, -0.243037f, 0.205918f, 0.0718368f, + 0.188041f, 0.141529f, -0.132239f, 0.425827f, -0.218353f, + 0.153114f, 0.33268f, 0.0226116f, 0.167394f, 0.269854f, + -0.457001f, 0.1973f, -0.526087f, 0.467528f, 0.290934f, + 1.16267f, 0.0823663f, -0.754389f, -0.83716f, 0.270157f, + -1.41229f, 0.148511f, -0.286832f, 0.664796f, 0.492254f, + 0.360567f, -0.533993f, 0.0435672f, -0.103001f, 0.220668f, + 0.594621f, -0.0213356f, -0.347638f, -0.694457f, 0.0759505f, + 0.161358f, -0.389384f, -0.0455192f, -0.61252f, -0.174173f, + -0.00788878f, -1.22487f, 0.332233f, -0.0457021f, -0.225918f, + -0.197657f, -0.115408f, -0.240589f, -2.05681f, 0.00914629f, + -1.92213f, 0.0268578f, -0.49076f, -0.0120123f, 0.291157f, + 0.267116f, -0.0775724f, 0.181115f, -0.392441f, -0.488114f, + -0.28842f, -0.115465f, 0.128974f, -0.0829899f, -0.14096f, + -0.140145f, -0.700281f, 0.0368945f, -0.437598f, 0.243485f, + -1.00301f, 0.332324f, 0.125014f, -0.0604481f, -0.0652028f, + -0.207295f, -1.0209f, -0.341525f, 0.191326f, -0.147578f, + 0.0878327f, 0.129827f, -0.0848319f, 0.187381f, -1.28663f, + 0.00537885f, -0.134277f, -0.0411126f, -0.3434f, -0.0456494f, + 0.37861f, 0.409095f, 0.237177f, -0.396855f, -0.205418f, + -1.31701f, -0.319032f, -0.123404f, -0.240005f, -0.305206f, + -0.0258176f, -0.26367f, -0.142396f, 0.191672f, -1.44061f, + 0.0554776f, -0.571839f, -0.284789f, -0.425677f, -0.0307376f, + 0.20275f, -0.223146f, 0.144612f, 0.0212636f, 0.0238303f, + -0.253802f, -0.188922f, -0.0637066f, -0.340836f, 0.124774f, + 0.130474f, -0.154099f, -0.0292733f, 0.158148f, -0.246989f, + -0.259059f, 0.220224f, 0.228449f, -0.41956f, -0.321848f, + -0.2396f, -0.316449f, -1.3363f, 0.0264099f, -1.46865f, + 0.113073f, 0.0722885f, -0.166986f, -0.164877f, 0.0360911f, + 0.534472f, -0.551152f, -0.328501f, 0.0781121f, -0.378112f, + -0.459502f, 0.28015f, -0.212302f, -0.521641f, 0.618993f, + -0.347709f, 0.266253f, -0.0280894f, 0.348511f, -0.0155031f, + -0.100693f, 0.0447673f, 0.277519f, -0.233998f, -0.0796738f, + -1.73644f, -0.160776f, 0.53092f, -0.180406f, 0.056447f, + 0.385356f, -0.262337f, -0.241479f, -0.271426f, -0.457354f, + -0.266788f, 0.367371f, -0.103065f, 0.47783f, -0.188327f, + -0.159636f, 0.00142907f, -0.409756f, 0.454889f, -0.24566f, + -0.0760084f, 0.286355f, 0.462102f, 0.0431695f, -0.127395f, + -0.200476f, -0.350557f, 0.217275f, -0.23975f, 0.255148f, + -0.280626f, 0.42476f, 0.157411f, 0.0358675f, -0.192591f +}; + +static const float av1_simple_motion_search_prune_part_layer_0_bias_32[] = { + 0.940498f, 0.15602f, -0.234831f, 0.0268585f, 0.144769f, 0.243081f, + 0.611406f, 0.366093f, 0.361868f, 0.39668f, 0.401479f, 0.369467f, + 0.0909503f, 0.710595f, 0.032786f, 0.525891f, -1.0232f, 0.732557f, + -0.064425f, 0.865222f, -0.042917f, -0.237191f, -0.527006f, -0.0172101f, + 0.59681f, -0.472405f, 0.0969218f, -0.250624f +}; + +static const float av1_simple_motion_search_prune_part_layer_0_kernel_32[] = { + 0.355607f, 0.126701f, -0.0825159f, 0.200675f, -0.011308f, + -0.280057f, 0.559816f, 0.142689f, 0.0422419f, -0.151692f, + -0.0275637f, -0.283101f, -0.20822f, -0.200394f, 0.465427f, + 0.344491f, -0.525319f, -0.358813f, -0.39767f, 0.0974486f, + 0.00559058f, -0.00546089f, 0.0506486f, 0.114475f, -0.0436463f, + -0.574152f, -0.376294f, 0.16563f, -0.0967032f, 0.00579838f, + 0.0639909f, -0.037129f, 0.407574f, -0.231428f, 0.489326f, + -0.221566f, -0.270382f, -0.784628f, -0.155502f, 0.481698f, + -0.0296057f, 0.431855f, 0.840807f, 0.112291f, 0.773874f, + -0.0610936f, -0.012892f, 0.365154f, 0.0267687f, -0.0751114f, + 0.25043f, 0.516472f, -0.186133f, -0.12762f, -0.168804f, + -0.146309f, 0.139314f, -0.367113f, -0.601079f, 0.0559856f, + 0.176081f, 0.22397f, 0.434113f, 0.0363256f, 0.313051f, + 0.0143976f, 0.190076f, 0.474607f, -0.681134f, -0.0709097f, + -0.253289f, -0.216277f, -0.0593789f, -0.107795f, -0.194842f, + 0.513945f, 0.239171f, -0.720561f, 0.0136723f, -0.391147f, + -0.272043f, -0.164766f, 0.124248f, 0.147178f, -0.35497f, + 0.397725f, -0.117603f, 0.262937f, -0.331964f, 0.182418f, + 0.315671f, -0.0385649f, 0.488769f, -0.334568f, 0.00596018f, + 0.0661557f, -0.0446985f, -0.0928255f, -0.0221032f, -0.019045f, + -0.20881f, 0.197907f, -0.381881f, 0.0598071f, -0.0434551f, + 0.159283f, -0.110631f, 0.266996f, -0.0265494f, 0.135199f, + -0.00833162f, 0.804482f, -0.114698f, -0.15066f, -0.479553f, + 0.448407f, -0.344069f, -0.0280952f, -0.208211f, -0.102269f, + -0.679066f, -0.37476f, -0.0228875f, 0.0535049f, 0.111015f, + -0.18125f, -0.167584f, 0.0110497f, 0.262723f, -0.413839f, + -0.0611238f, 0.358499f, 0.0807514f, 0.208254f, 0.214499f, + 0.11137f, -0.14262f, -0.0513973f, 0.243718f, -0.373716f, + -0.00413366f, 0.216501f, -0.164149f, -0.064935f, -0.0840282f, + 0.0566148f, 0.0377686f, 0.289835f, 0.769388f, 0.891198f, + -0.592739f, 0.40744f, -0.153095f, 0.657311f, 0.140737f, + 0.28209f, 0.158344f, 0.353546f, 0.0868246f, 0.116887f, + 0.402004f, 0.437184f, 0.589219f, 0.760594f, -0.575419f, + -0.754308f, -0.709219f, -0.297814f, -0.418609f, -0.0262104f, + 0.0411959f, 0.0597708f, -0.143728f, -0.136642f, 0.099614f, + -0.257601f, -0.2404f, 0.305893f, 0.254009f, -0.0301398f, + -0.0653091f, -0.459002f, -0.163404f, 0.123152f, -0.0284252f, + -0.457272f, 0.00788622f, -0.828399f, -0.0534199f, 0.586877f, + 0.982728f, 0.424581f, 0.0891856f, 0.383182f, -0.122053f, + 0.0808408f, -0.00384914f, -0.0560201f, -0.0524772f, -0.263444f, + -0.239287f, -0.882777f, 0.0180592f, -0.0948711f, -0.177946f, + 0.0296473f, 0.096082f, 0.0455604f, -0.108608f, 0.00777951f, + -0.140896f, 0.117187f, -0.342467f, -0.0691604f, 0.0761611f, + -0.0892053f, 0.111386f, -0.167456f, 1.40616f, -0.00478793f, + 0.00547665f, -0.0441829f, 0.0151323f, -0.0674099f, -0.0380578f, + 0.16072f, 0.31882f, 0.245486f, -0.424318f, 0.101845f, + -0.203343f, -0.197402f, -0.163025f, -0.0771961f, -0.264435f, + 0.319429f, 0.250076f, 0.782726f, 0.386003f, 0.00700673f, + -0.375715f, 0.151453f, -0.296265f, -0.560183f, -0.00767249f, + -0.109593f, -0.119419f, -0.0161516f, 0.0380283f, -0.156417f, + 0.131708f, 0.396268f, -0.221796f, 0.232099f, 0.128852f, + 0.0567268f, 0.297297f, 0.173269f, 0.213411f, 0.0384426f, + -0.290985f, -0.0426841f, -0.488292f, -0.087101f, -0.311582f, + 0.83009f, -0.153163f, 0.903335f, -1.15644f, -0.0378635f, + -0.0552129f, -0.126362f, -0.176945f, 0.0653115f, 0.0989368f, + -0.333543f, -0.330586f, 0.29775f, -0.103535f, 0.210824f, + -0.00300509f, 0.317105f, 0.216852f, 0.479718f, 0.0485808f, + -0.15662f, 0.718199f, 0.327513f, 0.115169f, -0.423598f, + -0.456633f, -0.575814f, -0.494454f, 0.304411f, 0.0493055f, + -0.381171f, 0.467251f, -0.122872f, -0.167441f, 0.017253f, + -0.0583646f, -0.1586f, 0.214046f, -0.0284424f, -0.217112f, + 0.606567f, -0.107533f, 0.36615f, -0.0709227f, 0.604761f, + -0.244657f, -0.296651f, -0.595611f, -0.156629f, -0.693468f, + -0.310603f, 0.499272f, 0.282941f, 0.295043f, -0.178704f, + 0.281186f, 0.014329f, -0.120819f, 0.154234f, 0.0131325f, + -0.472231f, -0.631281f, 0.422955f, 0.711432f, -0.118025f, + 0.0864996f, 0.343971f, -0.301477f, -0.246638f, 0.165068f, + 0.218044f, 0.224236f, -0.0848522f, 0.00671216f, 0.401141f, + -0.218857f, -0.0298495f, -0.135725f, -0.377618f, 0.022473f, + 0.106955f, -0.0582005f, 0.0468484f, -0.0217442f, 0.130911f, + -0.0926905f, 0.383007f, -0.159353f, -0.222711f, -0.0286419f, + 0.372315f, -0.469095f, 0.797571f, -0.301315f, 0.239327f, + -0.997507f, -0.363409f, 0.353717f, 0.676686f, -0.0500028f, + 0.0638539f, -0.431927f, 0.243852f, 0.000884826f, -0.00166585f, + 0.0613292f, -0.029558f, -0.0248432f, -0.0125607f, -0.0309674f, + -0.743308f, 0.0409806f, 0.0921015f, 0.167816f, 0.406849f, + 0.095677f, 0.0308913f, 0.139956f, -0.400472f, 0.396617f, + 0.936517f, 0.355057f, -0.423816f, -0.232472f, -0.220188f, + -0.399746f, -0.409623f, -0.158797f, 0.361153f, 0.0327019f, + 0.0690844f, -0.032197f, 0.0248558f, 0.00438518f, 0.0222724f, + -0.326832f, -0.314295f, 0.156563f, 0.0562703f, 0.332694f, + 0.299424f, 0.228206f, 0.322038f, 0.0136098f, 0.0060297f, + -0.165851f, -0.306512f, 0.0796508f, -0.37158f, 0.239395f, + -0.349442f, 0.198515f, -0.253854f, -1.13694f, 0.0202873f, + -0.0504009f, -0.130528f, -0.017126f, -0.0370001f, -0.087458f, + -0.119952f, -0.130404f, 0.0333733f, -0.184736f, 0.182162f, + 0.227776f, -0.166563f, -0.156162f, 0.118215f, -0.220183f, + 0.00474779f, -0.107792f, 0.260493f, 0.11884f, 0.156587f, + 0.303936f, -0.131788f, -0.314774f, 0.310606f, 0.0935523f, + 0.790767f, 0.26461f, 0.0236426f, 0.0629469f, 0.0344072f, + -0.151513f, 0.211498f, 0.0245435f, 0.0629973f, 0.052019f, + -0.03308f, 0.123487f, 0.0885027f, 0.159172f, -0.0510615f, + 0.0298033f, -0.130515f, -0.121799f, -0.104915f, 0.208822f, + -0.310496f, -0.314106f, 0.303307f, -0.0196736f, 0.0420045f, + 0.461777f, -0.433699f, 0.00345407f, 0.703139f, -0.655637f, + -0.210767f, -0.201278f, 0.163694f, -0.236534f, 0.300877f, + 0.0769982f, -0.282453f, 0.149721f, -0.0303466f, -0.191473f, + -0.406056f, -0.213472f, 0.1619f, -0.245953f, 0.00544399f, + -0.121434f, 0.193012f, -0.307165f, 1.45431f, -0.161468f, + -0.12444f, -0.146129f, -0.0528212f, -0.0925165f, -0.134528f, + -0.479475f, 0.315525f, 0.133845f, 0.382158f, -0.0799693f, + -0.151041f, 0.255772f, 0.409536f, -0.240663f, -0.323741f, + -0.205876f, 0.03699f, -0.217541f, 0.108511f, 0.640628f, + 0.705993f, -0.423899f, -0.78314f, -0.100733f, -0.00859087f, + 0.0251879f, 0.0458335f, 0.00210128f, -0.047576f, -0.0560518f, + -1.23869f, -0.829914f, 0.0346551f, 0.350505f, 0.193688f, + 0.459154f, 0.137898f, 0.503818f, 0.260867f, 0.649539f, + 0.0150802f, 0.0239274f, -0.276069f, -0.0621478f, -0.193106f, + -0.0375665f, -0.654529f, 0.189493f, 0.446625f, -0.0208265f, + 0.019838f, -0.0201955f, 0.00180428f, -0.0110678f, -0.0172414f, + 0.0276489f, -0.252882f, -0.0351807f, -0.0518874f, 0.279098f, + -0.245122f, 0.101287f, -0.114202f, -0.0812187f, 0.572429f, + -0.0821731f, 0.564183f, 0.0222552f, 0.190111f, -0.0417497f, + -0.00385925f, -0.182995f, -0.240482f, -0.291572f, -0.0450444f, + 0.0962974f, -0.165973f, -0.0954637f, -0.163841f, -0.833405f, + -1.31541f, -0.336473f, -0.0920702f, 0.816105f, 0.393377f, + 0.0340241f, -0.0844545f, 0.61729f, -0.17596f, 0.241149f, + -0.42825f, -0.59091f, -0.290702f, 0.0796465f, 0.0982819f, + 0.466934f, 0.261666f, 0.0373333f, 0.332509f, -0.0266694f, + -0.0476951f, -0.00642167f, -0.0132542f, -0.000320841f, 0.00475532f, + 0.000502778f, 0.296534f, -0.13297f, -0.113082f, -0.327923f, + 0.35901f, -0.302246f, 0.189799f, -0.37994f, 0.16107f, + -0.20414f, 0.548575f, -0.460821f, 0.591878f, -0.213113f, + -0.169373f, -0.07332f, 0.228841f, 0.682302f, -0.0665316f, + -0.142456f, -0.0873117f, 0.00607451f, 0.0376443f, 0.0536673f, + -0.0109536f, -0.400279f, 0.550058f, 0.820871f, -0.666373f, + -0.471962f, -0.315925f, -0.313142f, 0.952742f, 0.473928f, + -0.119006f, 0.153241f, -0.0383078f, 0.631869f, -0.343423f, + -0.233473f, -0.218195f, -0.077688f, -0.728291f, 0.0382408f, + -0.00662886f, -0.0419666f, 0.0309776f, -0.0281592f, 0.0154229f, + -0.198534f, 0.0206324f, 0.0152272f, -0.235067f, 0.0330486f, + 0.139198f, -0.0612118f, 0.133154f, -0.258675f, 0.0900275f, + -0.127771f, 0.157322f, -0.00767807f, -0.329258f, 0.327458f, + 0.0528581f, -0.181125f, 0.409995f, -0.162979f, -0.0193475f, + 0.186009f, 0.0519501f, 0.651877f, -0.37821f, -1.10341f, + -0.189776f, -0.0922788f, 0.460256f, 0.168011f, 0.440295f, + 0.478135f, 0.374573f, 0.384048f, 0.116953f, 0.68886f, + -0.427727f, -0.36676f, -0.500013f, -0.228685f, -0.218859f, + 0.208396f, -0.0173765f, -0.0680241f, -0.00538013f, -0.0674409f, + -0.092764f, 0.0295707f, -0.0462887f, -0.00636006f, 0.0334169f +}; + +static const float av1_simple_motion_search_prune_part_logits_bias_32[] = { + 0.176459f, 0.154405f, 0.281821f, 0.375264f, -0.882863f, + -0.240261f, -1.17075f, -0.280216f, -0.743836f, -0.317511f +}; + +static const NN_CONFIG av1_simple_motion_search_prune_part_nn_config_32 = { + NUM_FEATURES_32, + NUM_LOGITS_32, + NUM_HIDDEN_LAYERS_32, + { + NUM_LAYER_0_UNITS_32, + }, + { + av1_simple_motion_search_prune_part_layer_0_kernel_32, + av1_simple_motion_search_prune_part_logits_kernel_32, + }, + { + av1_simple_motion_search_prune_part_layer_0_bias_32, + av1_simple_motion_search_prune_part_logits_bias_32, + }, +}; + +#undef NUM_HIDDEN_LAYERS_32 +#undef NUM_FEATURES_32 +#undef NUM_LAYER_0_UNITS_32 +#undef NUM_LOGITS_32 + +#define NUM_HIDDEN_LAYERS_16 1 +#define NUM_FEATURES_16 25 +#define NUM_LAYER_0_UNITS_16 32 +#define NUM_LOGITS_16 10 + +static const float av1_simple_motion_search_prune_part_logits_kernel_16[] = { + -0.520913f, 0.395611f, 0.0369091f, -0.318591f, -0.463252f, + 0.134992f, -0.43154f, -0.0739112f, -0.118817f, 0.476373f, + -0.281406f, 0.3413f, 0.456255f, 0.33307f, 0.2942f, + 0.1317f, 0.498113f, 1.95406f, -0.165726f, -0.219306f, + -0.302656f, -1.31157f, -0.433662f, 0.151716f, -0.214817f, + 0.504523f, -0.710049f, 0.359616f, -0.412695f, -0.103193f, + 0.341912f, 0.351378f, -0.181486f, 0.573862f, -0.0396254f, + -0.17855f, -0.276163f, 0.0367465f, -0.353905f, -0.204689f, + 0.309581f, -0.0439686f, -0.147855f, 0.152745f, 0.290871f, + 0.131049f, -0.27808f, -0.142997f, 0.207843f, -1.23074f, + -0.267714f, -0.336923f, 0.313781f, -0.61488f, -0.161984f, + 0.238059f, -0.0879942f, -0.085543f, -0.260156f, -0.13614f, + -0.242196f, 0.201216f, -0.248691f, 0.0936671f, -0.350522f, + -0.35002f, -0.156583f, -0.00579001f, 0.300578f, -0.341269f, + -0.290712f, 0.354802f, -0.31629f, 0.509107f, -0.236953f, + -0.0923519f, 0.544509f, -0.280991f, -0.017437f, -0.202721f, + -0.116388f, -0.7191f, 0.324586f, 0.254249f, 0.125505f, + 0.00658697f, -0.333322f, -0.126537f, -0.140004f, -0.0241202f, + -0.172466f, 0.210035f, -0.270833f, 0.0579044f, 0.0950352f, + -0.120382f, 0.063292f, -0.394925f, 0.482165f, 0.147753f, + 0.331465f, -0.187444f, 0.1083f, 0.414028f, 0.279238f, + -0.486889f, -0.674349f, -0.313656f, -0.131186f, -0.100662f, + 0.238191f, -1.19083f, -0.30667f, -2.4324f, 0.235311f, + 0.108605f, 1.67197f, 0.476157f, 0.30055f, 0.0839538f, + 0.408469f, -0.473517f, 0.560283f, -0.0188136f, 0.273824f, + -0.43707f, -0.0346978f, -0.438315f, -0.0196275f, -0.0567921f, + -0.220166f, 0.216175f, -0.0180461f, 0.0116429f, -0.0096949f, + -0.32613f, 0.176829f, -0.243563f, -0.240972f, -0.621819f, + -0.00619648f, -0.145525f, 0.124324f, -0.0306925f, 0.172208f, + -2.04631f, -0.200087f, -0.594135f, -0.352303f, -0.309826f, + 0.0922786f, -0.698371f, -0.0366823f, 0.0244036f, 0.338775f, + -0.115947f, 0.144971f, -0.0607037f, -0.762412f, 0.0125584f, + -0.262427f, -0.0830273f, -0.291252f, -0.176059f, -0.203983f, + 0.0871455f, -0.0894925f, 0.0426263f, -0.060001f, -0.542355f, + -0.407837f, -0.0419273f, 0.226608f, -0.114844f, 0.158733f, + -0.187237f, 0.113163f, -1.86337f, -0.367544f, -0.547048f, + -0.24192f, -0.226764f, 0.090912f, 0.819604f, 0.433766f, + -0.841657f, 0.446987f, -0.622761f, -0.0296385f, -0.130176f, + -0.0518136f, -0.640326f, -0.330107f, -0.137832f, -0.0119033f, + 0.39401f, 0.111331f, -0.141367f, -0.230289f, 0.171054f, + -0.924059f, -0.107317f, -0.347983f, 0.0261109f, 0.423002f, + -0.305817f, 0.247696f, 0.0436002f, 0.0305862f, -1.52448f, + -0.595587f, -0.155552f, -1.11949f, -0.513937f, 0.138347f, + -0.301487f, 0.352144f, -0.615801f, 0.0326701f, -0.215322f, + -0.0608176f, -0.416557f, -0.306073f, -0.441512f, -0.0569277f, + -0.709768f, -0.602527f, -0.311134f, 0.152471f, -0.255299f, + 0.354505f, 0.194464f, 0.0144251f, 0.110732f, -0.4452f, + -0.804814f, 0.205325f, -0.0957486f, 0.502684f, 0.09112f, + -0.533087f, -1.77979f, 0.556992f, -0.176157f, -0.642633f, + 0.11553f, -0.232561f, 0.161277f, -0.0631125f, -0.20759f, + 0.489253f, -0.067533f, 0.0231024f, -0.179831f, -0.272985f, + -0.390059f, 0.3089f, 0.185733f, -0.257065f, -0.508838f, + -0.550028f, 0.0665621f, -0.138288f, -0.413188f, 0.191193f, + -1.32969f, -0.431025f, 0.270242f, -0.340062f, 0.0817257f, + 0.0376051f, -0.18633f, 0.0828274f, 0.00670051f, -0.431295f, + -0.450316f, -0.173042f, -0.322248f, 0.370628f, 0.10019f, + 0.317293f, -0.266613f, 0.0752441f, -0.425656f, -0.112223f, + 0.557991f, -0.324368f, -0.195261f, -0.0526129f, -0.807472f, + -0.387466f, 0.192186f, 0.353213f, -0.120238f, 0.107686f, + 0.200678f, -0.75363f, 0.466857f, -0.282345f, -0.0849236f, + -0.0490695f, -0.00643182f, 0.123047f, -0.207805f, -0.130456f, + -1.09455f, 0.340973f, 0.334784f, 0.0706643f, -1.65681f, + -0.319952f, -0.198514f, -0.0787972f, 0.089524f, 0.0531034f, + -0.202705f, -0.0852339f, -0.62572f, -0.0734234f, -0.838088f +}; + +static const float av1_simple_motion_search_prune_part_layer_0_bias_16[] = { + -0.0616197f, 0.939947f, 0.521161f, 0.213886f, 0.130324f, -0.127443f, + -0.0538715f, 0.708746f, 0.445031f, 0.418781f, -0.114539f, 0.521941f, + 1.13719f, 0.606545f, -0.32193f, -0.150788f, 0.158487f, -0.224005f, + 0.654715f, 0.115729f, -0.286506f, -2.06223f, 0.0117697f, 0.503905f, + -0.102339f, 0.653256f, -0.813561f, 0.905235f, -0.417269f, -0.206265f, + 0.661496f, 0.95533f +}; + +static const float av1_simple_motion_search_prune_part_layer_0_kernel_16[] = { + -0.203489f, 0.00686229f, -0.161414f, 0.0637276f, 0.27516f, + 0.512219f, 0.164205f, 0.00326062f, -0.41914f, -0.400334f, + 0.554419f, 0.715772f, -0.295569f, -0.703503f, 0.0137744f, + -0.0934259f, 0.174234f, -0.148618f, -0.0360558f, -0.0986598f, + -0.138502f, -0.0770713f, 0.122922f, -0.00784415f, 0.0953234f, + -0.255754f, -0.310967f, 0.185306f, 0.464554f, 0.147338f, + -0.0612304f, 0.164783f, 0.301097f, 0.161364f, -0.12723f, + -0.0265984f, -0.471361f, 0.0578776f, -0.362865f, 0.425789f, + 0.402758f, -0.190235f, 0.00549738f, -0.570908f, 1.27206f, + 0.048868f, -0.0097675f, 0.0708324f, 0.0456103f, 0.0149062f, + -0.563032f, -0.420573f, 0.107278f, 0.0938258f, 0.142712f, + -0.00251036f, -0.250583f, 0.522272f, 0.0113175f, 0.126751f, + -0.433028f, -0.035542f, -0.536686f, -0.0668722f, 0.253094f, + 0.254007f, -0.435505f, 0.343001f, 0.0531542f, -0.361914f, + -0.102664f, 0.0404874f, 0.132686f, 0.0762298f, 0.0236971f, + -0.419454f, 0.230877f, -0.223714f, 0.037813f, 0.0818604f, + 0.383705f, -0.235028f, -0.0554801f, 0.429851f, 0.0845829f, + 0.166295f, 0.355111f, -0.421197f, 0.298949f, 0.0218224f, + 0.445705f, -0.392217f, -0.429578f, -0.076276f, -0.0963531f, + -0.631425f, -0.225977f, 8.06349e-06f, 0.0676679f, 0.0779651f, + 0.0706891f, 0.101377f, 0.517103f, 0.0945502f, -0.52522f, + -0.312022f, 0.0358089f, 0.616509f, -0.0507444f, -0.465814f, + -0.0326024f, 0.591298f, 0.188544f, -0.0633316f, -0.199987f, + 0.403118f, -0.511281f, -0.696263f, 0.112996f, 0.103875f, + 0.0495595f, -0.0107449f, 0.521539f, -0.0123823f, -0.0642751f, + 0.08548f, -0.0679207f, 0.526558f, 0.0651114f, -0.342643f, + -0.349934f, 0.307437f, 0.368763f, -0.194851f, -0.134117f, + 0.102448f, -0.0520666f, 0.0415824f, -0.175085f, 0.272685f, + 0.0675856f, 0.120627f, 0.391408f, -0.135249f, -0.357024f, + 0.019666f, -0.0622677f, 0.407427f, 0.22655f, -0.129432f, + -0.165327f, 0.004893f, 0.5479f, 0.0613981f, -0.479682f, + -0.144228f, -0.130106f, 0.206458f, -0.342086f, 0.12691f, + -0.113554f, 0.231164f, -0.051419f, 0.0401286f, -0.560429f, + -0.070609f, 0.420232f, 0.442465f, -0.237501f, -0.000293732f, + -1.017f, -0.210222f, 0.0157063f, 0.0488178f, 0.0734721f, + -0.52626f, -0.276441f, -0.521579f, 0.443532f, -0.0819051f, + -0.0732633f, -0.17999f, 0.258525f, -0.0374872f, 0.150115f, + 0.0510939f, 0.168116f, 0.473372f, 0.824489f, 0.302195f, + -0.348613f, 0.238569f, 0.176444f, -0.633945f, -0.0567195f, + -0.0305827f, -0.0551851f, 0.85822f, -0.0628099f, 0.0364294f, + -0.234823f, 0.179067f, 0.143208f, -0.0511014f, -0.404191f, + 0.428035f, 0.0235506f, 0.371991f, -0.312909f, 0.550933f, + -0.389265f, -0.271813f, -0.293461f, -0.583752f, 0.179991f, + 0.191698f, 0.659094f, 1.07941f, -0.509555f, -0.100638f, + 0.079988f, -0.0519107f, -0.112723f, -0.0663326f, 0.0353569f, + -0.795055f, -0.465999f, 0.283579f, 0.340913f, 0.152738f, + 0.294664f, 0.527839f, 0.187735f, 0.359461f, 0.164629f, + 0.107512f, 0.390402f, 0.236702f, 0.114674f, -0.525655f, + -0.555476f, -0.6589f, -0.266601f, -0.0946547f, 0.6306f, + 0.0248513f, 0.038497f, 0.432706f, -0.0715465f, 0.0410172f, + -0.115313f, -0.428684f, 0.136283f, 0.0913185f, 0.11277f, + 0.0968689f, -0.00437052f, 0.0888981f, 0.10304f, 0.02442f, + -0.211315f, 0.00981596f, -0.0974827f, 0.208611f, 0.140644f, + 0.0315567f, 0.350332f, -0.291049f, -0.0715449f, -0.352992f, + -0.858004f, 0.828658f, 0.439092f, 0.0151291f, 0.0503828f, + 0.0656112f, -0.710749f, -0.0951757f, 0.193908f, 0.00908018f, + 0.141486f, -0.0657711f, 0.099791f, 0.153729f, -0.419576f, + -0.892636f, -0.0449268f, -0.170786f, -0.156564f, 0.384511f, + 0.296565f, 0.0569815f, -0.103938f, 1.27479f, -0.0406475f, + 0.154083f, -0.186442f, 0.0282588f, 0.0312102f, -0.188994f, + 0.284243f, -0.564693f, 0.425525f, -0.00924596f, 0.810003f, + 0.233812f, -0.0180273f, 0.121082f, -0.209096f, 0.151437f, + 0.286921f, -0.348095f, 0.174813f, -0.413798f, 0.108994f, + -0.34266f, -0.0337981f, -0.459f, -0.409812f, -0.0890104f, + 0.0834802f, -0.00259191f, -0.105914f, -0.164207f, 0.0697689f, + -0.312098f, -0.00650536f, -0.486758f, -0.248486f, 0.24314f, + -0.0857144f, 0.0884781f, -0.65615f, -0.121744f, 0.0709335f, + -0.0237193f, 0.10764f, -0.0409452f, -0.0824305f, 0.42329f, + 0.138258f, 0.502607f, 0.228545f, 0.0687789f, 0.0361586f, + 0.39074f, 0.0722654f, -0.0133148f, 0.283278f, 0.0743384f, + 0.310292f, -0.297675f, -0.359935f, 0.521021f, -0.10082f, + -0.272333f, 0.0120283f, 0.138118f, -0.123711f, -0.0711386f, + 0.0170747f, 0.831039f, 0.0509626f, 0.790608f, -0.0863406f, + -0.31962f, 0.0631013f, 0.0873453f, -0.472331f, -0.0826027f, + -0.241722f, 0.148835f, -0.131611f, 0.000195347f, -0.0615804f, + -0.838663f, -0.586979f, 0.247713f, 0.362254f, 0.492727f, + -0.132163f, 0.0516545f, 0.477838f, -0.0395182f, 0.0124993f, + -0.771514f, 0.0386912f, -0.118525f, -0.346172f, -0.265905f, + -0.175257f, -0.406287f, 0.393837f, 0.409096f, -0.408501f, + -0.0207146f, 0.0487809f, 0.0636982f, 0.0276368f, 0.0878249f, + 0.0425889f, 0.0868633f, 0.17423f, -0.128217f, -0.477068f, + -0.321294f, 0.0393771f, 0.00812823f, -0.350529f, -0.129012f, + 0.439953f, 0.396662f, 0.410475f, -0.123129f, -0.565966f, + 0.0298635f, -0.614611f, -0.477514f, 0.453651f, 0.0617068f, + 0.0530563f, 0.0479074f, 0.213551f, 0.039034f, 0.0449095f, + -1.06868f, -1.2654f, -0.175482f, 0.595068f, -0.230095f, + 0.719838f, -0.272148f, 0.696564f, 0.0485396f, 0.468584f, + 0.0695439f, -0.0842122f, -0.228978f, 0.161397f, -0.000441421f, + -0.0297514f, -0.250599f, 0.196656f, 0.608423f, -0.0112096f, + 0.0236881f, -0.00167311f, 0.0040709f, 0.015495f, 0.00757698f, + -0.165886f, 0.359767f, -0.0214696f, 0.377208f, 0.0303547f, + 0.0657094f, 0.140775f, 0.21867f, -0.203922f, 0.263878f, + -0.0529099f, 0.202438f, -0.243226f, 0.156659f, -0.627056f, + -0.845036f, -0.500873f, 0.172588f, 0.402972f, -0.147734f, + 0.151792f, -0.075579f, 0.443519f, 0.0311335f, -0.0328222f, + -0.0299781f, 0.435956f, -0.0987376f, 0.288402f, 0.135902f, + -0.173584f, -0.186255f, 0.224524f, -0.249645f, 0.123702f, + -0.0846244f, 0.491317f, 0.544846f, 0.338677f, -0.258885f, + -0.617434f, -0.629003f, -0.347233f, 0.181262f, -0.0606015f, + -0.537766f, 0.215089f, -0.334527f, 0.0488534f, 0.0577997f, + -1.12431f, -0.932292f, -0.11559f, 0.573715f, 0.151128f, + 0.693818f, -0.16956f, 0.802591f, -0.231531f, 1.04318f, + -0.476417f, 0.293452f, -0.610136f, 0.27506f, -0.384012f, + 0.305366f, -0.0540464f, -0.337583f, -0.174285f, 0.157248f, + 0.0477345f, -0.0229535f, 0.0475766f, -0.00603319f, 0.00856119f, + -0.702893f, -0.0579673f, 0.183024f, -0.166222f, 0.109763f, + -0.148019f, -0.258873f, -0.0820157f, -0.186716f, -0.449265f, + -0.0534138f, 0.15732f, 0.46357f, 0.00502591f, -0.0282085f, + 0.152277f, -0.855199f, -0.357115f, 0.0366159f, 0.0131101f, + -0.0407758f, 0.0462835f, 0.146309f, -0.00276278f, -0.0591814f, + -0.109437f, 0.506764f, -0.044421f, 0.465907f, 0.114444f, + -0.241053f, -0.362649f, -0.432615f, 0.199989f, -0.00635866f, + -0.521886f, 0.0958924f, -0.485725f, 0.0430527f, 0.069746f, + 0.681091f, -0.288144f, 0.505671f, 0.0489065f, -0.0373836f, + 0.266079f, 0.145173f, -0.011481f, -0.225074f, -0.754501f, + -0.122939f, -0.294213f, 0.334738f, 0.281561f, 0.558977f, + -0.21551f, -0.346507f, -0.0625635f, 0.0782034f, -0.236999f, + -0.803783f, -0.601117f, 0.091192f, 0.636122f, -0.250626f, + 0.0354961f, 0.103915f, 0.508571f, 0.329911f, -0.0425999f, + -0.0867587f, -0.0385824f, 1.13914f, -0.0261992f, 0.00484478f, + 0.124603f, -0.012173f, -0.377358f, -0.243563f, 0.236094f, + 0.145663f, -0.132752f, 0.347497f, -0.529315f, 0.271632f, + -0.372805f, 0.0261836f, 0.126169f, 0.0941008f, 0.283773f, + 0.765701f, -0.226477f, -0.181549f, -0.306896f, 0.110165f, + -0.0784234f, -0.0827892f, -0.0374252f, -0.0950872f, -0.451015f, + -0.995793f, -0.452663f, 0.293338f, -0.380865f, 0.032683f, + 0.0178248f, 0.0699194f, -0.0811722f, -0.0866096f, 0.139289f, + 0.296604f, 0.192293f, -0.0589607f, -0.179878f, 0.00360266f, + -0.0905794f, 0.136744f, -0.191555f, 1.31877f, -0.0592033f, + -0.158766f, 0.0214746f, -0.190113f, -0.116671f, 0.0449292f, + -0.109533f, -0.709307f, 0.386424f, 0.40201f, 0.262211f, + -0.155244f, 0.233988f, -0.0166317f, 0.462665f, 0.0484462f, + 0.210902f, -0.352798f, 0.38698f, -0.228261f, -0.084309f, + -0.220751f, -0.170879f, -0.352617f, -1.24277f, 0.266004f, + -0.0125749f, -0.0380073f, 0.101838f, -0.0483024f, -0.0629178f, + -0.0695577f, -0.103439f, 0.242131f, -0.0796858f, 0.349718f, + -0.332045f, 0.0138352f, -0.380235f, -0.28717f, -0.176276f, + 0.865903f, 0.36593f, 0.243925f, -0.422289f, -0.117327f, + 0.21876f, 0.245393f, -0.426134f, -0.186077f, 0.0352515f, + -0.123742f, 0.249376f, 1.3281f, 0.0707771f, 0.071415f, + -0.286827f, -0.131691f, -0.270881f, -0.434378f, 0.376064f, + 0.35966f, 0.513374f, 0.439378f, -0.222716f, -0.5874f, + 0.487997f, -0.293271f, -0.184245f, -0.037256f, 0.17723f, + -0.438651f, 0.428184f, 0.112983f, -0.449287f, -0.0451963f, + 0.0854929f, 0.0735442f, -0.0148642f, -0.0586782f, -0.176455f, + -0.438979f, -0.127109f, 0.211478f, 0.388035f, -0.0372021f, + 0.220575f, 0.382144f, 0.302121f, 0.0857121f, 0.193445f, + -0.488858f, -0.195288f, -0.316184f, -0.314026f, -0.111956f, + 0.0744768f, 0.292709f, 0.30187f, -0.285506f, -0.105006f, + 0.0851402f, -0.082318f, 0.277518f, 0.725294f, -0.756304f, + 0.0155309f, -0.378542f, 0.293377f, -0.347252f, -0.338458f, + 0.221449f, -0.176443f, -0.131972f, 0.0129163f, -0.290649f, + 0.198596f, -0.0721333f, 0.620591f, 0.568736f, 0.174001f, + -0.205186f, -0.265606f, -0.249155f, 0.299163f, 1.11842f, + 0.17423f, 0.196417f, -0.014484f, 0.0735422f, 0.26329f, + 0.12284f, -0.750305f, -0.351337f, 0.121994f, -0.00542878f, + -0.295707f, -0.094124f, 0.300993f, 0.412408f, -0.170761f, + -0.0676329f, -0.106638f, -0.419785f, -0.43878f, 0.22421f, + 0.0339903f, 0.619851f, 0.0615381f, 0.514631f, 1.35424f, + -0.0679228f, -0.203457f, 0.131948f, -0.0041251f, -0.209054f +}; + +static const float av1_simple_motion_search_prune_part_logits_bias_16[] = { + 0.304025f, 0.131887f, 0.259279f, -0.561564f, -0.161729f, + -0.208036f, 0.102206f, -0.162937f, -1.42311f, -0.708305f +}; + +static const NN_CONFIG av1_simple_motion_search_prune_part_nn_config_16 = { + NUM_FEATURES_16, + NUM_LOGITS_16, + NUM_HIDDEN_LAYERS_16, + { + NUM_LAYER_0_UNITS_16, + }, + { + av1_simple_motion_search_prune_part_layer_0_kernel_16, + av1_simple_motion_search_prune_part_logits_kernel_16, + }, + { + av1_simple_motion_search_prune_part_layer_0_bias_16, + av1_simple_motion_search_prune_part_logits_bias_16, + }, +}; + +#undef NUM_HIDDEN_LAYERS_16 +#undef NUM_FEATURES_16 +#undef NUM_LAYER_0_UNITS_16 +#undef NUM_LOGITS_16 + +#define NUM_HIDDEN_LAYERS_8 1 +#define NUM_FEATURES_8 25 +#define NUM_LAYER_0_UNITS_8 32 +#define NUM_LOGITS_8 4 + +static const float av1_simple_motion_search_prune_part_logits_kernel_8[] = { + -0.266303f, -0.387676f, 0.204501f, -0.120842f, -0.0752326f, 0.0337739f, + 0.0243477f, -0.356748f, 0.0143051f, -0.16403f, -0.139013f, 0.175003f, + -0.206754f, 0.349059f, 0.181763f, 0.212768f, -0.313783f, 0.182829f, + 0.00205376f, -0.939525f, -0.0992424f, 0.306254f, 0.083329f, -0.133137f, + -0.179022f, -0.0237902f, 0.0601026f, -0.216698f, -0.551149f, 0.081711f, + -0.442191f, 0.0680832f, -0.0353678f, 0.237704f, 0.23155f, -0.36097f, + 0.123389f, -0.288927f, 0.178133f, -0.152222f, -0.235648f, -0.0495293f, + -0.316522f, 0.034207f, 0.0463139f, -0.817825f, 0.417443f, -0.110984f, + -0.402371f, 0.0341694f, -0.37383f, 0.414532f, 0.093993f, 0.0039505f, + 0.0803175f, -0.511859f, -0.0154802f, 0.0979595f, 0.0909049f, -0.120938f, + -0.577382f, -0.155041f, -0.404295f, 0.122223f, -0.084703f, 0.00415336f, + 0.149135f, 0.113219f, 0.124236f, -0.240905f, 0.163909f, -0.154202f, + -0.208917f, 0.00200158f, -0.71796f, 0.105984f, -0.131996f, -0.539603f, + 0.223768f, -0.0710733f, -0.346679f, -0.0745909f, 0.171032f, 0.215701f, + 0.218519f, 0.105981f, -0.096209f, -0.166453f, -0.468894f, -0.401578f, + -0.239222f, 0.111382f, 0.38747f, -0.164734f, -0.175955f, 0.336621f, + -0.0305501f, -0.0576765f, 0.0672671f, -0.183692f, 0.412082f, -0.262951f, + -0.153429f, -0.128589f, -0.530472f, 0.0936412f, -1.08296f, -0.45147f, + 0.0714904f, -3.96842f, 0.438125f, -0.313945f, 0.231104f, -0.00183851f, + -0.0192768f, -0.637531f, -0.109296f, 0.0531702f, 0.00262162f, -0.615951f, + -0.546241f, -0.635305f, -0.0762367f, 0.0122019f, 0.423693f, -0.129142f, + -0.112242f, 0.295184f +}; + +static const float av1_simple_motion_search_prune_part_layer_0_bias_8[] = { + -2.16023f, -3.12831f, -0.213206f, -2.97875f, -1.83791f, -2.84713f, + -0.909636f, -2.05893f, 0.00525274f, -1.51672f, -3.95017f, 1.82847f, + -0.853224f, -3.29503f, -0.537517f, 0.923106f, -3.18665f, -1.29905f, + 1.64506f, -1.99848f, -2.24315f, 0.408613f, 0.503671f, -3.83393f, + -2.88388f, -3.52337f, 1.46818f, -1.67169f, -3.83253f, 1.52644f, + -0.490783f, -0.415782f +}; + +static const float av1_simple_motion_search_prune_part_layer_0_kernel_8[] = { + -0.702198f, -0.102148f, 0.0564545f, -0.0555548f, 0.16184f, + 0.0950792f, 0.136974f, -0.00824146f, 0.05746f, 0.0447542f, + 0.145978f, 0.0855769f, -0.041449f, 0.301347f, -0.0206691f, + -0.0662514f, -0.0525079f, -0.0998387f, -0.0891438f, 0.110545f, + -0.863098f, -1.83798f, 0.238818f, 0.127797f, 0.116872f, + -0.270655f, -0.21057f, 0.197013f, -0.123332f, 0.137104f, + -0.174766f, -0.00803025f, 0.0234369f, -0.0894175f, -0.0380927f, + 0.00827928f, -0.134148f, 0.110575f, -0.250173f, 0.116273f, + 0.0197749f, 0.270391f, 0.108437f, 0.173197f, -0.0650348f, + 0.0884626f, 0.262792f, 0.0649228f, 0.5573f, -2.81315f, + -0.479801f, -1.15825f, 0.0807932f, -0.19144f, 0.404016f, + -0.211521f, 0.233269f, -0.391414f, 0.160381f, -0.277233f, + 0.426354f, 0.156839f, 0.494315f, -0.214259f, -0.0132062f, + 0.148628f, -0.0899568f, 0.161845f, 0.467689f, 0.229474f, + 0.590634f, -0.705793f, -0.0486113f, -0.439088f, 0.994566f, + 0.679065f, 0.777869f, -0.225291f, -0.0303006f, -0.638782f, + -0.0824632f, -0.128561f, -0.327603f, 0.105624f, 0.567581f, + -0.396135f, -0.471028f, 0.181286f, 0.274604f, 0.180169f, + 0.0612144f, -0.865004f, 0.0306804f, 0.142985f, -0.0914358f, + -0.243284f, 0.358359f, -0.443847f, -0.371978f, 0.606933f, + -0.900408f, -0.52076f, 0.472118f, 0.0610973f, 0.152526f, + -0.550379f, 0.309331f, -0.141573f, 0.203046f, -0.231485f, + 0.505156f, 0.393224f, 0.435487f, -0.218681f, 0.123707f, + -0.270383f, -0.033565f, 0.210373f, -2.33967f, 0.367434f, + 0.0308118f, -0.205771f, 0.546141f, 0.19837f, 0.035648f, + -0.467007f, -1.50995f, -0.0314176f, 0.11762f, -0.15307f, + 0.618257f, -0.139502f, 0.303386f, -0.00758681f, 0.228107f, + -0.594499f, -0.201984f, -0.239666f, 0.114878f, -0.922174f, + -0.530137f, -0.379366f, -0.319582f, 0.0889624f, -0.00544663f, + 0.316264f, -0.204262f, -0.0959358f, 0.23552f, 0.141369f, + -0.207129f, -1.04067f, -0.0780501f, 0.226768f, -0.246752f, + 0.0823105f, 0.114783f, 0.49315f, 0.0197732f, 0.705433f, + 0.158076f, -0.250584f, -0.157326f, -0.0439547f, -0.139047f, + 0.090531f, -0.38833f, 0.743143f, -1.47418f, -0.155009f, + 0.511466f, -0.726716f, -0.181075f, 0.450133f, -0.390204f, + 0.292725f, 0.00811462f, -0.347738f, 0.613381f, -0.237124f, + 0.750748f, -0.383123f, 0.410309f, -0.204166f, 0.667199f, + -0.313197f, 0.436059f, -0.607571f, 0.193681f, 0.409399f, + 0.631747f, -0.0454149f, 0.198232f, 0.345591f, -0.0137374f, + -0.307014f, -0.535515f, 0.764678f, -0.225686f, -0.451621f, + -2.75564f, -1.52877f, 0.0511933f, 0.905979f, 0.145029f, + 0.759615f, 0.130166f, 0.83827f, 0.0655081f, 1.07555f, + -0.529777f, 0.682967f, -0.412052f, 0.611947f, -0.83676f, + 0.940695f, -0.465681f, 0.51505f, -0.883659f, -0.105524f, + -0.0344173f, -0.0683618f, -0.00698688f, -0.139349f, 0.135741f, + -0.294455f, -0.377834f, -0.602084f, -1.00128f, 0.483291f, + 1.25327f, 0.178987f, 0.75068f, -0.520731f, -0.325517f, + 0.272032f, 0.144144f, -0.279453f, 0.564907f, 0.144036f, + 0.297448f, -0.504243f, -0.250508f, -1.26395f, 0.4816f, + 0.392771f, -0.389961f, -0.261585f, -0.127124f, -0.202945f, + -0.709716f, -0.174719f, 0.113613f, 0.477753f, -0.226659f, + 0.0697828f, -0.177994f, 0.300726f, -0.185504f, 0.339424f, + -0.316746f, 0.369693f, -0.339723f, -0.143886f, -0.0326589f, + -0.268761f, -0.241094f, 0.284876f, -0.0270867f, -0.207397f, + -1.42738f, 0.495612f, -0.0277732f, 0.199675f, 1.48638f, + -0.659257f, -1.28199f, 0.498702f, 0.140695f, 0.571152f, + 0.416368f, 0.14153f, 0.126876f, 0.521114f, -0.00150571f, + 0.375581f, 0.00537624f, 0.1286f, -0.332227f, 0.417663f, + -0.539023f, 0.217124f, -0.787111f, -0.0335266f, 1.56751f, + 0.0640563f, -0.158791f, 0.118195f, 0.000970493f, -0.0403852f, + -0.0572557f, -0.0201181f, -0.10255f, 0.63237f, 0.156662f, + 0.418696f, -0.274802f, -0.663923f, -0.375232f, -0.40846f, + 0.462092f, 1.2176f, -0.301532f, -0.779704f, -0.112876f, + 0.0806591f, -0.0141923f, 0.00960801f, -0.663557f, 0.0979948f, + -0.0575999f, -0.012847f, 0.0403853f, -0.133666f, -0.00330217f, + -0.931518f, -0.774599f, -0.21391f, 0.377601f, -0.183365f, + 0.299094f, 0.0238552f, 0.206716f, -0.18959f, 0.346013f, + -0.150991f, -0.192817f, -0.293962f, -0.0537604f, -0.0648171f, + -0.275941f, -0.144854f, -0.224092f, 2.43113f, 0.0422494f, + -0.047236f, -0.0262028f, 0.0282119f, -0.175553f, 0.0888502f, + 0.580682f, 0.951055f, -0.284441f, -0.120133f, -0.268058f, + -0.312083f, -0.411556f, 0.21431f, -0.28033f, 0.324851f, + -1.02787f, -0.936816f, -0.577628f, 0.544743f, 0.295807f, + 0.406157f, 0.447927f, 0.25369f, -0.811421f, -0.0424979f, + -0.189867f, 0.00778673f, -0.113587f, -0.116175f, -0.0542222f, + -1.80089f, -1.44175f, -0.35332f, 0.191314f, -0.236691f, + -0.0261926f, -0.502363f, 0.252278f, -0.485478f, 0.296495f, + 0.455612f, -0.0489631f, 0.227255f, 0.170975f, 0.473487f, + 0.257812f, 0.178048f, 0.2506f, 2.04637f, -0.173857f, + 0.0583379f, 0.00765589f, -0.025772f, -0.162666f, -0.016214f, + -0.607486f, -0.0808025f, 0.0551611f, -0.0772291f, 0.126421f, + 0.10869f, -0.0877463f, -0.111527f, -0.0775766f, 0.503886f, + -0.002757f, -0.0421354f, -0.247857f, 0.140827f, 0.383576f, + 0.228232f, -0.157877f, -0.0927911f, 0.344687f, 0.191181f, + 0.236533f, 0.00102869f, -0.0184502f, -1.4509f, -1.15945f, + -0.521978f, -0.643225f, 0.133139f, 0.0660321f, 0.0851957f, + 0.0303648f, 0.0296239f, 0.0455713f, 0.175647f, 0.080532f, + 0.0445691f, -0.257356f, -0.125602f, -0.138829f, -0.167057f, + -0.0992552f, -0.13944f, 0.507531f, 0.444997f, 0.221452f, + -0.308384f, -0.327554f, 0.13235f, 2.1487f, -1.15453f, + -0.280239f, -0.363582f, -0.00358745f, 0.012866f, 0.251088f, + 0.0676416f, 0.178492f, -0.136631f, 0.197938f, -0.078198f, + 0.812439f, 1.1173f, 0.712113f, 1.10124f, -0.836503f, + -1.22433f, -1.07894f, -1.29215f, 0.56057f, 2.23928f, + -0.419029f, 0.282178f, -0.0719266f, -0.172192f, 0.28034f, + -2.99124f, -2.01481f, 0.0688982f, 0.697466f, 0.00635555f, + 0.566069f, 0.047534f, 0.507755f, -0.00690707f, 0.712594f, + -0.191467f, 0.355733f, -0.480016f, 0.664669f, -0.390619f, + 0.351199f, -0.482342f, 0.325005f, 1.9089f, 0.155987f, + 0.17032f, 0.132729f, 0.0402649f, 0.146991f, 0.0314905f, + -0.775316f, -0.208892f, -0.105993f, 0.0181653f, -0.12735f, + 0.0897852f, 0.0470231f, 0.25807f, 0.127406f, -0.0893252f, + -0.279776f, 0.190844f, 0.110384f, -0.148833f, 0.025293f, + 0.239838f, 0.00932245f, 0.35103f, -0.128268f, -0.0536754f, + 0.506899f, -0.16793f, 0.0955582f, -2.01108f, 0.721433f, + -2.31413f, -2.08646f, 0.033315f, 0.689828f, -0.271213f, + 0.790425f, -0.114234f, 0.755325f, -0.211533f, 0.774544f, + -0.263268f, 0.795762f, -0.551455f, 0.953602f, -0.168454f, + 0.529055f, -0.768991f, 0.882371f, 0.29763f, -0.155017f, + 0.00464101f, 0.121093f, 0.948271f, 0.113138f, -0.110332f, + -2.0492f, -1.31322f, -0.129212f, 0.464778f, -0.181465f, + 0.618403f, 0.0627984f, 0.465228f, 0.165729f, 0.278277f, + -0.563276f, -0.358358f, -0.590638f, 0.0104993f, 0.731206f, + 0.752569f, 0.631615f, 0.811822f, 0.129804f, -0.0558327f, + 0.570081f, -0.417922f, -0.168275f, 0.0703671f, 0.269127f, + 0.240457f, -0.197159f, -0.00179261f, 0.220065f, 0.463511f, + 0.0714626f, -0.716477f, -0.441865f, -0.717028f, -0.149176f, + 0.452182f, 0.662699f, -0.906534f, -0.817133f, 0.237747f, + 0.26024f, -7.7441e-05f, 0.0934616f, 0.824641f, -0.0404494f, + -0.088297f, -0.157899f, 0.037408f, 0.132435f, -0.316155f, + -0.276785f, 0.0117868f, 0.185008f, 0.32369f, -0.465855f, + -0.302127f, 0.303289f, 0.338597f, -0.665408f, -0.507594f, + 0.526979f, 0.532091f, 0.234395f, 0.754063f, 0.116769f, + 0.0800309f, -0.939344f, -1.51269f, 1.4583f, 0.178444f, + 0.0106756f, -0.213468f, -0.00369439f, 0.071015f, -0.192798f, + -0.0933147f, -0.129901f, -0.368279f, -0.246564f, 0.126966f, + 0.478565f, -0.476246f, -0.762863f, 0.168883f, 0.536136f, + -0.272969f, 0.2573f, -0.161577f, 0.311428f, -0.777994f, + -1.29752f, 0.216046f, 0.329016f, 1.57265f, 0.168075f, + -0.192518f, 0.0829308f, -0.073533f, -0.0202034f, 0.114716f, + -0.34888f, -0.519215f, 0.190809f, 0.0138507f, 0.133635f, + 0.14194f, 0.410618f, -0.165106f, 0.214438f, 0.0438265f, + -0.8481f, -1.19182f, -1.07878f, -0.882217f, 0.45616f, + 0.977385f, 0.74929f, 0.918466f, 0.904704f, 0.041938f, + 0.0362776f, 0.0757255f, 1.14007f, 0.0516825f, -0.160068f, + 0.219535f, 0.638634f, -0.0284544f, -0.222849f, -0.0344915f, + -0.0350256f, -0.0504452f, -0.0458416f, 0.146099f, 0.0783083f, + 0.206579f, 0.241264f, 0.28401f, 0.0425312f, -0.802049f, + -0.746271f, -0.578969f, -0.078218f, 0.436176f, -0.281465f, + -2.5539f, 0.237868f, -0.121796f, 0.0715619f, 0.106992f, + -0.621862f, -0.167142f, 0.153716f, 0.0570912f, -0.06525f, + -0.923773f, 0.130759f, 0.0517066f, 0.0729862f, -0.873064f, + 0.0403328f, -0.186499f, -0.0831918f, -0.223723f, 0.144697f, + 0.212845f, 0.416876f, 0.361598f, 0.138229f, 0.0728777f, + -1.95419f, -0.00382816f, -0.0440387f, 0.433627f, 0.44781f, + -1.05229f, -1.54506f, 0.564827f, -0.263456f, 0.296105f, + -0.158055f, 0.388274f, -0.366639f, 0.212006f, -0.245619f, + 0.593064f, 0.088727f, 0.410632f, -0.263462f, 0.507075f, + -0.0974155f, 0.275268f, -0.1293f, 0.136679f, 1.98276f, + 0.411766f, 0.391987f, 0.34283f, -0.114077f, 0.258462f, + -0.302443f, 0.301138f, -0.00726621f, 0.276441f, -0.291582f, + 0.66498f, -0.321451f, -0.332805f, 0.0943272f, 0.572253f, + -0.45818f, -0.0219593f, -0.151679f, 0.402033f, -1.15502f, + -0.882955f, 0.772904f, 0.88126f, -0.149555f, 0.709525f, + 0.350116f, -0.21531f, 0.797893f, 0.0230234f, 0.0203034f, + 0.2744f, 1.08273f, 0.039349f, 0.503909f, -0.45892f, + -0.579516f, -0.344058f, 0.390628f, -0.386941f, -0.430317f, + -0.0807066f, 0.435906f, 0.522996f, 0.724476f, -0.74371f, + -0.05376f, -0.340898f, -0.962646f, -0.0278005f, 0.0981149f, + -0.0811161f, 0.00237994f, 0.850042f, 0.0665473f, 0.134413f +}; + +static const float av1_simple_motion_search_prune_part_logits_bias_8[] = { + 1.63404f, -0.715866f, -1.0132f, -2.08745f +}; + +static const NN_CONFIG av1_simple_motion_search_prune_part_nn_config_8 = { NUM_FEATURES_8, NUM_LOGITS_8, NUM_HIDDEN_LAYERS_8, @@ -2883,22 +3967,839 @@ static const NN_CONFIG full_pixel_motion_search_based_split_nn_config_8 = { NUM_LAYER_0_UNITS_8, }, { - full_pixel_motion_search_based_split_layer_0_kernel_8, - full_pixel_motion_search_based_split_logits_kernel_8, + av1_simple_motion_search_prune_part_layer_0_kernel_8, + av1_simple_motion_search_prune_part_logits_kernel_8, }, { - full_pixel_motion_search_based_split_layer_0_bias_8, - full_pixel_motion_search_based_split_logits_bias_8, + av1_simple_motion_search_prune_part_layer_0_bias_8, + av1_simple_motion_search_prune_part_logits_bias_8, }, }; -static const float full_pixel_motion_search_based_split_thresh_8 = 2.0f; +#undef NUM_HIDDEN_LAYERS_8 +#undef NUM_FEATURES_8 +#undef NUM_LAYER_0_UNITS_8 +#undef NUM_LOGITS_8 + +#define FEATURE_SIZE 19 +static const float av1_2pass_split_partition_weights_128[FEATURE_SIZE + 1] = { + 2.683936f, -0.193620f, -4.106470f, -0.141320f, -0.282289f, + 0.125296f, -1.134961f, 0.862757f, -0.418799f, -0.637666f, + 0.016232f, 0.345013f, 0.018823f, -0.393394f, -1.130700f, + 0.695357f, 0.112569f, -0.341975f, -0.513882f, 5.7488966f, +}; + +static const float av1_2pass_split_partition_weights_64[FEATURE_SIZE + 1] = { + 2.990993f, 0.423273f, -0.926544f, 0.454646f, -0.292698f, + -1.311632f, -0.284432f, 0.717141f, -0.419257f, -0.574760f, + -0.674444f, 0.669047f, -0.374255f, 0.380624f, -0.804036f, + 0.264021f, 0.004163f, 1.896802f, 0.924287f, 0.13490619f, +}; + +static const float av1_2pass_split_partition_weights_32[FEATURE_SIZE + 1] = { + 2.795181f, -0.136943f, -0.924842f, 0.405330f, -0.463505f, + -0.584076f, -0.831472f, 0.382985f, -0.597544f, -0.138915f, + -1.354350f, 0.466035f, -0.553961f, 0.213202f, -1.166429f, + 0.010776f, -0.096236f, 2.335084f, 1.699857f, -0.58178353f, +}; + +static const float av1_2pass_split_partition_weights_16[FEATURE_SIZE + 1] = { + 1.987888f, -0.431100f, -1.687703f, 0.262602f, -0.425298f, + -0.463870f, -1.493457f, 0.470917f, -0.528457f, -0.087700f, + -1.815092f, 0.152883f, -0.337908f, 0.093679f, -1.548267f, + -0.042387f, -0.000861f, 2.556746f, 1.619192f, 0.03643292f, +}; + +static const float av1_2pass_split_partition_weights_8[FEATURE_SIZE + 1] = { + 2.188344f, -0.817528f, -2.119219f, 0.000000f, -0.348167f, + -0.658074f, -1.960362f, 0.000000f, -0.403080f, 0.282699f, + -2.061088f, 0.000000f, -0.431919f, -0.127960f, -1.099550f, + 0.000000f, 0.121622f, 2.017455f, 2.058228f, -0.15475988f, +}; + +static const float av1_2pass_none_partition_weights_128[FEATURE_SIZE + 1] = { + -1.006689f, 0.777908f, 4.461072f, -0.395782f, -0.014610f, + -0.853863f, 0.729997f, -0.420477f, 0.282429f, -1.194595f, + 3.181220f, -0.511416f, 0.117084f, -1.149348f, 1.507990f, + -0.477212f, 0.202963f, -1.469581f, 0.624461f, -0.89081228f, +}; + +static const float av1_2pass_none_partition_weights_64[FEATURE_SIZE + 1] = { + -1.241117f, 0.844878f, 5.638803f, -0.489780f, -0.108796f, + -4.576821f, 1.540624f, -0.477519f, 0.227791f, -1.443968f, + 1.586911f, -0.505125f, 0.140764f, -0.464194f, 1.466658f, + -0.641166f, 0.195412f, 1.427905f, 2.080007f, -1.98272777f, +}; + +static const float av1_2pass_none_partition_weights_32[FEATURE_SIZE + 1] = { + -2.130825f, 0.476023f, 5.907343f, -0.516002f, -0.097471f, + -2.662754f, 0.614858f, -0.576728f, 0.085261f, -0.031901f, + 0.727842f, -0.600034f, 0.079326f, 0.324328f, 0.504502f, + -0.547105f, -0.037670f, 0.304995f, 0.369018f, -2.66299987f, +}; + +static const float av1_2pass_none_partition_weights_16[FEATURE_SIZE + 1] = { + -1.626410f, 0.872047f, 5.414965f, -0.554781f, -0.084514f, + -3.020550f, 0.467632f, -0.382280f, 0.199568f, 0.426220f, + 0.829426f, -0.467100f, 0.153098f, 0.662994f, 0.327545f, + -0.560106f, -0.141610f, 0.403372f, 0.523991f, -3.02891231f, +}; + +static const float av1_2pass_none_partition_weights_8[FEATURE_SIZE + 1] = { + -1.463349f, 0.375376f, 4.751430f, 0.000000f, -0.184451f, + -1.655447f, 0.443214f, 0.000000f, 0.127961f, 0.152435f, + 0.083288f, 0.000000f, 0.143105f, 0.438012f, 0.073238f, + 0.000000f, -0.278137f, 0.186134f, 0.073737f, -1.6494962f, +}; +#undef FEATURE_SIZE + +// nn model for predicting max square partition level of a superblock +#define NUM_HIDDEN_LAYERS 1 +#define NUM_FEATURES 13 +#define NUM_LAYER_0_UNITS 48 +#define NUM_LOGITS 4 + +static const float av1_max_part_pred_logits_kernel[] = { + -0.304561f, 0.0885596f, -0.988539f, 1.08147f, 0.215213f, + 0.202965f, -0.828457f, -0.233945f, -0.0866977f, -0.115521f, + 0.02079f, 0.196491f, -0.0285075f, 0.05067f, -0.00872862f, + 0.00281844f, -0.238954f, 0.0253801f, 0.0257775f, 0.339269f, + 0.176174f, -0.152545f, -0.0588704f, -1.62275f, -0.189329f, + 0.0808033f, 0.233844f, -4.53798f, 0.674968f, -0.0361688f, + -0.0754075f, 1.16129f, -0.0188879f, 0.113255f, -3.04378f, + 0.814728f, -0.568517f, -0.00179383f, -3.61223f, -1.67535f, + -2.20417f, -0.197196f, 0.0507745f, -0.0909394f, -0.0507879f, + -1.27999f, -0.055623f, 0.0318497f, 0.192867f, 0.138726f, + 0.0443392f, -0.595075f, -0.166774f, 0.0882958f, -0.348161f, + 0.0214428f, -0.0599275f, -0.0995385f, -0.82358f, 0.141205f, + -0.053232f, 0.00508296f, -1.90872f, 1.15004f, -0.194219f, + 0.0229019f, -0.00354318f, 0.22016f, 0.154101f, -0.159231f, + -0.0446647f, -0.197503f, 0.0408453f, 0.197659f, 0.797858f, + -0.189722f, 0.343653f, 0.124666f, -1.03083f, 0.603059f, + 0.101565f, 0.0932993f, 0.462484f, 0.295984f, 1.11198f, + 0.143709f, -0.846232f, -0.464392f, -1.06058f, -0.124889f, + 0.0727475f, 1.18446f, -0.100302f, 0.0641918f, -0.101622f, + 0.10219f, 0.130189f, 0.0915623f, -0.166904f, -1.10606f, + -0.16726f, -0.146152f, 0.145443f, -0.177091f, -0.0215214f, + 0.0158506f, -0.553294f, 0.0784749f, -0.0416628f, -0.027785f, + 0.280027f, 0.484898f, -0.164225f, 0.0238317f, -0.0345254f, + 0.0410244f, 0.131529f, 0.0239622f, -0.0749436f, -0.0224914f, + 0.128926f, 0.224539f, 0.413297f, 0.0638572f, 0.103308f, + 0.0913242f, -0.119274f, 0.0163103f, 0.113828f, 0.119809f, + 0.297057f, -0.124889f, -0.533108f, -0.181408f, -0.129896f, + 0.0221064f, -0.0773281f, -0.0386467f, 0.0342961f, 0.126575f, + -0.24114f, 0.0735576f, 0.0524791f, 0.246896f, -0.130674f, + -0.03979f, 0.173639f, 1.95193f, -0.113029f, -0.0305852f, + -0.00671737f, 0.157159f, -0.00102858f, -0.543688f, 0.566772f, + 0.124124f, -0.0294064f, -0.0699021f, -0.0704103f, -0.766097f, + -0.0625802f, -0.0906173f, -0.0520414f, -0.0272724f, 0.283064f, + 0.236213f, -0.127319f, 0.019392f, 0.170042f, -0.0214542f, + 0.0740938f, 0.356578f, -0.236257f, 0.269021f, 0.114759f, + -0.641166f, 0.136308f, -0.0386959f, -0.112024f, -0.361209f, + 0.686095f, 0.183906f, 0.288656f, 0.182007f, 0.337458f, + 0.058974f, -0.305512f, -0.841708f, -0.243779f, -0.0614058f, + 0.208747f, 0.448697f +}; + +static const float av1_max_part_pred_layer_0_bias[] = { + -0.776544f, -2.0022f, -0.330294f, 2.47665f, 1.90206f, -1.61571f, + 0.536246f, 1.00455f, 5.24561f, 1.55111f, -0.816399f, -4.88703f, + -1.06417f, -1.15359f, -0.145289f, 1.91831f, 0.630915f, -1.94256f, + -3.35239f, -1.05007f, -1.05186f, 1.36824f, -5.2878f, 1.10482f, + -5.00077f, -0.0445198f, 3.41427f, 2.3439f, -0.413306f, -1.88152f, + -2.28638f, 8.24783f, -1.91961f, -1.49324f, 1.96599f, -6.32309f, + -0.332426f, -0.425506f, 4.06511f, 5.84386f, 4.15747f, 1.22402f, + 2.8512f, 2.53027f, 0.0170272f, -1.43966f, -0.997785f, 5.43064f +}; + +static const float av1_max_part_pred_logits_bias[] = { -4.25432f, 0.144758f, + 1.96217f, 0.728905f }; + +static const float av1_max_part_pred_layer_0_kernel[] = { + 0.992471f, 0.533006f, 0.143743f, -2.51788f, -0.468337f, + -0.201376f, -0.151834f, 0.479883f, 1.16061f, -0.278878f, + -0.814954f, -0.152405f, -0.0521608f, 0.797104f, -2.08912f, + 0.385839f, -2.22889f, -0.106858f, -0.239766f, -0.951128f, + -0.698753f, 0.0831051f, 1.1702f, 0.342834f, -0.0352795f, + -0.0847639f, -0.802086f, 0.258982f, 1.14174f, 0.645885f, + -1.19226f, -0.592888f, -0.343659f, 1.1912f, 1.45411f, + -1.22927f, 0.152858f, 0.00373585f, -1.60637f, 0.592611f, + 0.0857475f, -0.346147f, -0.150784f, -0.0817408f, -0.189918f, + -0.804952f, -1.33036f, -1.03307f, 0.0248769f, 0.16607f, + -2.896f, -2.1293f, 0.12293f, -0.173179f, -0.212128f, + -6.76221f, 0.033188f, 0.0231787f, 0.905957f, 0.0551327f, + -0.356276f, 0.0181795f, 0.0977523f, -0.0352873f, -0.0396386f, + 2.3241f, 0.0632874f, -0.11804f, -6.32521f, 0.0224659f, + -0.00188896f, 0.267992f, 0.272337f, 0.00936963f, 0.659969f, + -2.25707f, -0.0278229f, -0.0185089f, -1.14466f, 0.104827f, + 0.0435885f, 0.558586f, -0.00697004f, 0.0312611f, 0.540574f, + -0.568625f, 0.218608f, 0.378911f, -0.0289192f, -0.0734742f, + -1.08782f, -2.42069f, -0.0127239f, 0.0493651f, -1.15837f, + 0.261831f, 0.401824f, -1.04545f, 0.284173f, 0.784972f, + -0.511243f, -0.982599f, -0.106134f, -0.325964f, -1.44107f, + -1.42434f, -1.02402f, -1.52034f, 0.0737116f, 0.0462242f, + 0.628722f, -1.0405f, -0.113718f, 2.20573f, -4.33951f, + -0.0192695f, -0.0229314f, -1.89156f, 0.645942f, 0.375708f, + -1.97447f, -0.267014f, 0.0989443f, -0.450534f, -1.01737f, + -0.642416f, -0.0897288f, -2.08724f, -0.190965f, -0.279135f, + -0.830178f, 0.808754f, -0.139091f, 1.11004f, -0.454439f, + -0.479238f, -1.44001f, 0.0888059f, 0.885689f, -0.642505f, + -0.00773651f, -0.0265721f, -0.906346f, 1.68504f, 0.084257f, + -0.951101f, -8.06495f, 0.19231f, 0.16389f, -0.193678f, + 0.729837f, -1.98392f, -5.98513f, 3.32638f, -0.0658378f, + -0.0910426f, -0.666567f, -0.315339f, 0.123124f, -2.66375f, + -0.714852f, -0.136176f, -0.460166f, -0.567551f, -1.06193f, + -1.21389f, -0.83865f, 0.00280695f, -0.199519f, -0.534704f, + 0.419311f, -0.149008f, -3.68707f, 0.00285113f, -0.0718198f, + -1.41026f, -1.34155f, -0.538687f, -0.623666f, -2.56462f, + -0.0183333f, -0.323532f, -1.27141f, -0.0212039f, 0.198633f, + 0.459554f, -4.65103f, -1.01293f, -1.39512f, -0.289026f, + 0.208724f, -0.665226f, 1.13369f, -1.96734f, -1.45442f, + -3.46172f, 0.810681f, -0.603973f, 0.842764f, -3.90371f, + -0.394561f, -3.61363f, -2.88085f, 0.031645f, -0.23125f, + -2.63898f, -1.35314f, -0.46726f, 1.33145f, 1.20269f, + 1.38682f, -0.331637f, 0.069021f, 0.149523f, -1.24957f, + -0.878857f, -0.200368f, 0.465744f, 1.01365f, -0.0122221f, + -0.550586f, -1.12581f, -0.422132f, -0.0744868f, -2.4804f, + -1.07072f, -0.479006f, 0.101817f, -0.118947f, 0.341576f, + -1.0538f, -0.812346f, -1.13727f, -0.00939806f, 10.1571f, + -0.0441302f, 0.00280407f, -21.5044f, 0.0181152f, -0.0143246f, + 3.23462f, -1.38624f, -1.80416f, 4.89763f, -2.67364f, + 2.31771e-05f, 0.000393989f, 0.352204f, -0.193455f, 0.531455f, + 0.488757f, -0.442555f, -0.518528f, 0.431482f, -2.67727f, + -2.00626f, -0.39729f, -0.221494f, -0.0188888f, -0.0377649f, + -1.80169f, 0.0810332f, -0.0408335f, -1.28675f, -0.0353824f, + -0.666723f, -1.07281f, 0.252912f, -1.24547f, -1.7831f, + -1.14354f, -0.137662f, 0.00230182f, 0.736862f, 0.175872f, + -0.187556f, 0.43963f, -0.796524f, 0.056219f, -0.387874f, + 0.0710224f, -0.16548f, -0.100993f, 0.931481f, -3.20738f, + -0.0197576f, 0.266148f, -0.173909f, -0.337795f, -0.0682381f, + 0.176844f, 0.140286f, 1.12033f, 0.429064f, -2.24192f, + -1.54682f, 2.23646f, -0.0371138f, -0.0475339f, -3.21766f, + 0.0412858f, 0.387811f, 6.6711f, 0.140649f, 0.0559547f, + -0.802839f, 0.599977f, 0.64552f, -2.08103f, -0.503401f, + -0.0407036f, -0.0299199f, 0.0849445f, -0.111657f, -1.63462f, + 3.33762f, 0.0441394f, 0.0466889f, -0.951806f, 0.0723954f, + 0.00348661f, -1.36903f, 2.24625f, -0.0348915f, -0.0508893f, + -0.240891f, -0.120143f, -0.17991f, -2.09137f, 0.0150871f, + 0.0480333f, 1.72012f, 0.0309551f, -0.0370507f, -0.377075f, + 0.103916f, -0.0169255f, -0.0145395f, -4.02144f, 0.83193f, + -0.316502f, 6.3832f, -1.70038f, -1.97215f, -1.94501f, + 1.45479f, 0.711725f, -0.348496f, -0.279056f, -1.13396f, + -1.51744f, -0.853307f, 1.53131f, -0.0032358f, 1.41808f, + -1.32989f, -0.245221f, -0.161614f, -0.500845f, -0.449252f, + 0.0724151f, -0.116333f, -0.0946182f, -2.0945f, 0.0564572f, + 0.393261f, -1.06861f, -0.111458f, -0.839943f, -0.0880348f, + 0.0365742f, 0.415339f, -1.57494f, -0.713697f, 1.02349f, + -0.221371f, -0.0446281f, 1.89223f, -0.0811754f, -0.402773f, + -0.930987f, 0.0243194f, 0.0678332f, -0.0233014f, 0.165372f, + -0.44083f, -1.2404f, 0.35675f, -0.040916f, -0.0512548f, + -2.9071f, 0.861174f, -0.778133f, 2.14436f, -0.688427f, + -0.480371f, -1.69032f, 0.706687f, -0.281982f, -2.30451f, + 1.61541f, -0.0213638f, -0.740509f, -0.266677f, 0.0268434f, + -0.0116908f, -3.17595f, 0.0114825f, 0.0196997f, -0.144005f, + 0.0550181f, -0.851459f, -0.000285073f, -0.538441f, -0.0254868f, + -0.0104454f, -0.0661998f, -0.196469f, -0.346372f, -5.52892f, + -0.643683f, -0.622224f, -0.31463f, -0.555956f, -0.520132f, + -0.843166f, -2.59479f, -0.750195f, 0.00635995f, -0.338615f, + -0.216676f, -0.391544f, -1.62185f, -0.718471f, -0.475406f, + -0.782041f, -0.608824f, -1.09633f, -1.27308f, -0.560719f, + -0.207539f, -0.0196445f, -1.05519f, -0.575249f, -1.0642f, + 1.01615f, -0.873633f, -0.417953f, -0.428051f, 0.350259f, + -2.53833f, -2.72203f, 0.672846f, -0.503094f, -1.1374f, + 0.214291f, 0.013305f, 0.0112064f, 1.10532f, 0.030455f, + 0.0239614f, 0.628072f, 0.0539135f, -0.472441f, -0.688439f, + -0.32044f, -0.0234867f, -0.0158436f, -0.949314f, -0.0453161f, + -1.18306f, 0.626845f, -0.426925f, -0.688371f, 0.415062f, + 0.0640985f, -0.638387f, -2.01399f, -0.209744f, -0.762892f, + -0.0753296f, -0.879315f, -0.520433f, -0.111375f, 0.389742f, + -0.398862f, -0.643227f, -0.246396f, 0.0317051f, 1.06973f, + 0.413617f, 0.180506f, -0.0507897f, -0.00650435f, 0.620892f, + 0.046312f, 0.475032f, 0.906993f, -0.0388061f, -0.256271f, + -1.03323f, 0.0125266f, -0.31116f, -0.377611f, -0.0386407f, + -0.0232745f, -0.353644f, -2.27289f, 0.0571779f, -0.00865006f, + 1.65101f, 0.0175711f, 0.0184585f, 0.558458f, 0.2213f, + -0.285089f, 0.433445f, -0.427177f, -0.0103682f, -0.0101273f, + 0.214085f, -0.0459885f, 0.00761981f, 0.836381f, 0.0175293f, + 0.02508f, -1.51778f, 0.0143956f, -0.162589f, 0.595418f, + 0.21445f, -0.0335848f, -0.0136684f, -0.16686f, -0.14612f, + 0.0816238f, 0.499636f, 0.12458f, -2.41673f, -0.261721f, + -0.676805f, -1.88366f, 0.730462f, 0.69196f, -0.0288489f, + -2.38272f, 0.329876f, 0.014517f, -0.115145f, -3.48151f, + -0.00209072f, -0.0732377f, 0.820443f, -0.0118701f, 0.112145f, + 0.272315f, 0.137531f, -0.0200997f, -0.0397883f, -2.19458f, + 0.183554f, -0.639716f, 0.481605f, -0.621639f, -0.0980299f, + -0.710534f, -0.143105f, -6.77626f, -1.65139f, -2.37718f, + -0.533127f, -1.12574f, 3.34182f, -0.0758663f, 0.0334238f, + -9.48647f, 0.0674974f, 0.0507665f, 0.523007f, -0.0668f, + 0.5736f, -0.589761f, -1.1692f, -0.0236497f, -0.00828928f, + -0.265823f, 1.15284f, 0.307927f, -0.695308f, 0.13725f, + -0.20394f, -0.363965f, -0.331159f, -1.50927f, -1.20051f, + -0.0205825f, -0.0381859f, -0.0579876f, -1.6913f, -1.94626f, + 3.4214f, 3.3922f, -2.13798f, -0.679848f, -0.890735f, + 0.235017f, -0.253202f, -1.0571f, 1.40354f, 0.00719052f, + -1.54365f, -0.7289f, -1.05492f, 0.0238169f, -0.00543592f, + -0.0510353f, -0.175386f, -0.724207f, -0.788936f, 0.039976f, + 1.36966f, 0.869475f, -0.0302774f, -0.0537556f +}; + +static const NN_CONFIG av1_max_part_pred_nn_config = { + NUM_FEATURES, + NUM_LOGITS, + NUM_HIDDEN_LAYERS, + { + NUM_LAYER_0_UNITS, + }, + { + av1_max_part_pred_layer_0_kernel, + av1_max_part_pred_logits_kernel, + }, + { + av1_max_part_pred_layer_0_bias, + av1_max_part_pred_logits_bias, + }, +}; + +#undef NUM_HIDDEN_LAYERS +#undef NUM_FEATURES +#undef NUM_LAYER_0_UNITS +#undef NUM_LOGITS + +// Early termination in second pass +static const float av1_simple_motion_search_term_none_mean_128[28] = { + 12.661922f, 12.638062f, 10.896497f, 10.865719f, 10.978963f, 10.940105f, + 11.012235f, 10.972760f, 11.069924f, 11.018533f, 11.773865f, 11.747426f, + 11.891315f, 11.858107f, 11.793916f, 11.766356f, 11.874997f, 11.840164f, + 5.940535f, 0.770746f, 4.292692f, 4.309581f, 0.848423f, 4.292334f, + 4.298179f, 8.514713f, 14.911736f, 19.825352f, +}; + +static const float av1_simple_motion_search_term_none_std_128[28] = { + 1.796731f, 1.797056f, 1.898383f, 1.900753f, 1.846624f, 1.846953f, 1.906632f, + 1.908089f, 1.836533f, 1.835967f, 1.840262f, 1.840671f, 1.816836f, 1.817103f, + 1.879846f, 1.881333f, 1.803102f, 1.802654f, 2.263402f, 0.420354f, 1.117165f, + 1.083779f, 0.358611f, 1.101183f, 1.084938f, 2.462638f, 1.577009f, 1.574711f, +}; + +static const float av1_simple_motion_search_term_none_mean_64[28] = { + 10.904455f, 10.853546f, 9.247903f, 9.184479f, 9.251985f, 9.186686f, + 9.253490f, 9.190190f, 9.270079f, 9.204357f, 10.086511f, 10.031060f, + 10.100875f, 10.045429f, 10.069688f, 10.013173f, 10.082980f, 10.024640f, + 4.888378f, 0.878113f, 3.598450f, 3.628491f, 0.925833f, 3.560971f, + 3.573322f, 8.807137f, 13.348477f, 18.269117f, +}; + +static const float av1_simple_motion_search_term_none_std_64[28] = { + 1.789300f, 1.787061f, 1.823519f, 1.820226f, 1.794643f, 1.788620f, 1.797194f, + 1.795135f, 1.777795f, 1.773634f, 1.794000f, 1.790377f, 1.772197f, 1.769692f, + 1.819050f, 1.817139f, 1.793577f, 1.789333f, 1.998251f, 0.327156f, 0.885748f, + 0.853767f, 0.262043f, 0.902435f, 0.860033f, 1.224865f, 1.603411f, 1.589296f, +}; + +static const float av1_simple_motion_search_term_none_mean_32[28] = { + 9.818970f, 9.751199f, 8.015079f, 7.927318f, 8.029113f, 7.938330f, 8.012570f, + 7.923719f, 8.033508f, 7.941911f, 8.933057f, 8.857422f, 8.935639f, 8.859187f, + 8.905495f, 8.829741f, 8.929428f, 8.851351f, 4.114069f, 0.954752f, 2.645082f, + 2.709703f, 0.964678f, 2.652077f, 2.673393f, 9.430499f, 11.922798f, 16.942251f, +}; + +static const float av1_simple_motion_search_term_none_std_32[28] = { + 1.737107f, 1.734327f, 1.727923f, 1.720244f, 1.721570f, 1.712775f, 1.718028f, + 1.710370f, 1.711612f, 1.702596f, 1.754856f, 1.748855f, 1.741871f, 1.736304f, + 1.722428f, 1.717380f, 1.713563f, 1.707582f, 1.761170f, 0.207847f, 0.900058f, + 0.862356f, 0.184593f, 0.903822f, 0.856120f, 1.529199f, 1.412085f, 1.453153f, +}; + +static const float av1_simple_motion_search_term_none_mean_16[28] = { + 8.998877f, 8.912468f, 7.085255f, 6.953476f, 7.086386f, 6.954091f, 7.088727f, + 6.955747f, 7.093955f, 6.960635f, 8.065050f, 7.961432f, 8.071631f, 7.967233f, + 8.041699f, 7.937715f, 8.046791f, 7.942183f, 3.833521f, 0.978421f, 1.901347f, + 1.950124f, 0.979418f, 1.928000f, 1.936727f, 9.773951f, 10.735227f, 15.949769f, +}; + +static const float av1_simple_motion_search_term_none_std_16[28] = { + 1.641193f, 1.640172f, 1.614794f, 1.608906f, 1.609571f, 1.603580f, 1.606928f, + 1.601246f, 1.599230f, 1.593529f, 1.633747f, 1.630219f, 1.625695f, 1.622547f, + 1.633827f, 1.630182f, 1.626607f, 1.622777f, 1.548838f, 0.145303f, 0.744550f, + 0.736552f, 0.141980f, 0.742979f, 0.736977f, 1.366255f, 1.258794f, 1.294309f, +}; + +static const float av1_simple_motion_search_term_none_model_128[] = { + -0.6106842357f, -1.0402954455f, 0.6054417656f, -0.2116623578f, + 0.2447714930f, 0.3782256209f, 0.5095592479f, -0.3275620904f, + 0.3886188013f, 0.2629499420f, -0.1979599415f, -0.5389565605f, + 0.1209207902f, -0.4913347466f, 0.3798542731f, -0.2812861709f, + -0.1049824167f, -0.1088672020f, 0.4059596517f, -0.1347896613f, + 0.2276868621f, 0.0506386970f, 0.0071088411f, 0.0467952100f, + 0.2091247458f, -0.7371964736f, 0.1368935545f, 0.3175247786f, + -0.5493146094f, +}; + +static const float av1_simple_motion_search_term_none_model_64[] = { + -0.4150046575f, -0.3954358561f, 0.1997997444f, 0.3395826831f, + 0.2827215753f, 0.3395683652f, 0.2483140395f, 0.2722216476f, + 0.2610308009f, 0.3724974359f, -0.0551479654f, -0.1721616359f, + -0.3459358629f, -0.0952524186f, -0.1428993840f, -0.0415654914f, + -0.3169539902f, -0.0269429900f, 0.9891530919f, -0.0125084982f, + 0.0972182377f, 0.0008889801f, 0.0205418050f, 0.0057237854f, + 0.1005222691f, -0.2851321920f, -1.5150336445f, 0.1893942436f, + -0.4337360901f, +}; + +static const float av1_simple_motion_search_term_none_model_32[] = { + -0.4667392852f, -0.3893302767f, 0.1603498635f, 0.2304974726f, + 0.1404975592f, 0.2505516225f, 0.1423053884f, 0.2189318406f, + 0.1379765409f, 0.2638241296f, -0.1342865463f, -0.0549054345f, + -0.1925223436f, -0.1142702769f, 0.0127811659f, 0.0868639997f, + -0.0643197251f, 0.0279496470f, 0.9904395769f, -0.0095178685f, + 0.1179410649f, -0.0013411972f, 0.0095060660f, 0.0195730400f, + 0.0779717771f, -0.2498860763f, -0.8168817125f, -0.4798397348f, + -0.6609679881f, +}; + +static const float av1_simple_motion_search_term_none_model_16[] = { + -0.3021081992f, -0.4620153673f, 0.0448577479f, 0.1738455035f, + 0.0663209177f, 0.1629614573f, 0.0555168744f, 0.1631870212f, + 0.0425805150f, 0.1688564954f, 0.0434083772f, -0.0046603915f, + -0.0271580056f, -0.0183879127f, 0.1073730471f, 0.0314201476f, + 0.0576891756f, 0.0119723753f, 0.9084332022f, -0.0188429077f, + 0.0755089811f, -0.0172550234f, 0.0037663075f, 0.0022094472f, + 0.0500247894f, -0.2944572004f, -0.8908521199f, -0.2555515792f, + -0.5396254205f, +}; + +// Early termination in firstpass +static const float av1_fp_simple_motion_search_term_none_mean_32[20] = { + 10.216787f, 10.167575f, 8.405353f, 8.340786f, 8.436503f, + 8.373259f, 8.444113f, 8.379074f, 8.448215f, 8.384669f, + 4.107491f, 0.923902f, 2.702687f, 2.712742f, 0.953166f, + 2.703244f, 2.707070f, 9.549801f, 12.013671f, 17.059454f, +}; + +static const float av1_fp_simple_motion_search_term_none_std_32[20] = { + 1.886182f, 1.886638f, 1.884324f, 1.883410f, 1.851800f, 1.851652f, 1.847129f, + 1.848014f, 1.832187f, 1.832360f, 1.758185f, 0.265155f, 0.939592f, 0.932395f, + 0.211284f, 0.950024f, 0.945295f, 1.846744f, 1.453674f, 1.505994f, +}; + +static const float av1_fp_simple_motion_search_term_none_mean_16[20] = { + 9.131485f, 9.065489f, 7.254479f, 7.158092f, 7.274240f, 7.178158f, 7.278780f, + 7.182110f, 7.278793f, 7.182714f, 3.981902f, 0.964040f, 2.080875f, 2.087185f, + 0.973397f, 2.088189f, 2.090166f, 9.386505f, 10.826546f, 15.985614f, +}; + +static const float av1_fp_simple_motion_search_term_none_std_16[20] = { + 1.681172f, 1.688587f, 1.710854f, 1.717533f, 1.684010f, 1.691476f, 1.683537f, + 1.691523f, 1.674699f, 1.682130f, 1.639731f, 0.186191f, 0.796448f, 0.795075f, + 0.160921f, 0.791005f, 0.790048f, 1.430960f, 1.337976f, 1.370498f, +}; + +static const float av1_fp_simple_motion_search_term_none_mean_8[20] = { + 7.821461f, 7.714526f, 5.799360f, 5.606948f, 5.805885f, 5.614357f, 5.794252f, + 5.599669f, 5.798780f, 5.605399f, 4.069016f, 0.977720f, 1.577513f, 1.581266f, + 0.983371f, 1.524603f, 1.524952f, 9.221803f, 9.508886f, 14.972815f, +}; + +static const float av1_fp_simple_motion_search_term_none_std_8[20] = { + 1.618036f, 1.634415f, 1.652861f, 1.672006f, 1.646337f, 1.664935f, 1.650876f, + 1.670476f, 1.645141f, 1.664301f, 1.502258f, 0.147592f, 0.760353f, 0.762547f, + 0.127879f, 0.741096f, 0.742186f, 1.042003f, 1.292524f, 1.250398f, +}; + +#define NUM_HIDDEN_LAYERS_32 1 +#define NUM_FEATURES_32 20 +#define NUM_LAYER_0_UNITS_32 20 +#define NUM_LOGITS_32 1 + +static const float + av1_fp_simple_motion_search_term_none_hiddenlayer_0_kernel_32[] = { + -0.293987f, 0.796773f, -0.0888487f, -0.00796495f, -0.343768f, + 0.0783252f, 0.0596814f, -0.235432f, -0.0780005f, -0.409017f, + -0.256821f, -0.281654f, 1.00889f, 0.701893f, -0.0181661f, + 0.119718f, 0.0956582f, 0.76792f, 0.235693f, 0.351628f, + -1.28111f, -1.45847f, 0.387732f, 0.476054f, 0.384561f, + 0.427465f, 0.11875f, -0.0176598f, -0.0528453f, 0.395589f, + -0.331994f, 0.0442108f, 0.195171f, -0.0377402f, -0.0736457f, + -0.0490903f, 0.116165f, -0.549512f, 0.12968f, 0.641055f, + -1.03066f, -0.601979f, 0.351981f, -0.122019f, 0.00869275f, + 0.399222f, -0.343995f, -0.444257f, -0.160805f, -0.537537f, + 0.261478f, -0.163785f, 0.218916f, 0.106506f, -0.103819f, + 0.0121841f, 0.284757f, -0.362989f, 1.10793f, 0.477236f, + -0.424117f, -0.884156f, -0.468291f, -0.510531f, 0.791441f, + 0.75243f, 0.839871f, 0.604127f, -0.182956f, -0.246703f, + -1.25861f, 0.0546303f, 0.0811323f, 0.00655988f, 0.0286305f, + -0.00938366f, -0.0291418f, -0.231632f, -0.331077f, 1.12479f, + -0.635514f, -0.146066f, 0.853122f, 0.923699f, 0.180011f, + -0.252973f, 0.1474f, -0.454344f, 0.354736f, 0.576872f, + -1.43275f, 0.0327868f, 0.140849f, -0.102523f, 0.0524867f, + 0.007091f, -0.00232578f, -0.536116f, -0.700144f, 0.166646f, + 0.0636548f, 0.44645f, -0.346062f, -0.685779f, -1.0792f, + -0.999219f, 0.442744f, 0.371198f, 0.777914f, 0.719409f, + -0.417984f, 0.0602868f, 0.0225539f, 0.0457407f, 0.0249501f, + 0.0126021f, 0.00450792f, 0.0485095f, 0.203485f, 0.584116f, + -0.599426f, -0.244633f, 0.168231f, -0.00134934f, -0.106987f, + -0.0490239f, -0.22029f, 0.138017f, 0.373674f, 0.00638684f, + -2.08003f, 0.106453f, 0.124456f, -0.0286108f, 0.0422698f, + 0.013734f, 0.0780971f, -0.40173f, 0.473453f, 1.16836f, + -0.251035f, 0.0119074f, 0.319241f, 0.0422023f, -0.730454f, + -0.745948f, 0.796709f, 0.277634f, 0.09711f, -0.212224f, + 0.825348f, 0.0208521f, -0.0238098f, 0.00929265f, 0.0516351f, + -0.02329f, 0.0983163f, -0.180721f, 0.0122096f, -0.246159f, + 0.61468f, 0.923765f, 0.240435f, -0.294845f, -0.495317f, + -0.0563837f, -0.417936f, 0.154874f, -0.604407f, -0.0681337f, + -0.65738f, -0.0270073f, 0.0920023f, -0.0742724f, 0.820862f, + -0.602758f, -1.20617f, -0.201707f, 0.869499f, -0.0539076f, + 0.403097f, 0.429168f, -0.938227f, -0.830894f, -0.362462f, + -0.0658648f, 0.471469f, -0.264827f, 0.610275f, 0.367995f, + 0.735662f, -0.0473157f, -0.0380545f, -0.0848067f, -0.146108f, + -0.125875f, -0.0576117f, -0.296198f, -0.100443f, -0.212971f, + 0.593524f, 1.23111f, -0.810009f, -0.604572f, 0.203021f, + 0.256285f, -1.17049f, -1.19156f, 0.24365f, 0.727876f, + -0.466826f, 0.0298762f, -0.0331735f, -0.0109056f, 0.0114862f, + 0.00396703f, 0.0385985f, -0.0587946f, 0.821079f, 0.0582033f, + 0.349156f, 1.03529f, -0.407036f, 0.200308f, -0.265649f, + -0.104567f, 0.161149f, -0.0717528f, -0.0112724f, 0.0681578f, + 0.103809f, -0.0807997f, 0.0316814f, -0.332323f, 0.112254f, + -0.163981f, 0.118988f, -0.777055f, -1.34047f, -0.910482f, + 0.74599f, -0.59633f, 0.165649f, -0.594998f, 0.0845802f, + 0.00440975f, 0.122606f, -0.463991f, 0.418502f, -0.339126f, + 1.41847f, -0.109594f, -0.411879f, -0.444865f, -0.0404821f, + -0.0607352f, -0.663753f, -0.724327f, -0.138642f, 0.834144f, + -0.811695f, -0.930264f, 0.150993f, -0.325565f, 0.0615853f, + -0.473993f, 0.0966587f, 0.315197f, 1.0345f, 0.35441f, + 0.703234f, -0.335715f, 0.783153f, 0.467976f, -0.0234736f, + 0.549724f, 0.539107f, -0.510182f, -0.154442f, 0.0126656f, + 1.66711f, 0.884555f, 0.118675f, -0.341705f, 0.195316f, + -0.0366564f, -0.619244f, -0.634092f, -0.559951f, 0.0564255f, + 0.765917f, 0.0510238f, 0.0667615f, 0.0699302f, -0.0351751f, + -0.0484402f, -0.000792665f, -0.10775f, -0.337121f, -0.983947f, + 0.517793f, 1.34977f, -0.567602f, 0.129921f, -0.443722f, + -0.276277f, -0.501404f, -0.183234f, -0.553055f, -0.447434f, + -0.35529f, -0.0444689f, 0.0192031f, 0.0372702f, -0.195202f, + -0.020753f, -0.0247035f, 0.420298f, 1.39373f, 0.203699f, + -0.218818f, 0.250734f, -0.0282348f, 0.411986f, -0.262946f, + 0.526339f, 0.242769f, -0.159857f, -0.546788f, -0.0410147f, + 0.954238f, -0.0252765f, 0.639488f, -0.491367f, -0.0572638f, + 0.285763f, -0.45764f, 0.121657f, -1.24374f, -0.372479f, + -0.111521f, 0.194134f, -0.271364f, 0.179678f, 0.121237f, + -0.14305f, -0.205662f, 0.216891f, 0.344568f, -0.523745f, + -1.00908f, 0.180965f, 0.0263031f, -0.0556144f, 0.0831083f, + -0.0623274f, 0.112748f, 0.597137f, -0.502616f, -1.10624f, + -0.0487462f, -1.10744f, -0.125653f, 0.277049f, -0.141329f, + -0.00457003f, -0.161038f, 0.588462f, 0.323317f, 0.49762f, + 0.477561f, 0.901705f, -0.264511f, 0.256557f, 0.076023f, + -0.0460696f, 0.0830666f, -0.0651269f, -0.881245f, -0.285999f, + 0.53127f, 0.914533f, 0.0505795f, -0.3054f, -0.0988696f, + -0.0658403f, 0.15979f, -0.453316f, -0.824834f, -0.280222f, + -0.686952f, -0.0768344f, -1.12235f, -0.815408f, 0.0202134f, + -0.111892f, 0.0847659f, -0.18763f, 0.597782f, 0.364016f + }; + +static const float + av1_fp_simple_motion_search_term_none_hiddenlayer_0_bias_32[] = { + -1.541f, -0.00935641f, -1.50754f, -0.638648f, -0.679403f, + -0.0387804f, -0.714791f, -1.69522f, 0.435677f, -1.5846f, + 0.108788f, 0.614982f, 0.111048f, -0.465826f, -0.611358f, + 0.637197f, 0.929621f, -1.20889f, 0.954558f, 0.716529f + }; + +static const float av1_fp_simple_motion_search_term_none_logits_kernel_32[] = { + 0.396195f, -0.791364f, -0.881893f, 1.0542069f, 0.772562f, + 0.60815647f, 1.117405f, -1.272638f, 0.483183f, -0.917147f, + 0.690799f, -0.601466f, -0.545536f, -0.416353f, -0.927874f, + 0.972198f, -0.3770457f, 0.542694f, -0.591889f, 0.464565f +}; + +static const float av1_fp_simple_motion_search_term_none_logits_bias_32[] = { + -0.590318f +}; + +static const NN_CONFIG av1_fp_simple_motion_search_term_none_nn_config_32 = { + NUM_FEATURES_32, + NUM_LOGITS_32, + NUM_HIDDEN_LAYERS_32, + { + NUM_LAYER_0_UNITS_32, + }, + { + av1_fp_simple_motion_search_term_none_hiddenlayer_0_kernel_32, + av1_fp_simple_motion_search_term_none_logits_kernel_32, + }, + { + av1_fp_simple_motion_search_term_none_hiddenlayer_0_bias_32, + av1_fp_simple_motion_search_term_none_logits_bias_32, + }, +}; + +#undef NUM_HIDDEN_LAYERS_32 +#undef NUM_FEATURES_32 +#undef NUM_LAYER_0_UNITS_32 +#undef NUM_LOGITS_32 + +#define NUM_HIDDEN_LAYERS_16 1 +#define NUM_FEATURES_16 20 +#define NUM_LAYER_0_UNITS_16 24 +#define NUM_LOGITS_16 1 + +static const float + av1_fp_simple_motion_search_term_none_hiddenlayer_0_kernel_16[] = { + -0.315922f, 0.74455f, -0.0196939f, 0.238336f, 0.288554f, + 0.0845902f, -0.0121831f, 0.455303f, 0.0235902f, 0.218997f, + -0.0445164f, 0.0752211f, 0.0539915f, -0.0439682f, -0.397139f, + -0.0030004f, -0.106365f, 0.845384f, 0.684638f, -0.965702f, + 0.307643f, -0.0433377f, -0.0644826f, -0.214946f, -0.44467f, + 0.142967f, 0.0109982f, -0.344458f, -0.42947f, 0.269175f, + -0.88534f, -0.28077f, -1.36018f, -0.33725f, -0.0885953f, + -0.123887f, 0.218107f, -0.0759977f, 0.739124f, 0.684048f, + 0.577964f, -0.328481f, -0.247837f, 0.00546713f, 0.191895f, + -0.145274f, 0.320121f, -0.482379f, 0.534585f, -0.1582f, + 0.944784f, 0.944665f, 0.0494451f, -0.0399724f, -0.170375f, + -0.0869746f, 0.106216f, -0.120556f, -1.57849f, -0.752895f, + 0.424454f, -0.0269515f, 0.00398589f, 0.214165f, -0.142986f, + 0.199223f, 0.049624f, -0.116783f, -0.648119f, -0.311599f, + 0.122629f, -0.0338422f, 0.345092f, -0.408254f, 0.601037f, + -0.00146985f, 0.00133926f, 0.0392668f, -0.931156f, 0.31429f, + -0.150243f, 0.0755763f, -0.32177f, 0.258521f, -0.104078f, + -0.144506f, 0.0199566f, -0.454723f, -0.292959f, -0.0953681f, + -1.24843f, 0.446814f, -0.311363f, 0.0590878f, -0.0568717f, + -0.421585f, 0.179852f, 0.668763f, 0.48914f, 0.290584f, + -1.14053f, -1.37576f, 0.420112f, -0.158582f, 0.268231f, + 0.252999f, 0.276423f, 0.529033f, 0.141127f, 0.702762f, + 0.181407f, -0.0279289f, -0.0194757f, 0.0752152f, -0.136963f, + 0.00902489f, 0.125334f, 0.0680212f, -0.370449f, 0.438003f, + -0.600869f, 0.154209f, -0.36306f, -0.484209f, 0.140093f, + 0.0743079f, -0.143317f, 0.0442872f, 0.272089f, 0.601531f, + 1.20687f, -0.280695f, 0.222235f, -0.0106747f, -0.017026f, + 0.204008f, -0.0316111f, -0.64679f, -0.866749f, -0.774231f, + 0.306231f, -0.0940114f, -0.56555f, -0.34399f, 0.425142f, + 0.424064f, -0.50189f, -0.146558f, 0.544899f, 0.141728f, + 1.14592f, -0.0124826f, 0.111613f, -0.0862228f, 0.0211737f, + 0.0614017f, 0.0245077f, -0.454523f, -0.0766391f, -0.436808f, + 0.251409f, -0.13354f, -0.242447f, -0.311807f, -0.844505f, + -0.671486f, 0.0946297f, 0.241702f, 0.856521f, 0.529763f, + -0.869772f, -0.0016341f, 0.14511f, 0.0136254f, -0.0359721f, + -0.0454713f, 0.00664495f, 0.0373555f, 0.653991f, -0.075867f, + -0.102728f, -0.947685f, -0.119479f, -0.145413f, 0.148364f, + 0.310885f, -0.266837f, 0.354087f, 0.299469f, 0.603911f, + 0.257161f, 0.0190527f, 0.152862f, -0.0987196f, -0.293369f, + 0.139026f, -0.128421f, 0.0505933f, -0.703803f, 1.08628f, + -0.562294f, -0.818943f, 0.102178f, 0.727399f, -0.228433f, + 0.484057f, 0.0595919f, -0.0559087f, -0.549447f, 0.176168f, + 1.41744f, -0.126284f, 0.0987251f, -0.00123073f, 0.00510827f, + 0.105209f, 0.0671775f, -0.438525f, 0.211028f, -0.782459f, + 0.286411f, -0.459887f, 0.0633669f, 0.329958f, -0.0736945f, + 0.45188f, -0.2447f, 0.676601f, 0.600321f, -0.0336198f, + 0.108531f, 0.0452834f, -0.0848577f, 0.0731281f, 1.32381f, + -0.118349f, 0.129497f, -0.840938f, -1.45444f, -0.559047f, + -0.248109f, -0.491559f, -0.139812f, 0.175964f, 0.168687f, + 0.123031f, 0.201625f, 0.422849f, 0.34436f, 0.0426694f, + 0.558045f, -0.246772f, 0.679483f, -0.0959578f, -0.102879f, + 0.391029f, 0.280906f, 0.0867408f, -1.10932f, 0.402526f, + -0.227285f, 0.336087f, -0.237765f, 0.185619f, -0.309732f, + 0.0781132f, -0.0234955f, 0.0828806f, 0.19966f, -0.241288f, + -0.224634f, 0.0638918f, -0.143521f, -0.0206692f, -0.27131f, + 0.973051f, 1.12031f, 0.262846f, 0.471585f, 0.105231f, + -0.386434f, -0.355846f, 0.7359f, 0.567308f, 0.130768f, + 0.242369f, -0.0272523f, -0.118436f, 0.374145f, 0.24802f, + -1.00186f, -0.0241195f, 0.0140446f, 0.0202831f, 0.163197f, + 0.0399298f, -0.00912791f, -0.280572f, -0.309893f, -0.644495f, + 0.243838f, 0.731391f, 0.0725078f, 0.350308f, -0.136691f, + 0.208814f, 0.0218567f, -0.0805393f, -0.18681f, -0.214638f, + 0.273354f, -0.355047f, 0.242748f, 0.472951f, -0.202705f, + 0.405247f, 0.161622f, -0.284883f, -1.31181f, -0.661056f, + -0.248219f, -0.827307f, 0.289221f, 0.660529f, 0.48563f, + 0.407366f, 0.0327303f, -0.0610309f, -0.647064f, 0.0899991f, + 0.376267f, 1.27555f, 0.0264175f, 0.153931f, 1.07345f, + 0.0715052f, 0.174473f, 0.01322f, -0.715723f, 0.113909f, + 0.100968f, -0.457287f, -0.672022f, -0.20532f, 0.895176f, + 0.357034f, 0.5413f, 0.918393f, -0.455f, -0.499617f, + -1.21799f, 0.0634338f, 0.144944f, -0.106715f, 0.0227713f, + -0.0203213f, 0.030851f, -0.0726756f, 0.589192f, -0.060841f, + -0.198521f, 0.497179f, -0.0591156f, -0.135466f, -0.132638f, + -0.181333f, -0.332358f, 0.0349959f, 0.212885f, -0.536206f, + -0.425009f, -0.035525f, 0.0384449f, 0.0360549f, -0.0383953f, + -0.0263281f, -0.0228435f, 1.11771f, 0.928061f, -0.163923f, + -0.327868f, -0.894518f, 0.00448907f, 0.0805977f, 0.329559f, + 0.157429f, 0.292729f, 0.497688f, 0.188659f, 0.203724f, + -1.26001f, -0.0392533f, -0.0566088f, 0.000859925f, 0.125254f, + 0.054261f, 0.0357295f, -0.393813f, -0.275944f, 0.299657f, + -0.211421f, 0.038172f, -0.439829f, -0.913949f, 0.35642f, + 0.865473f, -0.472033f, -0.752376f, 0.995255f, 0.417965f, + -0.680645f, 0.0622027f, 0.128878f, -0.0357859f, 0.0793577f, + 0.203629f, -0.0600867f, 0.0512268f, 0.528584f, 0.23889f, + 0.38255f, -0.216407f, -0.0338828f, 0.0328103f, -0.885678f, + -0.716634f, 0.438663f, 0.320841f, -0.119656f, 0.626092f, + 0.8526f, -0.0325005f, -0.0275416f, -0.171131f, 0.0260563f, + -0.0162027f, 0.0879367f, -0.340473f, 0.0220265f, -0.1731f, + 0.512539f, 0.587822f, -0.175619f, 0.177215f, -0.35458f, + -0.159059f, -0.423754f, 0.0198413f, -0.336208f, -0.359052f, + -1.50819f, 0.0628184f, 0.054506f, 0.0048834f, 0.361657f, + 0.00986886f, -0.0721521f, -0.256765f, 1.41173f, 0.376196f, + -0.0783331f, 0.174803f, -0.00240091f, -0.306571f, -0.304654f, + -0.0348377f, 0.115569f, -0.20359f, -0.162341f, -0.0443526f, + -0.848317f, -0.228167f, 0.699534f, 0.482092f, -0.0921484f, + -0.172425f, -0.0610094f, -0.188327f, 0.836209f, 0.541725f + }; + +static const float + av1_fp_simple_motion_search_term_none_hiddenlayer_0_bias_16[] = { + -0.388147f, -0.0868767f, 0.702129f, 0.376659f, -0.709988f, 0.496603f, + -0.238442f, -1.35761f, -0.391887f, 0.235468f, -0.327982f, 0.731842f, + 1.0949f, -0.789218f, -0.881452f, 0.514341f, 0.727894f, -0.494498f, + -1.32304f, -1.22643f, -0.294287f, -1.3974f, -0.128148f, -0.0956137f + }; + +static const float av1_fp_simple_motion_search_term_none_logits_kernel_16[] = { + 0.456147f, 0.248707f, -0.5205241f, -0.1506567f, 0.388359f, -0.6074409f, + -0.4719775f, -0.733864f, 0.5588447f, -0.4021345f, -1.140733f, -0.73399f, + -0.4299591f, 0.450688f, 0.817564f, -0.265486f, -0.3525806f, 0.55188314f, + 1.365457f, 1.180764f, 0.587772f, -0.870683f, 0.818839f, 0.318488f +}; + +static const float av1_fp_simple_motion_search_term_none_logits_bias_16[] = { + -0.1046478f +}; + +static const NN_CONFIG av1_fp_simple_motion_search_term_none_nn_config_16 = { + NUM_FEATURES_16, + NUM_LOGITS_16, + NUM_HIDDEN_LAYERS_16, + { + NUM_LAYER_0_UNITS_16, + }, + { + av1_fp_simple_motion_search_term_none_hiddenlayer_0_kernel_16, + av1_fp_simple_motion_search_term_none_logits_kernel_16, + }, + { + av1_fp_simple_motion_search_term_none_hiddenlayer_0_bias_16, + av1_fp_simple_motion_search_term_none_logits_bias_16, + }, +}; + +#undef NUM_HIDDEN_LAYERS_16 +#undef NUM_FEATURES_16 +#undef NUM_LAYER_0_UNITS_16 +#undef NUM_LOGITS_16 + +#define NUM_HIDDEN_LAYERS_8 1 +#define NUM_FEATURES_8 20 +#define NUM_LAYER_0_UNITS_8 16 +#define NUM_LOGITS_8 1 + +static const float + av1_fp_simple_motion_search_term_none_hiddenlayer_0_kernel_8[] = { + -1.11024f, -0.530449f, -0.164768f, 0.675431f, 0.456155f, + 0.711099f, -0.248095f, 0.112132f, -0.131481f, 0.234457f, + 0.128073f, 0.306214f, 0.175471f, 0.220189f, -0.270533f, + 0.293534f, -0.0795547f, 0.234901f, -0.191754f, 0.101171f, + -0.108621f, 0.395477f, -0.529459f, -0.354854f, -0.941334f, + -0.237689f, 0.39357f, 0.527129f, 0.174333f, -0.00520422f, + 1.22219f, -0.21815f, 0.0866816f, -0.29591f, -0.212968f, + 0.00431436f, -0.295382f, -0.582317f, -0.284654f, 0.486427f, + -0.202448f, -0.0421883f, -0.116346f, -0.345832f, -0.0471637f, + -0.149954f, -0.0969526f, -0.59491f, 0.594364f, 0.298285f, + -1.33301f, 0.149562f, 0.097433f, 0.157641f, -0.231132f, + -0.0191656f, 0.149396f, 0.811553f, 1.07336f, 0.140674f, + 1.02134f, 0.455909f, -0.0548795f, 0.0459996f, -0.0589837f, + -0.116328f, -0.607502f, -0.232595f, -0.517977f, -0.325901f, + 1.35047f, -0.148698f, 0.0313182f, 0.181634f, 0.06539f, + 0.00820322f, 0.0522113f, -1.06071f, -0.817999f, -0.527422f, + -1.39175f, -0.110088f, 0.0858626f, -0.247541f, 0.29043f, + 1.13767f, 0.185834f, 0.390613f, -0.501175f, -0.214176f, + -0.256376f, 0.496687f, 0.240471f, 0.218852f, 0.513543f, + 0.400559f, -0.249168f, -0.752987f, 0.430491f, -0.72299f, + 0.339754f, 0.396623f, -0.0638322f, 0.353122f, 0.355662f, + -0.0704821f, 0.195448f, 0.179396f, 0.486533f, 0.0815535f, + -0.503726f, -0.000321223f, 0.501591f, -0.117849f, 0.217667f, + -0.123391f, -0.4026f, 0.149756f, -0.0359276f, -0.0990213f, + -0.215278f, -0.293649f, 0.301629f, -0.11081f, -0.206725f, + -0.00147108f, 0.363644f, -0.430092f, 0.169524f, 0.116091f, + -0.583605f, -0.0974948f, 0.253256f, 0.22648f, 0.136902f, + -0.882541f, -0.75078f, -0.0629343f, 0.411035f, 0.265742f, + -0.360904f, -0.899324f, 0.605871f, 0.0318372f, 0.0735312f, + -0.00960722f, 0.691249f, 0.127449f, -0.133021f, -0.0793589f, + 0.665591f, -0.0682262f, -0.0437626f, 0.0783621f, 2.25727f, + 0.126529f, -0.0320763f, -0.261759f, -1.19987f, 0.216295f, + -0.253886f, -0.642908f, 0.1865f, 0.00299179f, 0.0246782f, + -0.00750628f, 0.566367f, 0.99916f, -0.0209625f, 0.273254f, + 1.09724f, 0.30026f, 0.21585f, -0.0276715f, 0.338996f, + 0.129884f, -0.00628438f, 0.0461783f, -1.36378f, -0.394756f, + -0.395261f, 0.215928f, 0.252803f, -0.207108f, -0.0506214f, + -0.0138889f, 0.124197f, -0.0522996f, 0.533803f, -0.25729f, + -0.463514f, 0.128322f, -1.04751f, -0.605498f, -0.107235f, + -0.00813289f, 0.539742f, -0.0524178f, 0.272101f, 0.151935f, + 0.607511f, -0.0608427f, 0.36342f, 0.0999134f, 0.69712f, + -0.152471f, 0.364244f, 0.410644f, 0.312606f, 0.405679f, + -0.371656f, -0.0492209f, -0.148911f, 0.214996f, -0.274749f, + -0.0372888f, 0.079023f, -0.429136f, -1.30393f, -0.833824f, + -1.31373f, -0.445343f, 0.526917f, 1.30569f, -0.0626746f, + 0.282353f, -0.28552f, 0.28084f, -0.234934f, 0.227076f, + 1.09919f, 0.33248f, -0.114933f, 0.40629f, 0.331031f, + 0.245334f, -0.0318782f, 0.00735305f, -1.58715f, 0.126443f, + -0.09472f, -0.182152f, 0.311673f, -0.186136f, 0.817743f, + 0.928961f, 0.117334f, -0.373644f, -0.0797864f, 0.205565f, + 0.0789797f, 0.0757131f, -0.152409f, 0.30301f, -0.0170824f, + -0.194496f, 0.485547f, 0.370124f, -0.802044f, -0.789671f, + 0.669258f, 0.55082f, -0.438853f, 0.0597597f, -0.0148101f, + -0.41603f, 0.0486339f, -0.464523f, -0.413725f, 0.00907629f, + 0.70351f, -0.136422f, -0.145957f, -0.0626726f, -0.115773f, + -0.333937f, 0.135474f, -0.379598f, -0.134422f, 0.227595f, + 0.908927f, 0.759504f, -0.0088258f, -0.349333f, 0.122667f, + -0.682175f, 0.2201f, -0.332003f, -0.44433f, -0.620308f, + -1.36716f, -0.0167907f, -0.538969f, 0.256824f, -0.0706724f, + -0.0392471f, -0.156312f, 0.153699f, 1.41967f, 0.0434739f, + 0.428178f, -0.0714879f, 0.0912104f, 0.00687985f, 0.341789f, + 0.217381f, 0.128288f, 0.0286751f, 0.527344f, -0.428139f, + 0.60908f, 1.02074f, -0.0977894f, 0.158067f, 0.28958f, + -0.065152f, 0.120616f, -0.882976f, -1.10413f, -1.37497f + }; + +static const float + av1_fp_simple_motion_search_term_none_hiddenlayer_0_bias_8[] = { + 1.37086f, -1.61858f, -1.32395f, 0.276031f, -0.124696f, -1.71489f, + -1.68429f, 1.79103f, -0.335306f, -1.81523f, 0.841083f, -0.542628f, + -1.82168f, 0.459829f, 0.0949306f, 0.918486f + }; + +static const float av1_fp_simple_motion_search_term_none_logits_kernel_8[] = { + -0.283418f, -0.444453f, 0.4977782f, -0.4138758f, 0.41890771f, 0.22149438f, + 0.545079f, -0.729164f, 0.619389f, 0.5169534f, -0.4236282f, 0.7304213f, + 0.531938f, -0.14828f, 0.75119f, -0.464074f +}; + +static const float av1_fp_simple_motion_search_term_none_logits_bias_8[] = { + -2.22338f +}; + +static const NN_CONFIG av1_fp_simple_motion_search_term_none_nn_config_8 = { + NUM_FEATURES_8, + NUM_LOGITS_8, + NUM_HIDDEN_LAYERS_8, + { + NUM_LAYER_0_UNITS_8, + }, + { + av1_fp_simple_motion_search_term_none_hiddenlayer_0_kernel_8, + av1_fp_simple_motion_search_term_none_logits_kernel_8, + }, + { + av1_fp_simple_motion_search_term_none_hiddenlayer_0_bias_8, + av1_fp_simple_motion_search_term_none_logits_bias_8, + }, +}; #undef NUM_HIDDEN_LAYERS_8 #undef NUM_FEATURES_8 #undef NUM_LAYER_0_UNITS_8 #undef NUM_LOGITS_8 -#endif + +static const float av1_fp_simple_motion_search_term_none_thresh_32 = + -2.2884985045792563f; +static const float av1_fp_simple_motion_search_term_none_thresh_16 = + -1.6656874577527165f; +static const float av1_fp_simple_motion_search_term_none_thresh_8 = + -3.608804354309157f; #ifdef __cplusplus } // extern "C" diff --git a/libaom/av1/encoder/partition_strategy.c b/libaom/av1/encoder/partition_strategy.c new file mode 100644 index 0000000..e8270b3 --- /dev/null +++ b/libaom/av1/encoder/partition_strategy.c @@ -0,0 +1,727 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <float.h> + +#include "aom_ports/system_state.h" + +#include "av1/common/enums.h" +#include "av1/common/reconinter.h" + +#include "av1/encoder/encoder.h" +#include "av1/encoder/partition_model_weights.h" +#include "av1/encoder/partition_strategy.h" +#include "av1/encoder/rdopt.h" + +// Performs a simple_motion_search with a single reference frame and extract +// the variance of residues. Here features is assumed to be a length 6 array. +// After this function is called, we will store the following in to features: +// features[0] = log(1 + dc_q**2/256) +// features[1] = log(1 + variance_of_residue) +// for i in [2, 3, 4, 5]: +// features[i] = log(1 + variance_of_residue_in_block[i]/variance_of_residue) +static void get_res_var_features(AV1_COMP *const cpi, MACROBLOCK *x, int mi_row, + int mi_col, BLOCK_SIZE bsize, + float *features) { + // TODO(chiyotsai@google.com): The data this model trained on did not also use + // SIMPLE_TRANSLATION to build the inter_predictor. Retraining and tuning the + // model with the correct data should give better performance. + assert(mi_size_wide[bsize] == mi_size_high[bsize]); + + MACROBLOCKD *xd = &x->e_mbd; + + // Perform a single motion search in Y_PLANE to make a prediction + const int use_subpixel = 0; + + // Start getting the features + int f_idx = 0; + + // Q_INDEX + const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8); + aom_clear_system_state(); + features[f_idx++] = logf(1.0f + (float)(dc_q * dc_q) / 256.0f); + + // VARIANCE + unsigned int sse = 0; + unsigned int var = 0; + const MV ref_mv_full = { .row = 0, .col = 0 }; + av1_simple_motion_sse_var(cpi, x, mi_row, mi_col, bsize, ref_mv_full, + use_subpixel, &sse, &var); + aom_clear_system_state(); + features[f_idx++] = logf(1.0f + (float)var); + + // Regional + const uint8_t *src = x->plane[0].src.buf; + const int src_stride = x->plane[0].src.stride; + const uint8_t *dst = xd->plane[0].dst.buf; + const int dst_stride = xd->plane[0].dst.stride; + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); + int r_idx = 0; + for (r_idx = 0; r_idx < 4; r_idx++) { + const int x_idx = (r_idx & 1) * bw / 2; + const int y_idx = (r_idx >> 1) * bh / 2; + const int src_offset = y_idx * src_stride + x_idx; + const int dst_offset = y_idx * dst_stride + x_idx; + const unsigned int sub_var = cpi->fn_ptr[subsize].vf( + src + src_offset, src_stride, dst + dst_offset, dst_stride, &sse); + aom_clear_system_state(); + const float var_ratio = (1.0f + (float)sub_var) / (4.0f + (float)var); + features[f_idx++] = var_ratio; + } +} + +void av1_simple_motion_search_based_split( + AV1_COMP *const cpi, MACROBLOCK *x, int mi_row, int mi_col, + BLOCK_SIZE bsize, int *partition_none_allowed, int *partition_horz_allowed, + int *partition_vert_allowed, int *do_rectangular_split, + int *do_square_split) { + const NN_CONFIG *nn_config = NULL; + float split_only_thresh = 0.0f; + if (bsize == BLOCK_128X128) { + nn_config = &av1_simple_motion_search_based_split_nn_config_128; + split_only_thresh = av1_simple_motion_search_based_split_thresh_128; + } else if (bsize == BLOCK_64X64) { + nn_config = &av1_simple_motion_search_based_split_nn_config_64; + split_only_thresh = av1_simple_motion_search_based_split_thresh_64; + } else if (bsize == BLOCK_32X32) { + nn_config = &av1_simple_motion_search_based_split_nn_config_32; + split_only_thresh = av1_simple_motion_search_based_split_thresh_32; + } else if (bsize == BLOCK_16X16) { + nn_config = &av1_simple_motion_search_based_split_nn_config_16; + split_only_thresh = av1_simple_motion_search_based_split_thresh_16; + } else if (bsize == BLOCK_8X8) { + // Disable BLOCK_8X8 for now +#if !CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 + nn_config = &av1_simple_motion_search_based_split_nn_config_8; + split_only_thresh = av1_simple_motion_search_based_split_thresh_8; +#endif + } else { + assert(0 && "Unexpected block size in simple_motion_based_split"); + } + if (nn_config) { + float features[6] = { 0 }; + float score = 0; + get_res_var_features(cpi, x, mi_row, mi_col, bsize, features); + av1_nn_predict(features, nn_config, &score); + + if (score > split_only_thresh) { + *partition_none_allowed = 0; + *partition_horz_allowed = 0; + *partition_vert_allowed = 0; + *do_rectangular_split = 0; + } + if (cpi->sf.simple_motion_search_split_only >= 2) { + if (score < -split_only_thresh) *do_square_split = 0; + // For larger scores (>split_only_thresh), none and rectangular partitions + // are skipped. As score reduces, possibility of split decreases. Hence + // for near larger scores (.875 * split_only_thresh to split_only_thresh) + // none partition is disabled, but rectangular partitions are evaluated + // additionally. + if (score > (split_only_thresh * 0.875)) *partition_none_allowed = 0; + } + } +} + +// Given a list of ref frames in refs, performs simple_motion_search on each of +// the refs and returns the ref with the smallest sse. Returns -1 if none of the +// ref in the list is available. Also stores the best sse and var in best_sse, +// best_var, respectively. If save_mv_code is -1, don't update mv_ref_fulls in +// pc_tree. If save_mv_code is between 0 and 3, update mv_ref_fulls under +// pc_tree->split[i]. If save_mv_code is 4, update mv_ref_fulls under pc_tree. +static int simple_motion_search_get_best_ref( + AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row, + int mi_col, BLOCK_SIZE bsize, const int *const refs, int num_refs, + int use_subpixel, int save_mv_code, unsigned int *best_sse, + unsigned int *best_var) { + // TODO(chiyotsai@google.com): The calculation of variance currently uses + // bsize, so we might take area outside of the image into account. We need to + // modify the SIMD functions to fix this later. + const AV1_COMMON *const cm = &cpi->common; + int best_ref = -1; + + if (mi_col >= cm->mi_cols || mi_row >= cm->mi_rows) { + // If the whole block is outside of the image, set the var and sse to 0. + *best_var = 0; + *best_sse = 0; + + return best_ref; + } + + // Otherwise do loop through the reference frames and find the one with the + // minimum SSE + const MACROBLOCKD *xd = &x->e_mbd; + const MV *mv_ref_fulls = pc_tree->mv_ref_fulls; + + const int num_planes = 1; + + *best_sse = INT_MAX; + + for (int ref_idx = 0; ref_idx < num_refs; ref_idx++) { + const int ref = refs[ref_idx]; + + if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref]) { + unsigned int curr_sse = 0, curr_var = 0; + av1_simple_motion_search(cpi, x, mi_row, mi_col, bsize, ref, + mv_ref_fulls[ref], num_planes, use_subpixel); + curr_var = cpi->fn_ptr[bsize].vf( + x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].dst.buf, + xd->plane[0].dst.stride, &curr_sse); + if (curr_sse < *best_sse) { + *best_sse = curr_sse; + *best_var = curr_var; + best_ref = ref; + } + + const int new_mv_row = x->best_mv.as_mv.row / 8; + const int new_mv_col = x->best_mv.as_mv.col / 8; + if (save_mv_code == 4) { + pc_tree->mv_ref_fulls[ref].row = new_mv_row; + pc_tree->mv_ref_fulls[ref].col = new_mv_col; + } else if (save_mv_code >= 0 && save_mv_code < 4) { + // Propagate the new motion vectors to a lower level + pc_tree->split[save_mv_code]->mv_ref_fulls[ref].row = new_mv_row; + pc_tree->split[save_mv_code]->mv_ref_fulls[ref].col = new_mv_col; + } else { + assert(save_mv_code == -1 && + "Unknown code in simple_motion_search_get_best_ref."); + } + } + } + + return best_ref; +} + +// Performs fullpixel simple_motion_search with LAST_FRAME and ALTREF_FRAME on +// each subblock and extract the variance and sse of residues. Then store the +// var and sse from each partition subblock to features. The DC qindex is also +// stored in features. +// Here features is assumed to be a length 19 array. +// After this function is called, we will store the following to features: +// features[0:17] = var and sse from subblocks +// features[18] = DC q_index +static void simple_motion_search_prune_part_features( + AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row, + int mi_col, BLOCK_SIZE bsize, float *features) { + // TODO(chiyotsai@google.com): Cache the result of the motion search from the + // larger bsize. + const int w_mi = mi_size_wide[bsize]; + const int h_mi = mi_size_high[bsize]; + int f_idx = 0; + assert(mi_size_wide[bsize] == mi_size_high[bsize]); + assert(cpi->ref_frame_flags & av1_ref_frame_flag_list[LAST_FRAME] || + cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME]); + + // Setting up motion search + const int ref_list[] = { LAST_FRAME, ALTREF_FRAME }; + const int num_refs = 2; + const int use_subpixel = 1; + + unsigned int int_features[FEATURE_SIZE_SMS_PRUNE_PART - 1]; + + // Doing whole block first to update the mv + simple_motion_search_get_best_ref( + cpi, x, pc_tree, mi_row, mi_col, bsize, ref_list, num_refs, use_subpixel, + 4, &int_features[f_idx], &int_features[f_idx + 1]); + f_idx += 2; + + // Split subblocks + BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); + int r_idx = 0; + for (r_idx = 0; r_idx < 4; r_idx++) { + const int sub_mi_col = mi_col + (r_idx & 1) * w_mi / 2; + const int sub_mi_row = mi_row + (r_idx >> 1) * h_mi / 2; + + simple_motion_search_get_best_ref( + cpi, x, pc_tree, sub_mi_row, sub_mi_col, subsize, ref_list, num_refs, + use_subpixel, r_idx, &int_features[f_idx], &int_features[f_idx + 1]); + f_idx += 2; + } + + // Horz subblocks + subsize = get_partition_subsize(bsize, PARTITION_HORZ); + for (r_idx = 0; r_idx < 2; r_idx++) { + const int sub_mi_col = mi_col + 0; + const int sub_mi_row = mi_row + r_idx * h_mi / 2; + + simple_motion_search_get_best_ref( + cpi, x, pc_tree, sub_mi_row, sub_mi_col, subsize, ref_list, num_refs, + use_subpixel, -1, &int_features[f_idx], &int_features[f_idx + 1]); + + f_idx += 2; + } + + // Vert subblock + subsize = get_partition_subsize(bsize, PARTITION_VERT); + for (r_idx = 0; r_idx < 2; r_idx++) { + const int sub_mi_col = mi_col + r_idx * w_mi / 2; + const int sub_mi_row = mi_row + 0; + + simple_motion_search_get_best_ref( + cpi, x, pc_tree, sub_mi_row, sub_mi_col, subsize, ref_list, num_refs, + use_subpixel, -1, &int_features[f_idx], &int_features[f_idx + 1]); + + f_idx += 2; + } + + aom_clear_system_state(); + for (int idx = 0; idx < f_idx; idx++) { + features[idx] = logf(1.0f + (float)int_features[idx]); + } + + const MACROBLOCKD *xd = &x->e_mbd; + set_offsets_for_motion_search(cpi, x, mi_row, mi_col, bsize); + + // Q_INDEX + const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8); + features[f_idx++] = logf(1.0f + (float)(dc_q * dc_q) / 256.0f); + + // Neighbor stuff + const int has_above = !!xd->above_mbmi; + const int has_left = !!xd->left_mbmi; + const BLOCK_SIZE above_bsize = has_above ? xd->above_mbmi->sb_type : bsize; + const BLOCK_SIZE left_bsize = has_left ? xd->left_mbmi->sb_type : bsize; + features[f_idx++] = (float)has_above; + features[f_idx++] = (float)mi_size_wide_log2[above_bsize]; + features[f_idx++] = (float)mi_size_high_log2[above_bsize]; + features[f_idx++] = (float)has_left; + features[f_idx++] = (float)mi_size_wide_log2[left_bsize]; + features[f_idx++] = (float)mi_size_high_log2[left_bsize]; + + assert(f_idx == FEATURE_SIZE_SMS_PRUNE_PART); +} + +void av1_simple_motion_search_prune_part( + AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row, + int mi_col, BLOCK_SIZE bsize, int *partition_none_allowed, + int *partition_horz_allowed, int *partition_vert_allowed, + int *do_square_split, int *do_rectangular_split, int *prune_horz, + int *prune_vert, float *features, int *valid) { + const AV1_COMMON *const cm = &cpi->common; + // Get model parameters + const NN_CONFIG *nn_config = NULL; + const float *prune_thresh = NULL, *only_thresh = NULL; + const float *ml_mean = NULL, *ml_std = NULL; + float normalized_features[FEATURE_SIZE_SMS_PRUNE_PART] = { 0.0f }; + + if (bsize == BLOCK_128X128) { + nn_config = &av1_simple_motion_search_prune_part_nn_config_128; + ml_mean = av1_simple_motion_search_prune_part_mean_128; + ml_std = av1_simple_motion_search_prune_part_std_128; + prune_thresh = av1_simple_motion_search_prune_part_prune_thresh_128; + only_thresh = av1_simple_motion_search_prune_part_only_thresh_128; + } else if (bsize == BLOCK_64X64) { + nn_config = &av1_simple_motion_search_prune_part_nn_config_64; + ml_mean = av1_simple_motion_search_prune_part_mean_64; + ml_std = av1_simple_motion_search_prune_part_std_64; + prune_thresh = av1_simple_motion_search_prune_part_prune_thresh_64; + only_thresh = av1_simple_motion_search_prune_part_only_thresh_64; + } else if (bsize == BLOCK_32X32) { + nn_config = &av1_simple_motion_search_prune_part_nn_config_32; + ml_mean = av1_simple_motion_search_prune_part_mean_32; + ml_std = av1_simple_motion_search_prune_part_std_32; + prune_thresh = av1_simple_motion_search_prune_part_prune_thresh_32; + only_thresh = av1_simple_motion_search_prune_part_only_thresh_32; + } else if (bsize == BLOCK_16X16) { + nn_config = &av1_simple_motion_search_prune_part_nn_config_16; + ml_mean = av1_simple_motion_search_prune_part_mean_16; + ml_std = av1_simple_motion_search_prune_part_std_16; + prune_thresh = av1_simple_motion_search_prune_part_prune_thresh_16; + only_thresh = av1_simple_motion_search_prune_part_only_thresh_16; + } else if (bsize == BLOCK_8X8) { + nn_config = &av1_simple_motion_search_prune_part_nn_config_8; + ml_mean = av1_simple_motion_search_prune_part_mean_8; + ml_std = av1_simple_motion_search_prune_part_std_8; + prune_thresh = av1_simple_motion_search_prune_part_prune_thresh_8; + only_thresh = av1_simple_motion_search_prune_part_only_thresh_8; + } else { + assert(0 && "Unexpected block size in simple_motion_prune_part"); + } + + // If there is no valid threshold, return immediately. + if (!nn_config || (prune_thresh[PARTITION_HORZ] == 0.0f && + prune_thresh[PARTITION_VERT] == 0.0f)) { + return; + } + if (bsize < BLOCK_8X8) { + return; + } + + // Get features + simple_motion_search_prune_part_features(cpi, x, pc_tree, mi_row, mi_col, + bsize, features); + *valid = 1; + for (int f_idx = 0; f_idx < FEATURE_SIZE_SMS_PRUNE_PART; f_idx++) { + normalized_features[f_idx] = + (features[f_idx] - ml_mean[f_idx]) / ml_std[f_idx]; + } + + // Get probabilities + float scores[EXT_PARTITION_TYPES] = { 0.0f }, + probs[EXT_PARTITION_TYPES] = { 0.0f }; + const int num_classes = (bsize == BLOCK_128X128 || bsize == BLOCK_8X8) + ? PARTITION_TYPES + : EXT_PARTITION_TYPES; + + av1_nn_predict(normalized_features, nn_config, scores); + aom_clear_system_state(); + + av1_nn_softmax(scores, probs, num_classes); + + // Determine if we should prune rectangular partitions. + if (cpi->sf.simple_motion_search_prune_rect && !frame_is_intra_only(cm) && + (*partition_horz_allowed || *partition_vert_allowed) && + bsize >= BLOCK_8X8 && !av1_superres_scaled(cm)) { + *prune_horz = probs[PARTITION_HORZ] <= prune_thresh[PARTITION_HORZ]; + *prune_vert = probs[PARTITION_VERT] <= prune_thresh[PARTITION_VERT]; + } + + // Silence compiler warnings + (void)only_thresh; + (void)partition_none_allowed; + (void)do_square_split; + (void)do_rectangular_split; +} + +// Early terminates PARTITION_NONE using simple_motion_search features and the +// rate, distortion, and rdcost of PARTITION_NONE. This is only called when: +// - The frame is a show frame +// - The frame is not intra only +// - The current bsize is > BLOCK_8X8 +// - blk_row + blk_height/2 < total_rows and blk_col + blk_width/2 < total_cols +void av1_simple_motion_search_early_term_none( + AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row, + int mi_col, BLOCK_SIZE bsize, const RD_STATS *none_rdc, + int *early_terminate, float *simple_motion_features, + int *simple_motion_features_are_valid) { + // TODO(chiyotsai@google.com): There are other features we can extract from + // PARTITION_NONE. Play with this later. + int f_idx = 0; + if (!*simple_motion_features_are_valid) { + simple_motion_search_prune_part_features(cpi, x, pc_tree, mi_row, mi_col, + bsize, simple_motion_features); + *simple_motion_features_are_valid = 1; + } + f_idx = 25; + + simple_motion_features[f_idx++] = logf(1.0f + (float)none_rdc->rate); + simple_motion_features[f_idx++] = logf(1.0f + (float)none_rdc->dist); + simple_motion_features[f_idx++] = logf(1.0f + (float)none_rdc->rdcost); + + assert(f_idx == FEATURE_SIZE_SMS_TERM_NONE); + + const float *ml_mean = NULL; + const float *ml_std = NULL; + const float *ml_model = NULL; + + if (bsize == BLOCK_128X128) { + ml_mean = av1_simple_motion_search_term_none_mean_128; + ml_std = av1_simple_motion_search_term_none_std_128; + ml_model = av1_simple_motion_search_term_none_model_128; + } else if (bsize == BLOCK_64X64) { + ml_mean = av1_simple_motion_search_term_none_mean_64; + ml_std = av1_simple_motion_search_term_none_std_64; + ml_model = av1_simple_motion_search_term_none_model_64; + } else if (bsize == BLOCK_32X32) { + ml_mean = av1_simple_motion_search_term_none_mean_32; + ml_std = av1_simple_motion_search_term_none_std_32; + ml_model = av1_simple_motion_search_term_none_model_32; + } else if (bsize == BLOCK_16X16) { + ml_mean = av1_simple_motion_search_term_none_mean_16; + ml_std = av1_simple_motion_search_term_none_std_16; + ml_model = av1_simple_motion_search_term_none_model_16; + } else { + assert(0 && "Unexpected block size in simple_motion_term_none"); + } + + if (ml_model) { + float score = 0.0f; + for (f_idx = 0; f_idx < FEATURE_SIZE_SMS_TERM_NONE; f_idx++) { + score += ml_model[f_idx] * + (simple_motion_features[f_idx] - ml_mean[f_idx]) / ml_std[f_idx]; + } + score += ml_model[FEATURE_SIZE_SMS_TERM_NONE]; + + if (score >= 0.0f) { + *early_terminate = 1; + } + } +} + +static void firstpass_simple_motion_search_features( + AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row, + int mi_col, BLOCK_SIZE bsize, float *features) { + assert(mi_size_wide[bsize] == mi_size_high[bsize]); + assert(cpi->ref_frame_flags & av1_ref_frame_flag_list[LAST_FRAME] || + cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME]); + + // Setting up motion search + const int ref_list[] = { LAST_FRAME, ALTREF_FRAME }; + const int num_refs = 2; + const int use_subpixel = 0; + + unsigned int int_features[10] = { 0 }; + + int f_idx = 0; + // Doing whole block first to update the mv + simple_motion_search_get_best_ref( + cpi, x, pc_tree, mi_row, mi_col, bsize, ref_list, num_refs, use_subpixel, + 4, &int_features[f_idx], &int_features[f_idx + 1]); + f_idx += 2; + + // Split subblocks + const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT); + const int w_mi = mi_size_wide[bsize]; + const int h_mi = mi_size_high[bsize]; + for (int r_idx = 0; r_idx < 4; r_idx++) { + const int sub_mi_col = mi_col + (r_idx & 1) * w_mi / 2; + const int sub_mi_row = mi_row + (r_idx >> 1) * h_mi / 2; + + simple_motion_search_get_best_ref( + cpi, x, pc_tree, sub_mi_row, sub_mi_col, subsize, ref_list, num_refs, + use_subpixel, r_idx, &int_features[f_idx], &int_features[f_idx + 1]); + f_idx += 2; + } + + aom_clear_system_state(); + for (int idx = 0; idx < f_idx; idx++) { + features[idx] = logf(1.0f + (float)int_features[idx]); + } + + const MACROBLOCKD *xd = &x->e_mbd; + set_offsets_for_motion_search(cpi, x, mi_row, mi_col, bsize); + + // Q_INDEX + const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8); + features[f_idx++] = logf(1.0f + (float)(dc_q * dc_q) / 256.0f); + + // Neighbor stuff + const int has_above = !!xd->above_mbmi; + const int has_left = !!xd->left_mbmi; + const BLOCK_SIZE above_bsize = has_above ? xd->above_mbmi->sb_type : bsize; + const BLOCK_SIZE left_bsize = has_left ? xd->left_mbmi->sb_type : bsize; + features[f_idx++] = (float)has_above; + features[f_idx++] = (float)mi_size_wide_log2[above_bsize]; + features[f_idx++] = (float)mi_size_high_log2[above_bsize]; + features[f_idx++] = (float)has_left; + features[f_idx++] = (float)mi_size_wide_log2[left_bsize]; + features[f_idx++] = (float)mi_size_high_log2[left_bsize]; +} + +void av1_firstpass_simple_motion_search_early_term(AV1_COMP *const cpi, + MACROBLOCK *x, + PC_TREE *pc_tree, int mi_row, + int mi_col, BLOCK_SIZE bsize, + const RD_STATS *none_rdc, + int *do_square_split) { + const NN_CONFIG *nn_config = NULL; + float thresh = 0.0f; + const float *ml_mean = NULL, *ml_std = NULL; + if (bsize == BLOCK_32X32) { + nn_config = &av1_fp_simple_motion_search_term_none_nn_config_32; + ml_mean = av1_fp_simple_motion_search_term_none_mean_32; + ml_std = av1_fp_simple_motion_search_term_none_std_32; + thresh = av1_fp_simple_motion_search_term_none_thresh_32; + } else if (bsize == BLOCK_16X16) { + nn_config = &av1_fp_simple_motion_search_term_none_nn_config_16; + ml_mean = av1_fp_simple_motion_search_term_none_mean_16; + ml_std = av1_fp_simple_motion_search_term_none_std_16; + thresh = av1_fp_simple_motion_search_term_none_thresh_16; + } else if (bsize == BLOCK_8X8) { + nn_config = &av1_fp_simple_motion_search_term_none_nn_config_8; + ml_mean = av1_fp_simple_motion_search_term_none_mean_8; + ml_std = av1_fp_simple_motion_search_term_none_std_8; + thresh = av1_fp_simple_motion_search_term_none_thresh_8; + } else { + assert(0 && + "Unexpected bsize in firstpass_simple_motion_search_early_term"); + return; + } + + float ml_features[FEATURE_SIZE_FP_SMS_TERM_NONE] = { 0.0f }; + + firstpass_simple_motion_search_features(cpi, x, pc_tree, mi_row, mi_col, + bsize, ml_features); + int f_idx = 17; + + ml_features[f_idx++] = logf(1.0f + (float)none_rdc->rate); + ml_features[f_idx++] = logf(1.0f + (float)none_rdc->dist); + ml_features[f_idx++] = logf(1.0f + (float)none_rdc->rdcost); + + for (f_idx = 0; f_idx < 20; f_idx++) { + ml_features[f_idx] = (ml_features[f_idx] - ml_mean[f_idx]) / ml_std[f_idx]; + } + + // Get probabilities + float score = 0.0f; + + av1_nn_predict(ml_features, nn_config, &score); + aom_clear_system_state(); + + // Determine if we should prune square partitions. + if (score < thresh) { + *do_square_split = 0; + } +} + +void av1_get_max_min_partition_features(AV1_COMP *const cpi, MACROBLOCK *x, + int mi_row, int mi_col, + float *features) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + const BLOCK_SIZE sb_size = cm->seq_params.sb_size; + + assert(sb_size == BLOCK_128X128); + + int f_idx = 0; + + const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8); + aom_clear_system_state(); + const float log_q_sq = logf(1.0f + (float)(dc_q * dc_q) / 256.0f); + + // Perform full-pixel single motion search in Y plane of 16x16 mbs in the sb + float sum_mv_row_sq = 0; + float sum_mv_row = 0; + float min_abs_mv_row = FLT_MAX; + float max_abs_mv_row = 0; + + float sum_mv_col_sq = 0; + float sum_mv_col = 0; + float min_abs_mv_col = FLT_MAX; + float max_abs_mv_col = 0; + + float sum_log_sse_sq = 0; + float sum_log_sse = 0; + float min_log_sse = FLT_MAX; + float max_log_sse = 0; + + const BLOCK_SIZE mb_size = BLOCK_16X16; + const int mb_rows = block_size_high[sb_size] / block_size_high[mb_size]; + const int mb_cols = block_size_wide[sb_size] / block_size_wide[mb_size]; + const int mb_in_mi_size_high_log2 = mi_size_high_log2[mb_size]; + const int mb_in_mi_size_wide_log2 = mi_size_wide_log2[mb_size]; + + for (int mb_row = 0; mb_row < mb_rows; mb_row++) + for (int mb_col = 0; mb_col < mb_cols; mb_col++) { + const int this_mi_row = mi_row + (mb_row << mb_in_mi_size_high_log2); + const int this_mi_col = mi_col + (mb_col << mb_in_mi_size_wide_log2); + unsigned int sse = 0; + unsigned int var = 0; + const MV ref_mv_full = { .row = 0, .col = 0 }; + + av1_simple_motion_sse_var(cpi, x, this_mi_row, this_mi_col, mb_size, + ref_mv_full, 0, &sse, &var); + + aom_clear_system_state(); + const float mv_row = (float)(x->best_mv.as_mv.row / 8); + const float mv_col = (float)(x->best_mv.as_mv.col / 8); + const float log_sse = logf(1.0f + (float)sse); + const float abs_mv_row = fabsf(mv_row); + const float abs_mv_col = fabsf(mv_col); + + sum_mv_row_sq += mv_row * mv_row; + sum_mv_row += mv_row; + sum_mv_col_sq += mv_col * mv_col; + sum_mv_col += mv_col; + + if (abs_mv_row < min_abs_mv_row) min_abs_mv_row = abs_mv_row; + if (abs_mv_row > max_abs_mv_row) max_abs_mv_row = abs_mv_row; + if (abs_mv_col < min_abs_mv_col) min_abs_mv_col = abs_mv_col; + if (abs_mv_col > max_abs_mv_col) max_abs_mv_col = abs_mv_col; + + sum_log_sse_sq += log_sse * log_sse; + sum_log_sse += log_sse; + if (log_sse < min_log_sse) min_log_sse = log_sse; + if (log_sse > max_log_sse) max_log_sse = log_sse; + } + aom_clear_system_state(); + const float avg_mv_row = sum_mv_row / 64.0f; + const float var_mv_row = sum_mv_row_sq / 64.0f - avg_mv_row * avg_mv_row; + + const float avg_mv_col = sum_mv_col / 64.0f; + const float var_mv_col = sum_mv_col_sq / 64.0f - avg_mv_col * avg_mv_col; + + const float avg_log_sse = sum_log_sse / 64.0f; + const float var_log_sse = sum_log_sse_sq / 64.0f - avg_log_sse * avg_log_sse; + + features[f_idx++] = avg_log_sse; + features[f_idx++] = avg_mv_col; + features[f_idx++] = avg_mv_row; + features[f_idx++] = log_q_sq; + features[f_idx++] = max_abs_mv_col; + features[f_idx++] = max_abs_mv_row; + features[f_idx++] = max_log_sse; + features[f_idx++] = min_abs_mv_col; + features[f_idx++] = min_abs_mv_row; + features[f_idx++] = min_log_sse; + features[f_idx++] = var_log_sse; + features[f_idx++] = var_mv_col; + features[f_idx++] = var_mv_row; + + assert(f_idx == FEATURE_SIZE_MAX_MIN_PART_PRED); +} + +BLOCK_SIZE av1_predict_max_partition(AV1_COMP *const cpi, MACROBLOCK *const x, + const float *features) { + float scores[MAX_NUM_CLASSES_MAX_MIN_PART_PRED] = { 0.0f }, + probs[MAX_NUM_CLASSES_MAX_MIN_PART_PRED] = { 0.0f }; + const NN_CONFIG *nn_config = &av1_max_part_pred_nn_config; + + assert(cpi->sf.auto_max_partition_based_on_simple_motion != NOT_IN_USE); + + aom_clear_system_state(); + av1_nn_predict(features, nn_config, scores); + av1_nn_softmax(scores, probs, MAX_NUM_CLASSES_MAX_MIN_PART_PRED); + + int result = MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1; + if (cpi->sf.auto_max_partition_based_on_simple_motion == DIRECT_PRED) { + result = 0; + float max_prob = probs[0]; + for (int i = 1; i < MAX_NUM_CLASSES_MAX_MIN_PART_PRED; ++i) { + if (probs[i] > max_prob) { + max_prob = probs[i]; + result = i; + } + } + } else if (cpi->sf.auto_max_partition_based_on_simple_motion == + RELAXED_PRED) { + for (result = MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1; result >= 0; + --result) { + if (result < MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1) { + probs[result] += probs[result + 1]; + } + if (probs[result] > 0.2) break; + } + } else if (cpi->sf.auto_max_partition_based_on_simple_motion == ADAPT_PRED) { + const BLOCK_SIZE sb_size = cpi->common.seq_params.sb_size; + MACROBLOCKD *const xd = &x->e_mbd; + // TODO(debargha): x->source_variance is unavailable at this point, + // so compute. The redundant recomputation later can be removed. + const unsigned int source_variance = + is_cur_buf_hbd(xd) + ? av1_high_get_sby_perpixel_variance(cpi, &x->plane[0].src, sb_size, + xd->bd) + : av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, sb_size); + if (source_variance > 16) { + const double thresh = source_variance < 128 ? 0.05 : 0.1; + for (result = MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1; result >= 0; + --result) { + if (result < MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1) { + probs[result] += probs[result + 1]; + } + if (probs[result] > thresh) break; + } + } + } + + return (BLOCK_SIZE)((result + 2) * 3); +} diff --git a/libaom/av1/encoder/partition_strategy.h b/libaom/av1/encoder/partition_strategy.h new file mode 100644 index 0000000..36b1e95 --- /dev/null +++ b/libaom/av1/encoder/partition_strategy.h @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_PARTITION_STRATEGY_H_ +#define AOM_AV1_ENCODER_PARTITION_STRATEGY_H_ + +#include "av1/encoder/encodeframe.h" +#include "av1/encoder/encodemb.h" +#include "av1/encoder/encoder.h" + +#define FEATURE_SIZE_SMS_PRUNE_PART 25 +#define FEATURE_SIZE_SMS_TERM_NONE 28 +#define FEATURE_SIZE_FP_SMS_TERM_NONE 20 +#define FEATURE_SIZE_MAX_MIN_PART_PRED 13 +#define MAX_NUM_CLASSES_MAX_MIN_PART_PRED 4 + +// Performs a simple_motion_search with a single reference frame and extract +// the variance of residues. Then use the features to determine whether we want +// to go straight to splitting without trying PARTITION_NONE +void av1_simple_motion_search_based_split( + AV1_COMP *const cpi, MACROBLOCK *x, int mi_row, int mi_col, + BLOCK_SIZE bsize, int *partition_none_allowed, int *partition_horz_allowed, + int *partition_vert_allowed, int *do_rectangular_split, + int *do_square_split); + +// Performs a simple_motion_search with two reference frames and extract +// the variance of residues. Then use the features to determine whether we want +// to prune some partitions. +void av1_simple_motion_search_prune_part( + AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row, + int mi_col, BLOCK_SIZE bsize, int *partition_none_allowed, + int *partition_horz_allowed, int *partition_vert_allowed, + int *do_square_split, int *do_rectangular_split, int *prune_horz, + int *prune_vert, float *features, int *valid); + +// Early terminates PARTITION_NONE using simple_motion_search features and the +// rate, distortion, and rdcost of PARTITION_NONE. This is only called when: +// - The frame is a show frame +// - The frame is not intra only +// - The current bsize is > BLOCK_8X8 +// - blk_row + blk_height/2 < total_rows and blk_col + blk_width/2 < total_cols +void av1_simple_motion_search_early_term_none( + AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row, + int mi_col, BLOCK_SIZE bsize, const RD_STATS *none_rdc, + int *early_terminate, float *simple_motion_features, + int *simple_motion_features_are_valid); + +// Early terminates after PARTITION_NONE in firstpass of two pass partition +// search. +void av1_firstpass_simple_motion_search_early_term(AV1_COMP *const cpi, + MACROBLOCK *x, + PC_TREE *pc_tree, int mi_row, + int mi_col, BLOCK_SIZE bsize, + const RD_STATS *none_rdc, + int *do_square_split); + +// Get the features for selecting the max and min partition size. Currently this +// performs simple_motion_search on 16X16 subblocks of the currnet superblock, +// and then extract the statistics of sse and motion vectors as features. +void av1_get_max_min_partition_features(AV1_COMP *const cpi, MACROBLOCK *x, + int mi_row, int mi_col, + float *features); + +// Predict the maximum BLOCK_SIZE to be used to encoder the current superblock. +BLOCK_SIZE av1_predict_max_partition(AV1_COMP *const cpi, MACROBLOCK *const x, + const float *features); + +// A simplified version of set_offsets meant to be used for +// simple_motion_search. +static INLINE void set_offsets_for_motion_search(const AV1_COMP *const cpi, + MACROBLOCK *const x, + int mi_row, int mi_col, + BLOCK_SIZE bsize) { + const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *const xd = &x->e_mbd; + const int mi_width = mi_size_wide[bsize]; + const int mi_height = mi_size_high[bsize]; + + set_mode_info_offsets(cpi, x, xd, mi_row, mi_col); + + // Set up destination pointers. + av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col, 0, + num_planes); + + // Set up limit values for MV components. + // Mv beyond the range do not produce new/different prediction block. + x->mv_limits.row_min = + -(((mi_row + mi_height) * MI_SIZE) + AOM_INTERP_EXTEND); + x->mv_limits.col_min = -(((mi_col + mi_width) * MI_SIZE) + AOM_INTERP_EXTEND); + x->mv_limits.row_max = (cm->mi_rows - mi_row) * MI_SIZE + AOM_INTERP_EXTEND; + x->mv_limits.col_max = (cm->mi_cols - mi_col) * MI_SIZE + AOM_INTERP_EXTEND; + + set_plane_n4(xd, mi_width, mi_height, num_planes); + + // Set up distance of MB to edge of frame in 1/8th pel units. + assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1))); + xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8); + xd->mb_to_bottom_edge = ((cm->mi_rows - mi_height - mi_row) * MI_SIZE) * 8; + xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8); + xd->mb_to_right_edge = ((cm->mi_cols - mi_width - mi_col) * MI_SIZE) * 8; + + // Set up source buffers. + av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize); + + // R/D setup. + x->rdmult = cpi->rd.RDMULT; +} + +static INLINE void init_simple_motion_search_mvs(PC_TREE *pc_tree) { + for (int idx = 0; idx < REF_FRAMES; idx++) { + pc_tree->mv_ref_fulls[idx].row = 0; + pc_tree->mv_ref_fulls[idx].col = 0; + } + if (pc_tree->block_size >= BLOCK_8X8) { + init_simple_motion_search_mvs(pc_tree->split[0]); + init_simple_motion_search_mvs(pc_tree->split[1]); + init_simple_motion_search_mvs(pc_tree->split[2]); + init_simple_motion_search_mvs(pc_tree->split[3]); + } +} + +static INLINE int is_full_sb(AV1_COMMON *const cm, int mi_row, int mi_col, + BLOCK_SIZE sb_size) { + const int sb_mi_wide = mi_size_wide[sb_size]; + const int sb_mi_high = mi_size_high[sb_size]; + + return (mi_row + sb_mi_high) <= cm->mi_rows && + (mi_col + sb_mi_wide) <= cm->mi_cols; +} + +static INLINE int use_auto_max_partition(AV1_COMP *const cpi, + BLOCK_SIZE sb_size, int mi_row, + int mi_col) { + AV1_COMMON *const cm = &cpi->common; + + return !frame_is_intra_only(cm) && + cpi->sf.auto_max_partition_based_on_simple_motion != NOT_IN_USE && + sb_size == BLOCK_128X128 && is_full_sb(cm, mi_row, mi_col, sb_size) && + cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index] != + OVERLAY_UPDATE && + cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index] != + INTNL_OVERLAY_UPDATE; +} + +#endif // AOM_AV1_ENCODER_PARTITION_STRATEGY_H_ diff --git a/libaom/av1/encoder/pass2_strategy.c b/libaom/av1/encoder/pass2_strategy.c new file mode 100644 index 0000000..ac22b68 --- /dev/null +++ b/libaom/av1/encoder/pass2_strategy.c @@ -0,0 +1,1787 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <stdint.h> + +#include "config/aom_config.h" +#include "config/aom_scale_rtcd.h" + +#include "aom/aom_codec.h" +#include "aom/aom_encoder.h" + +#include "aom_ports/system_state.h" + +#include "av1/common/onyxc_int.h" + +#include "av1/encoder/encoder.h" +#include "av1/encoder/firstpass.h" +#include "av1/encoder/gop_structure.h" + +// Calculate an active area of the image that discounts formatting +// bars and partially discounts other 0 energy areas. +#define MIN_ACTIVE_AREA 0.5 +#define MAX_ACTIVE_AREA 1.0 +double calculate_active_area(const AV1_COMP *cpi, + const FIRSTPASS_STATS *this_frame) { + double active_pct; + + active_pct = + 1.0 - + ((this_frame->intra_skip_pct / 2) + + ((this_frame->inactive_zone_rows * 2) / (double)cpi->common.mb_rows)); + return fclamp(active_pct, MIN_ACTIVE_AREA, MAX_ACTIVE_AREA); +} + +// Calculate a modified Error used in distributing bits between easier and +// harder frames. +#define ACT_AREA_CORRECTION 0.5 +double calculate_modified_err(const AV1_COMP *cpi, const TWO_PASS *twopass, + const AV1EncoderConfig *oxcf, + const FIRSTPASS_STATS *this_frame) { + const FIRSTPASS_STATS *const stats = &twopass->total_stats; + const double av_weight = stats->weight / stats->count; + const double av_err = (stats->coded_error * av_weight) / stats->count; + double modified_error = + av_err * pow(this_frame->coded_error * this_frame->weight / + DOUBLE_DIVIDE_CHECK(av_err), + oxcf->two_pass_vbrbias / 100.0); + + // Correction for active area. Frames with a reduced active area + // (eg due to formatting bars) have a higher error per mb for the + // remaining active MBs. The correction here assumes that coding + // 0.5N blocks of complexity 2X is a little easier than coding N + // blocks of complexity X. + modified_error *= + pow(calculate_active_area(cpi, this_frame), ACT_AREA_CORRECTION); + + return fclamp(modified_error, twopass->modified_error_min, + twopass->modified_error_max); +} + +// Resets the first pass file to the given position using a relative seek from +// the current position. +static void reset_fpf_position(TWO_PASS *p, const FIRSTPASS_STATS *position) { + p->stats_in = position; +} + +static int input_stats(TWO_PASS *p, FIRSTPASS_STATS *fps) { + if (p->stats_in >= p->stats_in_end) return EOF; + + *fps = *p->stats_in; + ++p->stats_in; + return 1; +} + +// Read frame stats at an offset from the current position. +static const FIRSTPASS_STATS *read_frame_stats(const TWO_PASS *p, int offset) { + if ((offset >= 0 && p->stats_in + offset >= p->stats_in_end) || + (offset < 0 && p->stats_in + offset < p->stats_in_start)) { + return NULL; + } + + return &p->stats_in[offset]; +} + +static void subtract_stats(FIRSTPASS_STATS *section, + const FIRSTPASS_STATS *frame) { + section->frame -= frame->frame; + section->weight -= frame->weight; + section->intra_error -= frame->intra_error; + section->frame_avg_wavelet_energy -= frame->frame_avg_wavelet_energy; + section->coded_error -= frame->coded_error; + section->sr_coded_error -= frame->sr_coded_error; + section->pcnt_inter -= frame->pcnt_inter; + section->pcnt_motion -= frame->pcnt_motion; + section->pcnt_second_ref -= frame->pcnt_second_ref; + section->pcnt_neutral -= frame->pcnt_neutral; + section->intra_skip_pct -= frame->intra_skip_pct; + section->inactive_zone_rows -= frame->inactive_zone_rows; + section->inactive_zone_cols -= frame->inactive_zone_cols; + section->MVr -= frame->MVr; + section->mvr_abs -= frame->mvr_abs; + section->MVc -= frame->MVc; + section->mvc_abs -= frame->mvc_abs; + section->MVrv -= frame->MVrv; + section->MVcv -= frame->MVcv; + section->mv_in_out_count -= frame->mv_in_out_count; + section->new_mv_count -= frame->new_mv_count; + section->count -= frame->count; + section->duration -= frame->duration; +} + +// Calculate the linear size relative to a baseline of 1080P +#define BASE_SIZE 2073600.0 // 1920x1080 +static double get_linear_size_factor(const AV1_COMP *cpi) { + const double this_area = cpi->initial_width * cpi->initial_height; + return pow(this_area / BASE_SIZE, 0.5); +} + +// This function returns the maximum target rate per frame. +static int frame_max_bits(const RATE_CONTROL *rc, + const AV1EncoderConfig *oxcf) { + int64_t max_bits = ((int64_t)rc->avg_frame_bandwidth * + (int64_t)oxcf->two_pass_vbrmax_section) / + 100; + if (max_bits < 0) + max_bits = 0; + else if (max_bits > rc->max_frame_bandwidth) + max_bits = rc->max_frame_bandwidth; + + return (int)max_bits; +} + +static double calc_correction_factor(double err_per_mb, double err_divisor, + double pt_low, double pt_high, int q, + aom_bit_depth_t bit_depth) { + const double error_term = err_per_mb / err_divisor; + + // Adjustment based on actual quantizer to power term. + const double power_term = + AOMMIN(av1_convert_qindex_to_q(q, bit_depth) * 0.01 + pt_low, pt_high); + + // Calculate correction factor. + if (power_term < 1.0) assert(error_term >= 0.0); + + return fclamp(pow(error_term, power_term), 0.05, 5.0); +} + +#define ERR_DIVISOR 100.0 +#define FACTOR_PT_LOW 0.70 +#define FACTOR_PT_HIGH 0.90 + +// Similar to find_qindex_by_rate() function in ratectrl.c, but includes +// calculation of a correction_factor. +static int find_qindex_by_rate_with_correction( + int desired_bits_per_mb, aom_bit_depth_t bit_depth, FRAME_TYPE frame_type, + double error_per_mb, double ediv_size_correction, + double group_weight_factor, int best_qindex, int worst_qindex) { + assert(best_qindex <= worst_qindex); + int low = best_qindex; + int high = worst_qindex; + while (low < high) { + const int mid = (low + high) >> 1; + const double mid_factor = + calc_correction_factor(error_per_mb, ERR_DIVISOR - ediv_size_correction, + FACTOR_PT_LOW, FACTOR_PT_HIGH, mid, bit_depth); + const int mid_bits_per_mb = av1_rc_bits_per_mb( + frame_type, mid, mid_factor * group_weight_factor, bit_depth); + if (mid_bits_per_mb > desired_bits_per_mb) { + low = mid + 1; + } else { + high = mid; + } + } +#if CONFIG_DEBUG + assert(low == high); + const double low_factor = + calc_correction_factor(error_per_mb, ERR_DIVISOR - ediv_size_correction, + FACTOR_PT_LOW, FACTOR_PT_HIGH, low, bit_depth); + const int low_bits_per_mb = av1_rc_bits_per_mb( + frame_type, low, low_factor * group_weight_factor, bit_depth); + assert(low_bits_per_mb <= desired_bits_per_mb || low == worst_qindex); +#endif // CONFIG_DEBUG + return low; +} + +static int get_twopass_worst_quality(const AV1_COMP *cpi, + const double section_err, + double inactive_zone, + int section_target_bandwidth, + double group_weight_factor) { + const RATE_CONTROL *const rc = &cpi->rc; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + + inactive_zone = fclamp(inactive_zone, 0.0, 1.0); + + if (section_target_bandwidth <= 0) { + return rc->worst_quality; // Highest value allowed + } else { + const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) + ? cpi->initial_mbs + : cpi->common.MBs; + const int active_mbs = AOMMAX(1, num_mbs - (int)(num_mbs * inactive_zone)); + const double av_err_per_mb = section_err / active_mbs; + const int target_norm_bits_per_mb = + (int)((uint64_t)section_target_bandwidth << BPER_MB_NORMBITS) / + active_mbs; + + // Larger image formats are expected to be a little harder to code + // relatively given the same prediction error score. This in part at + // least relates to the increased size and hence coding overheads of + // motion vectors. Some account of this is made through adjustment of + // the error divisor. + double ediv_size_correction = + AOMMAX(0.2, AOMMIN(5.0, get_linear_size_factor(cpi))); + if (ediv_size_correction < 1.0) + ediv_size_correction = -(1.0 / ediv_size_correction); + ediv_size_correction *= 4.0; + + // Try and pick a max Q that will be high enough to encode the + // content at the given rate. + int q = find_qindex_by_rate_with_correction( + target_norm_bits_per_mb, cpi->common.seq_params.bit_depth, INTER_FRAME, + av_err_per_mb, ediv_size_correction, group_weight_factor, + rc->best_quality, rc->worst_quality); + + // Restriction on active max q for constrained quality mode. + if (cpi->oxcf.rc_mode == AOM_CQ) q = AOMMAX(q, oxcf->cq_level); + return q; + } +} + +#define SR_DIFF_PART 0.0015 +#define MOTION_AMP_PART 0.003 +#define INTRA_PART 0.005 +#define DEFAULT_DECAY_LIMIT 0.75 +#define LOW_SR_DIFF_TRHESH 0.1 +#define SR_DIFF_MAX 128.0 +#define NCOUNT_FRAME_II_THRESH 5.0 + +static double get_sr_decay_rate(const AV1_COMP *cpi, + const FIRSTPASS_STATS *frame) { + const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs + : cpi->common.MBs; + double sr_diff = (frame->sr_coded_error - frame->coded_error) / num_mbs; + double sr_decay = 1.0; + double modified_pct_inter; + double modified_pcnt_intra; + const double motion_amplitude_factor = + frame->pcnt_motion * ((frame->mvc_abs + frame->mvr_abs) / 2); + + modified_pct_inter = frame->pcnt_inter; + if ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) < + (double)NCOUNT_FRAME_II_THRESH) { + modified_pct_inter = frame->pcnt_inter - frame->pcnt_neutral; + } + modified_pcnt_intra = 100 * (1.0 - modified_pct_inter); + + if ((sr_diff > LOW_SR_DIFF_TRHESH)) { + sr_diff = AOMMIN(sr_diff, SR_DIFF_MAX); + sr_decay = 1.0 - (SR_DIFF_PART * sr_diff) - + (MOTION_AMP_PART * motion_amplitude_factor) - + (INTRA_PART * modified_pcnt_intra); + } + return AOMMAX(sr_decay, AOMMIN(DEFAULT_DECAY_LIMIT, modified_pct_inter)); +} + +// This function gives an estimate of how badly we believe the prediction +// quality is decaying from frame to frame. +static double get_zero_motion_factor(const AV1_COMP *cpi, + const FIRSTPASS_STATS *frame) { + const double zero_motion_pct = frame->pcnt_inter - frame->pcnt_motion; + double sr_decay = get_sr_decay_rate(cpi, frame); + return AOMMIN(sr_decay, zero_motion_pct); +} + +#define ZM_POWER_FACTOR 0.75 + +static double get_prediction_decay_rate(const AV1_COMP *cpi, + const FIRSTPASS_STATS *next_frame) { + const double sr_decay_rate = get_sr_decay_rate(cpi, next_frame); + const double zero_motion_factor = + (0.95 * pow((next_frame->pcnt_inter - next_frame->pcnt_motion), + ZM_POWER_FACTOR)); + + return AOMMAX(zero_motion_factor, + (sr_decay_rate + ((1.0 - sr_decay_rate) * zero_motion_factor))); +} + +// Function to test for a condition where a complex transition is followed +// by a static section. For example in slide shows where there is a fade +// between slides. This is to help with more optimal kf and gf positioning. +static int detect_transition_to_still(AV1_COMP *cpi, int frame_interval, + int still_interval, + double loop_decay_rate, + double last_decay_rate) { + TWO_PASS *const twopass = &cpi->twopass; + RATE_CONTROL *const rc = &cpi->rc; + + // Break clause to detect very still sections after motion + // For example a static image after a fade or other transition + // instead of a clean scene cut. + if (frame_interval > rc->min_gf_interval && loop_decay_rate >= 0.999 && + last_decay_rate < 0.9) { + int j; + + // Look ahead a few frames to see if static condition persists... + for (j = 0; j < still_interval; ++j) { + const FIRSTPASS_STATS *stats = &twopass->stats_in[j]; + if (stats >= twopass->stats_in_end) break; + + if (stats->pcnt_inter - stats->pcnt_motion < 0.999) break; + } + + // Only if it does do we signal a transition to still. + return j == still_interval; + } + + return 0; +} + +// This function detects a flash through the high relative pcnt_second_ref +// score in the frame following a flash frame. The offset passed in should +// reflect this. +static int detect_flash(const TWO_PASS *twopass, int offset) { + const FIRSTPASS_STATS *const next_frame = read_frame_stats(twopass, offset); + + // What we are looking for here is a situation where there is a + // brief break in prediction (such as a flash) but subsequent frames + // are reasonably well predicted by an earlier (pre flash) frame. + // The recovery after a flash is indicated by a high pcnt_second_ref + // compared to pcnt_inter. + return next_frame != NULL && + next_frame->pcnt_second_ref > next_frame->pcnt_inter && + next_frame->pcnt_second_ref >= 0.5; +} + +// Update the motion related elements to the GF arf boost calculation. +static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats, + double *mv_in_out, + double *mv_in_out_accumulator, + double *abs_mv_in_out_accumulator, + double *mv_ratio_accumulator) { + const double pct = stats->pcnt_motion; + + // Accumulate Motion In/Out of frame stats. + *mv_in_out = stats->mv_in_out_count * pct; + *mv_in_out_accumulator += *mv_in_out; + *abs_mv_in_out_accumulator += fabs(*mv_in_out); + + // Accumulate a measure of how uniform (or conversely how random) the motion + // field is (a ratio of abs(mv) / mv). + if (pct > 0.05) { + const double mvr_ratio = + fabs(stats->mvr_abs) / DOUBLE_DIVIDE_CHECK(fabs(stats->MVr)); + const double mvc_ratio = + fabs(stats->mvc_abs) / DOUBLE_DIVIDE_CHECK(fabs(stats->MVc)); + + *mv_ratio_accumulator += + pct * (mvr_ratio < stats->mvr_abs ? mvr_ratio : stats->mvr_abs); + *mv_ratio_accumulator += + pct * (mvc_ratio < stats->mvc_abs ? mvc_ratio : stats->mvc_abs); + } +} + +#define BASELINE_ERR_PER_MB 1000.0 +#define BOOST_FACTOR 12.5 + +static double calc_frame_boost(AV1_COMP *cpi, const FIRSTPASS_STATS *this_frame, + double this_frame_mv_in_out, double max_boost) { + double frame_boost; + const double lq = av1_convert_qindex_to_q( + cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.seq_params.bit_depth); + const double boost_q_correction = AOMMIN((0.5 + (lq * 0.015)), 1.5); + int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs + : cpi->common.MBs; + + // Correct for any inactive region in the image + num_mbs = (int)AOMMAX(1, num_mbs * calculate_active_area(cpi, this_frame)); + + // Underlying boost factor is based on inter error ratio. + frame_boost = (BASELINE_ERR_PER_MB * num_mbs) / + DOUBLE_DIVIDE_CHECK(this_frame->coded_error); + frame_boost = frame_boost * BOOST_FACTOR * boost_q_correction; + + // Increase boost for frames where new data coming into frame (e.g. zoom out). + // Slightly reduce boost if there is a net balance of motion out of the frame + // (zoom in). The range for this_frame_mv_in_out is -1.0 to +1.0. + if (this_frame_mv_in_out > 0.0) + frame_boost += frame_boost * (this_frame_mv_in_out * 2.0); + // In the extreme case the boost is halved. + else + frame_boost += frame_boost * (this_frame_mv_in_out / 2.0); + + return AOMMIN(frame_boost, max_boost * boost_q_correction); +} + +#define GF_MAX_BOOST 90.0 +#define MIN_ARF_GF_BOOST 240 +#define MIN_DECAY_FACTOR 0.01 + +static int calc_arf_boost(AV1_COMP *cpi, int offset, int f_frames, int b_frames, + int *f_boost, int *b_boost) { + TWO_PASS *const twopass = &cpi->twopass; + int i; + double boost_score = 0.0; + double mv_ratio_accumulator = 0.0; + double decay_accumulator = 1.0; + double this_frame_mv_in_out = 0.0; + double mv_in_out_accumulator = 0.0; + double abs_mv_in_out_accumulator = 0.0; + int arf_boost; + int flash_detected = 0; + + // Search forward from the proposed arf/next gf position. + for (i = 0; i < f_frames; ++i) { + const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset); + if (this_frame == NULL) break; + + // Update the motion related elements to the boost calculation. + accumulate_frame_motion_stats( + this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator, + &abs_mv_in_out_accumulator, &mv_ratio_accumulator); + + // We want to discount the flash frame itself and the recovery + // frame that follows as both will have poor scores. + flash_detected = detect_flash(twopass, i + offset) || + detect_flash(twopass, i + offset + 1); + + // Accumulate the effect of prediction quality decay. + if (!flash_detected) { + decay_accumulator *= get_prediction_decay_rate(cpi, this_frame); + decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR + ? MIN_DECAY_FACTOR + : decay_accumulator; + } + + boost_score += + decay_accumulator * + calc_frame_boost(cpi, this_frame, this_frame_mv_in_out, GF_MAX_BOOST); + } + + *f_boost = (int)boost_score; + + // Reset for backward looking loop. + boost_score = 0.0; + mv_ratio_accumulator = 0.0; + decay_accumulator = 1.0; + this_frame_mv_in_out = 0.0; + mv_in_out_accumulator = 0.0; + abs_mv_in_out_accumulator = 0.0; + + // Search backward towards last gf position. + for (i = -1; i >= -b_frames; --i) { + const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset); + if (this_frame == NULL) break; + + // Update the motion related elements to the boost calculation. + accumulate_frame_motion_stats( + this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator, + &abs_mv_in_out_accumulator, &mv_ratio_accumulator); + + // We want to discount the the flash frame itself and the recovery + // frame that follows as both will have poor scores. + flash_detected = detect_flash(twopass, i + offset) || + detect_flash(twopass, i + offset + 1); + + // Cumulative effect of prediction quality decay. + if (!flash_detected) { + decay_accumulator *= get_prediction_decay_rate(cpi, this_frame); + decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR + ? MIN_DECAY_FACTOR + : decay_accumulator; + } + + boost_score += + decay_accumulator * + calc_frame_boost(cpi, this_frame, this_frame_mv_in_out, GF_MAX_BOOST); + } + *b_boost = (int)boost_score; + + arf_boost = (*f_boost + *b_boost); + if (arf_boost < ((b_frames + f_frames) * 20)) + arf_boost = ((b_frames + f_frames) * 20); + arf_boost = AOMMAX(arf_boost, MIN_ARF_GF_BOOST); + + return arf_boost; +} + +// Calculate a section intra ratio used in setting max loop filter. +static int calculate_section_intra_ratio(const FIRSTPASS_STATS *begin, + const FIRSTPASS_STATS *end, + int section_length) { + const FIRSTPASS_STATS *s = begin; + double intra_error = 0.0; + double coded_error = 0.0; + int i = 0; + + while (s < end && i < section_length) { + intra_error += s->intra_error; + coded_error += s->coded_error; + ++s; + ++i; + } + + return (int)(intra_error / DOUBLE_DIVIDE_CHECK(coded_error)); +} + +// Calculate the total bits to allocate in this GF/ARF group. +static int64_t calculate_total_gf_group_bits(AV1_COMP *cpi, + double gf_group_err) { + const RATE_CONTROL *const rc = &cpi->rc; + const TWO_PASS *const twopass = &cpi->twopass; + const int max_bits = frame_max_bits(rc, &cpi->oxcf); + int64_t total_group_bits; + + // Calculate the bits to be allocated to the group as a whole. + if ((twopass->kf_group_bits > 0) && (twopass->kf_group_error_left > 0)) { + total_group_bits = (int64_t)(twopass->kf_group_bits * + (gf_group_err / twopass->kf_group_error_left)); + } else { + total_group_bits = 0; + } + + // Clamp odd edge cases. + total_group_bits = (total_group_bits < 0) + ? 0 + : (total_group_bits > twopass->kf_group_bits) + ? twopass->kf_group_bits + : total_group_bits; + + // Clip based on user supplied data rate variability limit. + if (total_group_bits > (int64_t)max_bits * rc->baseline_gf_interval) + total_group_bits = (int64_t)max_bits * rc->baseline_gf_interval; + + return total_group_bits; +} + +// Calculate the number bits extra to assign to boosted frames in a group. +static int calculate_boost_bits(int frame_count, int boost, + int64_t total_group_bits) { + int allocation_chunks; + + // return 0 for invalid inputs (could arise e.g. through rounding errors) + if (!boost || (total_group_bits <= 0) || (frame_count <= 0)) return 0; + + allocation_chunks = (frame_count * 100) + boost; + + // Prevent overflow. + if (boost > 1023) { + int divisor = boost >> 10; + boost /= divisor; + allocation_chunks /= divisor; + } + + // Calculate the number of extra bits for use in the boosted frame or frames. + return AOMMAX((int)(((int64_t)boost * total_group_bits) / allocation_chunks), + 0); +} + +#define LEAF_REDUCTION_FACTOR 0.75 +static double lvl_budget_factor[MAX_PYRAMID_LVL - 1][MAX_PYRAMID_LVL - 1] = { + { 1.0, 0.0, 0.0 }, { 0.6, 0.4, 0 }, { 0.45, 0.35, 0.20 } +}; +static void allocate_gf_group_bits( + AV1_COMP *cpi, int64_t gf_group_bits, double group_error, int gf_arf_bits, + const EncodeFrameParams *const frame_params) { + RATE_CONTROL *const rc = &cpi->rc; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + TWO_PASS *const twopass = &cpi->twopass; + GF_GROUP *const gf_group = &twopass->gf_group; + const int key_frame = (frame_params->frame_type == KEY_FRAME); + const int max_bits = frame_max_bits(&cpi->rc, &cpi->oxcf); + int64_t total_group_bits = gf_group_bits; + + // Check if GF group has any internal arfs. + int has_internal_arfs = 0; + for (int i = 0; i < gf_group->size; ++i) { + if (gf_group->update_type[i] == INTNL_ARF_UPDATE) { + has_internal_arfs = 1; + break; + } + } + + // For key frames the frame target rate is already set and it + // is also the golden frame. + // === [frame_index == 0] === + int frame_index = 0; + if (!key_frame) { + if (rc->source_alt_ref_active) + gf_group->bit_allocation[frame_index] = 0; + else + gf_group->bit_allocation[frame_index] = gf_arf_bits; + + // Step over the golden frame / overlay frame + FIRSTPASS_STATS frame_stats; + if (EOF == input_stats(twopass, &frame_stats)) return; + } + + // Deduct the boost bits for arf (or gf if it is not a key frame) + // from the group total. + if (rc->source_alt_ref_pending || !key_frame) total_group_bits -= gf_arf_bits; + + frame_index++; + + // Store the bits to spend on the ARF if there is one. + // === [frame_index == 1] === + if (rc->source_alt_ref_pending) { + gf_group->bit_allocation[frame_index] = gf_arf_bits; + + ++frame_index; + + // Skip all the internal ARFs right after ARF at the starting segment of + // the current GF group. + if (has_internal_arfs) { + while (gf_group->update_type[frame_index] == INTNL_ARF_UPDATE) { + ++frame_index; + } + } + } + + // Save. + const int tmp_frame_index = frame_index; + int budget_reduced_from_leaf_level = 0; + + // Allocate bits to frames other than first frame, which is either a keyframe, + // overlay frame or golden frame. + const int normal_frames = rc->baseline_gf_interval - 1; + + for (int i = 0; i < normal_frames; ++i) { + FIRSTPASS_STATS frame_stats; + if (EOF == input_stats(twopass, &frame_stats)) break; + + const double modified_err = + calculate_modified_err(cpi, twopass, oxcf, &frame_stats); + const double err_fraction = + (group_error > 0) ? modified_err / DOUBLE_DIVIDE_CHECK(group_error) + : 0.0; + const int target_frame_size = + clamp((int)((double)total_group_bits * err_fraction), 0, + AOMMIN(max_bits, (int)total_group_bits)); + + if (gf_group->update_type[frame_index] == INTNL_OVERLAY_UPDATE) { + assert(gf_group->pyramid_height <= MAX_PYRAMID_LVL && + "non-valid height for a pyramid structure"); + + const int arf_pos = gf_group->arf_pos_in_gf[frame_index]; + gf_group->bit_allocation[frame_index] = 0; + + gf_group->bit_allocation[arf_pos] = target_frame_size; + // Note: Boost, if needed, is added in the next loop. + } else { + assert(gf_group->update_type[frame_index] == LF_UPDATE); + gf_group->bit_allocation[frame_index] = target_frame_size; + if (has_internal_arfs) { + const int this_budget_reduction = + (int)(target_frame_size * LEAF_REDUCTION_FACTOR); + gf_group->bit_allocation[frame_index] -= this_budget_reduction; + budget_reduced_from_leaf_level += this_budget_reduction; + } + } + + ++frame_index; + + // Skip all the internal ARFs. + if (has_internal_arfs) { + while (gf_group->update_type[frame_index] == INTNL_ARF_UPDATE) + ++frame_index; + } + } + + if (budget_reduced_from_leaf_level > 0) { + assert(has_internal_arfs); + // Restore. + frame_index = tmp_frame_index; + + // Re-distribute this extra budget to overlay frames in the group. + for (int i = 0; i < normal_frames; ++i) { + if (gf_group->update_type[frame_index] == INTNL_OVERLAY_UPDATE) { + assert(gf_group->pyramid_height <= MAX_PYRAMID_LVL && + "non-valid height for a pyramid structure"); + const int arf_pos = gf_group->arf_pos_in_gf[frame_index]; + const int this_lvl = gf_group->pyramid_level[arf_pos]; + const int dist2top = gf_group->pyramid_height - 1 - this_lvl; + const double lvl_boost_factor = + lvl_budget_factor[gf_group->pyramid_height - 2][dist2top]; + const int extra_size = + (int)(budget_reduced_from_leaf_level * lvl_boost_factor / + gf_group->pyramid_lvl_nodes[this_lvl]); + gf_group->bit_allocation[arf_pos] += extra_size; + } + ++frame_index; + + // Skip all the internal ARFs. + if (has_internal_arfs) { + while (gf_group->update_type[frame_index] == INTNL_ARF_UPDATE) { + ++frame_index; + } + } + } + } +} + +// Given the maximum allowed height of the pyramid structure, return the fixed +// GF length to be used. +static INLINE int get_fixed_gf_length(int max_pyr_height) { + (void)max_pyr_height; + return MAX_GF_INTERVAL; +} + +// Returns true if KF group and GF group both are almost completely static. +static INLINE int is_almost_static(double gf_zero_motion, int kf_zero_motion) { + return (gf_zero_motion >= 0.995) && + (kf_zero_motion >= STATIC_KF_GROUP_THRESH); +} + +#define ARF_ABS_ZOOM_THRESH 4.4 +#define GROUP_ADAPTIVE_MAXQ 1 +#if GROUP_ADAPTIVE_MAXQ +#define RC_FACTOR_MIN 0.75 +#define RC_FACTOR_MAX 1.75 +#endif // GROUP_ADAPTIVE_MAXQ +#define MIN_FWD_KF_INTERVAL 8 + +// Analyse and define a gf/arf group. +static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame, + const EncodeFrameParams *const frame_params) { + AV1_COMMON *const cm = &cpi->common; + RATE_CONTROL *const rc = &cpi->rc; + AV1EncoderConfig *const oxcf = &cpi->oxcf; + TWO_PASS *const twopass = &cpi->twopass; + FIRSTPASS_STATS next_frame; + const FIRSTPASS_STATS *const start_pos = twopass->stats_in; + int i; + + double boost_score = 0.0; + double gf_group_err = 0.0; +#if GROUP_ADAPTIVE_MAXQ + double gf_group_raw_error = 0.0; +#endif + double gf_group_skip_pct = 0.0; + double gf_group_inactive_zone_rows = 0.0; + double gf_first_frame_err = 0.0; + double mod_frame_err = 0.0; + + double mv_ratio_accumulator = 0.0; + double decay_accumulator = 1.0; + double zero_motion_accumulator = 1.0; + + double loop_decay_rate = 1.00; + double last_loop_decay_rate = 1.00; + + double this_frame_mv_in_out = 0.0; + double mv_in_out_accumulator = 0.0; + double abs_mv_in_out_accumulator = 0.0; + + unsigned int allow_alt_ref = is_altref_enabled(cpi); + + int f_boost = 0; + int b_boost = 0; + int flash_detected; + int64_t gf_group_bits; + double gf_group_error_left; + int gf_arf_bits; + const int is_intra_only = frame_params->frame_type == KEY_FRAME || + frame_params->frame_type == INTRA_ONLY_FRAME; + const int arf_active_or_kf = is_intra_only || rc->source_alt_ref_active; + + cpi->internal_altref_allowed = (oxcf->gf_max_pyr_height > 1); + + // Reset the GF group data structures unless this is a key + // frame in which case it will already have been done. + if (!is_intra_only) { + av1_zero(twopass->gf_group); + } + + aom_clear_system_state(); + av1_zero(next_frame); + + // Load stats for the current frame. + mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame); + + // Note the error of the frame at the start of the group. This will be + // the GF frame error if we code a normal gf. + gf_first_frame_err = mod_frame_err; + + // If this is a key frame or the overlay from a previous arf then + // the error score / cost of this frame has already been accounted for. + if (arf_active_or_kf) { + gf_group_err -= gf_first_frame_err; +#if GROUP_ADAPTIVE_MAXQ + gf_group_raw_error -= this_frame->coded_error; +#endif + gf_group_skip_pct -= this_frame->intra_skip_pct; + gf_group_inactive_zone_rows -= this_frame->inactive_zone_rows; + } + // Motion breakout threshold for loop below depends on image size. + const double mv_ratio_accumulator_thresh = + (cpi->initial_height + cpi->initial_width) / 4.0; + + // TODO(urvang): Try logic to vary min and max interval based on q. + const int active_min_gf_interval = rc->min_gf_interval; + const int active_max_gf_interval = + AOMMIN(rc->max_gf_interval, get_fixed_gf_length(oxcf->gf_max_pyr_height)); + + double avg_sr_coded_error = 0; + double avg_raw_err_stdev = 0; + int non_zero_stdev_count = 0; + + i = 0; + while (i < rc->static_scene_max_gf_interval && i < rc->frames_to_key) { + ++i; + + // Accumulate error score of frames in this gf group. + mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame); + gf_group_err += mod_frame_err; +#if GROUP_ADAPTIVE_MAXQ + gf_group_raw_error += this_frame->coded_error; +#endif + gf_group_skip_pct += this_frame->intra_skip_pct; + gf_group_inactive_zone_rows += this_frame->inactive_zone_rows; + + if (EOF == input_stats(twopass, &next_frame)) break; + + // Test for the case where there is a brief flash but the prediction + // quality back to an earlier frame is then restored. + flash_detected = detect_flash(twopass, 0); + + // Update the motion related elements to the boost calculation. + accumulate_frame_motion_stats( + &next_frame, &this_frame_mv_in_out, &mv_in_out_accumulator, + &abs_mv_in_out_accumulator, &mv_ratio_accumulator); + // sum up the metric values of current gf group + avg_sr_coded_error += next_frame.sr_coded_error; + if (fabs(next_frame.raw_error_stdev) > 0.000001) { + non_zero_stdev_count++; + avg_raw_err_stdev += next_frame.raw_error_stdev; + } + + // Accumulate the effect of prediction quality decay. + if (!flash_detected) { + last_loop_decay_rate = loop_decay_rate; + loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame); + + decay_accumulator = decay_accumulator * loop_decay_rate; + + // Monitor for static sections. + if ((rc->frames_since_key + i - 1) > 1) { + zero_motion_accumulator = AOMMIN( + zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame)); + } + + // Break clause to detect very still sections after motion. For example, + // a static image after a fade or other transition. + if (detect_transition_to_still(cpi, i, 5, loop_decay_rate, + last_loop_decay_rate)) { + allow_alt_ref = 0; + break; + } + } + + // Calculate a boost number for this frame. + boost_score += + decay_accumulator * + calc_frame_boost(cpi, &next_frame, this_frame_mv_in_out, GF_MAX_BOOST); + // If almost totally static, we will not use the the max GF length later, + // so we can continue for more frames. + if ((i >= active_max_gf_interval + 1) && + !is_almost_static(zero_motion_accumulator, + twopass->kf_zeromotion_pct)) { + break; + } + + // Some conditions to breakout after min interval. + if (i >= active_min_gf_interval && + // If possible don't break very close to a kf + (rc->frames_to_key - i >= rc->min_gf_interval) && (i & 0x01) && + !flash_detected && + (mv_ratio_accumulator > mv_ratio_accumulator_thresh || + abs_mv_in_out_accumulator > ARF_ABS_ZOOM_THRESH)) { + break; + } + *this_frame = next_frame; + } + + // Was the group length constrained by the requirement for a new KF? + rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0; + + const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs + : cpi->common.MBs; + assert(num_mbs > 0); + if (i) avg_sr_coded_error /= i; + + if (non_zero_stdev_count) avg_raw_err_stdev /= non_zero_stdev_count; + + // Disable internal ARFs for "still" gf groups. + // zero_motion_accumulator: minimum percentage of (0,0) motion; + // avg_sr_coded_error: average of the SSE per pixel of each frame; + // avg_raw_err_stdev: average of the standard deviation of (0,0) + // motion error per block of each frame. + if (zero_motion_accumulator > MIN_ZERO_MOTION && + avg_sr_coded_error / num_mbs < MAX_SR_CODED_ERROR && + avg_raw_err_stdev < MAX_RAW_ERR_VAR) { + cpi->internal_altref_allowed = 0; + } + + const int use_alt_ref = + !is_almost_static(zero_motion_accumulator, twopass->kf_zeromotion_pct) && + allow_alt_ref && (i < cpi->oxcf.lag_in_frames) && + (i >= rc->min_gf_interval) && + (cpi->oxcf.gf_max_pyr_height > MIN_PYRAMID_LVL); + +#define REDUCE_GF_LENGTH_THRESH 4 +#define REDUCE_GF_LENGTH_TO_KEY_THRESH 9 +#define REDUCE_GF_LENGTH_BY 1 + int alt_offset = 0; + // The length reduction strategy is tweaked for certain cases, and doesn't + // work well for certain other cases. + const int allow_gf_length_reduction = + ((cpi->oxcf.rc_mode == AOM_Q && cpi->oxcf.cq_level <= 128) || + !cpi->internal_altref_allowed) && + !is_lossless_requested(&cpi->oxcf); + + if (allow_gf_length_reduction && use_alt_ref) { + // adjust length of this gf group if one of the following condition met + // 1: only one overlay frame left and this gf is too long + // 2: next gf group is too short to have arf compared to the current gf + + // maximum length of next gf group + const int next_gf_len = rc->frames_to_key - i; + const int single_overlay_left = + next_gf_len == 0 && i > REDUCE_GF_LENGTH_THRESH; + // the next gf is probably going to have a ARF but it will be shorter than + // this gf + const int unbalanced_gf = + i > REDUCE_GF_LENGTH_TO_KEY_THRESH && + next_gf_len + 1 < REDUCE_GF_LENGTH_TO_KEY_THRESH && + next_gf_len + 1 >= rc->min_gf_interval; + + if (single_overlay_left || unbalanced_gf) { + const int roll_back = REDUCE_GF_LENGTH_BY; + // Reduce length only if active_min_gf_interval will be respected later. + if (i - roll_back >= active_min_gf_interval + 1) { + alt_offset = -roll_back; + i -= roll_back; + } + } + } + + // Should we use the alternate reference frame. + if (use_alt_ref) { + // Calculate the boost for alt ref. + rc->gfu_boost = + calc_arf_boost(cpi, alt_offset, (i - 1), (i - 1), &f_boost, &b_boost); + rc->source_alt_ref_pending = 1; + + // do not replace ARFs with overlay frames, and keep it as GOLDEN_REF + cpi->preserve_arf_as_gld = 1; + } else { + rc->gfu_boost = AOMMAX((int)boost_score, MIN_ARF_GF_BOOST); + rc->source_alt_ref_pending = 0; + cpi->preserve_arf_as_gld = 0; + } + + // Set the interval until the next gf. + // If forward keyframes are enabled, ensure the final gf group obeys the + // MIN_FWD_KF_INTERVAL. + if (cpi->oxcf.fwd_kf_enabled && + ((twopass->stats_in - i + rc->frames_to_key) < twopass->stats_in_end)) { + if (i == rc->frames_to_key) { + rc->baseline_gf_interval = i; + // if the last gf group will be smaller than MIN_FWD_KF_INTERVAL + } else if ((rc->frames_to_key - i < + AOMMAX(MIN_FWD_KF_INTERVAL, rc->min_gf_interval)) && + (rc->frames_to_key != i)) { + // if possible, merge the last two gf groups + if (rc->frames_to_key <= active_max_gf_interval) { + rc->baseline_gf_interval = rc->frames_to_key; + // if merging the last two gf groups creates a group that is too long, + // split them and force the last gf group to be the MIN_FWD_KF_INTERVAL + } else { + rc->baseline_gf_interval = rc->frames_to_key - MIN_FWD_KF_INTERVAL; + } + } else { + rc->baseline_gf_interval = i - rc->source_alt_ref_pending; + } + } else { + rc->baseline_gf_interval = i - rc->source_alt_ref_pending; + } + +#define LAST_ALR_BOOST_FACTOR 0.2f + rc->arf_boost_factor = 1.0; + if (rc->source_alt_ref_pending && !is_lossless_requested(&cpi->oxcf)) { + // Reduce the boost of altref in the last gf group + if (rc->frames_to_key - i == REDUCE_GF_LENGTH_BY || + rc->frames_to_key - i == 0) { + rc->arf_boost_factor = LAST_ALR_BOOST_FACTOR; + } + } + + rc->frames_till_gf_update_due = rc->baseline_gf_interval; + + // Reset the file position. + reset_fpf_position(twopass, start_pos); + + // Calculate the bits to be allocated to the gf/arf group as a whole + gf_group_bits = calculate_total_gf_group_bits(cpi, gf_group_err); + +#if GROUP_ADAPTIVE_MAXQ + // Calculate an estimate of the maxq needed for the group. + // We are more agressive about correcting for sections + // where there could be significant overshoot than for easier + // sections where we do not wish to risk creating an overshoot + // of the allocated bit budget. + if ((cpi->oxcf.rc_mode != AOM_Q) && (rc->baseline_gf_interval > 1)) { + const int vbr_group_bits_per_frame = + (int)(gf_group_bits / rc->baseline_gf_interval); + const double group_av_err = gf_group_raw_error / rc->baseline_gf_interval; + const double group_av_skip_pct = + gf_group_skip_pct / rc->baseline_gf_interval; + const double group_av_inactive_zone = + ((gf_group_inactive_zone_rows * 2) / + (rc->baseline_gf_interval * (double)cm->mb_rows)); + + int tmp_q; + // rc factor is a weight factor that corrects for local rate control drift. + double rc_factor = 1.0; + if (rc->rate_error_estimate > 0) { + rc_factor = AOMMAX(RC_FACTOR_MIN, + (double)(100 - rc->rate_error_estimate) / 100.0); + } else { + rc_factor = AOMMIN(RC_FACTOR_MAX, + (double)(100 - rc->rate_error_estimate) / 100.0); + } + tmp_q = get_twopass_worst_quality( + cpi, group_av_err, (group_av_skip_pct + group_av_inactive_zone), + vbr_group_bits_per_frame, twopass->kfgroup_inter_fraction * rc_factor); + twopass->active_worst_quality = + AOMMAX(tmp_q, twopass->active_worst_quality >> 1); + } +#endif + + // Calculate the extra bits to be used for boosted frame(s) + gf_arf_bits = calculate_boost_bits(rc->baseline_gf_interval, rc->gfu_boost, + gf_group_bits); + + // Adjust KF group bits and error remaining. + twopass->kf_group_error_left -= (int64_t)gf_group_err; + + // If this is an arf update we want to remove the score for the overlay + // frame at the end which will usually be very cheap to code. + // The overlay frame has already, in effect, been coded so we want to spread + // the remaining bits among the other frames. + // For normal GFs remove the score for the GF itself unless this is + // also a key frame in which case it has already been accounted for. + if (rc->source_alt_ref_pending) { + gf_group_error_left = gf_group_err - mod_frame_err; + } else if (!is_intra_only) { + gf_group_error_left = gf_group_err - gf_first_frame_err; + } else { + gf_group_error_left = gf_group_err; + } + + // Set up the structure of this Group-Of-Pictures (same as GF_GROUP) + av1_gop_setup_structure(cpi, frame_params); + + // Allocate bits to each of the frames in the GF group. + allocate_gf_group_bits(cpi, gf_group_bits, gf_group_error_left, gf_arf_bits, + frame_params); + + // Reset the file position. + reset_fpf_position(twopass, start_pos); + + // Calculate a section intra ratio used in setting max loop filter. + if (frame_params->frame_type != KEY_FRAME) { + twopass->section_intra_rating = calculate_section_intra_ratio( + start_pos, twopass->stats_in_end, rc->baseline_gf_interval); + } +} + +// Minimum % intra coding observed in first pass (1.0 = 100%) +#define MIN_INTRA_LEVEL 0.25 +// Minimum ratio between the % of intra coding and inter coding in the first +// pass after discounting neutral blocks (discounting neutral blocks in this +// way helps catch scene cuts in clips with very flat areas or letter box +// format clips with image padding. +#define INTRA_VS_INTER_THRESH 2.0 +// Hard threshold where the first pass chooses intra for almost all blocks. +// In such a case even if the frame is not a scene cut coding a key frame +// may be a good option. +#define VERY_LOW_INTER_THRESH 0.05 +// Maximum threshold for the relative ratio of intra error score vs best +// inter error score. +#define KF_II_ERR_THRESHOLD 2.5 +// In real scene cuts there is almost always a sharp change in the intra +// or inter error score. +#define ERR_CHANGE_THRESHOLD 0.4 +// For real scene cuts we expect an improvment in the intra inter error +// ratio in the next frame. +#define II_IMPROVEMENT_THRESHOLD 3.5 +#define KF_II_MAX 128.0 + +// Threshold for use of the lagging second reference frame. High second ref +// usage may point to a transient event like a flash or occlusion rather than +// a real scene cut. +// We adapt the threshold based on number of frames in this key-frame group so +// far. +static double get_second_ref_usage_thresh(int frame_count_so_far) { + const int adapt_upto = 32; + const double min_second_ref_usage_thresh = 0.085; + const double second_ref_usage_thresh_max_delta = 0.035; + if (frame_count_so_far >= adapt_upto) { + return min_second_ref_usage_thresh + second_ref_usage_thresh_max_delta; + } + return min_second_ref_usage_thresh + + ((double)frame_count_so_far / (adapt_upto - 1)) * + second_ref_usage_thresh_max_delta; +} + +static int test_candidate_kf(TWO_PASS *twopass, + const FIRSTPASS_STATS *last_frame, + const FIRSTPASS_STATS *this_frame, + const FIRSTPASS_STATS *next_frame, + int frame_count_so_far) { + int is_viable_kf = 0; + double pcnt_intra = 1.0 - this_frame->pcnt_inter; + double modified_pcnt_inter = + this_frame->pcnt_inter - this_frame->pcnt_neutral; + const double second_ref_usage_thresh = + get_second_ref_usage_thresh(frame_count_so_far); + + // Does the frame satisfy the primary criteria of a key frame? + // See above for an explanation of the test criteria. + // If so, then examine how well it predicts subsequent frames. + if ((this_frame->pcnt_second_ref < second_ref_usage_thresh) && + (next_frame->pcnt_second_ref < second_ref_usage_thresh) && + ((this_frame->pcnt_inter < VERY_LOW_INTER_THRESH) || + ((pcnt_intra > MIN_INTRA_LEVEL) && + (pcnt_intra > (INTRA_VS_INTER_THRESH * modified_pcnt_inter)) && + ((this_frame->intra_error / + DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) < + KF_II_ERR_THRESHOLD) && + ((fabs(last_frame->coded_error - this_frame->coded_error) / + DOUBLE_DIVIDE_CHECK(this_frame->coded_error) > + ERR_CHANGE_THRESHOLD) || + (fabs(last_frame->intra_error - this_frame->intra_error) / + DOUBLE_DIVIDE_CHECK(this_frame->intra_error) > + ERR_CHANGE_THRESHOLD) || + ((next_frame->intra_error / + DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) > + II_IMPROVEMENT_THRESHOLD))))) { + int i; + const FIRSTPASS_STATS *start_pos = twopass->stats_in; + FIRSTPASS_STATS local_next_frame = *next_frame; + double boost_score = 0.0; + double old_boost_score = 0.0; + double decay_accumulator = 1.0; + + // Examine how well the key frame predicts subsequent frames. + for (i = 0; i < 16; ++i) { + double next_iiratio = (BOOST_FACTOR * local_next_frame.intra_error / + DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error)); + + if (next_iiratio > KF_II_MAX) next_iiratio = KF_II_MAX; + + // Cumulative effect of decay in prediction quality. + if (local_next_frame.pcnt_inter > 0.85) + decay_accumulator *= local_next_frame.pcnt_inter; + else + decay_accumulator *= (0.85 + local_next_frame.pcnt_inter) / 2.0; + + // Keep a running total. + boost_score += (decay_accumulator * next_iiratio); + + // Test various breakout clauses. + if ((local_next_frame.pcnt_inter < 0.05) || (next_iiratio < 1.5) || + (((local_next_frame.pcnt_inter - local_next_frame.pcnt_neutral) < + 0.20) && + (next_iiratio < 3.0)) || + ((boost_score - old_boost_score) < 3.0) || + (local_next_frame.intra_error < 200)) { + break; + } + + old_boost_score = boost_score; + + // Get the next frame details + if (EOF == input_stats(twopass, &local_next_frame)) break; + } + + // If there is tolerable prediction for at least the next 3 frames then + // break out else discard this potential key frame and move on + if (boost_score > 30.0 && (i > 3)) { + is_viable_kf = 1; + } else { + // Reset the file position + reset_fpf_position(twopass, start_pos); + + is_viable_kf = 0; + } + } + + return is_viable_kf; +} + +#define FRAMES_TO_CHECK_DECAY 8 +#define KF_MIN_FRAME_BOOST 80.0 +#define KF_MAX_FRAME_BOOST 128.0 +#define MIN_KF_BOOST 300 // Minimum boost for non-static KF interval +#define MIN_STATIC_KF_BOOST 5400 // Minimum boost for static KF interval + +static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) { + int i, j; + RATE_CONTROL *const rc = &cpi->rc; + TWO_PASS *const twopass = &cpi->twopass; + GF_GROUP *const gf_group = &twopass->gf_group; + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + const FIRSTPASS_STATS first_frame = *this_frame; + const FIRSTPASS_STATS *const start_position = twopass->stats_in; + FIRSTPASS_STATS next_frame; + FIRSTPASS_STATS last_frame; + int kf_bits = 0; + int loop_decay_counter = 0; + double decay_accumulator = 1.0; + double av_decay_accumulator = 0.0; + double zero_motion_accumulator = 1.0; + double boost_score = 0.0; + double kf_mod_err = 0.0; + double kf_group_err = 0.0; + double recent_loop_decay[FRAMES_TO_CHECK_DECAY]; + + av1_zero(next_frame); + + rc->frames_since_key = 0; + + // Reset the GF group data structures. + av1_zero(*gf_group); + + // Is this a forced key frame by interval. + rc->this_key_frame_forced = rc->next_key_frame_forced; + + // Clear the alt ref active flag and last group multi arf flags as they + // can never be set for a key frame. + rc->source_alt_ref_active = 0; + + // KF is always a GF so clear frames till next gf counter. + rc->frames_till_gf_update_due = 0; + + rc->frames_to_key = 1; + + twopass->kf_group_bits = 0; // Total bits available to kf group + twopass->kf_group_error_left = 0; // Group modified error score. + + kf_mod_err = calculate_modified_err(cpi, twopass, oxcf, this_frame); + + // Initialize the decay rates for the recent frames to check + for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j) recent_loop_decay[j] = 1.0; + + // Find the next keyframe. + i = 0; + while (twopass->stats_in < twopass->stats_in_end && + rc->frames_to_key < cpi->oxcf.key_freq) { + // Accumulate kf group error. + kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame); + + // Load the next frame's stats. + last_frame = *this_frame; + input_stats(twopass, this_frame); + + // Provided that we are not at the end of the file... + if (cpi->oxcf.auto_key && twopass->stats_in < twopass->stats_in_end) { + double loop_decay_rate; + + // Check for a scene cut. + if (test_candidate_kf(twopass, &last_frame, this_frame, twopass->stats_in, + rc->frames_to_key)) + break; + + // How fast is the prediction quality decaying? + loop_decay_rate = get_prediction_decay_rate(cpi, twopass->stats_in); + + // We want to know something about the recent past... rather than + // as used elsewhere where we are concerned with decay in prediction + // quality since the last GF or KF. + recent_loop_decay[i % FRAMES_TO_CHECK_DECAY] = loop_decay_rate; + decay_accumulator = 1.0; + for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j) + decay_accumulator *= recent_loop_decay[j]; + + // Special check for transition or high motion followed by a + // static scene. + if (detect_transition_to_still(cpi, i, cpi->oxcf.key_freq - i, + loop_decay_rate, decay_accumulator)) + break; + + // Step on to the next frame. + ++rc->frames_to_key; + + // If we don't have a real key frame within the next two + // key_freq intervals then break out of the loop. + if (rc->frames_to_key >= 2 * cpi->oxcf.key_freq) break; + } else { + ++rc->frames_to_key; + } + ++i; + } + + // If there is a max kf interval set by the user we must obey it. + // We already breakout of the loop above at 2x max. + // This code centers the extra kf if the actual natural interval + // is between 1x and 2x. + if (cpi->oxcf.auto_key && rc->frames_to_key > cpi->oxcf.key_freq) { + FIRSTPASS_STATS tmp_frame = first_frame; + + rc->frames_to_key /= 2; + + // Reset to the start of the group. + reset_fpf_position(twopass, start_position); + + kf_group_err = 0.0; + + // Rescan to get the correct error data for the forced kf group. + for (i = 0; i < rc->frames_to_key; ++i) { + kf_group_err += calculate_modified_err(cpi, twopass, oxcf, &tmp_frame); + input_stats(twopass, &tmp_frame); + } + rc->next_key_frame_forced = 1; + } else if (twopass->stats_in == twopass->stats_in_end || + rc->frames_to_key >= cpi->oxcf.key_freq) { + rc->next_key_frame_forced = 1; + } else { + rc->next_key_frame_forced = 0; + } + + // Special case for the last key frame of the file. + if (twopass->stats_in >= twopass->stats_in_end) { + // Accumulate kf group error. + kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame); + } + + // Calculate the number of bits that should be assigned to the kf group. + if (twopass->bits_left > 0 && twopass->modified_error_left > 0.0) { + // Maximum number of bits for a single normal frame (not key frame). + const int max_bits = frame_max_bits(rc, &cpi->oxcf); + + // Maximum number of bits allocated to the key frame group. + int64_t max_grp_bits; + + // Default allocation based on bits left and relative + // complexity of the section. + twopass->kf_group_bits = (int64_t)( + twopass->bits_left * (kf_group_err / twopass->modified_error_left)); + + // Clip based on maximum per frame rate defined by the user. + max_grp_bits = (int64_t)max_bits * (int64_t)rc->frames_to_key; + if (twopass->kf_group_bits > max_grp_bits) + twopass->kf_group_bits = max_grp_bits; + } else { + twopass->kf_group_bits = 0; + } + twopass->kf_group_bits = AOMMAX(0, twopass->kf_group_bits); + + // Reset the first pass file position. + reset_fpf_position(twopass, start_position); + + // Scan through the kf group collating various stats used to determine + // how many bits to spend on it. + decay_accumulator = 1.0; + boost_score = 0.0; + const double kf_max_boost = + cpi->oxcf.rc_mode == AOM_Q + ? AOMMIN(AOMMAX(rc->frames_to_key * 2.0, KF_MIN_FRAME_BOOST), + KF_MAX_FRAME_BOOST) + : KF_MAX_FRAME_BOOST; + for (i = 0; i < (rc->frames_to_key - 1); ++i) { + if (EOF == input_stats(twopass, &next_frame)) break; + + // Monitor for static sections. + // For the first frame in kf group, the second ref indicator is invalid. + if (i > 0) { + zero_motion_accumulator = AOMMIN( + zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame)); + } else { + zero_motion_accumulator = next_frame.pcnt_inter - next_frame.pcnt_motion; + } + + // Not all frames in the group are necessarily used in calculating boost. + if ((i <= rc->max_gf_interval) || + ((i <= (rc->max_gf_interval * 4)) && (decay_accumulator > 0.5))) { + const double frame_boost = + calc_frame_boost(cpi, this_frame, 0, kf_max_boost); + + // How fast is prediction quality decaying. + if (!detect_flash(twopass, 0)) { + const double loop_decay_rate = + get_prediction_decay_rate(cpi, &next_frame); + decay_accumulator *= loop_decay_rate; + decay_accumulator = AOMMAX(decay_accumulator, MIN_DECAY_FACTOR); + av_decay_accumulator += decay_accumulator; + ++loop_decay_counter; + } + boost_score += (decay_accumulator * frame_boost); + } + } + if (loop_decay_counter > 0) + av_decay_accumulator /= (double)loop_decay_counter; + + reset_fpf_position(twopass, start_position); + + // Store the zero motion percentage + twopass->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0); + + // Calculate a section intra ratio used in setting max loop filter. + twopass->section_intra_rating = calculate_section_intra_ratio( + start_position, twopass->stats_in_end, rc->frames_to_key); + + rc->kf_boost = (int)(av_decay_accumulator * boost_score); + + // Special case for static / slide show content but don't apply + // if the kf group is very short. + if ((zero_motion_accumulator > STATIC_KF_GROUP_FLOAT_THRESH) && + (rc->frames_to_key > 8)) { + rc->kf_boost = AOMMAX(rc->kf_boost, MIN_STATIC_KF_BOOST); + } else { + // Apply various clamps for min and max boost + rc->kf_boost = AOMMAX(rc->kf_boost, (rc->frames_to_key * 3)); + rc->kf_boost = AOMMAX(rc->kf_boost, MIN_KF_BOOST); + } + + // Work out how many bits to allocate for the key frame itself. + kf_bits = calculate_boost_bits((rc->frames_to_key - 1), rc->kf_boost, + twopass->kf_group_bits); + // printf("kf boost = %d kf_bits = %d kf_zeromotion_pct = %d\n", rc->kf_boost, + // kf_bits, twopass->kf_zeromotion_pct); + + // Work out the fraction of the kf group bits reserved for the inter frames + // within the group after discounting the bits for the kf itself. + if (twopass->kf_group_bits) { + twopass->kfgroup_inter_fraction = + (double)(twopass->kf_group_bits - kf_bits) / + (double)twopass->kf_group_bits; + } else { + twopass->kfgroup_inter_fraction = 1.0; + } + + twopass->kf_group_bits -= kf_bits; + + // Save the bits to spend on the key frame. + gf_group->bit_allocation[0] = kf_bits; + gf_group->update_type[0] = KF_UPDATE; + + // Note the total error score of the kf group minus the key frame itself. + twopass->kf_group_error_left = (int)(kf_group_err - kf_mod_err); + + // Adjust the count of total modified error left. + // The count of bits left is adjusted elsewhere based on real coded frame + // sizes. + twopass->modified_error_left -= kf_group_err; +} + +static int is_skippable_frame(const AV1_COMP *cpi) { + // If the current frame does not have non-zero motion vector detected in the + // first pass, and so do its previous and forward frames, then this frame + // can be skipped for partition check, and the partition size is assigned + // according to the variance + const TWO_PASS *const twopass = &cpi->twopass; + + return (!frame_is_intra_only(&cpi->common) && + twopass->stats_in - 2 > twopass->stats_in_start && + twopass->stats_in < twopass->stats_in_end && + (twopass->stats_in - 1)->pcnt_inter - + (twopass->stats_in - 1)->pcnt_motion == + 1 && + (twopass->stats_in - 2)->pcnt_inter - + (twopass->stats_in - 2)->pcnt_motion == + 1 && + twopass->stats_in->pcnt_inter - twopass->stats_in->pcnt_motion == 1); +} + +#define ARF_STATS_OUTPUT 0 +#if ARF_STATS_OUTPUT +unsigned int arf_count = 0; +#endif +#define DEFAULT_GRP_WEIGHT 1.0 + +void av1_get_second_pass_params(AV1_COMP *cpi, + EncodeFrameParams *const frame_params, + unsigned int frame_flags) { + AV1_COMMON *const cm = &cpi->common; + CurrentFrame *const current_frame = &cm->current_frame; + RATE_CONTROL *const rc = &cpi->rc; + TWO_PASS *const twopass = &cpi->twopass; + GF_GROUP *const gf_group = &twopass->gf_group; + int frames_left; + FIRSTPASS_STATS this_frame; + + int target_rate; + + frames_left = (int)(twopass->total_stats.count - current_frame->frame_number); + + if (!twopass->stats_in) return; + + // If this is an arf frame then we dont want to read the stats file or + // advance the input pointer as we already have what we need. + if (gf_group->update_type[gf_group->index] == ARF_UPDATE || + gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE) { + target_rate = gf_group->bit_allocation[gf_group->index]; + target_rate = av1_rc_clamp_pframe_target_size( + cpi, target_rate, gf_group->update_type[gf_group->index]); + rc->base_frame_target = target_rate; + + if (cpi->no_show_kf) { + assert(gf_group->update_type[gf_group->index] == ARF_UPDATE); + frame_params->frame_type = KEY_FRAME; + } else { + frame_params->frame_type = INTER_FRAME; + } + + // Do the firstpass stats indicate that this frame is skippable for the + // partition search? + if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2) { + cpi->partition_search_skippable_frame = is_skippable_frame(cpi); + } + + return; + } + + aom_clear_system_state(); + + if (cpi->oxcf.rc_mode == AOM_Q) { + twopass->active_worst_quality = cpi->oxcf.cq_level; + } else if (current_frame->frame_number == 0) { + // Special case code for first frame. + const int section_target_bandwidth = + (int)(twopass->bits_left / frames_left); + const double section_length = twopass->total_left_stats.count; + const double section_error = + twopass->total_left_stats.coded_error / section_length; + const double section_intra_skip = + twopass->total_left_stats.intra_skip_pct / section_length; + const double section_inactive_zone = + (twopass->total_left_stats.inactive_zone_rows * 2) / + ((double)cm->mb_rows * section_length); + const int tmp_q = get_twopass_worst_quality( + cpi, section_error, section_intra_skip + section_inactive_zone, + section_target_bandwidth, DEFAULT_GRP_WEIGHT); + + twopass->active_worst_quality = tmp_q; + twopass->baseline_active_worst_quality = tmp_q; + rc->ni_av_qi = tmp_q; + rc->last_q[INTER_FRAME] = tmp_q; + rc->avg_q = av1_convert_qindex_to_q(tmp_q, cm->seq_params.bit_depth); + rc->avg_frame_qindex[INTER_FRAME] = tmp_q; + rc->last_q[KEY_FRAME] = (tmp_q + cpi->oxcf.best_allowed_q) / 2; + rc->avg_frame_qindex[KEY_FRAME] = rc->last_q[KEY_FRAME]; + } + + av1_zero(this_frame); + if (EOF == input_stats(twopass, &this_frame)) return; + + // Set the frame content type flag. + if (this_frame.intra_skip_pct >= FC_ANIMATION_THRESH) + twopass->fr_content_type = FC_GRAPHICS_ANIMATION; + else + twopass->fr_content_type = FC_NORMAL; + + // Keyframe and section processing. + if (rc->frames_to_key == 0 || (frame_flags & FRAMEFLAGS_KEY)) { + FIRSTPASS_STATS this_frame_copy; + this_frame_copy = this_frame; + frame_params->frame_type = KEY_FRAME; + // Define next KF group and assign bits to it. + find_next_key_frame(cpi, &this_frame); + this_frame = this_frame_copy; + } else { + frame_params->frame_type = INTER_FRAME; + } + + // Define a new GF/ARF group. (Should always enter here for key frames). + if (rc->frames_till_gf_update_due == 0) { + define_gf_group(cpi, &this_frame, frame_params); + + rc->frames_till_gf_update_due = rc->baseline_gf_interval; + +#if ARF_STATS_OUTPUT + { + FILE *fpfile; + fpfile = fopen("arf.stt", "a"); + ++arf_count; + fprintf(fpfile, "%10d %10d %10d %10d %10d\n", current_frame->frame_number, + rc->frames_till_gf_update_due, rc->kf_boost, arf_count, + rc->gfu_boost); + + fclose(fpfile); + } +#endif + } + + // Do the firstpass stats indicate that this frame is skippable for the + // partition search? + if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2) { + cpi->partition_search_skippable_frame = is_skippable_frame(cpi); + } + + target_rate = gf_group->bit_allocation[gf_group->index]; + + if (frame_params->frame_type == KEY_FRAME) { + target_rate = av1_rc_clamp_iframe_target_size(cpi, target_rate); + } else { + target_rate = av1_rc_clamp_pframe_target_size( + cpi, target_rate, gf_group->update_type[gf_group->index]); + } + + rc->base_frame_target = target_rate; + + { + const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) + ? cpi->initial_mbs + : cpi->common.MBs; + // The multiplication by 256 reverses a scaling factor of (>> 8) + // applied when combining MB error values for the frame. + twopass->mb_av_energy = log((this_frame.intra_error / num_mbs) + 1.0); + twopass->frame_avg_haar_energy = + log((this_frame.frame_avg_wavelet_energy / num_mbs) + 1.0); + } + + // Update the total stats remaining structure. + subtract_stats(&twopass->total_left_stats, &this_frame); +} + +void av1_init_second_pass(AV1_COMP *cpi) { + const AV1EncoderConfig *const oxcf = &cpi->oxcf; + TWO_PASS *const twopass = &cpi->twopass; + double frame_rate; + FIRSTPASS_STATS *stats; + + av1_twopass_zero_stats(&twopass->total_stats); + av1_twopass_zero_stats(&twopass->total_left_stats); + + if (!twopass->stats_in_end) return; + + stats = &twopass->total_stats; + + *stats = *twopass->stats_in_end; + twopass->total_left_stats = *stats; + + frame_rate = 10000000.0 * stats->count / stats->duration; + // Each frame can have a different duration, as the frame rate in the source + // isn't guaranteed to be constant. The frame rate prior to the first frame + // encoded in the second pass is a guess. However, the sum duration is not. + // It is calculated based on the actual durations of all frames from the + // first pass. + av1_new_framerate(cpi, frame_rate); + twopass->bits_left = + (int64_t)(stats->duration * oxcf->target_bandwidth / 10000000.0); + + // This variable monitors how far behind the second ref update is lagging. + twopass->sr_update_lag = 1; + + // Scan the first pass file and calculate a modified total error based upon + // the bias/power function used to allocate bits. + { + const double avg_error = + stats->coded_error / DOUBLE_DIVIDE_CHECK(stats->count); + const FIRSTPASS_STATS *s = twopass->stats_in; + double modified_error_total = 0.0; + twopass->modified_error_min = + (avg_error * oxcf->two_pass_vbrmin_section) / 100; + twopass->modified_error_max = + (avg_error * oxcf->two_pass_vbrmax_section) / 100; + while (s < twopass->stats_in_end) { + modified_error_total += calculate_modified_err(cpi, twopass, oxcf, s); + ++s; + } + twopass->modified_error_left = modified_error_total; + } + + // Reset the vbr bits off target counters + cpi->rc.vbr_bits_off_target = 0; + cpi->rc.vbr_bits_off_target_fast = 0; + + cpi->rc.rate_error_estimate = 0; + + // Static sequence monitor variables. + twopass->kf_zeromotion_pct = 100; + twopass->last_kfgroup_zeromotion_pct = 100; +} + +#define MINQ_ADJ_LIMIT 48 +#define MINQ_ADJ_LIMIT_CQ 20 +#define HIGH_UNDERSHOOT_RATIO 2 +void av1_twopass_postencode_update(AV1_COMP *cpi) { + TWO_PASS *const twopass = &cpi->twopass; + RATE_CONTROL *const rc = &cpi->rc; + const int bits_used = rc->base_frame_target; + + // VBR correction is done through rc->vbr_bits_off_target. Based on the + // sign of this value, a limited % adjustment is made to the target rate + // of subsequent frames, to try and push it back towards 0. This method + // is designed to prevent extreme behaviour at the end of a clip + // or group of frames. + rc->vbr_bits_off_target += rc->base_frame_target - rc->projected_frame_size; + twopass->bits_left = AOMMAX(twopass->bits_left - bits_used, 0); + + // Calculate the pct rc error. + if (rc->total_actual_bits) { + rc->rate_error_estimate = + (int)((rc->vbr_bits_off_target * 100) / rc->total_actual_bits); + rc->rate_error_estimate = clamp(rc->rate_error_estimate, -100, 100); + } else { + rc->rate_error_estimate = 0; + } + + if (cpi->common.current_frame.frame_type != KEY_FRAME) { + twopass->kf_group_bits -= bits_used; + twopass->last_kfgroup_zeromotion_pct = twopass->kf_zeromotion_pct; + } + twopass->kf_group_bits = AOMMAX(twopass->kf_group_bits, 0); + + // If the rate control is drifting consider adjustment to min or maxq. + if ((cpi->oxcf.rc_mode != AOM_Q) && !cpi->rc.is_src_frame_alt_ref) { + const int maxq_adj_limit = + rc->worst_quality - twopass->active_worst_quality; + const int minq_adj_limit = + (cpi->oxcf.rc_mode == AOM_CQ ? MINQ_ADJ_LIMIT_CQ : MINQ_ADJ_LIMIT); + + // Undershoot. + if (rc->rate_error_estimate > cpi->oxcf.under_shoot_pct) { + --twopass->extend_maxq; + if (rc->rolling_target_bits >= rc->rolling_actual_bits) + ++twopass->extend_minq; + // Overshoot. + } else if (rc->rate_error_estimate < -cpi->oxcf.over_shoot_pct) { + --twopass->extend_minq; + if (rc->rolling_target_bits < rc->rolling_actual_bits) + ++twopass->extend_maxq; + } else { + // Adjustment for extreme local overshoot. + if (rc->projected_frame_size > (2 * rc->base_frame_target) && + rc->projected_frame_size > (2 * rc->avg_frame_bandwidth)) + ++twopass->extend_maxq; + + // Unwind undershoot or overshoot adjustment. + if (rc->rolling_target_bits < rc->rolling_actual_bits) + --twopass->extend_minq; + else if (rc->rolling_target_bits > rc->rolling_actual_bits) + --twopass->extend_maxq; + } + + twopass->extend_minq = clamp(twopass->extend_minq, 0, minq_adj_limit); + twopass->extend_maxq = clamp(twopass->extend_maxq, 0, maxq_adj_limit); + + // If there is a big and undexpected undershoot then feed the extra + // bits back in quickly. One situation where this may happen is if a + // frame is unexpectedly almost perfectly predicted by the ARF or GF + // but not very well predcited by the previous frame. + if (!frame_is_kf_gf_arf(cpi) && !cpi->rc.is_src_frame_alt_ref) { + int fast_extra_thresh = rc->base_frame_target / HIGH_UNDERSHOOT_RATIO; + if (rc->projected_frame_size < fast_extra_thresh) { + rc->vbr_bits_off_target_fast += + fast_extra_thresh - rc->projected_frame_size; + rc->vbr_bits_off_target_fast = + AOMMIN(rc->vbr_bits_off_target_fast, (4 * rc->avg_frame_bandwidth)); + + // Fast adaptation of minQ if necessary to use up the extra bits. + if (rc->avg_frame_bandwidth) { + twopass->extend_minq_fast = + (int)(rc->vbr_bits_off_target_fast * 8 / rc->avg_frame_bandwidth); + } + twopass->extend_minq_fast = AOMMIN( + twopass->extend_minq_fast, minq_adj_limit - twopass->extend_minq); + } else if (rc->vbr_bits_off_target_fast) { + twopass->extend_minq_fast = AOMMIN( + twopass->extend_minq_fast, minq_adj_limit - twopass->extend_minq); + } else { + twopass->extend_minq_fast = 0; + } + } + } +} diff --git a/libaom/av1/encoder/pass2_strategy.h b/libaom/av1/encoder/pass2_strategy.h new file mode 100644 index 0000000..bf37746 --- /dev/null +++ b/libaom/av1/encoder/pass2_strategy.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_PASS2_STRATEGY_H_ +#define AOM_AV1_ENCODER_PASS2_STRATEGY_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +struct AV1_COMP; +struct EncodeFrameParams; + +void av1_init_second_pass(struct AV1_COMP *cpi); + +void av1_get_second_pass_params(struct AV1_COMP *cpi, + struct EncodeFrameParams *const frame_params, + unsigned int frame_flags); + +void av1_twopass_postencode_update(struct AV1_COMP *cpi); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_PASS2_STRATEGY_H_ diff --git a/libaom/av1/encoder/picklpf.c b/libaom/av1/encoder/picklpf.c index b6b84c8..aca089c 100644 --- a/libaom/av1/encoder/picklpf.c +++ b/libaom/av1/encoder/picklpf.c @@ -70,24 +70,24 @@ static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd, // TODO(any): please enable multi-thread and remove the flag when loop // filter mask is compatible with multi-thread. if (cpi->num_workers > 1) - av1_loop_filter_frame_mt(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, plane, + av1_loop_filter_frame_mt(&cm->cur_frame->buf, cm, &cpi->td.mb.e_mbd, plane, plane + 1, partial_frame, #if LOOP_FILTER_BITMASK 0, #endif cpi->workers, cpi->num_workers, &cpi->lf_row_sync); else - av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, + av1_loop_filter_frame(&cm->cur_frame->buf, cm, &cpi->td.mb.e_mbd, #if LOOP_FILTER_BITMASK 0, #endif plane, plane + 1, partial_frame); - filt_err = aom_get_sse_plane(sd, cm->frame_to_show, plane, + filt_err = aom_get_sse_plane(sd, &cm->cur_frame->buf, plane, cm->seq_params.use_highbitdepth); // Re-instate the unfiltered frame - yv12_copy_plane(&cpi->last_frame_uf, cm->frame_to_show, plane); + yv12_copy_plane(&cpi->last_frame_uf, &cm->cur_frame->buf, plane); return filt_err; } @@ -108,7 +108,17 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi, // range. int lvl; switch (plane) { - case 0: lvl = last_frame_filter_level[dir]; break; + case 0: + switch (dir) { + case 2: + lvl = (last_frame_filter_level[0] + last_frame_filter_level[1] + 1) >> + 1; + break; + case 0: + case 1: lvl = last_frame_filter_level[dir]; break; + default: assert(dir >= 0 && dir <= 2); return 0; + } + break; case 1: lvl = last_frame_filter_level[2]; break; case 2: lvl = last_frame_filter_level[3]; break; default: assert(plane >= 0 && plane <= 2); return 0; @@ -120,7 +130,7 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi, // Set each entry to -1 memset(ss_err, 0xFF, sizeof(ss_err)); - yv12_copy_plane(cm->frame_to_show, &cpi->last_frame_uf, plane); + yv12_copy_plane(&cm->cur_frame->buf, &cpi->last_frame_uf, plane); best_err = try_filter_frame(sd, cpi, filt_mid, partial_frame, plane, dir); filt_best = filt_mid; ss_err[filt_mid] = best_err; @@ -203,19 +213,25 @@ void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi, const int min_filter_level = 0; const int max_filter_level = av1_get_max_filter_level(cpi); const int q = av1_ac_quant_Q3(cm->base_qindex, 0, cm->seq_params.bit_depth); + // based on tests result for rtc test set + // 0.04590 boosted or 0.02295 non-booseted in 18-bit fixed point + const int strength_boost_q_treshold = 700; + const int inter_frame_multiplier = + q > strength_boost_q_treshold ? 12034 : 6017; // These values were determined by linear fitting the result of the // searched level for 8 bit depth: // Keyframes: filt_guess = q * 0.06699 - 1.60817 - // Other frames: filt_guess = q * 0.02295 + 2.48225 + // Other frames: filt_guess = q * inter_frame_multiplier + 2.48225 // // And high bit depth separately: // filt_guess = q * 0.316206 + 3.87252 int filt_guess; switch (cm->seq_params.bit_depth) { case AOM_BITS_8: - filt_guess = (cm->current_frame.frame_type == KEY_FRAME) - ? ROUND_POWER_OF_TWO(q * 17563 - 421574, 18) - : ROUND_POWER_OF_TWO(q * 6017 + 650707, 18); + filt_guess = + (cm->current_frame.frame_type == KEY_FRAME) + ? ROUND_POWER_OF_TWO(q * 17563 - 421574, 18) + : ROUND_POWER_OF_TWO(q * inter_frame_multiplier + 650707, 18); break; case AOM_BITS_10: filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 4060632, 20); diff --git a/libaom/av1/encoder/pickrst.c b/libaom/av1/encoder/pickrst.c index a7fab16..1b4f26c 100644 --- a/libaom/av1/encoder/pickrst.c +++ b/libaom/av1/encoder/pickrst.c @@ -140,7 +140,7 @@ static void init_rsc(const YV12_BUFFER_CONFIG *src, const AV1_COMMON *cm, rsc->rusi = rusi; rsc->sf = sf; - const YV12_BUFFER_CONFIG *dgd = cm->frame_to_show; + const YV12_BUFFER_CONFIG *dgd = &cm->cur_frame->buf; const int is_uv = plane != AOM_PLANE_Y; rsc->plane_width = src->crop_widths[is_uv]; rsc->plane_height = src->crop_heights[is_uv]; @@ -165,7 +165,7 @@ static int64_t try_restoration_unit(const RestSearchCtxt *rsc, const int bit_depth = cm->seq_params.bit_depth; const int highbd = cm->seq_params.use_highbitdepth; - const YV12_BUFFER_CONFIG *fts = cm->frame_to_show; + const YV12_BUFFER_CONFIG *fts = &cm->cur_frame->buf; // TODO(yunqing): For now, only use optimized LR filter in decoder. Can be // also used in encoder. const int optimized_lr = 0; @@ -200,7 +200,7 @@ int64_t av1_lowbd_pixel_proj_error_c(const uint8_t *src8, int width, int height, v += xq[0] * (flt0[j] - u) + xq[1] * (flt1[j] - u); const int32_t e = ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j]; - err += e * e; + err += ((int64_t)e * e); } dat += dat_stride; src += src_stride; @@ -216,7 +216,7 @@ int64_t av1_lowbd_pixel_proj_error_c(const uint8_t *src8, int width, int height, v += xq[0] * (flt0[j] - u); const int32_t e = ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j]; - err += e * e; + err += ((int64_t)e * e); } dat += dat_stride; src += src_stride; @@ -231,7 +231,7 @@ int64_t av1_lowbd_pixel_proj_error_c(const uint8_t *src8, int width, int height, v += xq[1] * (flt1[j] - u); const int32_t e = ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j]; - err += e * e; + err += ((int64_t)e * e); } dat += dat_stride; src += src_stride; @@ -241,7 +241,7 @@ int64_t av1_lowbd_pixel_proj_error_c(const uint8_t *src8, int width, int height, for (i = 0; i < height; ++i) { for (j = 0; j < width; ++j) { const int32_t e = (int32_t)(dat[j]) - src[j]; - err += e * e; + err += ((int64_t)e * e); } dat += dat_stride; src += src_stride; @@ -276,7 +276,7 @@ int64_t av1_highbd_pixel_proj_error_c(const uint8_t *src8, int width, v += xq0 * v0; v += xq1 * v1; const int32_t e = (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + d - s; - err += e * e; + err += ((int64_t)e * e); } dat += dat_stride; flt0 += flt0_stride; @@ -304,7 +304,7 @@ int64_t av1_highbd_pixel_proj_error_c(const uint8_t *src8, int width, int32_t v = half; v += exq * (flt[j] - u); const int32_t e = (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + d - s; - err += e * e; + err += ((int64_t)e * e); } dat += dat_stride; flt += flt_stride; @@ -316,7 +316,7 @@ int64_t av1_highbd_pixel_proj_error_c(const uint8_t *src8, int width, const int32_t d = dat[j]; const int32_t s = src[j]; const int32_t e = d - s; - err += e * e; + err += ((int64_t)e * e); } dat += dat_stride; src += src_stride; @@ -1281,7 +1281,7 @@ static void search_norestore(const RestorationTileLimits *limits, const int highbd = rsc->cm->seq_params.use_highbitdepth; rusi->sse[RESTORE_NONE] = sse_restoration_unit( - limits, rsc->src, rsc->cm->frame_to_show, rsc->plane, highbd); + limits, rsc->src, &rsc->cm->cur_frame->buf, rsc->plane, highbd); rsc->sse += rusi->sse[RESTORE_NONE]; } @@ -1413,20 +1413,22 @@ void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi) { RestorationType best_rtype = RESTORE_NONE; const int highbd = rsc.cm->seq_params.use_highbitdepth; - extend_frame(rsc.dgd_buffer, rsc.plane_width, rsc.plane_height, - rsc.dgd_stride, RESTORATION_BORDER, RESTORATION_BORDER, - highbd); + if (!cpi->sf.disable_loop_restoration_chroma || !plane) { + extend_frame(rsc.dgd_buffer, rsc.plane_width, rsc.plane_height, + rsc.dgd_stride, RESTORATION_BORDER, RESTORATION_BORDER, + highbd); - for (RestorationType r = 0; r < num_rtypes; ++r) { - if ((force_restore_type != RESTORE_TYPES) && (r != RESTORE_NONE) && - (r != force_restore_type)) - continue; + for (RestorationType r = 0; r < num_rtypes; ++r) { + if ((force_restore_type != RESTORE_TYPES) && (r != RESTORE_NONE) && + (r != force_restore_type)) + continue; - double cost = search_rest_type(&rsc, r); + double cost = search_rest_type(&rsc, r); - if (r == 0 || cost < best_cost) { - best_cost = cost; - best_rtype = r; + if (r == 0 || cost < best_cost) { + best_cost = cost; + best_rtype = r; + } } } diff --git a/libaom/av1/encoder/ratectrl.c b/libaom/av1/encoder/ratectrl.c index 21632c0..861c737 100644 --- a/libaom/av1/encoder/ratectrl.c +++ b/libaom/av1/encoder/ratectrl.c @@ -29,6 +29,8 @@ #include "av1/common/seg_common.h" #include "av1/encoder/encodemv.h" +#include "av1/encoder/encode_strategy.h" +#include "av1/encoder/gop_structure.h" #include "av1/encoder/random.h" #include "av1/encoder/ratectrl.h" @@ -96,18 +98,13 @@ static double resize_rate_factor(const AV1_COMP *cpi, int width, int height) { // fit to the original data (after plotting real maxq vs minq (not q index)) static int get_minq_index(double maxq, double x3, double x2, double x1, aom_bit_depth_t bit_depth) { - int i; const double minqtarget = AOMMIN(((x3 * maxq + x2) * maxq + x1) * maxq, maxq); // Special case handling to deal with the step from q2.0 // down to lossless mode represented by q 1.0. if (minqtarget <= 2.0) return 0; - for (i = 0; i < QINDEX_RANGE; i++) { - if (minqtarget <= av1_convert_qindex_to_q(i, bit_depth)) return i; - } - - return QINDEX_RANGE - 1; + return av1_find_qindex(minqtarget, bit_depth, 0, QINDEX_RANGE - 1); } static void init_minq_luts(int *kf_low_m, int *kf_high_m, int *arfgf_low, @@ -174,13 +171,15 @@ int av1_estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs, (int)((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS); } -int av1_rc_clamp_pframe_target_size(const AV1_COMP *const cpi, int target) { +int av1_rc_clamp_pframe_target_size(const AV1_COMP *const cpi, int target, + FRAME_UPDATE_TYPE frame_update_type) { const RATE_CONTROL *rc = &cpi->rc; const AV1EncoderConfig *oxcf = &cpi->oxcf; const int min_frame_target = AOMMAX(rc->min_frame_bandwidth, rc->avg_frame_bandwidth >> 5); // Clip the frame target to the minimum setup value. - if (cpi->rc.is_src_frame_alt_ref) { + if (frame_update_type == OVERLAY_UPDATE || + frame_update_type == INTNL_OVERLAY_UPDATE) { // If there is an active ARF at this location use the minimum // bits on this frame even if it is a constructed arf. // The active maximum quantizer insures that an appropriate @@ -219,9 +218,7 @@ static void update_buffer_level(AV1_COMP *cpi, int encoded_frame_size) { RATE_CONTROL *const rc = &cpi->rc; // Non-viewable frames are a special case and are treated as pure overhead. - // TODO(zoeliu): To further explore whether we should treat BWDREF_FRAME - // differently, since it is a no-show frame. - if (!cm->show_frame && !rc->is_bwd_ref_frame) + if (!cm->show_frame) rc->bits_off_target -= encoded_frame_size; else rc->bits_off_target += rc->avg_frame_bandwidth - encoded_frame_size; @@ -253,9 +250,7 @@ int av1_rc_get_default_min_gf_interval(int width, int height, int av1_rc_get_default_max_gf_interval(double framerate, int min_gf_interval) { int interval = AOMMIN(MAX_GF_INTERVAL, (int)(framerate * 0.75)); interval += (interval & 0x01); // Round to even value -#if CONFIG_FIX_GF_LENGTH - interval = AOMMAX(FIXED_GF_LENGTH, interval); -#endif + interval = AOMMAX(MAX_GF_INTERVAL, interval); return AOMMAX(interval, min_gf_interval); } @@ -352,6 +347,22 @@ int av1_rc_drop_frame(AV1_COMP *cpi) { } } +static const RATE_FACTOR_LEVEL rate_factor_levels[FRAME_UPDATE_TYPES] = { + KF_STD, // KF_UPDATE + INTER_NORMAL, // LF_UPDATE + GF_ARF_STD, // GF_UPDATE + GF_ARF_STD, // ARF_UPDATE + INTER_NORMAL, // OVERLAY_UPDATE + INTER_NORMAL, // INTNL_OVERLAY_UPDATE + GF_ARF_LOW, // INTNL_ARF_UPDATE +}; + +static RATE_FACTOR_LEVEL get_rate_factor_level(const GF_GROUP *const gf_group) { + const FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index]; + assert(update_type < FRAME_UPDATE_TYPES); + return rate_factor_levels[update_type]; +} + static double get_rate_correction_factor(const AV1_COMP *cpi, int width, int height) { const RATE_CONTROL *const rc = &cpi->rc; @@ -360,8 +371,8 @@ static double get_rate_correction_factor(const AV1_COMP *cpi, int width, if (cpi->common.current_frame.frame_type == KEY_FRAME) { rcf = rc->rate_correction_factors[KF_STD]; } else if (cpi->oxcf.pass == 2) { - RATE_FACTOR_LEVEL rf_lvl = - cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index]; + const RATE_FACTOR_LEVEL rf_lvl = + get_rate_factor_level(&cpi->twopass.gf_group); rcf = rc->rate_correction_factors[rf_lvl]; } else { if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) && @@ -387,8 +398,8 @@ static void set_rate_correction_factor(AV1_COMP *cpi, double factor, int width, if (cpi->common.current_frame.frame_type == KEY_FRAME) { rc->rate_correction_factors[KF_STD] = factor; } else if (cpi->oxcf.pass == 2) { - RATE_FACTOR_LEVEL rf_lvl = - cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index]; + const RATE_FACTOR_LEVEL rf_lvl = + get_rate_factor_level(&cpi->twopass.gf_group); rc->rate_correction_factors[rf_lvl] = factor; } else { if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) && @@ -474,45 +485,82 @@ void av1_rc_update_rate_correction_factors(AV1_COMP *cpi, int width, set_rate_correction_factor(cpi, rate_correction_factor, width, height); } +// Calculate rate for the given 'q'. +static int get_bits_per_mb(const AV1_COMP *cpi, int use_cyclic_refresh, + double correction_factor, int q) { + const AV1_COMMON *const cm = &cpi->common; + return use_cyclic_refresh + ? av1_cyclic_refresh_rc_bits_per_mb(cpi, q, correction_factor) + : av1_rc_bits_per_mb(cm->current_frame.frame_type, q, + correction_factor, cm->seq_params.bit_depth); +} + +// Similar to find_qindex_by_rate() function in ratectrl.c, but returns the q +// index with rate just above or below the desired rate, depending on which of +// the two rates is closer to the desired rate. +// Also, respects the selected aq_mode when computing the rate. +static int find_closest_qindex_by_rate(int desired_bits_per_mb, + const AV1_COMP *cpi, + double correction_factor, + int best_qindex, int worst_qindex) { + const int use_cyclic_refresh = + cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->common.seg.enabled; + + // Find 'qindex' based on 'desired_bits_per_mb'. + assert(best_qindex <= worst_qindex); + int low = best_qindex; + int high = worst_qindex; + while (low < high) { + const int mid = (low + high) >> 1; + const int mid_bits_per_mb = + get_bits_per_mb(cpi, use_cyclic_refresh, correction_factor, mid); + if (mid_bits_per_mb > desired_bits_per_mb) { + low = mid + 1; + } else { + high = mid; + } + } + assert(low == high); + + // Calculate rate difference of this q index from the desired rate. + const int curr_q = low; + const int curr_bits_per_mb = + get_bits_per_mb(cpi, use_cyclic_refresh, correction_factor, curr_q); + const int curr_bit_diff = (curr_bits_per_mb <= desired_bits_per_mb) + ? desired_bits_per_mb - curr_bits_per_mb + : INT_MAX; + assert((curr_bit_diff != INT_MAX && curr_bit_diff >= 0) || + curr_q == worst_qindex); + + // Calculate rate difference for previous q index too. + const int prev_q = curr_q - 1; + int prev_bit_diff; + if (curr_bit_diff == INT_MAX || curr_q == best_qindex) { + prev_bit_diff = INT_MAX; + } else { + const int prev_bits_per_mb = + get_bits_per_mb(cpi, use_cyclic_refresh, correction_factor, prev_q); + assert(prev_bits_per_mb > desired_bits_per_mb); + prev_bit_diff = prev_bits_per_mb - desired_bits_per_mb; + } + + // Pick one of the two q indices, depending on which one has rate closer to + // the desired rate. + return (curr_bit_diff <= prev_bit_diff) ? curr_q : prev_q; +} + int av1_rc_regulate_q(const AV1_COMP *cpi, int target_bits_per_frame, int active_best_quality, int active_worst_quality, int width, int height) { - const AV1_COMMON *const cm = &cpi->common; - int q = active_worst_quality; - int last_error = INT_MAX; - int i, target_bits_per_mb, bits_per_mb_at_this_q; const int MBs = av1_get_MBs(width, height); const double correction_factor = get_rate_correction_factor(cpi, width, height); - - // Calculate required scaling factor based on target frame size and size of - // frame produced using previous Q. - target_bits_per_mb = + const int target_bits_per_mb = (int)((uint64_t)(target_bits_per_frame) << BPER_MB_NORMBITS) / MBs; - i = active_best_quality; - - do { - if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) { - bits_per_mb_at_this_q = - (int)av1_cyclic_refresh_rc_bits_per_mb(cpi, i, correction_factor); - } else { - bits_per_mb_at_this_q = - (int)av1_rc_bits_per_mb(cm->current_frame.frame_type, i, - correction_factor, cm->seq_params.bit_depth); - } - - if (bits_per_mb_at_this_q <= target_bits_per_mb) { - if ((target_bits_per_mb - bits_per_mb_at_this_q) <= last_error) - q = i; - else - q = i - 1; - - break; - } else { - last_error = bits_per_mb_at_this_q - target_bits_per_mb; - } - } while (++i <= active_worst_quality); + int q = + find_closest_qindex_by_rate(target_bits_per_mb, cpi, correction_factor, + active_best_quality, active_worst_quality); // In CBR mode, this makes sure q is between oscillating Qs to prevent // resonance. @@ -560,13 +608,11 @@ static int get_gf_active_quality(const RATE_CONTROL *const rc, int q, arfgf_low_motion_minq, arfgf_high_motion_minq); } -#if REDUCE_LAST_ALT_BOOST static int get_gf_high_motion_quality(int q, aom_bit_depth_t bit_depth) { int *arfgf_high_motion_minq; ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq); return arfgf_high_motion_minq[q]; } -#endif static int calc_active_worst_quality_one_pass_vbr(const AV1_COMP *cpi) { const RATE_CONTROL *const rc = &cpi->rc; @@ -758,10 +804,28 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const AV1_COMP *cpi, int width, return q; } +static int gf_group_pyramid_level(const AV1_COMP *cpi) { + const GF_GROUP *gf_group = &cpi->twopass.gf_group; + int this_height = gf_group->pyramid_level[gf_group->index]; + return this_height; +} + static int get_active_cq_level(const RATE_CONTROL *rc, - const AV1EncoderConfig *const oxcf) { + const AV1EncoderConfig *const oxcf, + int intra_only, int superres_denom) { static const double cq_adjust_threshold = 0.1; int active_cq_level = oxcf->cq_level; + (void)intra_only; + if (oxcf->rc_mode == AOM_CQ || oxcf->rc_mode == AOM_Q) { + // printf("Superres %d %d %d = %d\n", superres_denom, intra_only, + // rc->frames_to_key, !(intra_only && rc->frames_to_key <= 1)); + if (oxcf->superres_mode == SUPERRES_QTHRESH && + superres_denom != SCALE_NUMERATOR && + !(intra_only && rc->frames_to_key <= 1)) { + active_cq_level = + AOMMAX(active_cq_level - ((superres_denom - SCALE_NUMERATOR) * 4), 0); + } + } if (oxcf->rc_mode == AOM_CQ && rc->total_target_bits > 0) { const double x = (double)rc->total_actual_bits / rc->total_target_bits; if (x < cq_adjust_threshold) { @@ -778,7 +842,8 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const AV1_COMP *cpi, int width, const RATE_CONTROL *const rc = &cpi->rc; const CurrentFrame *const current_frame = &cm->current_frame; const AV1EncoderConfig *const oxcf = &cpi->oxcf; - const int cq_level = get_active_cq_level(rc, oxcf); + const int cq_level = get_active_cq_level(rc, oxcf, frame_is_intra_only(cm), + cm->superres_scale_denominator); int active_best_quality; int active_worst_quality = calc_active_worst_quality_one_pass_vbr(cpi); int q; @@ -920,15 +985,20 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const AV1_COMP *cpi, int width, return q; } -int av1_frame_type_qdelta(const AV1_COMP *cpi, int rf_level, int q) { - static const FRAME_TYPE frame_type[RATE_FACTOR_LEVELS] = { - INTER_FRAME, INTER_FRAME, INTER_FRAME, INTER_FRAME, INTER_FRAME, KEY_FRAME - }; - const AV1_COMMON *const cm = &cpi->common; - int qdelta = av1_compute_qdelta_by_rate(&cpi->rc, frame_type[rf_level], q, - rate_factor_deltas[rf_level], - cm->seq_params.bit_depth); - return qdelta; +static const double rate_factor_deltas[RATE_FACTOR_LEVELS] = { + 1.00, // INTER_NORMAL + 1.25, // GF_ARF_LOW + 2.00, // GF_ARF_STD + 2.00, // KF_STD +}; + +int av1_frame_type_qdelta(const AV1_COMP *cpi, int q) { + const RATE_FACTOR_LEVEL rf_lvl = + get_rate_factor_level(&cpi->twopass.gf_group); + const FRAME_TYPE frame_type = (rf_lvl == KF_STD) ? KEY_FRAME : INTER_FRAME; + return av1_compute_qdelta_by_rate(&cpi->rc, frame_type, q, + rate_factor_deltas[rf_lvl], + cpi->common.seq_params.bit_depth); } #define STATIC_MOTION_THRESH 95 @@ -939,7 +1009,8 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width, const RATE_CONTROL *const rc = &cpi->rc; const AV1EncoderConfig *const oxcf = &cpi->oxcf; const GF_GROUP *gf_group = &cpi->twopass.gf_group; - const int cq_level = get_active_cq_level(rc, oxcf); + const int cq_level = get_active_cq_level(rc, oxcf, frame_is_intra_only(cm), + cm->superres_scale_denominator); int active_best_quality; int active_worst_quality = cpi->twopass.active_worst_quality; int q; @@ -947,12 +1018,8 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width, const int bit_depth = cm->seq_params.bit_depth; ASSIGN_MINQ_TABLE(bit_depth, inter_minq); -#if CUSTOMIZED_GF const int is_intrl_arf_boost = gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE; -#else - const int is_intrl_arf_boost = cpi->refresh_alt2_ref_frame; -#endif // CUSTOMIZED_GF if (frame_is_intra_only(cm)) { if (rc->frames_to_key == 1 && oxcf->rc_mode == AOM_Q) { @@ -961,6 +1028,18 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width, // as q. active_best_quality = cq_level; active_worst_quality = cq_level; + } else if (cm->current_frame.frame_type == KEY_FRAME && + cm->show_frame == 0) { + // Handle the special case for forward reference key frames. + // Increase the boost because this keyframe is used as a forward and + // backward reference. + const int qindex = rc->last_boosted_qindex; + const double last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth); + const int delta_qindex = av1_compute_qdelta( + rc, last_boosted_q, last_boosted_q * 0.25, bit_depth); + active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality); + // Update the arf_q since the forward keyframe is replacing the ALTREF + *arf_q = active_best_quality; } else if (rc->this_key_frame_forced) { // Handle the special case for key frames forced when we have reached // the maximum key frame interval. Here force the Q to a range @@ -978,13 +1057,10 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width, active_worst_quality = AOMMIN(qindex + delta_qindex, active_worst_quality); } else { - // Increase the boost if the forced keyframe is a forward reference. - // These numbers were derived empirically. - const double boost_factor = cpi->oxcf.fwd_kf_enabled ? 0.25 : 0.50; qindex = rc->last_boosted_qindex; last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth); - delta_qindex = av1_compute_qdelta( - rc, last_boosted_q, last_boosted_q * boost_factor, bit_depth); + delta_qindex = av1_compute_qdelta(rc, last_boosted_q, + last_boosted_q * 0.50, bit_depth); active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality); } } else { @@ -1035,80 +1111,57 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width, // Constrained quality use slightly lower active best. active_best_quality = active_best_quality * 15 / 16; -#if USE_SYMM_MULTI_LAYER && MULTI_LVL_BOOST_VBR_CQ - if (gf_group->update_type[gf_group->index] == ARF_UPDATE || - (is_intrl_arf_boost && !cpi->new_bwdref_update_rule)) { -#if REDUCE_LAST_ALT_BOOST - if (gf_group->update_type[gf_group->index] == ARF_UPDATE) { - const int min_boost = get_gf_high_motion_quality(q, bit_depth); - const int boost = min_boost - active_best_quality; + if (gf_group->update_type[gf_group->index] == ARF_UPDATE) { + const int min_boost = get_gf_high_motion_quality(q, bit_depth); + const int boost = min_boost - active_best_quality; - active_best_quality = min_boost - (int)(boost * rc->arf_boost_factor); - } -#endif // REDUCE_LAST_ALT_BOOST + active_best_quality = min_boost - (int)(boost * rc->arf_boost_factor); *arf_q = active_best_quality; - } else if (cpi->new_bwdref_update_rule && is_intrl_arf_boost) { + } else if (is_intrl_arf_boost) { assert(rc->arf_q >= 0); // Ensure it is set to a valid value. active_best_quality = rc->arf_q; - int this_height = gf_group->pyramid_level[gf_group->index]; + int this_height = gf_group_pyramid_level(cpi); while (this_height < gf_group->pyramid_height) { active_best_quality = (active_best_quality + cq_level + 1) / 2; ++this_height; } } -#endif // USE_SYMM_MULTI_LAYER && MULTI_LVL_BOOST_VBR_CQ } else if (oxcf->rc_mode == AOM_Q) { if (!cpi->refresh_alt_ref_frame && !is_intrl_arf_boost) { active_best_quality = cq_level; } else { if (gf_group->update_type[gf_group->index] == ARF_UPDATE) { active_best_quality = get_gf_active_quality(rc, q, bit_depth); - *arf_q = active_best_quality; -#if REDUCE_LAST_ALT_BOOST const int min_boost = get_gf_high_motion_quality(q, bit_depth); const int boost = min_boost - active_best_quality; active_best_quality = min_boost - (int)(boost * rc->arf_boost_factor); -#endif + *arf_q = active_best_quality; } else { assert(rc->arf_q >= 0); // Ensure it is set to a valid value. + assert(is_intrl_arf_boost); active_best_quality = rc->arf_q; - } -#if USE_SYMM_MULTI_LAYER - if (cpi->new_bwdref_update_rule && is_intrl_arf_boost) { - int this_height = gf_group->pyramid_level[gf_group->index]; + int this_height = gf_group_pyramid_level(cpi); while (this_height < gf_group->pyramid_height) { active_best_quality = (active_best_quality + cq_level + 1) / 2; ++this_height; } - } else { -#endif - // Modify best quality for second level arfs. For mode AOM_Q this - // becomes the baseline frame q. - if (gf_group->rf_level[gf_group->index] == GF_ARF_LOW) - active_best_quality = (active_best_quality + cq_level + 1) / 2; -#if USE_SYMM_MULTI_LAYER } -#endif } } else { active_best_quality = get_gf_active_quality(rc, q, bit_depth); -#if REDUCE_LAST_ALT_BOOST const int min_boost = get_gf_high_motion_quality(q, bit_depth); const int boost = min_boost - active_best_quality; active_best_quality = min_boost - (int)(boost * rc->arf_boost_factor); -#endif -#if USE_SYMM_MULTI_LAYER - if (cpi->new_bwdref_update_rule && is_intrl_arf_boost) { - int this_height = gf_group->pyramid_level[gf_group->index]; + if (is_intrl_arf_boost) { + int this_height = gf_group_pyramid_level(cpi); while (this_height < gf_group->pyramid_height) { active_best_quality = (active_best_quality + active_worst_quality + 1) / 2; ++this_height; } } -#endif } } else { if (oxcf->rc_mode == AOM_Q) { @@ -1126,8 +1179,7 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width, // Extension to max or min Q if undershoot or overshoot is outside // the permitted range. - if ((cpi->oxcf.rc_mode != AOM_Q) && - (cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD)) { + if (cpi->oxcf.rc_mode != AOM_Q) { if (frame_is_intra_only(cm) || (!rc->is_src_frame_alt_ref && (cpi->refresh_golden_frame || is_intrl_arf_boost || @@ -1146,8 +1198,7 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width, // Static forced key frames Q restrictions dealt with elsewhere. if (!(frame_is_intra_only(cm)) || !rc->this_key_frame_forced || (cpi->twopass.last_kfgroup_zeromotion_pct < STATIC_MOTION_THRESH)) { - int qdelta = av1_frame_type_qdelta(cpi, gf_group->rf_level[gf_group->index], - active_worst_quality); + const int qdelta = av1_frame_type_qdelta(cpi, active_worst_quality); active_worst_quality = AOMMAX(active_worst_quality + qdelta, active_best_quality); } @@ -1167,7 +1218,8 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width, if (oxcf->rc_mode == AOM_Q || (frame_is_intra_only(cm) && !rc->this_key_frame_forced && - cpi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH)) { + cpi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH && + rc->frames_to_key > 1)) { q = active_best_quality; // Special case code to try and match quality with forced key frames. } else if (frame_is_intra_only(cm) && rc->this_key_frame_forced) { @@ -1275,16 +1327,12 @@ static void update_alt_ref_frame_stats(AV1_COMP *cpi) { static void update_golden_frame_stats(AV1_COMP *cpi) { RATE_CONTROL *const rc = &cpi->rc; -#if CUSTOMIZED_GF const TWO_PASS *const twopass = &cpi->twopass; const GF_GROUP *const gf_group = &twopass->gf_group; const int is_intrnl_arf = cpi->oxcf.pass == 2 ? gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE : cpi->refresh_alt2_ref_frame; -#else - const int is_intnl_arf = cpi->refresh_alt2_ref_frame; -#endif // Update the Golden frame usage counts. // NOTE(weitinglin): If we use show_existing_frame for an OVERLAY frame, @@ -1292,9 +1340,10 @@ static void update_golden_frame_stats(AV1_COMP *cpi) { // updated and cpi->refresh_golden_frame will still be zero. if (cpi->refresh_golden_frame || rc->is_src_frame_alt_ref) { // We will not use internal overlay frames to replace the golden frame - if (!rc->is_src_frame_ext_arf) + if (!rc->is_src_frame_internal_arf) { // this frame refreshes means next frames don't unless specified by user rc->frames_since_golden = 0; + } // If we are not using alt ref in the up and coming group clear the arf // active flag. In multi arf group case, if the index is not 0 then @@ -1310,165 +1359,16 @@ static void update_golden_frame_stats(AV1_COMP *cpi) { } } -// Define the reference buffers that will be updated post encode. -void av1_configure_buffer_updates(AV1_COMP *cpi) { - TWO_PASS *const twopass = &cpi->twopass; - - // NOTE(weitinglin): Should we define another function to take care of - // cpi->rc.is_$Source_Type to make this function as it is in the comment? - - cpi->rc.is_src_frame_alt_ref = 0; - cpi->rc.is_bwd_ref_frame = 0; - cpi->rc.is_last_bipred_frame = 0; - cpi->rc.is_bipred_frame = 0; - cpi->rc.is_src_frame_ext_arf = 0; - - switch (twopass->gf_group.update_type[twopass->gf_group.index]) { - case KF_UPDATE: - cpi->refresh_last_frame = 1; - cpi->refresh_golden_frame = 1; - cpi->refresh_bwd_ref_frame = 1; - cpi->refresh_alt2_ref_frame = 1; - cpi->refresh_alt_ref_frame = 1; - break; - - case LF_UPDATE: - cpi->refresh_last_frame = 1; - cpi->refresh_golden_frame = 0; - cpi->refresh_bwd_ref_frame = 0; - cpi->refresh_alt2_ref_frame = 0; - cpi->refresh_alt_ref_frame = 0; - break; - - case GF_UPDATE: - // TODO(zoeliu): To further investigate whether 'refresh_last_frame' is - // needed. - cpi->refresh_last_frame = 1; - cpi->refresh_golden_frame = 1; - cpi->refresh_bwd_ref_frame = 0; - cpi->refresh_alt2_ref_frame = 0; - cpi->refresh_alt_ref_frame = 0; - break; - - case OVERLAY_UPDATE: - cpi->refresh_last_frame = 0; - cpi->refresh_golden_frame = 1; - cpi->refresh_bwd_ref_frame = 0; - cpi->refresh_alt2_ref_frame = 0; - cpi->refresh_alt_ref_frame = 0; - - cpi->rc.is_src_frame_alt_ref = 1; - break; - - case ARF_UPDATE: - cpi->refresh_last_frame = 0; - cpi->refresh_golden_frame = 0; - // NOTE: BWDREF does not get updated along with ALTREF_FRAME. - cpi->refresh_bwd_ref_frame = 0; - cpi->refresh_alt2_ref_frame = 0; - cpi->refresh_alt_ref_frame = 1; - break; - - case BRF_UPDATE: - cpi->refresh_last_frame = 0; - cpi->refresh_golden_frame = 0; - cpi->refresh_bwd_ref_frame = 1; - cpi->refresh_alt2_ref_frame = 0; - cpi->refresh_alt_ref_frame = 0; - - cpi->rc.is_bwd_ref_frame = 1; - break; - - case LAST_BIPRED_UPDATE: - cpi->refresh_last_frame = 1; - cpi->refresh_golden_frame = 0; - cpi->refresh_bwd_ref_frame = 0; - cpi->refresh_alt2_ref_frame = 0; - cpi->refresh_alt_ref_frame = 0; - - cpi->rc.is_last_bipred_frame = 1; - break; - - case BIPRED_UPDATE: - cpi->refresh_last_frame = 1; - cpi->refresh_golden_frame = 0; - cpi->refresh_bwd_ref_frame = 0; - cpi->refresh_alt2_ref_frame = 0; - cpi->refresh_alt_ref_frame = 0; - - cpi->rc.is_bipred_frame = 1; - break; - - case INTNL_OVERLAY_UPDATE: - cpi->refresh_last_frame = 1; - cpi->refresh_golden_frame = 0; - cpi->refresh_bwd_ref_frame = 0; - cpi->refresh_alt2_ref_frame = 0; - cpi->refresh_alt_ref_frame = 0; - - cpi->rc.is_src_frame_alt_ref = 1; - cpi->rc.is_src_frame_ext_arf = 1; - break; - - case INTNL_ARF_UPDATE: - cpi->refresh_last_frame = 0; - cpi->refresh_golden_frame = 0; -#if USE_SYMM_MULTI_LAYER - if (cpi->new_bwdref_update_rule == 1) { - cpi->refresh_bwd_ref_frame = 1; - cpi->refresh_alt2_ref_frame = 0; - } else { -#endif - cpi->refresh_bwd_ref_frame = 0; - cpi->refresh_alt2_ref_frame = 1; -#if USE_SYMM_MULTI_LAYER - } -#endif - cpi->refresh_alt_ref_frame = 0; - break; - - default: assert(0); break; - } -} - -void av1_estimate_qp_gop(AV1_COMP *cpi) { - AV1_COMMON *const cm = &cpi->common; - int gop_length = cpi->rc.baseline_gf_interval; - int bottom_index, top_index; - int idx; - const int gf_index = cpi->twopass.gf_group.index; - - for (idx = 1; idx <= gop_length + 1 && idx < MAX_LAG_BUFFERS; ++idx) { - TplDepFrame *tpl_frame = &cpi->tpl_stats[idx]; - int target_rate = cpi->twopass.gf_group.bit_allocation[idx]; - int arf_q = 0; - - cpi->twopass.gf_group.index = idx; - rc_set_frame_target(cpi, target_rate, cm->width, cm->height); - av1_configure_buffer_updates(cpi); - tpl_frame->base_qindex = rc_pick_q_and_bounds_two_pass( - cpi, cm->width, cm->height, &bottom_index, &top_index, &arf_q); - tpl_frame->base_qindex = AOMMAX(tpl_frame->base_qindex, 1); - } - // Reset the actual index and frame update - cpi->twopass.gf_group.index = gf_index; - av1_configure_buffer_updates(cpi); -} - void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) { const AV1_COMMON *const cm = &cpi->common; const CurrentFrame *const current_frame = &cm->current_frame; RATE_CONTROL *const rc = &cpi->rc; -#if CUSTOMIZED_GF const TWO_PASS *const twopass = &cpi->twopass; const GF_GROUP *const gf_group = &twopass->gf_group; const int is_intrnl_arf = cpi->oxcf.pass == 2 ? gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE : cpi->refresh_alt2_ref_frame; -#else - const int is_intrnl_arf = cpi->refresh_alt2_ref_frame; -#endif const int qindex = cm->base_qindex; @@ -1539,10 +1439,7 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) { // Actual bits spent rc->total_actual_bits += rc->projected_frame_size; - // TODO(zoeliu): To investigate whether we should treat BWDREF_FRAME - // differently here for rc->avg_frame_bandwidth. - rc->total_target_bits += - (cm->show_frame || rc->is_bwd_ref_frame) ? rc->avg_frame_bandwidth : 0; + rc->total_target_bits += cm->show_frame ? rc->avg_frame_bandwidth : 0; rc->total_target_vs_actual = rc->total_actual_bits - rc->total_target_bits; @@ -1575,22 +1472,24 @@ void av1_rc_postencode_update_drop_frame(AV1_COMP *cpi) { // Use this macro to turn on/off use of alt-refs in one-pass mode. #define USE_ALTREF_FOR_ONE_PASS 1 -static int calc_pframe_target_size_one_pass_vbr(const AV1_COMP *const cpi) { +static int calc_pframe_target_size_one_pass_vbr( + const AV1_COMP *const cpi, FRAME_UPDATE_TYPE frame_update_type) { static const int af_ratio = 10; const RATE_CONTROL *const rc = &cpi->rc; int target; #if USE_ALTREF_FOR_ONE_PASS - target = - (!rc->is_src_frame_alt_ref && - (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) - ? (rc->avg_frame_bandwidth * rc->baseline_gf_interval * af_ratio) / - (rc->baseline_gf_interval + af_ratio - 1) - : (rc->avg_frame_bandwidth * rc->baseline_gf_interval) / - (rc->baseline_gf_interval + af_ratio - 1); + if (frame_update_type == KF_UPDATE || frame_update_type == GF_UPDATE || + frame_update_type == ARF_UPDATE) { + target = (rc->avg_frame_bandwidth * rc->baseline_gf_interval * af_ratio) / + (rc->baseline_gf_interval + af_ratio - 1); + } else { + target = (rc->avg_frame_bandwidth * rc->baseline_gf_interval) / + (rc->baseline_gf_interval + af_ratio - 1); + } #else target = rc->avg_frame_bandwidth; #endif - return av1_rc_clamp_pframe_target_size(cpi, target); + return av1_rc_clamp_pframe_target_size(cpi, target, frame_update_type); } static int calc_iframe_target_size_one_pass_vbr(const AV1_COMP *const cpi) { @@ -1600,7 +1499,10 @@ static int calc_iframe_target_size_one_pass_vbr(const AV1_COMP *const cpi) { return av1_rc_clamp_iframe_target_size(cpi, target); } -void av1_rc_get_one_pass_vbr_params(AV1_COMP *cpi) { +void av1_rc_get_one_pass_vbr_params(AV1_COMP *cpi, + FRAME_UPDATE_TYPE *const frame_update_type, + EncodeFrameParams *const frame_params, + unsigned int frame_flags) { AV1_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; CurrentFrame *const current_frame = &cm->current_frame; @@ -1610,48 +1512,45 @@ void av1_rc_get_one_pass_vbr_params(AV1_COMP *cpi) { int sframe_mode = cpi->oxcf.sframe_mode; int sframe_enabled = cpi->oxcf.sframe_enabled; // TODO(yaowu): replace the "auto_key && 0" below with proper decision logic. - if (!cpi->refresh_alt_ref_frame && - (current_frame->frame_number == 0 || - (cpi->frame_flags & FRAMEFLAGS_KEY) || rc->frames_to_key == 0 || - (cpi->oxcf.auto_key && 0))) { - current_frame->frame_type = KEY_FRAME; + if (*frame_update_type != ARF_UPDATE && + (current_frame->frame_number == 0 || (frame_flags & FRAMEFLAGS_KEY) || + rc->frames_to_key == 0 || (cpi->oxcf.auto_key && 0))) { + frame_params->frame_type = KEY_FRAME; rc->this_key_frame_forced = current_frame->frame_number != 0 && rc->frames_to_key == 0; rc->frames_to_key = cpi->oxcf.key_freq; rc->kf_boost = DEFAULT_KF_BOOST; rc->source_alt_ref_active = 0; } else { - current_frame->frame_type = INTER_FRAME; + frame_params->frame_type = INTER_FRAME; if (sframe_enabled) { if (altref_enabled) { if (sframe_mode == 1) { // sframe_mode == 1: insert sframe if it matches altref frame. if (current_frame->frame_number % sframe_dist == 0 && - current_frame->frame_type != KEY_FRAME && - current_frame->frame_number != 0 && cpi->refresh_alt_ref_frame) { - current_frame->frame_type = S_FRAME; + current_frame->frame_number != 0 && + *frame_update_type == ARF_UPDATE) { + frame_params->frame_type = S_FRAME; } } else { // sframe_mode != 1: if sframe will be inserted at the next available // altref frame if (current_frame->frame_number % sframe_dist == 0 && - current_frame->frame_type != KEY_FRAME && current_frame->frame_number != 0) { rc->sframe_due = 1; } - if (rc->sframe_due && cpi->refresh_alt_ref_frame) { - current_frame->frame_type = S_FRAME; + if (rc->sframe_due && *frame_update_type == ARF_UPDATE) { + frame_params->frame_type = S_FRAME; rc->sframe_due = 0; } } } else { if (current_frame->frame_number % sframe_dist == 0 && - current_frame->frame_type != KEY_FRAME && current_frame->frame_number != 0) { - current_frame->frame_type = S_FRAME; + frame_params->frame_type = S_FRAME; } } } @@ -1666,7 +1565,7 @@ void av1_rc_get_one_pass_vbr_params(AV1_COMP *cpi) { } else { rc->constrained_gf_group = 0; } - cpi->refresh_golden_frame = 1; + if (*frame_update_type == LF_UPDATE) *frame_update_type = GF_UPDATE; rc->source_alt_ref_pending = USE_ALTREF_FOR_ONE_PASS; rc->gfu_boost = DEFAULT_GF_BOOST; } @@ -1674,14 +1573,15 @@ void av1_rc_get_one_pass_vbr_params(AV1_COMP *cpi) { if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) av1_cyclic_refresh_update_parameters(cpi); - if (current_frame->frame_type == KEY_FRAME) + if (frame_params->frame_type == KEY_FRAME) target = calc_iframe_target_size_one_pass_vbr(cpi); else - target = calc_pframe_target_size_one_pass_vbr(cpi); + target = calc_pframe_target_size_one_pass_vbr(cpi, *frame_update_type); rc_set_frame_target(cpi, target, cm->width, cm->height); } -static int calc_pframe_target_size_one_pass_cbr(const AV1_COMP *cpi) { +static int calc_pframe_target_size_one_pass_cbr( + const AV1_COMP *cpi, FRAME_UPDATE_TYPE frame_update_type) { const AV1EncoderConfig *oxcf = &cpi->oxcf; const RATE_CONTROL *rc = &cpi->rc; const int64_t diff = rc->optimal_buffer_level - rc->buffer_level; @@ -1692,12 +1592,14 @@ static int calc_pframe_target_size_one_pass_cbr(const AV1_COMP *cpi) { if (oxcf->gf_cbr_boost_pct) { const int af_ratio_pct = oxcf->gf_cbr_boost_pct + 100; - target = cpi->refresh_golden_frame - ? (rc->avg_frame_bandwidth * rc->baseline_gf_interval * - af_ratio_pct) / - (rc->baseline_gf_interval * 100 + af_ratio_pct - 100) - : (rc->avg_frame_bandwidth * rc->baseline_gf_interval * 100) / - (rc->baseline_gf_interval * 100 + af_ratio_pct - 100); + if (frame_update_type == GF_UPDATE || frame_update_type == OVERLAY_UPDATE) { + target = + (rc->avg_frame_bandwidth * rc->baseline_gf_interval * af_ratio_pct) / + (rc->baseline_gf_interval * 100 + af_ratio_pct - 100); + } else { + target = (rc->avg_frame_bandwidth * rc->baseline_gf_interval * 100) / + (rc->baseline_gf_interval * 100 + af_ratio_pct - 100); + } } else { target = rc->avg_frame_bandwidth; } @@ -1740,23 +1642,25 @@ static int calc_iframe_target_size_one_pass_cbr(const AV1_COMP *cpi) { return av1_rc_clamp_iframe_target_size(cpi, target); } -void av1_rc_get_one_pass_cbr_params(AV1_COMP *cpi) { +void av1_rc_get_one_pass_cbr_params(AV1_COMP *cpi, + FRAME_UPDATE_TYPE *const frame_update_type, + EncodeFrameParams *const frame_params, + unsigned int frame_flags) { AV1_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; CurrentFrame *const current_frame = &cm->current_frame; int target; // TODO(yaowu): replace the "auto_key && 0" below with proper decision logic. - if ((current_frame->frame_number == 0 || - (cpi->frame_flags & FRAMEFLAGS_KEY) || rc->frames_to_key == 0 || - (cpi->oxcf.auto_key && 0))) { - current_frame->frame_type = KEY_FRAME; + if ((current_frame->frame_number == 0 || (frame_flags & FRAMEFLAGS_KEY) || + rc->frames_to_key == 0 || (cpi->oxcf.auto_key && 0))) { + frame_params->frame_type = KEY_FRAME; rc->this_key_frame_forced = current_frame->frame_number != 0 && rc->frames_to_key == 0; rc->frames_to_key = cpi->oxcf.key_freq; rc->kf_boost = DEFAULT_KF_BOOST; rc->source_alt_ref_active = 0; } else { - current_frame->frame_type = INTER_FRAME; + frame_params->frame_type = INTER_FRAME; } if (rc->frames_till_gf_update_due == 0) { if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) @@ -1768,7 +1672,7 @@ void av1_rc_get_one_pass_cbr_params(AV1_COMP *cpi) { // NOTE: frames_till_gf_update_due must be <= frames_to_key. if (rc->frames_till_gf_update_due > rc->frames_to_key) rc->frames_till_gf_update_due = rc->frames_to_key; - cpi->refresh_golden_frame = 1; + if (*frame_update_type == LF_UPDATE) *frame_update_type = GF_UPDATE; rc->gfu_boost = DEFAULT_GF_BOOST; } @@ -1777,42 +1681,75 @@ void av1_rc_get_one_pass_cbr_params(AV1_COMP *cpi) { if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) av1_cyclic_refresh_update_parameters(cpi); - if (current_frame->frame_type == KEY_FRAME) + if (frame_params->frame_type == KEY_FRAME) target = calc_iframe_target_size_one_pass_cbr(cpi); else - target = calc_pframe_target_size_one_pass_cbr(cpi); + target = calc_pframe_target_size_one_pass_cbr(cpi, *frame_update_type); rc_set_frame_target(cpi, target, cm->width, cm->height); // TODO(afergs): Decide whether to scale up, down, or not at all } +int av1_find_qindex(double desired_q, aom_bit_depth_t bit_depth, + int best_qindex, int worst_qindex) { + assert(best_qindex <= worst_qindex); + int low = best_qindex; + int high = worst_qindex; + while (low < high) { + const int mid = (low + high) >> 1; + const double mid_q = av1_convert_qindex_to_q(mid, bit_depth); + if (mid_q < desired_q) { + low = mid + 1; + } else { + high = mid; + } + } + assert(low == high); + assert(av1_convert_qindex_to_q(low, bit_depth) >= desired_q || + low == worst_qindex); + return low; +} + int av1_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget, aom_bit_depth_t bit_depth) { - int start_index = rc->worst_quality; - int target_index = rc->worst_quality; - int i; - - // Convert the average q value to an index. - for (i = rc->best_quality; i < rc->worst_quality; ++i) { - start_index = i; - if (av1_convert_qindex_to_q(i, bit_depth) >= qstart) break; - } + const int start_index = + av1_find_qindex(qstart, bit_depth, rc->best_quality, rc->worst_quality); + const int target_index = + av1_find_qindex(qtarget, bit_depth, rc->best_quality, rc->worst_quality); + return target_index - start_index; +} - // Convert the q target to an index - for (i = rc->best_quality; i < rc->worst_quality; ++i) { - target_index = i; - if (av1_convert_qindex_to_q(i, bit_depth) >= qtarget) break; +// Find q_index for the desired_bits_per_mb, within [best_qindex, worst_qindex], +// assuming 'correction_factor' is 1.0. +// To be precise, 'q_index' is the smallest integer, for which the corresponding +// bits per mb <= desired_bits_per_mb. +// If no such q index is found, returns 'worst_qindex'. +static int find_qindex_by_rate(int desired_bits_per_mb, + aom_bit_depth_t bit_depth, FRAME_TYPE frame_type, + int best_qindex, int worst_qindex) { + assert(best_qindex <= worst_qindex); + int low = best_qindex; + int high = worst_qindex; + while (low < high) { + const int mid = (low + high) >> 1; + const int mid_bits_per_mb = + av1_rc_bits_per_mb(frame_type, mid, 1.0, bit_depth); + if (mid_bits_per_mb > desired_bits_per_mb) { + low = mid + 1; + } else { + high = mid; + } } - - return target_index - start_index; + assert(low == high); + assert(av1_rc_bits_per_mb(frame_type, low, 1.0, bit_depth) <= + desired_bits_per_mb || + low == worst_qindex); + return low; } int av1_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type, int qindex, double rate_target_ratio, aom_bit_depth_t bit_depth) { - int target_index = rc->worst_quality; - int i; - // Look up the current projected bits per block for the base index const int base_bits_per_mb = av1_rc_bits_per_mb(frame_type, qindex, 1.0, bit_depth); @@ -1820,14 +1757,9 @@ int av1_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type, // Find the target bits per mb based on the base value and given ratio. const int target_bits_per_mb = (int)(rate_target_ratio * base_bits_per_mb); - // Convert the q target to an index - for (i = rc->best_quality; i < rc->worst_quality; ++i) { - if (av1_rc_bits_per_mb(frame_type, i, 1.0, bit_depth) <= - target_bits_per_mb) { - target_index = i; - break; - } - } + const int target_index = + find_qindex_by_rate(target_bits_per_mb, bit_depth, frame_type, + rc->best_quality, rc->worst_quality); return target_index - qindex; } diff --git a/libaom/av1/encoder/ratectrl.h b/libaom/av1/encoder/ratectrl.h index ea8975d..1cd5994 100644 --- a/libaom/av1/encoder/ratectrl.h +++ b/libaom/av1/encoder/ratectrl.h @@ -15,6 +15,8 @@ #include "aom/aom_codec.h" #include "aom/aom_integer.h" +#include "aom_ports/mem.h" + #include "av1/common/blockd.h" #include "av1/common/onyxc_int.h" @@ -34,54 +36,29 @@ extern "C" { // The maximum duration of a GF group that is static (e.g. a slide show). #define MAX_STATIC_GF_GROUP_LENGTH 250 -#define CUSTOMIZED_GF 1 - -#if CONFIG_FIX_GF_LENGTH -#define FIXED_GF_LENGTH 16 +// Minimum and maximum height for the new pyramid structure. +// (Old structure supports height = 1, but does NOT support height = 4). +#define MIN_PYRAMID_LVL 0 #define MAX_PYRAMID_LVL 4 -// We allow a frame to have at most two left/right descendants before changing -// them into to a subtree, i.e., we allow the following structure: -/* OUT_OF_ORDER_FRAME - / / \ \ -(two left children) F F F F (two right children) */ -// Therefore the max gf size supported by 4 layer structure is -// 1 (KEY/OVERLAY) + 1 + 2 + 4 + 16 (two children on both side of their parent) -#define MAX_PYRAMID_SIZE 24 -#define USE_SYMM_MULTI_LAYER 1 -#define REDUCE_LAST_ALT_BOOST 1 -#define REDUCE_LAST_GF_LENGTH 1 -#define MULTI_LVL_BOOST_VBR_CQ 1 -#else -#define MAX_PYRAMID_SIZE 16 -#define USE_SYMM_MULTI_LAYER 0 -#define REDUCE_LAST_ALT_BOOST 0 -#define REDUCE_LAST_GF_LENGTH 0 -#define MULTI_LVL_BOOST_VBR_CQ 0 -#endif - -#if USE_SYMM_MULTI_LAYER -#define USE_MANUAL_GF4_STRUCT 0 -#endif #define MIN_GF_INTERVAL 4 #define MAX_GF_INTERVAL 16 #define FIXED_GF_INTERVAL 8 // Used in some testing modes only -static const double rate_factor_deltas[RATE_FACTOR_LEVELS] = { - 1.00, // INTER_NORMAL - 0.80, // INTER_LOW - 1.50, // INTER_HIGH - 1.25, // GF_ARF_LOW - 2.00, // GF_ARF_STD - 2.00, // KF_STD -}; - typedef struct { int resize_width; int resize_height; uint8_t superres_denom; } size_params_type; +enum { + INTER_NORMAL, + GF_ARF_LOW, + GF_ARF_STD, + KF_STD, + RATE_FACTOR_LEVELS +} UENUM1BYTE(RATE_FACTOR_LEVEL); + typedef struct { // Rate targetting variables int base_frame_target; // A baseline frame target before adjustment @@ -94,7 +71,6 @@ typedef struct { int last_kf_qindex; // Q index of the last key frame coded. int gfu_boost; - int last_boost; int kf_boost; double rate_correction_factors[RATE_FACTOR_LEVELS]; @@ -113,18 +89,9 @@ typedef struct { int source_alt_ref_pending; int source_alt_ref_active; int is_src_frame_alt_ref; + int is_src_frame_internal_arf; int sframe_due; - // Length of the bi-predictive frame group interval - int bipred_group_interval; - - // NOTE: Different types of frames may have different bits allocated - // accordingly, aiming to achieve the overall optimal RD performance. - int is_bwd_ref_frame; - int is_last_bipred_frame; - int is_bipred_frame; - int is_src_frame_ext_arf; - int avg_frame_bandwidth; // Average frame size target for clip int min_frame_bandwidth; // Minimum allocation used for any frame int max_frame_bandwidth; // Maximum burst rate allowed for a frame. @@ -172,8 +139,6 @@ typedef struct { int q_1_frame; int q_2_frame; - // Auto frame-scaling variables. - int rf_level_maxq[RATE_FACTOR_LEVELS]; float_t arf_boost_factor; // Q index used for ALT frame int arf_q; @@ -196,7 +161,7 @@ int av1_rc_get_default_min_gf_interval(int width, int height, double framerate); // Note av1_rc_get_default_max_gf_interval() requires the min_gf_interval to // be passed in to ensure that the max_gf_interval returned is at least as bis // as that. -int av1_rc_get_default_max_gf_interval(double framerate, int min_frame_rate); +int av1_rc_get_default_max_gf_interval(double framerate, int min_gf_interval); // Generally at the high level, the following flow is expected // to be enforced for rate control: @@ -221,8 +186,13 @@ int av1_rc_get_default_max_gf_interval(double framerate, int min_frame_rate); // Functions to set parameters for encoding before the actual // encode_frame_to_data_rate() function. -void av1_rc_get_one_pass_vbr_params(struct AV1_COMP *cpi); -void av1_rc_get_one_pass_cbr_params(struct AV1_COMP *cpi); +struct EncodeFrameParams; +void av1_rc_get_one_pass_vbr_params( + struct AV1_COMP *cpi, uint8_t *const frame_update_type, + struct EncodeFrameParams *const frame_params, unsigned int frame_flags); +void av1_rc_get_one_pass_cbr_params( + struct AV1_COMP *cpi, uint8_t *const frame_update_type, + struct EncodeFrameParams *const frame_params, unsigned int frame_flags); // Post encode update of the rate control parameters based // on bytes used @@ -262,7 +232,14 @@ int av1_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex, int av1_rc_clamp_iframe_target_size(const struct AV1_COMP *const cpi, int target); int av1_rc_clamp_pframe_target_size(const struct AV1_COMP *const cpi, - int target); + int target, uint8_t frame_update_type); + +// Find q_index corresponding to desired_q, within [best_qindex, worst_qindex]. +// To be precise, 'q_index' is the smallest integer, for which the corresponding +// q >= desired_q. +// If no such q index is found, returns 'worst_qindex'. +int av1_find_qindex(double desired_q, aom_bit_depth_t bit_depth, + int best_qindex, int worst_qindex); // Computes a q delta (in "q index" terms) to get from a starting q value // to a target q value @@ -275,7 +252,7 @@ int av1_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type, int qindex, double rate_target_ratio, aom_bit_depth_t bit_depth); -int av1_frame_type_qdelta(const struct AV1_COMP *cpi, int rf_level, int q); +int av1_frame_type_qdelta(const struct AV1_COMP *cpi, int q); void av1_rc_update_framerate(struct AV1_COMP *cpi, int width, int height); @@ -286,10 +263,6 @@ void av1_set_target_rate(struct AV1_COMP *cpi, int width, int height); int av1_resize_one_pass_cbr(struct AV1_COMP *cpi); -void av1_configure_buffer_updates(struct AV1_COMP *cpi); - -void av1_estimate_qp_gop(struct AV1_COMP *cpi); - #ifdef __cplusplus } // extern "C" #endif diff --git a/libaom/av1/encoder/rd.c b/libaom/av1/encoder/rd.c index 510bb3b..d78e269 100644 --- a/libaom/av1/encoder/rd.c +++ b/libaom/av1/encoder/rd.c @@ -344,13 +344,7 @@ void av1_init_me_luts(void) { static const int rd_boost_factor[16] = { 64, 32, 32, 32, 24, 16, 12, 12, 8, 8, 4, 4, 2, 2, 1, 0 }; static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = { - 128, 144, 128, 128, 144, - // TODO(zoeliu): To adjust further following factor values. - 128, 128, 128, - // TODO(weitinglin): We should investigate if the values should be the same - // as the value used by OVERLAY frame - 144, // INTNL_OVERLAY_UPDATE - 128 // INTNL_ARF_UPDATE + 128, 144, 128, 128, 144, 144, 128 }; int av1_compute_rd_mult_based_on_qindex(const AV1_COMP *cpi, int qindex) { @@ -508,6 +502,17 @@ void av1_fill_coeff_costs(MACROBLOCK *x, FRAME_CONTEXT *fc, av1_cost_tokens_from_cdf(pcost->base_cost[ctx], fc->coeff_base_cdf[tx_size][plane][ctx], NULL); + for (int ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx) { + pcost->base_cost[ctx][4] = 0; + pcost->base_cost[ctx][5] = pcost->base_cost[ctx][1] + + av1_cost_literal(1) - + pcost->base_cost[ctx][0]; + pcost->base_cost[ctx][6] = + pcost->base_cost[ctx][2] - pcost->base_cost[ctx][1]; + pcost->base_cost[ctx][7] = + pcost->base_cost[ctx][3] - pcost->base_cost[ctx][2]; + } + for (int ctx = 0; ctx < EOB_COEF_CONTEXTS; ++ctx) av1_cost_tokens_from_cdf(pcost->eob_extra_cost[ctx], fc->eob_extra_cdf[tx_size][plane][ctx], NULL); @@ -538,6 +543,14 @@ void av1_fill_coeff_costs(MACROBLOCK *x, FRAME_CONTEXT *fc, // printf("%5d ", pcost->lps_cost[ctx][i]); // printf("\n"); } + for (int ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx) { + pcost->lps_cost[ctx][0 + COEFF_BASE_RANGE + 1] = + pcost->lps_cost[ctx][0]; + for (int i = 1; i <= COEFF_BASE_RANGE; ++i) { + pcost->lps_cost[ctx][i + COEFF_BASE_RANGE + 1] = + pcost->lps_cost[ctx][i] - pcost->lps_cost[ctx][i - 1]; + } + } } } } @@ -684,6 +697,7 @@ static double interp_cubic(const double *p, double x) { x * (3.0 * (p[1] - p[2]) + p[3] - p[0]))); } +/* static double interp_bicubic(const double *p, int p_stride, double x, double y) { double q[4]; @@ -693,441 +707,224 @@ static double interp_bicubic(const double *p, int p_stride, double x, q[3] = interp_cubic(p + 3 * p_stride, x); return interp_cubic(q, y); } +*/ -static const double interp_rgrid_surf[65 * 18] = { - 0.104019, 0.245714, 0.293686, 0.358635, 0.382167, 0.412446, - 0.419955, 0.421388, 0.426672, 0.427990, 0.428531, 0.456868, - 0.569880, 0.638822, 1.016319, 2.143453, 3.565229, 4.720880, - 0.124618, 0.294211, 0.352023, 0.429991, 0.458206, 0.494510, - 0.503513, 0.505232, 0.511566, 0.513234, 0.519365, 0.570225, - 0.697373, 0.840624, 1.462198, 3.289054, 6.256517, 6.852788, - 0.118630, 0.269669, 0.346620, 0.430999, 0.459385, 0.495783, - 0.504808, 0.506532, 0.512884, 0.514988, 0.543437, 0.662772, - 0.795876, 1.313596, 2.403841, 4.163098, 7.440589, 8.616275, - 0.093329, 0.168205, 0.321320, 0.430607, 0.459385, 0.495783, - 0.504813, 0.506548, 0.512975, 0.520662, 0.571659, 0.701841, - 1.010727, 2.138851, 3.460626, 6.317955, 10.098127, 14.418553, - 0.087021, 0.142905, 0.315011, 0.430509, 0.459385, 0.495787, - 0.505075, 0.507599, 0.513584, 0.543182, 0.669941, 0.825620, - 1.362800, 2.572187, 4.205047, 7.498399, 12.303118, 16.641735, - 0.086923, 0.142513, 0.314913, 0.430508, 0.459385, 0.495803, - 0.506126, 0.511816, 0.514810, 0.549705, 0.725350, 1.127334, - 2.168597, 3.463686, 6.318605, 10.162284, 18.556041, 19.847042, - 0.086923, 0.142513, 0.314913, 0.430506, 0.459376, 0.495805, - 0.506388, 0.512954, 0.520772, 0.580215, 0.810474, 1.391548, - 2.579442, 4.205160, 7.498399, 12.381597, 21.703618, 24.015457, - 0.086923, 0.142513, 0.314911, 0.430353, 0.458765, 0.495652, - 0.506391, 0.513406, 0.544098, 0.702950, 1.121860, 2.168961, - 3.463798, 6.318607, 10.162284, 18.685361, 28.188192, 37.638872, - 0.086923, 0.142513, 0.314901, 0.429742, 0.456313, 0.495045, - 0.506484, 0.519195, 0.580104, 0.810126, 1.391462, 2.579441, - 4.205160, 7.498399, 12.381597, 21.848607, 33.367199, 42.623190, - 0.086923, 0.142513, 0.314899, 0.429589, 0.455706, 0.495155, - 0.507882, 0.542426, 0.702360, 1.119921, 2.168478, 3.463791, - 6.318607, 10.162284, 18.685361, 28.345760, 47.802028, 49.163533, - 0.086924, 0.142548, 0.315086, 0.429842, 0.455870, 0.496336, - 0.512412, 0.556953, 0.773373, 1.266396, 2.548277, 4.204676, - 7.498399, 12.381597, 21.848607, 33.548250, 54.301011, 56.262859, - 0.087067, 0.144957, 0.327436, 0.446616, 0.466362, 0.505706, - 0.522077, 0.610747, 0.972543, 1.666916, 3.338812, 6.316669, - 10.162284, 18.685361, 28.345760, 48.065311, 66.145302, 78.396020, - 0.094295, 0.164235, 0.393722, 0.534219, 0.530922, 0.579308, - 0.603889, 0.760870, 1.229961, 2.423214, 4.173513, 7.497916, - 12.381597, 21.848607, 33.548250, 54.589585, 74.875848, 86.468182, - 0.124096, 0.213005, 0.497188, 0.665176, 0.685973, 0.800200, - 0.911394, 1.077971, 1.677290, 3.332129, 6.314960, 10.162257, - 18.685361, 28.345760, 48.065311, 66.453506, 98.275189, 96.862588, - 0.140999, 0.270140, 0.658212, 0.867661, 0.970183, 1.149516, - 1.480599, 1.664833, 2.421893, 3.857981, 7.418830, 12.380371, - 21.848607, 33.548250, 54.589585, 75.188867, 106.657971, 99.762997, - 0.178353, 0.398001, 0.988462, 1.241473, 1.340967, 1.713568, - 2.335030, 2.701432, 3.348532, 5.077158, 9.829903, 18.676528, - 28.345700, 48.065311, 66.453506, 98.588283, 117.057193, 101.130722, - 0.281079, 0.548300, 1.395825, 1.780770, 2.000508, 2.702964, - 3.638454, 4.573843, 5.051641, 7.079129, 11.293332, 21.594861, - 33.544335, 54.589585, 75.188867, 106.971065, 119.957601, 101.466632, - 0.476762, 0.842189, 2.019678, 2.723895, 3.188467, 4.011610, - 5.545111, 7.508984, 8.176339, 9.774504, 14.720782, 27.334416, - 48.049609, 66.453506, 98.588283, 117.370357, 121.329855, 101.509242, - 0.993999, 1.520111, 3.013605, 4.203530, 4.982992, 6.074944, - 8.583581, 11.818375, 14.192544, 14.937517, 21.258160, 33.305953, - 54.585735, 75.188867, 106.971135, 120.279824, 121.976055, 102.690130, - 1.776487, 2.613655, 4.356487, 6.161726, 7.622196, 9.464193, - 13.077233, 18.051656, 23.221051, 24.080068, 30.085038, 48.345269, - 66.457698, 98.588353, 117.379415, 121.976128, 124.356210, 107.713202, - 3.191085, 4.495201, 5.686033, 8.365566, 11.275339, 14.706437, - 20.300969, 28.152237, 35.688355, 39.341382, 41.030743, 55.752262, - 75.211764, 106.980285, 120.608403, 124.680746, 130.222528, 112.260098, - 6.136611, 7.305215, 7.272532, 10.646713, 15.630815, 22.383168, - 31.349131, 42.419822, 52.301680, 58.983454, 58.915405, 69.161305, - 98.992460, 117.713855, 124.344836, 130.623638, 138.442401, 127.846670, - 11.707980, 13.490761, 11.640845, 14.176132, 22.131124, 33.776462, - 47.365711, 61.603834, 75.281056, 83.463985, 85.510533, 86.026513, - 108.787480, 123.031136, 130.607284, 138.954406, 160.867784, 158.958882, - 27.062874, 32.195139, 24.147297, 22.114632, 35.580506, 52.551674, - 71.652956, 88.606776, 102.107193, 110.703186, 114.398733, 111.118539, - 121.503578, 132.455924, 139.490806, 161.412674, 193.563210, 172.203945, - 35.625692, 47.953028, 42.639820, 42.276254, 58.815664, 84.977282, - 110.656412, 126.168446, 134.658126, 140.604482, 144.006012, 141.702382, - 140.125323, 153.122630, 164.748041, 194.156197, 206.854650, 174.013079, - 49.516447, 65.335381, 71.738306, 81.872819, 98.400740, 136.840488, - 163.775802, 169.440078, 172.747876, 171.222919, 171.679604, 172.173550, - 168.200129, 187.617133, 199.683394, 207.768200, 210.062520, 175.478356, - 60.341673, 92.487135, 119.907299, 136.068010, 144.778950, 189.443534, - 220.120077, 219.641635, 214.616503, 205.894657, 198.453924, 200.013069, - 195.938103, 206.118661, 210.447375, 212.061379, 216.078218, 181.162805, - 78.422159, 112.242899, 158.416312, 181.404320, 193.188690, 229.296967, - 270.461799, 275.168977, 256.511701, 244.706786, 231.344608, 226.065087, - 222.248618, 218.662324, 217.966722, 218.248574, 218.818588, 182.740573, - 88.713664, 123.594164, 172.928179, 213.781414, 245.800351, 252.063414, - 313.283141, 331.703831, 305.866639, 285.177142, 269.759635, 251.988739, - 245.998388, 232.688076, 230.588702, 230.882657, 230.319053, 192.120741, - 102.540561, 152.905927, 189.137131, 241.806756, 273.868497, 284.258017, - 339.689853, 373.561104, 362.657463, 326.291984, 311.922687, 290.460189, - 276.774381, 273.012072, 277.751792, 279.123748, 278.820447, 233.813798, - 132.983118, 176.307242, 197.415684, 243.307787, 280.893995, 332.922370, - 340.329043, 404.530166, 419.475405, 375.775209, 351.300889, 340.042759, - 315.683832, 306.123530, 306.359319, 306.733063, 307.609556, 261.647847, - 149.579109, 185.925581, 207.937033, 245.159084, 301.890957, 350.040480, - 352.250771, 418.742329, 458.112686, 430.125208, 386.460441, 380.346839, - 354.679150, 337.305620, 334.504124, 335.889932, 341.060725, 286.898578, - 153.576812, 202.105624, 219.366967, 248.524506, 314.255692, 350.607526, - 390.567688, 408.629209, 488.000213, 480.563823, 432.461799, 410.412624, - 398.607371, 400.188740, 402.780916, 408.853470, 430.449735, 363.777088, - 161.353129, 214.848904, 231.549852, 258.536466, 313.163177, 368.140577, - 412.136393, 413.409032, 499.838438, 519.571063, 485.833867, 444.562715, - 435.738129, 442.358549, 450.166531, 453.208524, 458.424358, 385.823139, - 175.109034, 227.608058, 250.069563, 286.101747, 312.256740, 378.421485, - 413.344147, 435.058646, 476.960941, 542.448886, 530.189154, 495.408402, - 475.326752, 465.017144, 464.694045, 465.144689, 466.905382, 398.669138, - 184.750180, 240.766694, 283.240772, 305.480150, 322.409001, 374.526162, - 427.141326, 452.840323, 472.604139, 545.366105, 567.676694, 541.666203, - 509.591873, 492.044219, 492.778569, 493.765684, 493.235693, 413.684325, - 194.728357, 254.928927, 289.991157, 300.193195, 324.194589, 371.563147, - 439.226438, 468.295088, 495.654854, 533.506353, 587.476353, 578.298989, - 548.041942, 527.393885, 538.965146, 545.070442, 544.295454, 454.012211, - 205.195287, 283.135677, 297.921431, 319.295927, 355.621830, 392.466463, - 446.696167, 485.053519, 516.426615, 532.264584, 588.481600, 615.906737, - 589.319634, 555.754316, 558.389367, 569.094521, 569.779764, 475.384946, - 218.552054, 298.511016, 319.188338, 351.781666, 372.789510, 412.827434, - 464.569387, 506.270203, 533.049810, 553.347364, 580.644599, 632.759854, - 622.235843, 569.960552, 580.799340, 586.553714, 579.488366, 491.826482, - 244.803348, 299.790203, 324.187975, 363.280782, 403.710443, 441.724083, - 492.732682, 534.722691, 552.193622, 575.112647, 586.097705, 635.224970, - 644.642944, 606.017786, 640.321218, 642.316989, 616.397020, 548.300111, - 256.957358, 318.638991, 355.063346, 389.889307, 433.607315, 468.209001, - 515.178157, 573.556591, 578.113115, 587.246475, 601.762801, 638.454644, - 656.574853, 641.184609, 676.908189, 684.198162, 678.387412, 574.805864, - 251.211502, 323.448532, 364.227424, 411.792704, 462.226488, 503.572288, - 549.299249, 599.124071, 601.227977, 597.118176, 613.247552, 633.278532, - 658.074755, 664.930719, 685.731531, 693.632845, 693.076350, 578.326477, - 267.695377, 354.273736, 389.976833, 438.518178, 493.332686, 544.343027, - 588.895829, 620.206193, 628.327410, 606.067827, 620.998532, 657.985256, - 683.936059, 691.345257, 693.894723, 695.175306, 693.618786, 578.517148, - 274.290725, 363.465288, 411.808596, 463.369805, 515.310226, 581.009306, - 613.070738, 636.638714, 647.333929, 629.867603, 644.646319, 687.796202, - 702.859596, 713.495479, 704.068069, 704.991807, 704.188594, 587.283658, - 302.538449, 389.174737, 438.518422, 493.398902, 547.662399, 601.981814, - 624.773046, 641.629484, 644.699451, 645.848784, 668.033340, 703.643523, - 707.422408, 717.329600, 726.298973, 744.127507, 745.365167, 617.954068, - 310.328188, 410.984766, 463.369805, 515.315010, 581.309832, 613.787792, - 634.988538, 654.145284, 662.632978, 668.413496, 706.494057, 750.545471, - 730.724808, 730.002100, 743.625262, 750.801609, 745.308457, 606.505800, - 329.948756, 437.600191, 493.398902, 547.661910, 601.917884, 622.557745, - 633.244395, 644.055898, 648.224221, 665.062911, 763.555733, 812.391078, - 769.063582, 744.865168, 727.579796, 724.950408, 722.179707, 598.564510, - 350.848328, 462.437458, 515.315010, 581.309823, 613.779123, 634.465309, - 652.056257, 662.179143, 671.466297, 726.881256, 819.824030, 880.232789, - 810.371672, 754.246481, 725.053473, 724.253390, 723.503395, 603.394909, - 373.704088, 492.408266, 547.661910, 601.917884, 622.557620, 633.236320, - 644.023513, 648.232514, 666.381639, 785.498283, 929.441612, 999.772800, - 890.339033, 775.852504, 731.840181, 726.905100, 725.251844, 604.899901, - 394.473422, 514.261306, 581.309823, 613.779123, 634.465309, 652.056257, - 662.179143, 671.466557, 727.134512, 835.764144, 981.747089, 1018.462934, - 939.686967, 811.276731, 739.398459, 727.365647, 725.285425, 604.923525, - 419.976505, 546.538939, 601.917884, 622.557620, 633.236320, 644.023513, - 648.232514, 666.381639, 785.545191, 932.841398, 1036.609617, 1026.945092, - 963.822765, 840.827315, 755.532423, 730.241865, 725.366847, 604.924155, - 437.281359, 580.116337, 613.779123, 634.465309, 652.056257, 662.179143, - 671.466557, 727.134512, 835.764859, 981.996194, 1031.896881, 1002.544732, - 881.157178, 828.151494, 799.340975, 751.314325, 728.316587, 605.005504, - 464.713920, 600.649281, 622.557620, 633.236320, 644.023513, 648.232514, - 666.381639, 785.545191, 932.841398, 1036.735329, 1035.037004, 995.478339, - 858.093733, 823.471976, 819.881754, 798.749289, 749.440463, 607.955244, - 495.880237, 612.473139, 634.465309, 652.056257, 662.179143, 671.466557, - 727.134512, 835.764859, 981.996194, 1032.339788, 1031.105117, 995.303259, - 857.733663, 823.435877, 822.822791, 819.873050, 796.882480, 629.038445, - 510.391280, 621.158273, 633.236320, 644.023513, 648.232514, 666.381639, - 785.545191, 932.841398, 1036.735329, 1035.566013, 1029.599350, 994.926093, - 857.645648, 823.435143, 822.904139, 822.822791, 817.965681, 673.856962, - 514.588176, 632.947715, 652.056257, 662.179143, 671.466557, 727.134512, - 835.764859, 981.996194, 1032.339788, 1031.547475, 1023.835377, 972.158629, - 851.968626, 823.347128, 822.904770, 822.904139, 820.752301, 684.418900, - 520.013294, 631.668183, 644.023513, 648.232514, 666.381639, 785.545191, - 932.841398, 1036.735329, 1035.567378, 1029.776746, 1001.044108, 880.853721, - 829.201546, 822.994150, 822.904770, 822.904770, 820.792975, 684.582020, - 531.253628, 650.479606, 662.179143, 671.466557, 727.134512, 835.764859, - 981.996194, 1032.339788, 1031.636855, 1029.601779, 995.366703, 858.086641, - 823.524524, 822.906135, 822.904770, 822.904770, 820.792975, 684.582020, - 528.531744, 642.424501, 648.232514, 666.381639, 785.545191, 932.841398, - 1036.735329, 1035.567378, 1030.219103, 1029.576226, 995.278687, 857.733663, - 823.436508, 822.904770, 822.904770, 822.904770, 820.792975, 684.582020, - 545.401164, 660.550678, 671.508859, 727.304161, 835.807162, 981.996850, - 1032.339788, 1031.636855, 1030.130788, 1029.487827, 994.925709, 857.645648, - 823.435143, 822.904770, 822.904770, 822.904770, 820.792975, 684.582020, - 537.684760, 646.650947, 669.110131, 796.487512, 935.569890, 1036.777631, - 1035.567378, 1030.219103, 1030.018584, 1023.810805, 972.158629, 851.968626, - 823.347128, 822.904770, 822.904770, 822.904770, 820.792975, 684.582020, - 552.408370, 670.001885, 738.246482, 879.690154, 992.939171, 1032.509436, - 1031.636855, 1030.132153, 1029.665223, 1001.043724, 880.853721, 829.201546, - 822.994150, 822.904770, 822.904770, 822.904770, 820.792975, 684.582020, - 539.835902, 667.496388, 799.216004, 946.512211, 1039.506123, 1035.609680, - 1030.219103, 1030.107964, 1029.577207, 995.366703, 858.086641, 823.524524, - 822.906135, 822.904770, 822.904770, 822.904770, 820.792975, 684.582020, - 558.362529, 734.277451, 877.197218, 990.478243, 1029.908393, 1028.993978, - 1027.488620, 1027.464048, 1026.933674, 992.724534, 855.532488, 821.323349, - 820.792975, 820.792975, 820.792975, 820.792975, 818.686600, 682.825198, - 453.127195, 649.075095, 780.278390, 867.165890, 862.469711, 857.067460, - 856.956321, 856.955937, 856.513579, 827.981461, 713.556496, 685.024378, - 684.582020, 684.582020, 684.582020, 684.582020, 682.825198, 569.510056, +static const uint8_t bsize_curvfit_model_cat_lookup[BLOCK_SIZES_ALL] = { + 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 0, 0, 1, 1, 2, 2 }; -static const double interp_dgrid_surf[65 * 18] = { - 10.650434, 12.204694, 12.040917, 11.843008, 11.845578, 12.051535, 12.103583, - 12.136780, 12.266709, 12.299107, 12.299673, 12.303120, 12.316337, 12.293431, - 12.092165, 11.602421, 11.141559, 8.864495, 12.770003, 14.634889, 14.437149, - 14.199413, 14.202487, 14.449423, 14.511827, 14.551629, 14.707410, 14.746265, - 14.747610, 14.753705, 14.762194, 14.699395, 14.390525, 13.690970, 12.874168, - 10.367121, 12.832328, 14.790730, 14.503765, 14.236403, 14.239028, 14.486600, - 14.549164, 14.589069, 14.745250, 14.784258, 14.788320, 14.801930, 14.762798, - 14.499088, 14.021544, 13.469684, 12.661560, 10.108384, 12.950520, 15.264726, - 14.621957, 14.238236, 14.239028, 14.486601, 14.549264, 14.589469, 14.745361, - 14.784949, 14.791572, 14.798652, 14.660251, 14.119394, 13.651131, 12.935657, - 12.176082, 9.228999, 12.979992, 15.382918, 14.651428, 14.238693, 14.239028, - 14.486701, 14.555710, 14.615321, 14.751849, 14.787700, 14.797104, 14.743189, - 14.475057, 13.944406, 13.450468, 12.687876, 11.824993, 8.906683, 12.980449, - 15.384750, 14.651885, 14.238700, 14.239028, 14.487102, 14.581562, 14.718998, - 14.777721, 14.788445, 14.778661, 14.582790, 14.099785, 13.649637, 12.935359, - 12.201859, 10.891931, 8.482221, 12.980449, 15.384750, 14.651886, 14.238801, - 14.239434, 14.487303, 14.588010, 14.744860, 14.784773, 14.786094, 14.735647, - 14.455704, 13.939591, 13.450393, 12.687876, 11.849334, 10.476658, 8.043672, - 12.980449, 15.384750, 14.651987, 14.245320, 14.265579, 14.493824, 14.588211, - 14.745312, 14.787263, 14.775934, 14.582036, 14.099475, 13.649563, 12.935358, - 12.201859, 10.911285, 9.730570, 6.696921, 12.980449, 15.384750, 14.652393, - 14.271466, 14.370434, 14.520069, 14.589027, 14.746028, 14.785482, 14.735605, - 14.455693, 13.939590, 13.450393, 12.687876, 11.849334, 10.494514, 9.195398, - 6.215460, 12.980449, 15.384750, 14.652494, 14.277985, 14.396679, 14.533035, - 14.615021, 14.754825, 14.775610, 14.582796, 14.099664, 13.649565, 12.935358, - 12.201859, 10.911285, 9.747361, 7.779960, 5.617541, 12.980448, 15.384731, - 14.652415, 14.278078, 14.397578, 14.559053, 14.718657, 14.776398, 14.747044, - 14.504690, 13.951810, 13.450583, 12.687876, 11.849334, 10.494514, 9.210817, - 7.210003, 5.164575, 12.980446, 15.383448, 14.647073, 14.277541, 14.403813, - 14.569546, 14.744956, 14.765103, 14.629073, 14.296161, 13.698573, 12.936118, - 12.201859, 10.911285, 9.747361, 7.790897, 6.322998, 3.931551, 12.981550, - 15.376916, 14.615597, 14.274820, 14.437479, 14.575942, 14.707492, 14.734111, - 14.515975, 14.000806, 13.462803, 12.688066, 11.849334, 10.494514, 9.210817, - 7.219566, 5.781392, 3.486081, 12.991899, 15.376201, 14.579444, 14.296898, - 14.473361, 14.522910, 14.491600, 14.543267, 14.288580, 13.700311, 12.936579, - 12.201867, 10.911285, 9.747361, 7.790897, 6.331506, 4.480348, 2.923138, - 13.019848, 15.383477, 14.582260, 14.385262, 14.452673, 14.436019, 14.238174, - 14.255993, 13.977481, 13.532342, 12.705591, 11.849605, 10.494514, 9.210817, - 7.219566, 5.789642, 4.018194, 2.766222, 13.028558, 15.315782, 14.439141, - 14.326286, 14.452429, 14.311731, 14.033235, 13.922587, 13.665868, 13.207897, - 12.274375, 10.912967, 9.747371, 7.790897, 6.331506, 4.488594, 3.454993, - 2.692682, 12.992752, 15.321471, 14.409573, 14.236340, 14.322969, 14.049072, - 13.764823, 13.479242, 13.250105, 12.759133, 12.019174, 10.532951, 9.211409, - 7.219566, 5.789642, 4.026440, 3.298077, 2.674624, 12.945493, 15.276596, - 14.315745, 14.026198, 14.085774, 13.844563, 13.447576, 12.964935, 12.735525, - 12.288592, 11.511693, 9.900227, 7.793270, 6.331506, 4.488594, 3.463236, - 3.224318, 2.672433, 12.757570, 15.056661, 14.095011, 13.722362, 13.812624, - 13.608480, 13.021206, 12.367627, 11.937931, 11.581049, 10.599552, 9.247860, - 7.220151, 5.789642, 4.026437, 3.305882, 3.191260, 2.615317, 12.581293, - 14.824658, 13.909074, 13.496158, 13.491402, 13.221550, 12.514140, 11.677229, - 10.936895, 10.619912, 9.634779, 7.763570, 6.331082, 4.488590, 3.462798, - 3.216460, 3.076315, 2.373499, 12.283499, 14.455760, 13.890593, 13.427587, - 13.183783, 12.763833, 11.861006, 10.740618, 9.820756, 9.354945, 8.669862, - 7.123268, 5.787860, 4.025994, 3.290000, 3.084410, 2.810905, 2.222916, - 12.010893, 14.300919, 13.986624, 13.484026, 13.025385, 12.224281, 11.064265, - 9.631040, 8.594396, 8.003736, 7.561587, 6.274418, 4.466637, 3.446574, - 3.102467, 2.816989, 2.598688, 1.951541, 11.581477, 13.831132, 13.632027, - 13.380414, 12.807880, 11.665651, 10.218236, 8.562237, 7.222614, 6.611808, - 6.261676, 5.402793, 3.938544, 3.174375, 2.818166, 2.602758, 2.213911, - 1.434763, 11.050735, 12.893449, 12.363152, 12.712829, 12.012961, 10.887854, - 9.109699, 7.421701, 5.965603, 5.272129, 4.991435, 4.423000, 3.369988, - 2.800371, 2.593901, 2.217431, 1.670917, 1.215265, 10.641194, 11.766277, - 10.777082, 10.972917, 10.689298, 9.701545, 7.719947, 6.145654, 4.872442, - 4.099600, 3.880934, 3.514159, 2.786474, 2.368963, 2.162376, 1.673670, - 1.450770, 1.185424, 10.071964, 11.107701, 9.172361, 8.551313, 8.412080, - 7.641397, 6.174246, 4.853916, 3.904549, 3.246810, 2.959903, 2.785066, - 2.240001, 1.793166, 1.585520, 1.449824, 1.405368, 1.168856, 9.213182, - 9.173278, 7.219231, 6.242951, 5.626013, 5.768007, 4.908666, 3.809589, - 3.115109, 2.617899, 2.274793, 2.172960, 1.838597, 1.505915, 1.414333, - 1.392666, 1.338173, 1.105611, 7.365015, 7.471370, 5.622346, 4.520127, - 3.936272, 4.208822, 3.623024, 2.977794, 2.450003, 2.097261, 1.824090, - 1.643270, 1.473525, 1.351388, 1.327504, 1.323865, 1.307894, 1.088234, - 6.198210, 6.580712, 4.682511, 3.416952, 2.941929, 2.766637, 2.650686, - 2.315439, 1.925838, 1.659784, 1.464419, 1.252806, 1.162722, 1.197518, - 1.199875, 1.197365, 1.194040, 0.995797, 5.402507, 5.055466, 3.728724, - 2.624359, 2.165810, 1.943189, 1.918190, 1.738078, 1.516328, 1.290520, - 1.155793, 1.015962, 0.881900, 0.807203, 0.754242, 0.743378, 0.740288, - 0.614158, 3.937867, 3.862507, 2.884664, 2.088147, 1.648496, 1.473584, - 1.340123, 1.291769, 1.165381, 1.000224, 0.893316, 0.821333, 0.691363, - 0.610501, 0.586766, 0.583762, 0.577840, 0.468733, 3.104660, 3.181078, - 2.420208, 1.747442, 1.297956, 1.109835, 0.970385, 0.943229, 0.876923, - 0.777584, 0.678183, 0.628623, 0.553745, 0.523430, 0.519490, 0.514394, - 0.492259, 0.403172, 2.593833, 2.533720, 2.010452, 1.480944, 1.060302, - 0.846383, 0.738703, 0.673144, 0.658010, 0.592449, 0.518236, 0.470335, - 0.425088, 0.393168, 0.378116, 0.355846, 0.275469, 0.213128, 2.176988, - 2.089575, 1.671284, 1.225008, 0.895382, 0.672008, 0.566241, 0.496746, - 0.488005, 0.449874, 0.400899, 0.354002, 0.318150, 0.281533, 0.238545, - 0.224159, 0.202399, 0.160681, 1.874679, 1.769165, 1.430124, 1.068727, - 0.780272, 0.557801, 0.441643, 0.377256, 0.352957, 0.338452, 0.304965, - 0.273172, 0.240052, 0.208724, 0.193431, 0.190845, 0.185025, 0.138166, - 1.590226, 1.502830, 1.193127, 0.917885, 0.670432, 0.474546, 0.355420, - 0.292305, 0.259035, 0.249937, 0.232079, 0.208943, 0.181936, 0.160038, - 0.152257, 0.151235, 0.149583, 0.120747, 1.331730, 1.255907, 1.012871, - 0.778422, 0.578977, 0.412432, 0.293155, 0.231824, 0.197187, 0.183921, - 0.174876, 0.157252, 0.140263, 0.127050, 0.110244, 0.105041, 0.104323, - 0.086944, 1.153994, 1.118771, 0.822355, 0.612321, 0.478249, 0.348222, - 0.247408, 0.186141, 0.152714, 0.135445, 0.129810, 0.119994, 0.115619, - 0.131626, 0.095612, 0.079343, 0.077502, 0.064550, 0.946317, 0.925894, - 0.677969, 0.499906, 0.397101, 0.297931, 0.214467, 0.152333, 0.120731, - 0.102686, 0.095062, 0.090361, 0.122319, 0.240194, 0.112687, 0.070690, - 0.070461, 0.054194, 0.824155, 0.787241, 0.581856, 0.419228, 0.313167, - 0.245582, 0.183500, 0.128101, 0.096577, 0.080267, 0.071022, 0.066851, - 0.085754, 0.154163, 0.075884, 0.052401, 0.054270, 0.026656, 0.716310, - 0.671378, 0.489580, 0.349569, 0.256155, 0.206343, 0.157853, 0.111950, - 0.079271, 0.062518, 0.053441, 0.049660, 0.051400, 0.063778, 0.039993, - 0.029133, 0.023382, 0.013725, 0.614125, 0.579096, 0.417126, 0.299465, - 0.217849, 0.165515, 0.129040, 0.093127, 0.065612, 0.049543, 0.041429, - 0.036850, 0.034416, 0.033989, 0.024216, 0.017377, 0.014833, 0.011987, - 0.520407, 0.487239, 0.349473, 0.251741, 0.184897, 0.135813, 0.107098, - 0.073607, 0.053938, 0.040531, 0.032931, 0.028876, 0.025759, 0.022168, - 0.016739, 0.014638, 0.014333, 0.011947, 0.449954, 0.415124, 0.299452, - 0.216942, 0.158874, 0.115334, 0.088821, 0.060105, 0.042610, 0.032566, - 0.026903, 0.023123, 0.019913, 0.016835, 0.014306, 0.013625, 0.013535, - 0.011284, 0.377618, 0.347773, 0.251741, 0.184839, 0.132857, 0.095439, - 0.070462, 0.052244, 0.036078, 0.026025, 0.021518, 0.018487, 0.015361, - 0.012905, 0.011470, 0.010569, 0.010283, 0.008297, 0.319953, 0.297976, - 0.216942, 0.158842, 0.113280, 0.080426, 0.057367, 0.041987, 0.030135, - 0.022295, 0.017901, 0.015121, 0.012224, 0.010035, 0.009353, 0.009108, - 0.008695, 0.006139, 0.267864, 0.250502, 0.184839, 0.132851, 0.095039, - 0.068220, 0.049135, 0.035315, 0.025144, 0.018237, 0.013857, 0.012094, - 0.009715, 0.007743, 0.006937, 0.006446, 0.006243, 0.004929, 0.230449, - 0.215895, 0.158842, 0.113280, 0.080417, 0.057174, 0.041304, 0.029959, - 0.021866, 0.015673, 0.012133, 0.010083, 0.007801, 0.006053, 0.005401, - 0.003834, 0.003429, 0.002851, 0.193984, 0.183963, 0.132851, 0.095039, - 0.068220, 0.049133, 0.035305, 0.025140, 0.018150, 0.013175, 0.010422, - 0.008491, 0.006397, 0.004567, 0.003494, 0.002933, 0.002825, 0.002355, - 0.167298, 0.158088, 0.113280, 0.080417, 0.057174, 0.041304, 0.029959, - 0.021866, 0.015669, 0.011955, 0.009257, 0.007051, 0.005543, 0.003905, - 0.002984, 0.002825, 0.002814, 0.002347, 0.143228, 0.132220, 0.095039, - 0.068220, 0.049133, 0.035305, 0.025140, 0.018150, 0.013174, 0.010394, - 0.008403, 0.006661, 0.005378, 0.003545, 0.002876, 0.002818, 0.002814, - 0.002347, 0.122934, 0.112735, 0.080417, 0.057174, 0.041304, 0.029959, - 0.021866, 0.015669, 0.011955, 0.009258, 0.007182, 0.006012, 0.003762, - 0.002866, 0.002739, 0.002788, 0.002810, 0.002347, 0.101934, 0.094569, - 0.068220, 0.049133, 0.035305, 0.025140, 0.018150, 0.013174, 0.010394, - 0.008405, 0.006797, 0.005845, 0.003333, 0.002703, 0.002695, 0.002723, - 0.002781, 0.002343, 0.086702, 0.080014, 0.057174, 0.041304, 0.029959, - 0.021866, 0.015669, 0.011955, 0.009258, 0.007190, 0.006533, 0.005839, - 0.003326, 0.002700, 0.002690, 0.002694, 0.002716, 0.002314, 0.073040, - 0.067886, 0.049133, 0.035305, 0.025140, 0.018150, 0.013174, 0.010394, - 0.008405, 0.006807, 0.006468, 0.005831, 0.003325, 0.002700, 0.002690, - 0.002690, 0.002687, 0.002253, 0.061685, 0.056890, 0.041304, 0.029959, - 0.021866, 0.015669, 0.011955, 0.009258, 0.007190, 0.006542, 0.006360, - 0.005416, 0.003221, 0.002698, 0.002690, 0.002690, 0.002683, 0.002238, - 0.052465, 0.048894, 0.035305, 0.025140, 0.018150, 0.013174, 0.010394, - 0.008405, 0.006807, 0.006472, 0.005943, 0.003748, 0.002805, 0.002692, - 0.002690, 0.002690, 0.002683, 0.002238, 0.043838, 0.041101, 0.029959, - 0.021866, 0.015669, 0.011955, 0.009258, 0.007190, 0.006543, 0.006465, - 0.005839, 0.003333, 0.002702, 0.002690, 0.002690, 0.002690, 0.002683, - 0.002238, 0.037824, 0.035133, 0.025140, 0.018150, 0.013174, 0.010394, - 0.008405, 0.006807, 0.006480, 0.006464, 0.005838, 0.003326, 0.002700, - 0.002690, 0.002690, 0.002690, 0.002683, 0.002238, 0.031865, 0.029815, - 0.021866, 0.015668, 0.011955, 0.009258, 0.007190, 0.006543, 0.006475, - 0.006462, 0.005831, 0.003325, 0.002700, 0.002690, 0.002690, 0.002690, - 0.002683, 0.002238, 0.027150, 0.025016, 0.018128, 0.013083, 0.010371, - 0.008405, 0.006807, 0.006480, 0.006472, 0.006359, 0.005416, 0.003221, - 0.002698, 0.002690, 0.002690, 0.002690, 0.002683, 0.002238, 0.023094, - 0.021760, 0.015577, 0.011590, 0.009167, 0.007188, 0.006543, 0.006475, - 0.006466, 0.005943, 0.003748, 0.002805, 0.002692, 0.002690, 0.002690, - 0.002690, 0.002683, 0.002238, 0.019269, 0.018038, 0.013060, 0.010280, - 0.008382, 0.006806, 0.006480, 0.006474, 0.006464, 0.005839, 0.003333, - 0.002702, 0.002690, 0.002690, 0.002690, 0.002690, 0.002683, 0.002238, - 0.016874, 0.015472, 0.011566, 0.009148, 0.007171, 0.006527, 0.006458, - 0.006457, 0.006447, 0.005823, 0.003318, 0.002693, 0.002683, 0.002683, - 0.002683, 0.002683, 0.002676, 0.002232, 0.011968, 0.011056, 0.008762, - 0.007219, 0.005717, 0.005391, 0.005386, 0.005386, 0.005377, 0.004856, - 0.002767, 0.002246, 0.002238, 0.002238, 0.002238, 0.002238, 0.002232, - 0.001862, +static int sse_norm_curvfit_model_cat_lookup(double sse_norm) { + return (sse_norm > 16.0); +} + +// Models distortion by sse using a logistic function on +// l = log2(sse / q^2) as: +// dbysse = 16 / (1 + k exp(l + c)) +static double get_dbysse_logistic(double l, double c, double k) { + const double A = 16.0; + const double dbysse = A / (1 + k * exp(l + c)); + return dbysse; +} + +// Models rate using a clamped linear function on +// l = log2(sse / q^2) as: +// rate = max(0, a + b * l) +static double get_rate_clamplinear(double l, double a, double b) { + const double rate = a + b * l; + return (rate < 0 ? 0 : rate); +} + +static const uint8_t bsize_surffit_model_cat_lookup[BLOCK_SIZES_ALL] = { + 0, 0, 0, 0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8, 0, 0, 2, 2, 4, 4 }; -void av1_model_rd_surffit(double xm, double yl, double *rate_f, - double *dist_f) { - const double x_start = -0.5; - const double x_end = 16.5; - const double x_step = 1; - const double y_start = -15.5; - const double y_end = 16.5; - const double y_step = 0.5; - const double epsilon = 1e-6; - const int stride = (int)rint((x_end - x_start) / x_step) + 1; - (void)y_end; +static const double surffit_rate_params[9][4] = { + { + 638.390212, + 2.253108, + 166.585650, + -3.939401, + }, + { + 5.256905, + 81.997240, + -1.321771, + 17.694216, + }, + { + -74.193045, + 72.431868, + -19.033152, + 15.407276, + }, + { + 416.770113, + 14.794188, + 167.686830, + -6.997756, + }, + { + 378.511276, + 9.558376, + 154.658843, + -6.635663, + }, + { + 277.818787, + 4.413180, + 150.317637, + -9.893038, + }, + { + 142.212132, + 11.542038, + 94.393964, + -5.518517, + }, + { + 219.100256, + 4.007421, + 108.932852, + -6.981310, + }, + { + 222.261971, + 3.251049, + 95.972916, + -5.609789, + }, +}; + +static const double surffit_dist_params[7] = { + 1.475844, 4.328362, -5.680233, -0.500994, 0.554585, 4.839478, -0.695837 +}; - xm = AOMMAX(xm, x_start + x_step + epsilon); - xm = AOMMIN(xm, x_end - x_step - epsilon); - yl = AOMMAX(yl, y_start + y_step + epsilon); - yl = AOMMIN(yl, y_end - y_step - epsilon); +static void rate_surffit_model_params_lookup(BLOCK_SIZE bsize, double xm, + double *rpar) { + const int cat = bsize_surffit_model_cat_lookup[bsize]; + rpar[0] = surffit_rate_params[cat][0] + surffit_rate_params[cat][1] * xm; + rpar[1] = surffit_rate_params[cat][2] + surffit_rate_params[cat][3] * xm; +} - const double y = (yl - y_start) / y_step; - const double x = (xm - x_start) / x_step; +static void dist_surffit_model_params_lookup(BLOCK_SIZE bsize, double xm, + double *dpar) { + (void)bsize; + const double *params = surffit_dist_params; + dpar[0] = params[0] + params[1] / (1 + exp((xm + params[2]) * params[3])); + dpar[1] = params[4] + params[5] * exp(params[6] * xm); +} - const int yi = (int)floor(y); - const int xi = (int)floor(x); - assert(xi > 0); - assert(yi > 0); +void av1_model_rd_surffit(BLOCK_SIZE bsize, double sse_norm, double xm, + double yl, double *rate_f, double *distbysse_f) { + (void)sse_norm; + double rpar[2], dpar[2]; + rate_surffit_model_params_lookup(bsize, xm, rpar); + dist_surffit_model_params_lookup(bsize, xm, dpar); - const double yo = y - yi; - const double xo = x - xi; - const double *prate = &interp_rgrid_surf[(yi - 1) * stride + (xi - 1)]; - const double *pdist = &interp_dgrid_surf[(yi - 1) * stride + (xi - 1)]; - *rate_f = interp_bicubic(prate, stride, xo, yo); - *dist_f = interp_bicubic(pdist, stride, xo, yo); + *rate_f = get_rate_clamplinear(yl, rpar[0], rpar[1]); + *distbysse_f = get_dbysse_logistic(yl, dpar[0], dpar[1]); } -static const double interp_rgrid_curv[65] = { - 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, - 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, - 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 4.759876, - 8.132086, 13.651828, 21.908271, 33.522054, 48.782376, 71.530983, - 106.728649, 151.942795, 199.893011, 242.850965, 283.933923, 322.154203, - 360.684608, 394.801656, 426.879017, 460.234313, 484.103987, 508.261495, - 536.486763, 558.196737, 586.285894, 614.764511, 634.166333, 647.706472, - 658.211478, 681.360407, 701.052141, 727.007310, 768.663973, 804.407660, - 884.627751, 1065.658131, 1238.875214, 1440.185176, 1678.377931, 1962.243390, - 2300.571467, 2702.152072, 3175.775119, 3730.230519, 4374.308184, 5116.798028, - 5966.489961, 6932.173897, 8022.639747, 9246.677424, 10613.076839, +static const double interp_rgrid_curv[4][65] = { + { + 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 23.801499, 28.387688, 33.388795, 42.298282, + 41.525408, 51.597692, 49.566271, 54.632979, 60.321507, + 67.730678, 75.766165, 85.324032, 96.600012, 120.839562, + 173.917577, 255.974908, 354.107573, 458.063476, 562.345966, + 668.568424, 772.072881, 878.598490, 982.202274, 1082.708946, + 1188.037853, 1287.702240, 1395.588773, 1490.825830, 1584.231230, + 1691.386090, 1766.822555, 1869.630904, 1926.743565, 2002.949495, + 2047.431137, 2138.486068, 2154.743767, 2209.242472, 2277.593051, + 2290.996432, 2307.452938, 2343.567091, 2397.654644, 2469.425868, + 2558.591037, 2664.860422, 2787.944296, 2927.552932, 3083.396602, + 3255.185579, 3442.630134, 3645.440541, 3863.327072, 4096.000000, + }, + { + 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 8.998436, 9.439592, 9.731837, 10.865931, + 11.561347, 12.578139, 14.205101, 16.770584, 19.094853, + 21.330863, 23.298907, 26.901921, 34.501017, 57.891733, + 112.234763, 194.853189, 288.302032, 380.499422, 472.625309, + 560.226809, 647.928463, 734.155122, 817.489721, 906.265783, + 999.260562, 1094.489206, 1197.062998, 1293.296825, 1378.926484, + 1472.760990, 1552.663779, 1635.196884, 1692.451951, 1759.741063, + 1822.162720, 1916.515921, 1966.686071, 2031.647506, 2033.700134, + 2087.847688, 2161.688858, 2242.536028, 2334.023491, 2436.337802, + 2549.665519, 2674.193198, 2810.107395, 2957.594666, 3116.841567, + 3288.034655, 3471.360486, 3667.005616, 3875.156602, 4096.000000, + }, + { + 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 2.377584, 2.557185, 2.732445, 2.851114, + 3.281800, 3.765589, 4.342578, 5.145582, 5.611038, + 6.642238, 7.945977, 11.800522, 17.346624, 37.501413, + 87.216800, 165.860942, 253.865564, 332.039345, 408.518863, + 478.120452, 547.268590, 616.067676, 680.022540, 753.863541, + 834.529973, 919.489191, 1008.264989, 1092.230318, 1173.971886, + 1249.514122, 1330.510941, 1399.523249, 1466.923387, 1530.533471, + 1586.515722, 1695.197774, 1746.648696, 1837.136959, 1909.075485, + 1975.074651, 2060.159200, 2155.335095, 2259.762505, 2373.710437, + 2497.447898, 2631.243895, 2775.367434, 2930.087523, 3095.673170, + 3272.393380, 3460.517161, 3660.313520, 3872.051464, 4096.000000, + }, + { + 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 0.296997, 0.342545, 0.403097, 0.472889, + 0.614483, 0.842937, 1.050824, 1.326663, 1.717750, + 2.530591, 3.582302, 6.995373, 9.973335, 24.042464, + 56.598240, 113.680735, 180.018689, 231.050567, 266.101082, + 294.957934, 323.326511, 349.434429, 380.443211, 408.171987, + 441.214916, 475.716772, 512.900000, 551.186939, 592.364455, + 624.527378, 661.940693, 679.185473, 724.800679, 764.781792, + 873.050019, 950.299001, 939.292954, 1052.406153, 1033.893184, + 1112.182406, 1219.174326, 1337.296681, 1471.648357, 1622.492809, + 1790.093491, 1974.713858, 2176.617364, 2396.067465, 2633.327614, + 2888.661266, 3162.331876, 3454.602899, 3765.737789, 4096.000000, + }, }; -static const double interp_dgrid_curv[65] = { - 14.604855, 14.604855, 14.604855, 14.604855, 14.604855, 14.604855, 14.604855, - 14.604855, 14.604855, 14.604855, 14.604855, 14.604855, 14.555776, 14.533692, - 14.439920, 14.257791, 13.977230, 13.623229, 13.064884, 12.355411, 11.560773, - 10.728960, 9.861975, 8.643612, 6.916021, 5.154769, 3.734940, 2.680051, - 1.925506, 1.408410, 1.042223, 0.767641, 0.565392, 0.420116, 0.310427, - 0.231711, 0.172999, 0.128293, 0.094992, 0.072171, 0.052972, 0.039354, - 0.029555, 0.022857, 0.016832, 0.013297, 0.000000, 0.000000, 0.000000, - 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, - 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, - 0.000000, 0.000000, +static const double interp_dgrid_curv[2][65] = { + { + 16.000000, 15.962891, 15.925174, 15.886888, 15.848074, 15.808770, + 15.769015, 15.728850, 15.688313, 15.647445, 15.606284, 15.564870, + 15.525918, 15.483820, 15.373330, 15.126844, 14.637442, 14.184387, + 13.560070, 12.880717, 12.165995, 11.378144, 10.438769, 9.130790, + 7.487633, 5.688649, 4.267515, 3.196300, 2.434201, 1.834064, + 1.369920, 1.035921, 0.775279, 0.574895, 0.427232, 0.314123, + 0.233236, 0.171440, 0.128188, 0.092762, 0.067569, 0.049324, + 0.036330, 0.027008, 0.019853, 0.015539, 0.011093, 0.008733, + 0.007624, 0.008105, 0.005427, 0.004065, 0.003427, 0.002848, + 0.002328, 0.001865, 0.001457, 0.001103, 0.000801, 0.000550, + 0.000348, 0.000193, 0.000085, 0.000021, 0.000000, + }, + { + 16.000000, 15.996116, 15.984769, 15.966413, 15.941505, 15.910501, + 15.873856, 15.832026, 15.785466, 15.734633, 15.679981, 15.621967, + 15.560961, 15.460157, 15.288367, 15.052462, 14.466922, 13.921212, + 13.073692, 12.222005, 11.237799, 9.985848, 8.898823, 7.423519, + 5.995325, 4.773152, 3.744032, 2.938217, 2.294526, 1.762412, + 1.327145, 1.020728, 0.765535, 0.570548, 0.425833, 0.313825, + 0.232959, 0.171324, 0.128174, 0.092750, 0.067558, 0.049319, + 0.036330, 0.027008, 0.019853, 0.015539, 0.011093, 0.008733, + 0.007624, 0.008105, 0.005427, 0.004065, 0.003427, 0.002848, + 0.002328, 0.001865, 0.001457, 0.001103, 0.000801, 0.000550, + 0.000348, 0.000193, 0.000085, 0.000021, -0.000000, + }, }; -void av1_model_rd_curvfit(double xqr, double *rate_f, double *distbysse_f) { +void av1_model_rd_curvfit(BLOCK_SIZE bsize, double sse_norm, double xqr, + double *rate_f, double *distbysse_f) { const double x_start = -15.5; const double x_end = 16.5; const double x_step = 0.5; const double epsilon = 1e-6; + const int rcat = bsize_curvfit_model_cat_lookup[bsize]; + const int dcat = sse_norm_curvfit_model_cat_lookup(sse_norm); (void)x_end; xqr = AOMMAX(xqr, x_start + x_step + epsilon); @@ -1138,9 +935,9 @@ void av1_model_rd_curvfit(double xqr, double *rate_f, double *distbysse_f) { assert(xi > 0); - const double *prate = &interp_rgrid_curv[(xi - 1)]; - const double *pdist = &interp_dgrid_curv[(xi - 1)]; + const double *prate = &interp_rgrid_curv[rcat][(xi - 1)]; *rate_f = interp_cubic(prate, xo); + const double *pdist = &interp_dgrid_curv[dcat][(xi - 1)]; *distbysse_f = interp_cubic(pdist, xo); } @@ -1257,13 +1054,12 @@ int16_t *av1_raster_block_offset_int16(BLOCK_SIZE plane_bsize, int raster_block, YV12_BUFFER_CONFIG *av1_get_scaled_ref_frame(const AV1_COMP *cpi, int ref_frame) { - const AV1_COMMON *const cm = &cpi->common; assert(ref_frame >= LAST_FRAME && ref_frame <= ALTREF_FRAME); - const int scaled_idx = cpi->scaled_ref_idx[ref_frame - 1]; - const int ref_idx = get_ref_frame_buf_idx(cpi, ref_frame); - return (scaled_idx != ref_idx && scaled_idx != INVALID_IDX) - ? &cm->buffer_pool->frame_bufs[scaled_idx].buf - : NULL; + RefCntBuffer *const scaled_buf = cpi->scaled_ref_buf[ref_frame - 1]; + const RefCntBuffer *const ref_buf = + get_ref_frame_buf(&cpi->common, ref_frame); + return (scaled_buf != ref_buf && scaled_buf != NULL) ? &scaled_buf->buf + : NULL; } int av1_get_switchable_rate(const AV1_COMMON *const cm, MACROBLOCK *x, @@ -1304,7 +1100,7 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) { } else { rd->thresh_mult[THR_NEARESTMV] = 0; rd->thresh_mult[THR_NEARESTL2] = 0; - rd->thresh_mult[THR_NEARESTL3] = 0; + rd->thresh_mult[THR_NEARESTL3] = 100; rd->thresh_mult[THR_NEARESTB] = 0; rd->thresh_mult[THR_NEARESTA2] = 0; rd->thresh_mult[THR_NEARESTA] = 0; @@ -1315,7 +1111,7 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) { rd->thresh_mult[THR_NEWL2] += 1000; rd->thresh_mult[THR_NEWL3] += 1000; rd->thresh_mult[THR_NEWB] += 1000; - rd->thresh_mult[THR_NEWA2] = 1000; + rd->thresh_mult[THR_NEWA2] = 1100; rd->thresh_mult[THR_NEWA] += 1000; rd->thresh_mult[THR_NEWG] += 1000; @@ -1327,18 +1123,18 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) { rd->thresh_mult[THR_NEARA] += 1000; rd->thresh_mult[THR_NEARG] += 1000; - rd->thresh_mult[THR_GLOBALMV] += 2000; + rd->thresh_mult[THR_GLOBALMV] += 2200; rd->thresh_mult[THR_GLOBALL2] += 2000; rd->thresh_mult[THR_GLOBALL3] += 2000; - rd->thresh_mult[THR_GLOBALB] += 2000; + rd->thresh_mult[THR_GLOBALB] += 2400; rd->thresh_mult[THR_GLOBALA2] = 2000; rd->thresh_mult[THR_GLOBALG] += 2000; - rd->thresh_mult[THR_GLOBALA] += 2000; + rd->thresh_mult[THR_GLOBALA] += 2400; - rd->thresh_mult[THR_COMP_NEAREST_NEARESTLA] += 1000; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTLA] += 1100; rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2A] += 1000; - rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3A] += 1000; - rd->thresh_mult[THR_COMP_NEAREST_NEARESTGA] += 1000; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3A] += 800; + rd->thresh_mult[THR_COMP_NEAREST_NEARESTGA] += 900; rd->thresh_mult[THR_COMP_NEAREST_NEARESTLB] += 1000; rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2B] += 1000; rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3B] += 1000; @@ -1356,17 +1152,17 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) { rd->thresh_mult[THR_COMP_NEAR_NEARLA] += 1200; rd->thresh_mult[THR_COMP_NEAREST_NEWLA] += 1500; rd->thresh_mult[THR_COMP_NEW_NEARESTLA] += 1500; - rd->thresh_mult[THR_COMP_NEAR_NEWLA] += 1700; - rd->thresh_mult[THR_COMP_NEW_NEARLA] += 1700; - rd->thresh_mult[THR_COMP_NEW_NEWLA] += 2000; - rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLA] += 2500; + rd->thresh_mult[THR_COMP_NEAR_NEWLA] += 1530; + rd->thresh_mult[THR_COMP_NEW_NEARLA] += 1870; + rd->thresh_mult[THR_COMP_NEW_NEWLA] += 2400; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLA] += 2750; rd->thresh_mult[THR_COMP_NEAR_NEARL2A] += 1200; rd->thresh_mult[THR_COMP_NEAREST_NEWL2A] += 1500; rd->thresh_mult[THR_COMP_NEW_NEARESTL2A] += 1500; - rd->thresh_mult[THR_COMP_NEAR_NEWL2A] += 1700; + rd->thresh_mult[THR_COMP_NEAR_NEWL2A] += 1870; rd->thresh_mult[THR_COMP_NEW_NEARL2A] += 1700; - rd->thresh_mult[THR_COMP_NEW_NEWL2A] += 2000; + rd->thresh_mult[THR_COMP_NEW_NEWL2A] += 1800; rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2A] += 2500; rd->thresh_mult[THR_COMP_NEAR_NEARL3A] += 1200; @@ -1375,23 +1171,23 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) { rd->thresh_mult[THR_COMP_NEAR_NEWL3A] += 1700; rd->thresh_mult[THR_COMP_NEW_NEARL3A] += 1700; rd->thresh_mult[THR_COMP_NEW_NEWL3A] += 2000; - rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3A] += 2500; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3A] += 3000; - rd->thresh_mult[THR_COMP_NEAR_NEARGA] += 1200; + rd->thresh_mult[THR_COMP_NEAR_NEARGA] += 1320; rd->thresh_mult[THR_COMP_NEAREST_NEWGA] += 1500; rd->thresh_mult[THR_COMP_NEW_NEARESTGA] += 1500; - rd->thresh_mult[THR_COMP_NEAR_NEWGA] += 1700; + rd->thresh_mult[THR_COMP_NEAR_NEWGA] += 2040; rd->thresh_mult[THR_COMP_NEW_NEARGA] += 1700; rd->thresh_mult[THR_COMP_NEW_NEWGA] += 2000; - rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA] += 2500; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA] += 2250; rd->thresh_mult[THR_COMP_NEAR_NEARLB] += 1200; rd->thresh_mult[THR_COMP_NEAREST_NEWLB] += 1500; rd->thresh_mult[THR_COMP_NEW_NEARESTLB] += 1500; - rd->thresh_mult[THR_COMP_NEAR_NEWLB] += 1700; + rd->thresh_mult[THR_COMP_NEAR_NEWLB] += 1360; rd->thresh_mult[THR_COMP_NEW_NEARLB] += 1700; - rd->thresh_mult[THR_COMP_NEW_NEWLB] += 2000; - rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLB] += 2500; + rd->thresh_mult[THR_COMP_NEW_NEWLB] += 2400; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLB] += 2250; rd->thresh_mult[THR_COMP_NEAR_NEARL2B] += 1200; rd->thresh_mult[THR_COMP_NEAREST_NEWL2B] += 1500; @@ -1404,7 +1200,7 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) { rd->thresh_mult[THR_COMP_NEAR_NEARL3B] += 1200; rd->thresh_mult[THR_COMP_NEAREST_NEWL3B] += 1500; rd->thresh_mult[THR_COMP_NEW_NEARESTL3B] += 1500; - rd->thresh_mult[THR_COMP_NEAR_NEWL3B] += 1700; + rd->thresh_mult[THR_COMP_NEAR_NEWL3B] += 1870; rd->thresh_mult[THR_COMP_NEW_NEARL3B] += 1700; rd->thresh_mult[THR_COMP_NEW_NEWL3B] += 2000; rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3B] += 2500; @@ -1418,7 +1214,7 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) { rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGB] += 2500; rd->thresh_mult[THR_COMP_NEAR_NEARLA2] += 1200; - rd->thresh_mult[THR_COMP_NEAREST_NEWLA2] += 1500; + rd->thresh_mult[THR_COMP_NEAREST_NEWLA2] += 1800; rd->thresh_mult[THR_COMP_NEW_NEARESTLA2] += 1500; rd->thresh_mult[THR_COMP_NEAR_NEWLA2] += 1700; rd->thresh_mult[THR_COMP_NEW_NEARLA2] += 1700; @@ -1433,7 +1229,7 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) { rd->thresh_mult[THR_COMP_NEW_NEWL2A2] += 2000; rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2A2] += 2500; - rd->thresh_mult[THR_COMP_NEAR_NEARL3A2] += 1200; + rd->thresh_mult[THR_COMP_NEAR_NEARL3A2] += 1440; rd->thresh_mult[THR_COMP_NEAREST_NEWL3A2] += 1500; rd->thresh_mult[THR_COMP_NEW_NEARESTL3A2] += 1500; rd->thresh_mult[THR_COMP_NEAR_NEWL3A2] += 1700; @@ -1447,29 +1243,29 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) { rd->thresh_mult[THR_COMP_NEAR_NEWGA2] += 1700; rd->thresh_mult[THR_COMP_NEW_NEARGA2] += 1700; rd->thresh_mult[THR_COMP_NEW_NEWGA2] += 2000; - rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA2] += 2500; + rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA2] += 2750; rd->thresh_mult[THR_COMP_NEAR_NEARLL2] += 1600; rd->thresh_mult[THR_COMP_NEAREST_NEWLL2] += 2000; rd->thresh_mult[THR_COMP_NEW_NEARESTLL2] += 2000; - rd->thresh_mult[THR_COMP_NEAR_NEWLL2] += 2200; + rd->thresh_mult[THR_COMP_NEAR_NEWLL2] += 2640; rd->thresh_mult[THR_COMP_NEW_NEARLL2] += 2200; rd->thresh_mult[THR_COMP_NEW_NEWLL2] += 2400; rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLL2] += 3200; rd->thresh_mult[THR_COMP_NEAR_NEARLL3] += 1600; rd->thresh_mult[THR_COMP_NEAREST_NEWLL3] += 2000; - rd->thresh_mult[THR_COMP_NEW_NEARESTLL3] += 2000; + rd->thresh_mult[THR_COMP_NEW_NEARESTLL3] += 1800; rd->thresh_mult[THR_COMP_NEAR_NEWLL3] += 2200; rd->thresh_mult[THR_COMP_NEW_NEARLL3] += 2200; rd->thresh_mult[THR_COMP_NEW_NEWLL3] += 2400; rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLL3] += 3200; - rd->thresh_mult[THR_COMP_NEAR_NEARLG] += 1600; - rd->thresh_mult[THR_COMP_NEAREST_NEWLG] += 2000; + rd->thresh_mult[THR_COMP_NEAR_NEARLG] += 1760; + rd->thresh_mult[THR_COMP_NEAREST_NEWLG] += 2400; rd->thresh_mult[THR_COMP_NEW_NEARESTLG] += 2000; - rd->thresh_mult[THR_COMP_NEAR_NEWLG] += 2200; - rd->thresh_mult[THR_COMP_NEW_NEARLG] += 2200; + rd->thresh_mult[THR_COMP_NEAR_NEWLG] += 1760; + rd->thresh_mult[THR_COMP_NEW_NEARLG] += 2640; rd->thresh_mult[THR_COMP_NEW_NEWLG] += 2400; rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLG] += 3200; @@ -1477,34 +1273,25 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) { rd->thresh_mult[THR_COMP_NEAREST_NEWBA] += 2000; rd->thresh_mult[THR_COMP_NEW_NEARESTBA] += 2000; rd->thresh_mult[THR_COMP_NEAR_NEWBA] += 2200; - rd->thresh_mult[THR_COMP_NEW_NEARBA] += 2200; - rd->thresh_mult[THR_COMP_NEW_NEWBA] += 2400; + rd->thresh_mult[THR_COMP_NEW_NEARBA] += 1980; + rd->thresh_mult[THR_COMP_NEW_NEWBA] += 2640; rd->thresh_mult[THR_COMP_GLOBAL_GLOBALBA] += 3200; rd->thresh_mult[THR_DC] += 1000; rd->thresh_mult[THR_PAETH] += 1000; - rd->thresh_mult[THR_SMOOTH] += 2000; + rd->thresh_mult[THR_SMOOTH] += 2200; rd->thresh_mult[THR_SMOOTH_V] += 2000; rd->thresh_mult[THR_SMOOTH_H] += 2000; rd->thresh_mult[THR_H_PRED] += 2000; - rd->thresh_mult[THR_V_PRED] += 2000; + rd->thresh_mult[THR_V_PRED] += 1800; rd->thresh_mult[THR_D135_PRED] += 2500; - rd->thresh_mult[THR_D203_PRED] += 2500; + rd->thresh_mult[THR_D203_PRED] += 2000; rd->thresh_mult[THR_D157_PRED] += 2500; - rd->thresh_mult[THR_D67_PRED] += 2500; + rd->thresh_mult[THR_D67_PRED] += 2000; rd->thresh_mult[THR_D113_PRED] += 2500; rd->thresh_mult[THR_D45_PRED] += 2500; } -void av1_set_rd_speed_thresholds_sub8x8(AV1_COMP *cpi) { - static const int thresh_mult[MAX_REFS] = { 2500, 2500, 2500, 2500, 2500, - 2500, 2500, 4500, 4500, 4500, - 4500, 4500, 4500, 4500, 4500, - 4500, 4500, 4500, 4500, 2500 }; - RD_OPT *const rd = &cpi->rd; - memcpy(rd->thresh_mult_sub8x8, thresh_mult, sizeof(thresh_mult)); -} - void av1_update_rd_thresh_fact(const AV1_COMMON *const cm, int (*factor_buf)[MAX_MODES], int rd_thresh, int bsize, int best_mode_index) { diff --git a/libaom/av1/encoder/rd.h b/libaom/av1/encoder/rd.h index 2e2a30d..ff46083 100644 --- a/libaom/av1/encoder/rd.h +++ b/libaom/av1/encoder/rd.h @@ -48,7 +48,7 @@ extern "C" { // This enumerator type needs to be kept aligned with the mode order in // const MODE_DEFINITION av1_mode_order[MAX_MODES] used in the rd code. -typedef enum { +enum { THR_NEARESTMV, THR_NEARESTL2, THR_NEARESTL3, @@ -246,9 +246,9 @@ typedef enum { MAX_SINGLE_REF_MODES = LAST_SINGLE_REF_MODES + 1, LAST_COMP_REF_MODES = THR_COMP_GLOBAL_GLOBALBA, MAX_COMP_REF_MODES = LAST_COMP_REF_MODES + 1 -} THR_MODES; +} UENUM1BYTE(THR_MODES); -typedef enum { +enum { THR_LAST, THR_LAST2, THR_LAST3, @@ -275,7 +275,7 @@ typedef enum { THR_INTRA, MAX_REFS -} THR_MODES_SUB8X8; +} UENUM1BYTE(THR_MODES_SUB8X8); typedef struct RD_OPT { // Thresh_mult is used to set a threshold for the rd score. A higher value @@ -283,7 +283,6 @@ typedef struct RD_OPT { // is used in combination with the current block size, and thresh_freq_fact // to pick a threshold. int thresh_mult[MAX_MODES]; - int thresh_mult_sub8x8[MAX_REFS]; int threshes[MAX_SEGMENTS][BLOCK_SIZES_ALL][MAX_MODES]; @@ -319,25 +318,6 @@ static INLINE void av1_init_rd_stats(RD_STATS *rd_stats) { } } #endif -#if CONFIG_ONE_PASS_SVM - rd_stats->eob = 0; - rd_stats->eob_0 = 0; - rd_stats->eob_1 = 0; - rd_stats->eob_2 = 0; - rd_stats->eob_3 = 0; - - rd_stats->rd = 0; - rd_stats->rd_0 = 0; - rd_stats->rd_1 = 0; - rd_stats->rd_2 = 0; - rd_stats->rd_3 = 0; - - rd_stats->y_sse = 0; - rd_stats->sse_0 = 0; - rd_stats->sse_1 = 0; - rd_stats->sse_2 = 0; - rd_stats->sse_3 = 0; -#endif } static INLINE void av1_invalid_rd_stats(RD_STATS *rd_stats) { @@ -365,30 +345,6 @@ static INLINE void av1_invalid_rd_stats(RD_STATS *rd_stats) { } } #endif -#if CONFIG_ONE_PASS_SVM - // TODO(chiyotsai@google.com): Change invalid values to INT_MAX and - // INT64_MAX. Currently there are some code paths where rd_stats's properties - // are set directly without calling av1_init_rd_stats, so changing it now will - // break this speed feature. Need to hunt down all places where rd_stats is - // used without initialized. - rd_stats->eob = 0; - rd_stats->eob_0 = 0; - rd_stats->eob_1 = 0; - rd_stats->eob_2 = 0; - rd_stats->eob_3 = 0; - - rd_stats->rd = 0; - rd_stats->rd_0 = 0; - rd_stats->rd_1 = 0; - rd_stats->rd_2 = 0; - rd_stats->rd_3 = 0; - - rd_stats->y_sse = 0; - rd_stats->sse_0 = 0; - rd_stats->sse_1 = 0; - rd_stats->sse_2 = 0; - rd_stats->sse_3 = 0; -#endif } static INLINE void av1_merge_rd_stats(RD_STATS *rd_stats_dst, @@ -422,222 +378,8 @@ static INLINE void av1_merge_rd_stats(RD_STATS *rd_stats_dst, } } #endif -#if CONFIG_ONE_PASS_SVM - rd_stats_dst->eob += rd_stats_src->eob; - rd_stats_dst->eob_0 += rd_stats_src->eob_0; - rd_stats_dst->eob_1 += rd_stats_src->eob_1; - rd_stats_dst->eob_2 += rd_stats_src->eob_2; - rd_stats_dst->eob_3 += rd_stats_src->eob_3; - - rd_stats_dst->rd += rd_stats_src->rd; - rd_stats_dst->rd_0 += rd_stats_src->rd_0; - rd_stats_dst->rd_1 += rd_stats_src->rd_1; - rd_stats_dst->rd_2 += rd_stats_src->rd_2; - rd_stats_dst->rd_3 += rd_stats_src->rd_3; - - rd_stats_dst->y_sse += rd_stats_src->y_sse; - rd_stats_dst->sse_0 += rd_stats_src->sse_0; - rd_stats_dst->sse_1 += rd_stats_src->sse_1; - rd_stats_dst->sse_2 += rd_stats_src->sse_2; - rd_stats_dst->sse_3 += rd_stats_src->sse_3; -#endif -} - -#if CONFIG_ONE_PASS_SVM -static INLINE void av1_add_reg_stat(RD_STATS *rd_stats, int eob, int64_t rd, - int64_t sse, int blk_row, int blk_col, - BLOCK_SIZE bsize, BLOCK_SIZE crop_bsize) { - // NOTE: Currently the calculation of regional features works by assuming - // bsize is square so that each transform block of size crop_bsize either - // 1. locates completely within a quadrant or - // 2. is exactly half of bsize or - // 3. is the entire prediction block - // Size of TX block and SB - const int block_width_mi = mi_size_wide[bsize]; - const int block_height_mi = mi_size_high[bsize]; - const int crop_width_mi = mi_size_wide[crop_bsize]; - const int crop_height_mi = mi_size_high[crop_bsize]; - - // Increment the eob proportionally to how much the tx_block overlaps with - // each quadrant. We will scale it by MAX_MIB_SIZE * MAX_MIB_SIZE to avoid - // being truncated. - const int max_scaling_factor = MAX_MIB_SIZE * MAX_MIB_SIZE; - - // Update the stats - rd_stats->eob = eob; - rd_stats->rd = rd; - rd_stats->y_sse = sse; - - if (crop_width_mi <= block_width_mi / 2 && - crop_height_mi <= block_width_mi / 2) { - // The transform block lies completely in a quadrant. - const int scaling_factor = max_scaling_factor; - const int r_eob = eob * scaling_factor, r_rd = rd * scaling_factor, - r_sse = sse * scaling_factor; - - if (blk_row < block_height_mi / 2 && blk_col < block_width_mi / 2) { - rd_stats->eob_0 = r_eob; - rd_stats->rd_0 = r_rd; - rd_stats->sse_0 = r_sse; - } else if (blk_row < block_height_mi / 2 && blk_col >= block_width_mi / 2) { - rd_stats->eob_1 = r_eob; - rd_stats->rd_1 = r_rd; - rd_stats->sse_1 = r_sse; - } else if (blk_row >= block_height_mi / 2 && blk_col < block_width_mi / 2) { - rd_stats->eob_2 = r_eob; - rd_stats->rd_2 = r_rd; - rd_stats->sse_2 = r_sse; - } else { - rd_stats->eob_3 = r_eob; - rd_stats->rd_3 = r_rd; - rd_stats->sse_3 = r_sse; - } - } else if (crop_height_mi == block_height_mi && - crop_width_mi == block_width_mi) { - // The transform block is the whole prediction block - const int scaling_factor = max_scaling_factor; - const int r_eob = eob * scaling_factor, r_rd = rd * scaling_factor, - r_sse = sse * scaling_factor; - - rd_stats->eob_0 = r_eob; - rd_stats->rd_0 = r_rd; - rd_stats->sse_0 = r_sse; - - rd_stats->eob_1 = r_eob; - rd_stats->rd_1 = r_rd; - rd_stats->sse_1 = r_sse; - - rd_stats->eob_2 = r_eob; - rd_stats->rd_2 = r_rd; - rd_stats->sse_2 = r_sse; - - rd_stats->eob_3 = r_eob; - rd_stats->rd_3 = r_rd; - rd_stats->sse_3 = r_sse; - } else if (crop_height_mi == block_height_mi) { - // The tranform block is a vertical block - const int scaling_factor = max_scaling_factor / 2; - const int r_eob = eob * scaling_factor, r_rd = rd * scaling_factor, - r_sse = sse * scaling_factor; - - if (blk_col < block_width_mi / 2) { - rd_stats->eob_0 = r_eob; - rd_stats->rd_0 = r_rd; - rd_stats->sse_0 = r_sse; - - rd_stats->eob_2 = r_eob; - rd_stats->rd_2 = r_rd; - rd_stats->sse_2 = r_sse; - } else { - rd_stats->eob_1 = r_eob; - rd_stats->rd_1 = r_rd; - rd_stats->sse_1 = r_sse; - - rd_stats->eob_3 = r_eob; - rd_stats->rd_3 = r_rd; - rd_stats->sse_3 = r_sse; - } - } else if (crop_width_mi == block_width_mi) { - // The tranform block is a horizontal block half the size of predition block - const int scaling_factor = max_scaling_factor / 2; - const int r_eob = eob * scaling_factor, r_rd = rd * scaling_factor, - r_sse = sse * scaling_factor; - - if (blk_row < block_height_mi / 2) { - rd_stats->eob_0 = r_eob; - rd_stats->rd_0 = r_rd; - rd_stats->sse_0 = r_sse; - - rd_stats->eob_1 = r_eob; - rd_stats->rd_1 = r_rd; - rd_stats->sse_1 = r_sse; - } else { - rd_stats->eob_2 = r_eob; - rd_stats->rd_2 = r_rd; - rd_stats->sse_2 = r_sse; - - rd_stats->eob_3 = r_eob; - rd_stats->rd_3 = r_rd; - rd_stats->sse_3 = r_sse; - } - } else { - assert(0 && "Unexpected transform size"); - } } -static INLINE void av1_reg_stat_skipmode_update(RD_STATS *rd_stats, - int rdmult) { - // Update the stats - rd_stats->eob = 0; - rd_stats->eob_0 = 0; - rd_stats->eob_1 = 0; - rd_stats->eob_2 = 0; - rd_stats->eob_3 = 0; - - rd_stats->rd = RDCOST(rdmult, 0, rd_stats->sse); - rd_stats->rd_0 = RDCOST(rdmult, 0, rd_stats->sse_0); - rd_stats->rd_1 = RDCOST(rdmult, 0, rd_stats->sse_1); - rd_stats->rd_2 = RDCOST(rdmult, 0, rd_stats->sse_2); - rd_stats->rd_3 = RDCOST(rdmult, 0, rd_stats->sse_3); -} - -static INLINE void av1_copy_reg_stat(RD_STATS *rd_stats_dst, - RD_STATS *rd_stats_src) { - rd_stats_dst->eob = rd_stats_src->eob; - rd_stats_dst->eob_0 = rd_stats_src->eob_0; - rd_stats_dst->eob_1 = rd_stats_src->eob_1; - rd_stats_dst->eob_2 = rd_stats_src->eob_2; - rd_stats_dst->eob_3 = rd_stats_src->eob_3; - - rd_stats_dst->rd = rd_stats_src->rd; - rd_stats_dst->rd_0 = rd_stats_src->rd_0; - rd_stats_dst->rd_1 = rd_stats_src->rd_1; - rd_stats_dst->rd_2 = rd_stats_src->rd_2; - rd_stats_dst->rd_3 = rd_stats_src->rd_3; - - rd_stats_dst->y_sse = rd_stats_src->y_sse; - rd_stats_dst->sse_0 = rd_stats_src->sse_0; - rd_stats_dst->sse_1 = rd_stats_src->sse_1; - rd_stats_dst->sse_2 = rd_stats_src->sse_2; - rd_stats_dst->sse_3 = rd_stats_src->sse_3; -} - -static INLINE void av1_unpack_reg_stat(RD_STATS *rd_stats, int *eob, int *eob_0, - int *eob_1, int *eob_2, int *eob_3, - int64_t *rd, int64_t *rd_0, - int64_t *rd_1, int64_t *rd_2, - int64_t *rd_3) { - *rd = rd_stats->rd; - *rd_0 = rd_stats->rd_0; - *rd_1 = rd_stats->rd_1; - *rd_2 = rd_stats->rd_2; - *rd_3 = rd_stats->rd_3; - - *eob = rd_stats->eob; - *eob_0 = rd_stats->eob_0; - *eob_1 = rd_stats->eob_1; - *eob_2 = rd_stats->eob_2; - *eob_3 = rd_stats->eob_3; -} - -static INLINE void av1_set_reg_stat(RD_STATS *rd_stats, int eob, int eob_0, - int eob_1, int eob_2, int eob_3, int64_t rd, - int64_t rd_0, int64_t rd_1, int64_t rd_2, - int64_t rd_3) { - rd_stats->rd = rd; - rd_stats->rd_0 = rd_0; - rd_stats->rd_1 = rd_1; - rd_stats->rd_2 = rd_2; - rd_stats->rd_3 = rd_3; - - rd_stats->eob = eob; - rd_stats->eob_0 = eob_0; - rd_stats->eob_1 = eob_1; - rd_stats->eob_2 = eob_2; - rd_stats->eob_3 = eob_3; -} -#endif - struct TileInfo; struct TileDataEnc; struct AV1_COMP; @@ -657,9 +399,10 @@ void av1_initialize_me_consts(const struct AV1_COMP *cpi, MACROBLOCK *x, void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n, unsigned int qstep, int *rate, int64_t *dist); -void av1_model_rd_curvfit(double xqr, double *rate_f, double *distbysse_f); -void av1_model_rd_surffit(double xm, double yl, double *rate_f, - double *distbysse_f); +void av1_model_rd_curvfit(BLOCK_SIZE bsize, double sse_norm, double xqr, + double *rate_f, double *distbysse_f); +void av1_model_rd_surffit(BLOCK_SIZE bsize, double sse_norm, double xm, + double yl, double *rate_f, double *distbysse_f); int av1_get_switchable_rate(const AV1_COMMON *const cm, MACROBLOCK *x, const MACROBLOCKD *xd); @@ -684,8 +427,6 @@ void av1_get_entropy_contexts(BLOCK_SIZE bsize, void av1_set_rd_speed_thresholds(struct AV1_COMP *cpi); -void av1_set_rd_speed_thresholds_sub8x8(struct AV1_COMP *cpi); - void av1_update_rd_thresh_fact(const AV1_COMMON *const cm, int (*fact)[MAX_MODES], int rd_thresh, int bsize, int best_mode_index); diff --git a/libaom/av1/encoder/rdopt.c b/libaom/av1/encoder/rdopt.c index b393e6f..5e6054e 100644 --- a/libaom/av1/encoder/rdopt.c +++ b/libaom/av1/encoder/rdopt.c @@ -125,14 +125,14 @@ static void model_rd_with_surffit(const AV1_COMP *const cpi, int64_t sse, int num_samples, int *rate, int64_t *dist); -typedef enum { +enum { MODELRD_LEGACY, MODELRD_CURVFIT, MODELRD_SUFFIT, MODELRD_DNN, MODELRD_FULLRDY, MODELRD_TYPES -} ModelRdType; +} UENUM1BYTE(ModelRdType); static model_rd_for_sb_type model_rd_sb_fn[MODELRD_TYPES] = { model_rd_for_sb, model_rd_for_sb_with_curvfit, model_rd_for_sb_with_surffit, @@ -150,11 +150,12 @@ static model_rd_from_sse_type model_rd_sse_fn[MODELRD_TYPES] = { // 3: DNN regression model // 4: Full rd model #define MODELRD_TYPE_INTERP_FILTER 1 -#define MODELRD_TYPE_TX_SEARCH_PRUNE 2 +#define MODELRD_TYPE_TX_SEARCH_PRUNE 1 #define MODELRD_TYPE_MASKED_COMPOUND 1 #define MODELRD_TYPE_INTERINTRA 1 #define MODELRD_TYPE_INTRA 1 -#define MODELRD_TYPE_JNT_COMPOUND 1 +#define MODELRD_TYPE_DIST_WTD_COMPOUND 1 +#define MODELRD_TYPE_MOTION_MODE_RD 1 #define DUAL_FILTER_SET_SIZE (SWITCHABLE_FILTERS * SWITCHABLE_FILTERS) static const InterpFilters filter_sets[DUAL_FILTER_SET_SIZE] = { @@ -163,10 +164,6 @@ static const InterpFilters filter_sets[DUAL_FILTER_SET_SIZE] = { 0x00000002, 0x00010002, 0x00020002, // y = 2 }; -#define SECOND_REF_FRAME_MASK \ - ((1 << ALTREF_FRAME) | (1 << ALTREF2_FRAME) | (1 << BWDREF_FRAME) | \ - (1 << GOLDEN_FRAME) | (1 << LAST2_FRAME) | 0x01) - static const double ADST_FLIP_SVM[8] = { /* vertical */ -6.6623, -2.8062, -3.2531, 3.1671, @@ -179,26 +176,12 @@ typedef struct { MV_REFERENCE_FRAME ref_frame[2]; } MODE_DEFINITION; -typedef struct { - MV_REFERENCE_FRAME ref_frame[2]; -} REF_DEFINITION; - -typedef enum { +enum { FTXS_NONE = 0, FTXS_DCT_AND_1D_DCT_ONLY = 1 << 0, FTXS_DISABLE_TRELLIS_OPT = 1 << 1, FTXS_USE_TRANSFORM_DOMAIN = 1 << 2 -} FAST_TX_SEARCH_MODE; - -static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x, - RD_STATS *rd_stats, BLOCK_SIZE bsize, int mi_row, - int mi_col, int64_t ref_best_rd); - -static int inter_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x, - RD_STATS *rd_stats, BLOCK_SIZE bsize, - int64_t non_skip_ref_best_rd, - int64_t skip_ref_best_rd, - FAST_TX_SEARCH_MODE ftxs_mode); +} UENUM1BYTE(FAST_TX_SEARCH_MODE); struct rdcost_block_args { const AV1_COMP *cpi; @@ -212,6 +195,7 @@ struct rdcost_block_args { int incomplete_exit; int use_fast_coef_costing; FAST_TX_SEARCH_MODE ftxs_mode; + int skip_trellis; }; #define LAST_NEW_MV_INDEX 6 @@ -749,12 +733,12 @@ typedef struct InterModeSearchState { MV_REFERENCE_FRAME single_rd_order[2][SINGLE_INTER_MODE_NUM][FWD_REFS]; } InterModeSearchState; -#if CONFIG_COLLECT_INTER_MODE_RD_STATS static int inter_mode_data_block_idx(BLOCK_SIZE bsize) { - if (bsize == BLOCK_8X8) return 1; - if (bsize == BLOCK_16X16) return 2; - if (bsize == BLOCK_32X32) return 3; - return -1; + if (bsize == BLOCK_4X4 || bsize == BLOCK_4X8 || bsize == BLOCK_8X4 || + bsize == BLOCK_4X16 || bsize == BLOCK_16X4) { + return -1; + } + return 1; } void av1_inter_mode_data_init(TileDataEnc *tile_data) { @@ -770,37 +754,41 @@ void av1_inter_mode_data_init(TileDataEnc *tile_data) { } } -static int get_est_rate_dist(TileDataEnc *tile_data, BLOCK_SIZE bsize, +static int get_est_rate_dist(const TileDataEnc *tile_data, BLOCK_SIZE bsize, int64_t sse, int *est_residue_cost, int64_t *est_dist) { aom_clear_system_state(); const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize]; if (md->ready) { - const double est_ld = md->a * sse + md->b; if (sse < md->dist_mean) { *est_residue_cost = 0; *est_dist = sse; } else { - *est_residue_cost = (int)round((sse - md->dist_mean) / est_ld); *est_dist = (int64_t)round(md->dist_mean); + const double est_ld = md->a * sse + md->b; + // Clamp estimated rate cost by INT_MAX / 2. + // TODO(angiebird@google.com): find better solution than clamping. + if (fabs(est_ld) < 1e-2) { + *est_residue_cost = INT_MAX / 2; + } else { + double est_residue_cost_dbl = ((sse - md->dist_mean) / est_ld); + if (est_residue_cost_dbl < 0) { + *est_residue_cost = 0; + } else { + *est_residue_cost = + (int)AOMMIN((int64_t)round(est_residue_cost_dbl), INT_MAX / 2); + } + } + if (*est_residue_cost <= 0) { + *est_residue_cost = 0; + *est_dist = sse; + } } return 1; } return 0; } -static int64_t get_est_rd(TileDataEnc *tile_data, BLOCK_SIZE bsize, int rdmult, - int64_t sse, int curr_cost) { - int est_residue_cost; - int64_t est_dist; - if (get_est_rate_dist(tile_data, bsize, sse, &est_residue_cost, &est_dist)) { - int rate = est_residue_cost + curr_cost; - int64_t est_rd = RDCOST(rdmult, rate, est_dist); - return est_rd; - } - return 0; -} - void av1_inter_mode_data_fit(TileDataEnc *tile_data, int rdmult) { aom_clear_system_state(); for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) { @@ -865,20 +853,31 @@ static void inter_mode_data_push(TileDataEnc *tile_data, BLOCK_SIZE bsize, rd_model->dist_sum += dist; rd_model->ld_sum += ld; rd_model->sse_sum += sse; - rd_model->sse_sse_sum += sse * sse; + rd_model->sse_sse_sum += (double)sse * (double)sse; rd_model->sse_ld_sum += sse * ld; } } static void inter_modes_info_push(InterModesInfo *inter_modes_info, - int mode_rate, int64_t sse, int64_t est_rd, + int mode_rate, int64_t sse, int64_t rd, + bool true_rd, uint8_t *blk_skip, + RD_STATS *rd_cost, RD_STATS *rd_cost_y, + RD_STATS *rd_cost_uv, const MB_MODE_INFO *mbmi) { const int num = inter_modes_info->num; assert(num < MAX_INTER_MODES); inter_modes_info->mbmi_arr[num] = *mbmi; inter_modes_info->mode_rate_arr[num] = mode_rate; inter_modes_info->sse_arr[num] = sse; - inter_modes_info->est_rd_arr[num] = est_rd; + inter_modes_info->est_rd_arr[num] = rd; + inter_modes_info->true_rd_arr[num] = true_rd; + if (blk_skip != NULL) { + memcpy(inter_modes_info->blk_skip_arr[num], blk_skip, + sizeof(blk_skip[0]) * MAX_MIB_SIZE * MAX_MIB_SIZE); + } + inter_modes_info->rd_cost_arr[num] = *rd_cost; + inter_modes_info->rd_cost_y_arr[num] = *rd_cost_y; + inter_modes_info->rd_cost_uv_arr[num] = *rd_cost_uv; ++inter_modes_info->num; } @@ -904,7 +903,6 @@ static void inter_modes_info_sort(const InterModesInfo *inter_modes_info, qsort(rd_idx_pair_arr, inter_modes_info->num, sizeof(rd_idx_pair_arr[0]), compare_rd_idx_pair); } -#endif // CONFIG_COLLECT_INTER_MODE_RD_STATS static INLINE int write_uniform_cost(int n, int v) { const int l = get_unsigned_bits(n); @@ -961,7 +959,7 @@ static unsigned pixel_dist_visible_only( } const MACROBLOCKD *xd = &x->e_mbd; - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + if (is_cur_buf_hbd(xd)) { uint64_t sse64 = aom_highbd_sse_odd_size(src, src_stride, dst, dst_stride, visible_cols, visible_rows); return (unsigned int)ROUND_POWER_OF_TWO(sse64, (xd->bd - 8) * 2); @@ -1217,7 +1215,7 @@ int64_t av1_dist_8x8(const AV1_COMP *const cpi, const MACROBLOCK *x, if (x->tune_metric == AOM_TUNE_CDEF_DIST || x->tune_metric == AOM_TUNE_DAALA_DIST) { - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + if (is_cur_buf_hbd(xd)) { for (j = 0; j < bsh; j++) for (i = 0; i < bsw; i++) orig[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i]; @@ -1281,8 +1279,7 @@ int64_t av1_dist_8x8(const AV1_COMP *const cpi, const MACROBLOCK *x, bsw, coeff_shift); } } - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) - d = ((uint64_t)d) >> 2 * coeff_shift; + if (is_cur_buf_hbd(xd)) d = ((uint64_t)d) >> 2 * coeff_shift; } else { // Otherwise, MSE by default d = pixel_dist_visible_only(cpi, x, src, src_stride, dst, dst_stride, @@ -1310,7 +1307,7 @@ static int64_t dist_8x8_diff(const MACROBLOCK *x, const uint8_t *src, if (x->tune_metric == AOM_TUNE_CDEF_DIST || x->tune_metric == AOM_TUNE_DAALA_DIST) { - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + if (is_cur_buf_hbd(xd)) { for (j = 0; j < bsh; j++) for (i = 0; i < bsw; i++) orig[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i]; @@ -1727,16 +1724,19 @@ void av1_get_horver_correlation_full_c(const int16_t *diff, int stride, static void score_2D_transform_pow8(float *scores_2D, float shift) { float sum = 0.0f; int i; - for (i = 0; i < 16; i++) { - float v, v2, v4; - v = AOMMAX(scores_2D[i] + shift, 0.0f); - v2 = v * v; - v4 = v2 * v2; + const float v = AOMMIN(AOMMAX(scores_2D[i] + shift, 0.0f), 100.0f); + const float v2 = v * v; + const float v4 = v2 * v2; scores_2D[i] = v4 * v4; sum += scores_2D[i]; } - for (i = 0; i < 16; i++) scores_2D[i] /= sum; + for (i = 0; i < 16; i++) { + if (scores_2D[i] < sum * 1e-4) + scores_2D[i] = 0.0f; + else + scores_2D[i] /= sum; + } } // These thresholds were calibrated to provide a certain number of TX types @@ -1909,7 +1909,13 @@ static void prune_tx(const AV1_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, x->tx_search_prune[tx_set_type] = 0; x->tx_split_prune_flag = 0; const MB_MODE_INFO *mbmi = xd->mi[0]; - if (!is_inter_block(mbmi) || cpi->sf.tx_type_search.prune_mode == NO_PRUNE || + const int is_inter = is_inter_block(mbmi); + if ((is_inter && cpi->oxcf.use_inter_dct_only) || + (!is_inter && cpi->oxcf.use_intra_dct_only)) { + x->tx_search_prune[tx_set_type] = ~(1 << DCT_DCT); + return; + } + if (!is_inter || cpi->sf.tx_type_search.prune_mode == NO_PRUNE || x->use_default_inter_tx_type || xd->lossless[mbmi->segment_id] || x->cb_partition_scan) return; @@ -1948,8 +1954,7 @@ static void model_rd_from_sse(const AV1_COMP *const cpi, (void)num_samples; const MACROBLOCKD *const xd = &x->e_mbd; const struct macroblockd_plane *const pd = &xd->plane[plane]; - const int dequant_shift = - (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3; + const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3; // Fast approximate the modelling function. if (cpi->sf.simple_model_rd_from_var) { @@ -1971,7 +1976,6 @@ static void model_rd_from_sse(const AV1_COMP *const cpi, *dist <<= 4; } -#if CONFIG_COLLECT_INTER_MODE_RD_STATS static int64_t get_sse(const AV1_COMP *cpi, const MACROBLOCK *x) { const AV1_COMMON *cm = &cpi->common; const int num_planes = av1_num_planes(cm); @@ -1994,7 +1998,6 @@ static int64_t get_sse(const AV1_COMP *cpi, const MACROBLOCK *x) { total_sse <<= 4; return total_sse; } -#endif static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, int plane_from, @@ -2028,7 +2031,7 @@ static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize, if (x->skip_chroma_rd && plane) continue; - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + if (is_cur_buf_hbd(xd)) { sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw, bh); } else { @@ -2057,43 +2060,6 @@ static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize, *out_dist_sum = dist_sum; } -static void check_block_skip(const AV1_COMP *const cpi, BLOCK_SIZE bsize, - MACROBLOCK *x, MACROBLOCKD *xd, int plane_from, - int plane_to, int *skip_txfm_sb) { - *skip_txfm_sb = 1; - for (int plane = plane_from; plane <= plane_to; ++plane) { - struct macroblock_plane *const p = &x->plane[plane]; - struct macroblockd_plane *const pd = &xd->plane[plane]; - const BLOCK_SIZE bs = - get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); - unsigned int sse; - - if (x->skip_chroma_rd && plane) continue; - - // Since fast HBD variance functions scale down sse by 4 bit, we first use - // fast vf implementation to rule out blocks with non-zero scaled sse. Then, - // only if the source is HBD and the scaled sse is 0, accurate sse - // computation is applied to determine if the sse is really 0. This step is - // necessary for HBD lossless coding. - cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, - &sse); - if (sse) { - *skip_txfm_sb = 0; - return; - } else if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - uint64_t sse64 = aom_highbd_sse_odd_size( - p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, - block_size_wide[bs], block_size_high[bs]); - - if (sse64) { - *skip_txfm_sb = 0; - return; - } - } - } - return; -} - int64_t av1_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz) { int i; @@ -2195,7 +2161,8 @@ static unsigned pixel_dist(const AV1_COMP *const cpi, const MACROBLOCK *x, static INLINE int64_t pixel_diff_dist(const MACROBLOCK *x, int plane, int blk_row, int blk_col, const BLOCK_SIZE plane_bsize, - const BLOCK_SIZE tx_bsize) { + const BLOCK_SIZE tx_bsize, + unsigned int *block_mse_q8) { int visible_rows, visible_cols; const MACROBLOCKD *xd = &x->e_mbd; get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL, @@ -2218,7 +2185,11 @@ static INLINE int64_t pixel_diff_dist(const MACROBLOCK *x, int plane, } #endif diff += ((blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]); - return aom_sum_squares_2d_i16(diff, diff_stride, visible_cols, visible_rows); + uint64_t sse = + aom_sum_squares_2d_i16(diff, diff_stride, visible_cols, visible_rows); + if (block_mse_q8 != NULL) + *block_mse_q8 = (unsigned int)((256 * sse) / (visible_cols * visible_rows)); + return sse; } int av1_count_colors(const uint8_t *src, int stride, int rows, int cols, @@ -2318,7 +2289,7 @@ static INLINE void dist_block_tx_domain(MACROBLOCK *x, int plane, int block, tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + if (is_cur_buf_hbd(xd)) *out_dist = av1_highbd_block_error(coeff, dqcoeff, buffer_length, &this_sse, xd->bd); else @@ -2354,7 +2325,7 @@ static INLINE int64_t dist_block_px_domain(const AV1_COMP *cpi, MACROBLOCK *x, uint8_t *recon; DECLARE_ALIGNED(16, uint16_t, recon16[MAX_TX_SQUARE]); - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + if (is_cur_buf_hbd(xd)) { recon = CONVERT_TO_BYTEPTR(recon16); av1_highbd_convolve_2d_copy_sr(CONVERT_TO_SHORTPTR(dst), dst_stride, CONVERT_TO_SHORTPTR(recon), MAX_TX_SIZE, bsw, @@ -2376,11 +2347,29 @@ static INLINE int64_t dist_block_px_domain(const AV1_COMP *cpi, MACROBLOCK *x, blk_row, blk_col, plane_bsize, tx_bsize); } -static double get_mean(const int16_t *diff, int stride, int w, int h) { +static double get_diff_mean(const uint8_t *src, int src_stride, + const uint8_t *dst, int dst_stride, int w, int h) { double sum = 0.0; for (int j = 0; j < h; ++j) { for (int i = 0; i < w; ++i) { - sum += diff[j * stride + i]; + const int diff = src[j * src_stride + i] - dst[j * dst_stride + i]; + sum += diff; + } + } + assert(w > 0 && h > 0); + return sum / (w * h); +} + +static double get_highbd_diff_mean(const uint8_t *src8, int src_stride, + const uint8_t *dst8, int dst_stride, int w, + int h) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + double sum = 0.0; + for (int j = 0; j < h; ++j) { + for (int i = 0; i < w; ++i) { + const int diff = src[j * src_stride + i] - dst[j * dst_stride + i]; + sum += diff; } } assert(w > 0 && h > 0); @@ -2469,6 +2458,17 @@ static void get_2x2_normalized_sses_and_sads( #if CONFIG_COLLECT_RD_STATS #if CONFIG_COLLECT_RD_STATS == 1 +static double get_mean(const int16_t *diff, int stride, int w, int h) { + double sum = 0.0; + for (int j = 0; j < h; ++j) { + for (int i = 0; i < w; ++i) { + sum += diff[j * stride + i]; + } + } + assert(w > 0 && h > 0); + return sum / (w * h); +} + static void PrintTransformUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x, const RD_STATS *const rd_stats, int blk_row, int blk_col, BLOCK_SIZE plane_bsize, @@ -2491,10 +2491,9 @@ static void PrintTransformUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x, const struct macroblockd_plane *const pd = &xd->plane[plane]; const int txw = tx_size_wide[tx_size]; const int txh = tx_size_high[tx_size]; - const int dequant_shift = - (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3; + const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3; const int q_step = pd->dequant_Q3[1] >> dequant_shift; - const double num_samples = txw * txh; + const int num_samples = txw * txh; const double rate_norm = (double)rd_stats->rate / num_samples; const double dist_norm = (double)rd_stats->dist / num_samples; @@ -2566,15 +2565,25 @@ static void PrintTransformUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x, #endif // CONFIG_COLLECT_RD_STATS == 1 #if CONFIG_COLLECT_RD_STATS >= 2 -static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x, +static void PrintPredictionUnitStats(const AV1_COMP *const cpi, + const TileDataEnc *tile_data, + MACROBLOCK *x, const RD_STATS *const rd_stats, BLOCK_SIZE plane_bsize) { if (rd_stats->invalid_rate) return; if (rd_stats->rate == INT_MAX || rd_stats->dist == INT64_MAX) return; + if (cpi->sf.inter_mode_rd_model_estimation == 1 && + (tile_data == NULL || + !tile_data->inter_mode_rd_models[plane_bsize].ready)) + return; + (void)tile_data; // Generate small sample to restrict output size. static unsigned int seed = 95014; - if (lcg_rand16(&seed) % 256 > 0) return; + + if ((lcg_rand16(&seed) % (1 << (14 - num_pels_log2_lookup[plane_bsize]))) != + 1) + return; const char output_file[] = "pu_stats.txt"; FILE *fout = fopen(output_file, "a"); @@ -2589,8 +2598,7 @@ static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x, get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, &bw, &bh); const int num_samples = bw * bh; - const int dequant_shift = - (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3; + const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3; const int q_step = pd->dequant_Q3[1] >> dequant_shift; const double rate_norm = (double)rd_stats->rate / num_samples; @@ -2607,7 +2615,14 @@ static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x, const int16_t *const src_diff = p->src_diff; const int shift = (xd->bd - 8); - int64_t sse = aom_sum_squares_2d_i16(src_diff, diff_stride, bw, bh); + int64_t sse; + if (is_cur_buf_hbd(xd)) { + sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, + bw, bh); + } else { + sse = + aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw, bh); + } sse = ROUND_POWER_OF_TWO(sse, shift * 2); const double sse_norm = (double)sse / num_samples; @@ -2646,7 +2661,14 @@ static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x, fprintf(fout, " %g %g %g", model_rate_norm, model_dist_norm, model_rdcost_norm); - double mean = get_mean(src_diff, diff_stride, bw, bh); + double mean; + if (is_cur_buf_hbd(xd)) { + mean = get_highbd_diff_mean(p->src.buf, p->src.stride, pd->dst.buf, + pd->dst.stride, bw, bh); + } else { + mean = get_diff_mean(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, + bw, bh); + } mean /= (1 << shift); float hor_corr, vert_corr; av1_get_horver_correlation_full(src_diff, diff_stride, bw, bh, &hor_corr, @@ -2659,6 +2681,21 @@ static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x, fprintf(fout, " %g %g %g %g %g %g %g %g", hdist[0], hdist[1], hdist[2], hdist[3], vdist[0], vdist[1], vdist[2], vdist[3]); + if (cpi->sf.inter_mode_rd_model_estimation == 1) { + assert(tile_data->inter_mode_rd_models[plane_bsize].ready); + const int64_t overall_sse = get_sse(cpi, x); + int est_residue_cost = 0; + int64_t est_dist = 0; + get_est_rate_dist(tile_data, plane_bsize, overall_sse, &est_residue_cost, + &est_dist); + const double est_residue_cost_norm = (double)est_residue_cost / num_samples; + const double est_dist_norm = (double)est_dist / num_samples; + const double est_rdcost_norm = + (double)RDCOST(x->rdmult, est_residue_cost, est_dist) / num_samples; + fprintf(fout, " %g %g %g", est_residue_cost_norm, est_dist_norm, + est_rdcost_norm); + } + fprintf(fout, "\n"); fclose(fout); } @@ -2673,8 +2710,7 @@ static void model_rd_with_dnn(const AV1_COMP *const cpi, const struct macroblockd_plane *const pd = &xd->plane[plane]; const int log_numpels = num_pels_log2_lookup[plane_bsize]; - const int dequant_shift = - (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3; + const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3; const int q_step = AOMMAX(pd->dequant_Q3[1] >> dequant_shift, 1); const struct macroblock_plane *const p = &x->plane[plane]; @@ -2711,7 +2747,12 @@ static void model_rd_with_dnn(const AV1_COMP *const cpi, get_2x2_normalized_sses_and_sads(cpi, plane_bsize, src, src_stride, dst, dst_stride, src_diff, diff_stride, sse_norm_arr, NULL); - double mean = get_mean(src_diff, bw, bw, bh); + double mean; + if (is_cur_buf_hbd(xd)) { + mean = get_highbd_diff_mean(src, src_stride, dst, dst_stride, bw, bh); + } else { + mean = get_diff_mean(src, src_stride, dst, dst_stride, bw, bh); + } if (shift) { for (int k = 0; k < 4; ++k) sse_norm_arr[k] /= (1 << (2 * shift)); mean /= (1 << shift); @@ -2790,7 +2831,7 @@ static void model_rd_for_sb_with_dnn( int bw, bh; get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, &bw, &bh); - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + if (is_cur_buf_hbd(xd)) { sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw, bh); } else { @@ -2829,8 +2870,7 @@ static void model_rd_with_surffit(const AV1_COMP *const cpi, (void)plane_bsize; const MACROBLOCKD *const xd = &x->e_mbd; const struct macroblockd_plane *const pd = &xd->plane[plane]; - const int dequant_shift = - (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3; + const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3; const int qstep = AOMMAX(pd->dequant_Q3[1] >> dequant_shift, 1); if (sse == 0) { if (rate) *rate = 0; @@ -2844,7 +2884,8 @@ static void model_rd_with_surffit(const AV1_COMP *const cpi, const double yl = log(sse_norm / qstepsqr) / log(2.0); double rate_f, dist_by_sse_norm_f; - av1_model_rd_surffit(xm, yl, &rate_f, &dist_by_sse_norm_f); + av1_model_rd_surffit(plane_bsize, sse_norm, xm, yl, &rate_f, + &dist_by_sse_norm_f); const double dist_f = dist_by_sse_norm_f * sse_norm; int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5); @@ -2894,7 +2935,7 @@ static void model_rd_for_sb_with_surffit( const int shift = (xd->bd - 8); get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, &bw, &bh); - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + if (is_cur_buf_hbd(xd)) { sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw, bh); } else { @@ -2934,8 +2975,7 @@ static void model_rd_with_curvfit(const AV1_COMP *const cpi, (void)plane_bsize; const MACROBLOCKD *const xd = &x->e_mbd; const struct macroblockd_plane *const pd = &xd->plane[plane]; - const int dequant_shift = - (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3; + const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3; const int qstep = AOMMAX(pd->dequant_Q3[1] >> dequant_shift, 1); if (sse == 0) { @@ -2946,10 +2986,11 @@ static void model_rd_with_curvfit(const AV1_COMP *const cpi, aom_clear_system_state(); const double sse_norm = (double)sse / num_samples; const double qstepsqr = (double)qstep * qstep; - const double xqr = log(sse_norm / qstepsqr) / log(2.0); + const double xqr = log2(sse_norm / qstepsqr); double rate_f, dist_by_sse_norm_f; - av1_model_rd_curvfit(xqr, &rate_f, &dist_by_sse_norm_f); + av1_model_rd_curvfit(plane_bsize, sse_norm, xqr, &rate_f, + &dist_by_sse_norm_f); const double dist_f = dist_by_sse_norm_f * sse_norm; int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5); @@ -3000,7 +3041,7 @@ static void model_rd_for_sb_with_curvfit( get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, &bw, &bh); - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + if (is_cur_buf_hbd(xd)) { sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw, bh); } else { @@ -3029,78 +3070,13 @@ static void model_rd_for_sb_with_curvfit( *out_dist_sum = dist_sum; } -static void model_rd_for_sb_with_fullrdy( - const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, - int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum, - int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb, - int *plane_rate, int64_t *plane_sse, int64_t *plane_dist) { - const int ref = xd->mi[0]->ref_frame[0]; - - int64_t rate_sum = 0; - int64_t dist_sum = 0; - int64_t total_sse = 0; - - for (int plane = plane_from; plane <= plane_to; ++plane) { - struct macroblock_plane *const p = &x->plane[plane]; - struct macroblockd_plane *const pd = &xd->plane[plane]; - const BLOCK_SIZE plane_bsize = - get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); - const int bw = block_size_wide[plane_bsize]; - const int bh = block_size_high[plane_bsize]; - int64_t sse; - int rate; - int64_t dist; - - if (x->skip_chroma_rd && plane) continue; - - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf, - pd->dst.stride, bw, bh); - } else { - sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw, - bh); - } - sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2); - - RD_STATS rd_stats; - if (plane == 0) { - select_tx_type_yrd(cpi, x, &rd_stats, bsize, mi_row, mi_col, INT64_MAX); - if (rd_stats.invalid_rate) { - rate = 0; - dist = sse << 4; - } else { - rate = rd_stats.rate; - dist = rd_stats.dist; - } - } else { - model_rd_with_curvfit(cpi, x, plane_bsize, plane, sse, bw * bh, &rate, - &dist); - } - - if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX); - - total_sse += sse; - rate_sum += rate; - dist_sum += dist; - - if (plane_rate) plane_rate[plane] = rate; - if (plane_sse) plane_sse[plane] = sse; - if (plane_dist) plane_dist[plane] = dist; - } - - if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0; - if (skip_sse_sb) *skip_sse_sb = total_sse << 4; - *out_rate_sum = (int)rate_sum; - *out_dist_sum = dist_sum; -} - static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane, int block, int blk_row, int blk_col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size, const TXB_CTX *const txb_ctx, FAST_TX_SEARCH_MODE ftxs_mode, - int use_fast_coef_costing, int64_t ref_best_rd, - RD_STATS *best_rd_stats) { + int use_fast_coef_costing, int skip_trellis, + int64_t ref_best_rd, RD_STATS *best_rd_stats) { const AV1_COMMON *cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; struct macroblockd_plane *const pd = &xd->plane[plane]; @@ -3118,6 +3094,7 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane, tran_low_t *best_dqcoeff = this_dqcoeff; const int txk_type_idx = av1_get_txk_type_index(plane_bsize, blk_row, blk_col); + int perform_block_coeff_opt; av1_invalid_rd_stats(best_rd_stats); TXB_RD_INFO *intra_txb_rd_info = NULL; @@ -3129,6 +3106,9 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane, (mi_row + mi_size_high[plane_bsize] < xd->tile.mi_row_end) && mi_col >= xd->tile.mi_col_start && (mi_col + mi_size_wide[plane_bsize] < xd->tile.mi_col_end); + skip_trellis |= + cpi->optimize_seg_arr[mbmi->segment_id] == NO_TRELLIS_OPT || + cpi->optimize_seg_arr[mbmi->segment_id] == FINAL_PASS_TRELLIS_OPT; if (within_border && cpi->sf.use_intra_txb_hash && frame_is_intra_only(cm) && !is_inter && plane == 0 && tx_size_wide[tx_size] == tx_size_high[tx_size]) { @@ -3168,7 +3148,8 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane, TX_TYPE txk_end = TX_TYPES - 1; if ((!is_inter && x->use_default_intra_tx_type) || (is_inter && x->use_default_inter_tx_type)) { - txk_start = txk_end = get_default_tx_type(0, xd, tx_size); + txk_start = txk_end = + get_default_tx_type(0, xd, tx_size, cpi->is_screen_content_type); } else if (x->rd_model == LOW_TXFM_RD || x->cb_partition_scan) { if (plane == 0) txk_end = DCT_DCT; } @@ -3186,7 +3167,9 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane, } const uint16_t ext_tx_used_flag = av1_ext_tx_used_flag[tx_set_type]; if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32 || - ext_tx_used_flag == 0x0001) { + ext_tx_used_flag == 0x0001 || + (is_inter && cpi->oxcf.use_inter_dct_only) || + (!is_inter && cpi->oxcf.use_intra_dct_only)) { txk_start = txk_end = DCT_DCT; } uint16_t allowed_tx_mask = 0; // 1: allow; 0: skip. @@ -3212,14 +3195,35 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane, } } } + + if (cpi->oxcf.enable_flip_idtx == 0) { + for (TX_TYPE tx_type = FLIPADST_DCT; tx_type <= H_FLIPADST; ++tx_type) { + allowed_tx_mask &= ~(1 << tx_type); + } + } + // Need to have at least one transform type allowed. if (allowed_tx_mask == 0) { txk_start = txk_end = (plane ? uv_tx_type : DCT_DCT); allowed_tx_mask = (1 << txk_start); } + const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size]; + int64_t block_sse = 0; + unsigned int block_mse_q8 = UINT_MAX; + block_sse = pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize, tx_bsize, + &block_mse_q8); + assert(block_mse_q8 != UINT_MAX); + if (is_cur_buf_hbd(xd)) { + block_sse = ROUND_POWER_OF_TWO(block_sse, (xd->bd - 8) * 2); + block_mse_q8 = ROUND_POWER_OF_TWO(block_mse_q8, (xd->bd - 8) * 2); + } + block_sse *= 16; + // Tranform domain distortion is accurate for higher residuals. + // TODO(any): Experiment with variance and mean based thresholds int use_transform_domain_distortion = (cpi->sf.use_transform_domain_distortion > 0) && + (block_mse_q8 >= cpi->tx_domain_dist_threshold) && // Any 64-pt transforms only preserves half the coefficients. // Therefore transform domain distortion is not valid for these // transform sizes. @@ -3237,20 +3241,18 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane, const uint16_t *eobs_ptr = x->plane[plane].eobs; - const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size]; - int64_t block_sse = - pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize, tx_bsize); - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) - block_sse = ROUND_POWER_OF_TWO(block_sse, (xd->bd - 8) * 2); - block_sse *= 16; + // Used mse based threshold logic to take decision of R-D of optimization of + // coeffs. For smaller residuals, coeff optimization would be helpful. For + // larger residuals, R-D optimization may not be effective. + // TODO(any): Experiment with variance and mean based thresholds + perform_block_coeff_opt = (block_mse_q8 <= cpi->coeff_opt_dist_threshold); for (TX_TYPE tx_type = txk_start; tx_type <= txk_end; ++tx_type) { if (!(allowed_tx_mask & (1 << tx_type))) continue; if (plane == 0) mbmi->txk_type[txk_type_idx] = tx_type; RD_STATS this_rd_stats; av1_invalid_rd_stats(&this_rd_stats); - - if (!cpi->optimize_seg_arr[mbmi->segment_id]) { + if (skip_trellis || (!perform_block_coeff_opt)) { av1_xform_quant( cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, tx_type, USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP); @@ -3270,8 +3272,8 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane, RDCOST(x->rdmult, 0, AOMMIN(this_rd_stats.dist, this_rd_stats.sse)); if (dist_cost_estimate - (dist_cost_estimate >> 3) > best_rd_) continue; } - av1_optimize_b(cpi, x, plane, block, tx_size, tx_type, txb_ctx, 1, - &rate_cost); + av1_optimize_b(cpi, x, plane, block, tx_size, tx_type, txb_ctx, + cpi->sf.trellis_eob_fast, &rate_cost); } if (eobs_ptr[block] == 0) { // When eob is 0, pixel domain distortion is more efficient and accurate. @@ -3280,8 +3282,38 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane, dist_block_tx_domain(x, plane, block, tx_size, &this_rd_stats.dist, &this_rd_stats.sse); } else { - this_rd_stats.dist = dist_block_px_domain( - cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size); + int64_t sse_diff = INT64_MAX; + // high_energy threshold assumes that every pixel within a txfm block + // has a residue energy of at least 25% of the maximum, i.e. 128 * 128 + // for 8 bit, then the threshold is scaled based on input bit depth. + const int64_t high_energy_thresh = + ((int64_t)128 * 128 * tx_size_2d[tx_size]) << ((xd->bd - 8) * 2); + const int is_high_energy = (block_sse >= high_energy_thresh); + if (tx_size == TX_64X64 || is_high_energy) { + // Because 3 out 4 quadrants of transform coefficients are forced to + // zero, the inverse transform has a tendency to overflow. sse_diff + // is effectively the energy of those 3 quadrants, here we use it + // to decide if we should do pixel domain distortion. If the energy + // is mostly in first quadrant, then it is unlikely that we have + // overflow issue in inverse transform. + dist_block_tx_domain(x, plane, block, tx_size, &this_rd_stats.dist, + &this_rd_stats.sse); + sse_diff = block_sse - this_rd_stats.sse; + } + if (tx_size != TX_64X64 || !is_high_energy || + (sse_diff * 2) < this_rd_stats.sse) { + const int64_t tx_domain_dist = this_rd_stats.dist; + this_rd_stats.dist = dist_block_px_domain( + cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size); + // For high energy blocks, occasionally, the pixel domain distortion + // can be artificially low due to clamping at reconstruction stage + // even when inverse transform output is hugely different from the + // actual residue. + if (is_high_energy && this_rd_stats.dist < tx_domain_dist) + this_rd_stats.dist = tx_domain_dist; + } else { + this_rd_stats.dist += sse_diff; + } this_rd_stats.sse = block_sse; } @@ -3396,7 +3428,7 @@ RECON_INTRA: // if the last search tx_type is the best tx_type, we don't need to // do this again if (best_tx_type != last_tx_type) { - if (!cpi->optimize_seg_arr[mbmi->segment_id]) { + if (skip_trellis) { av1_xform_quant( cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, best_tx_type, @@ -3404,8 +3436,8 @@ RECON_INTRA: } else { av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, best_tx_type, AV1_XFORM_QUANT_FP); - av1_optimize_b(cpi, x, plane, block, tx_size, best_tx_type, txb_ctx, 1, - &rate_cost); + av1_optimize_b(cpi, x, plane, block, tx_size, best_tx_type, txb_ctx, + cpi->sf.trellis_eob_fast, &rate_cost); } } @@ -3432,12 +3464,11 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, struct rdcost_block_args *args = arg; MACROBLOCK *const x = args->x; MACROBLOCKD *const xd = &x->e_mbd; - const MB_MODE_INFO *const mbmi = xd->mi[0]; + const int is_inter = is_inter_block(xd->mi[0]); const AV1_COMP *cpi = args->cpi; ENTROPY_CONTEXT *a = args->t_above + blk_col; ENTROPY_CONTEXT *l = args->t_left + blk_row; const AV1_COMMON *cm = &cpi->common; - int64_t rd1, rd2, rd; RD_STATS this_rd_stats; av1_init_rd_stats(&this_rd_stats); @@ -3447,7 +3478,7 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, return; } - if (!is_inter_block(mbmi)) { + if (!is_inter) { av1_predict_intra_block_facade(cm, xd, plane, blk_col, blk_row, tx_size); av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size); } @@ -3455,10 +3486,11 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx); search_txk_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, &txb_ctx, args->ftxs_mode, args->use_fast_coef_costing, - args->best_rd - args->this_rd, &this_rd_stats); + args->skip_trellis, args->best_rd - args->this_rd, + &this_rd_stats); if (plane == AOM_PLANE_Y && xd->cfl.store_y) { - assert(!is_inter_block(mbmi) || plane_bsize < BLOCK_8X8); + assert(!is_inter || plane_bsize < BLOCK_8X8); cfl_store_tx(xd, blk_row, blk_col, tx_size, plane_bsize); } @@ -3477,37 +3509,26 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, else set_blk_skip(x, plane, blk_idx, 0); - rd1 = RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist); - rd2 = RDCOST(x->rdmult, 0, this_rd_stats.sse); + const int64_t rd1 = RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist); + const int64_t rd2 = RDCOST(x->rdmult, 0, this_rd_stats.sse); // TODO(jingning): temporarily enabled only for luma component - rd = AOMMIN(rd1, rd2); + const int64_t rd = AOMMIN(rd1, rd2); this_rd_stats.skip &= !x->plane[plane].eobs[block]; -#if CONFIG_ONE_PASS_SVM - if (plane == AOM_PLANE_Y && plane_bsize >= BLOCK_8X8) { - int eob = x->plane[plane].eobs[block]; - av1_add_reg_stat(&this_rd_stats, eob, rd, this_rd_stats.sse, blk_row, - blk_col, plane_bsize, txsize_to_bsize[tx_size]); - } -#endif - av1_merge_rd_stats(&args->rd_stats, &this_rd_stats); args->this_rd += rd; - if (args->this_rd > args->best_rd) { - args->exit_early = 1; - return; - } + if (args->this_rd > args->best_rd) args->exit_early = 1; } static void txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi, RD_STATS *rd_stats, int64_t ref_best_rd, int64_t this_rd, int plane, BLOCK_SIZE bsize, TX_SIZE tx_size, int use_fast_coef_casting, - FAST_TX_SEARCH_MODE ftxs_mode) { + FAST_TX_SEARCH_MODE ftxs_mode, int skip_trellis) { MACROBLOCKD *const xd = &x->e_mbd; const struct macroblockd_plane *const pd = &xd->plane[plane]; struct rdcost_block_args args; @@ -3518,8 +3539,14 @@ static void txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi, args.use_fast_coef_costing = use_fast_coef_casting; args.ftxs_mode = ftxs_mode; args.this_rd = this_rd; + args.skip_trellis = skip_trellis; av1_init_rd_stats(&args.rd_stats); + if (!cpi->oxcf.enable_tx64 && txsize_sqr_up_map[tx_size] == TX_64X64) { + av1_invalid_rd_stats(rd_stats); + return; + } + if (plane == 0) xd->mi[0]->tx_size = tx_size; av1_get_entropy_contexts(bsize, pd, args.t_above, args.t_left); @@ -3544,23 +3571,20 @@ static void txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi, static int tx_size_cost(const AV1_COMMON *const cm, const MACROBLOCK *const x, BLOCK_SIZE bsize, TX_SIZE tx_size) { - const MACROBLOCKD *const xd = &x->e_mbd; - const MB_MODE_INFO *const mbmi = xd->mi[0]; + assert(bsize == x->e_mbd.mi[0]->sb_type); + if (cm->tx_mode != TX_MODE_SELECT || !block_signals_txsize(bsize)) return 0; - if (cm->tx_mode == TX_MODE_SELECT && block_signals_txsize(mbmi->sb_type)) { - const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize); - const int depth = tx_size_to_depth(tx_size, bsize); - const int tx_size_ctx = get_tx_size_context(xd); - int r_tx_size = x->tx_size_cost[tx_size_cat][tx_size_ctx][depth]; - return r_tx_size; - } else { - return 0; - } + const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize); + const int depth = tx_size_to_depth(tx_size, bsize); + const MACROBLOCKD *const xd = &x->e_mbd; + const int tx_size_ctx = get_tx_size_context(xd); + return x->tx_size_cost[tx_size_cat][tx_size_ctx][depth]; } static int64_t txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats, int64_t ref_best_rd, BLOCK_SIZE bs, - TX_SIZE tx_size, FAST_TX_SEARCH_MODE ftxs_mode) { + TX_SIZE tx_size, FAST_TX_SEARCH_MODE ftxs_mode, + int skip_trellis) { const AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; @@ -3594,49 +3618,60 @@ static int64_t txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, mbmi->tx_size = tx_size; txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, AOMMIN(this_rd, skip_rd), AOM_PLANE_Y, bs, tx_size, cpi->sf.use_fast_coef_costing, - ftxs_mode); + ftxs_mode, skip_trellis); if (rd_stats->rate == INT_MAX) return INT64_MAX; + // rdstats->rate should include all the rate except skip/non-skip cost as the + // same is accounted in the caller functions after rd evaluation of all + // planes. However the decisions should be done after considering the + // skip/non-skip header cost if (rd_stats->skip) { if (is_inter) { rd = RDCOST(x->rdmult, s1, rd_stats->sse); -#if CONFIG_ONE_PASS_SVM - // TODO(chiyotsai@google.com): Investigate if these updates are really - // needed. - av1_reg_stat_skipmode_update(rd_stats, x->rdmult); -#endif } else { rd = RDCOST(x->rdmult, s1 + r_tx_size * tx_select, rd_stats->sse); -#if CONFIG_ONE_PASS_SVM - av1_reg_stat_skipmode_update(rd_stats, x->rdmult); -#endif + rd_stats->rate += r_tx_size * tx_select; } } else { rd = RDCOST(x->rdmult, rd_stats->rate + s0 + r_tx_size * tx_select, rd_stats->dist); + rd_stats->rate += r_tx_size * tx_select; + } + if (is_inter && !xd->lossless[xd->mi[0]->segment_id]) { + int64_t temp_skip_rd = RDCOST(x->rdmult, s1, rd_stats->sse); + if (temp_skip_rd <= rd) { + rd = temp_skip_rd; + rd_stats->rate = 0; + rd_stats->dist = rd_stats->sse; + rd_stats->skip = 1; + } } - - if (tx_select) rd_stats->rate += r_tx_size; - - if (is_inter && !xd->lossless[xd->mi[0]->segment_id] && !(rd_stats->skip)) - rd = AOMMIN(rd, RDCOST(x->rdmult, s1, rd_stats->sse)); return rd; } static int64_t estimate_yrd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bs, - MACROBLOCK *x, int *r, int64_t *d, int *s, - int64_t *sse, int64_t ref_best_rd) { - RD_STATS rd_stats; + MACROBLOCK *x, int64_t ref_best_rd, + RD_STATS *rd_stats) { + MACROBLOCKD *const xd = &x->e_mbd; av1_subtract_plane(x, bs, 0); x->rd_model = LOW_TXFM_RD; - int64_t rd = txfm_yrd(cpi, x, &rd_stats, ref_best_rd, bs, - max_txsize_rect_lookup[bs], FTXS_NONE); + int skip_trellis = cpi->optimize_seg_arr[xd->mi[0]->segment_id] == + NO_ESTIMATE_YRD_TRELLIS_OPT; + const int64_t rd = + txfm_yrd(cpi, x, rd_stats, ref_best_rd, bs, max_txsize_rect_lookup[bs], + FTXS_NONE, skip_trellis); x->rd_model = FULL_TXFM_RD; - *r = rd_stats.rate; - *d = rd_stats.dist; - *s = rd_stats.skip; - *sse = rd_stats.sse; + if (rd != INT64_MAX) { + const int skip_ctx = av1_get_skip_context(xd); + if (rd_stats->skip) { + const int s1 = x->skip_cost[skip_ctx][1]; + rd_stats->rate = s1; + } else { + const int s0 = x->skip_cost[skip_ctx][0]; + rd_stats->rate += s0; + } + } return rd; } @@ -3662,7 +3697,7 @@ static void choose_largest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x, txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, AOMMIN(this_rd, skip_rd), AOM_PLANE_Y, bs, mbmi->tx_size, - cpi->sf.use_fast_coef_costing, FTXS_NONE); + cpi->sf.use_fast_coef_costing, FTXS_NONE, 0); // Reset the pruning flags. av1_zero(x->tx_search_prune); x->tx_split_prune_flag = 0; @@ -3677,7 +3712,7 @@ static void choose_smallest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x, mbmi->tx_size = TX_4X4; // TODO(any) : Pass this_rd based on skip/non-skip cost txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, 0, bs, mbmi->tx_size, - cpi->sf.use_fast_coef_costing, FTXS_NONE); + cpi->sf.use_fast_coef_costing, FTXS_NONE, 0); } static INLINE int bsize_to_num_blk(BLOCK_SIZE bsize) { @@ -3707,55 +3742,64 @@ static int get_search_init_depth(int mi_width, int mi_height, int is_inter, static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats, int64_t ref_best_rd, BLOCK_SIZE bs) { + av1_invalid_rd_stats(rd_stats); + const AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; - int64_t rd = INT64_MAX; - int n; - int start_tx; - int depth; - int64_t best_rd = INT64_MAX; const TX_SIZE max_rect_tx_size = max_txsize_rect_lookup[bs]; - TX_SIZE best_tx_size = max_rect_tx_size; - TX_TYPE best_txk_type[TXK_TYPE_BUF_LEN]; - uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; - const int n4 = bsize_to_num_blk(bs); const int tx_select = cm->tx_mode == TX_MODE_SELECT; - - av1_invalid_rd_stats(rd_stats); + int start_tx; + int depth, init_depth; if (tx_select) { start_tx = max_rect_tx_size; - depth = get_search_init_depth(mi_size_wide[bs], mi_size_high[bs], - is_inter_block(mbmi), &cpi->sf); + init_depth = get_search_init_depth(mi_size_wide[bs], mi_size_high[bs], + is_inter_block(mbmi), &cpi->sf); } else { const TX_SIZE chosen_tx_size = tx_size_from_tx_mode(bs, cm->tx_mode); start_tx = chosen_tx_size; - depth = MAX_TX_DEPTH; + init_depth = MAX_TX_DEPTH; } prune_tx(cpi, bs, x, xd, EXT_TX_SET_ALL16); - for (n = start_tx; depth <= MAX_TX_DEPTH; depth++, n = sub_tx_size_map[n]) { + TX_TYPE best_txk_type[TXK_TYPE_BUF_LEN]; + uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; + TX_SIZE best_tx_size = max_rect_tx_size; + int64_t best_rd = INT64_MAX; + const int n4 = bsize_to_num_blk(bs); + x->rd_model = FULL_TXFM_RD; + depth = init_depth; + int64_t rd[MAX_TX_DEPTH + 1] = { INT64_MAX, INT64_MAX, INT64_MAX }; + for (int n = start_tx; depth <= MAX_TX_DEPTH; + depth++, n = sub_tx_size_map[n]) { #if CONFIG_DIST_8X8 if (x->using_dist_8x8) { if (tx_size_wide[n] < 8 || tx_size_high[n] < 8) continue; } #endif + if (!cpi->oxcf.enable_tx64 && txsize_sqr_up_map[n] == TX_64X64) continue; + RD_STATS this_rd_stats; - if (mbmi->ref_mv_idx > 0) x->rd_model = LOW_TXFM_RD; - rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, n, FTXS_NONE); - x->rd_model = FULL_TXFM_RD; + rd[depth] = + txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, n, FTXS_NONE, 0); - if (rd < best_rd) { + if (rd[depth] < best_rd) { memcpy(best_txk_type, mbmi->txk_type, sizeof(best_txk_type[0]) * TXK_TYPE_BUF_LEN); memcpy(best_blk_skip, x->blk_skip, sizeof(best_blk_skip[0]) * n4); best_tx_size = n; - best_rd = rd; + best_rd = rd[depth]; *rd_stats = this_rd_stats; } if (n == TX_4X4) break; + // If we are searching three depths, prune the smallest size depending + // on rd results for the first two depths for low contrast blocks. + if (depth > init_depth && depth != MAX_TX_DEPTH && + x->source_variance < 256) { + if (rd[depth - 1] != INT64_MAX && rd[depth] > rd[depth - 1]) break; + } } if (rd_stats->rate != INT_MAX) { @@ -3770,14 +3814,245 @@ static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi, x->tx_split_prune_flag = 0; } +// origin_threshold * 128 / 100 +static const uint32_t skip_pred_threshold[3][BLOCK_SIZES_ALL] = { + { + 64, 64, 64, 70, 60, 60, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 64, 64, 70, 70, 68, 68, + }, + { + 88, 88, 88, 86, 87, 87, 68, 68, 68, 68, 68, + 68, 68, 68, 68, 68, 88, 88, 86, 86, 68, 68, + }, + { + 90, 93, 93, 90, 93, 93, 74, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 90, 90, 90, 90, 74, 74, + }, +}; + +// lookup table for predict_skip_flag +// int max_tx_size = max_txsize_rect_lookup[bsize]; +// if (tx_size_high[max_tx_size] > 16 || tx_size_wide[max_tx_size] > 16) +// max_tx_size = AOMMIN(max_txsize_lookup[bsize], TX_16X16); +static const TX_SIZE max_predict_sf_tx_size[BLOCK_SIZES_ALL] = { + TX_4X4, TX_4X8, TX_8X4, TX_8X8, TX_8X16, TX_16X8, + TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_16X16, + TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_4X16, TX_16X4, + TX_8X8, TX_8X8, TX_16X16, TX_16X16, +}; + +// Uses simple features on top of DCT coefficients to quickly predict +// whether optimal RD decision is to skip encoding the residual. +// The sse value is stored in dist. +static int predict_skip_flag(MACROBLOCK *x, BLOCK_SIZE bsize, int64_t *dist, + int reduced_tx_set) { + const int bw = block_size_wide[bsize]; + const int bh = block_size_high[bsize]; + const MACROBLOCKD *xd = &x->e_mbd; + const int16_t dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd); + + *dist = pixel_diff_dist(x, 0, 0, 0, bsize, bsize, NULL); + + const int64_t mse = *dist / bw / bh; + // Normalized quantizer takes the transform upscaling factor (8 for tx size + // smaller than 32) into account. + const int16_t normalized_dc_q = dc_q >> 3; + const int64_t mse_thresh = (int64_t)normalized_dc_q * normalized_dc_q / 8; + // Predict not to skip when mse is larger than threshold. + if (mse > mse_thresh) return 0; + + const int max_tx_size = max_predict_sf_tx_size[bsize]; + const int tx_h = tx_size_high[max_tx_size]; + const int tx_w = tx_size_wide[max_tx_size]; + DECLARE_ALIGNED(32, tran_low_t, coefs[32 * 32]); + TxfmParam param; + param.tx_type = DCT_DCT; + param.tx_size = max_tx_size; + param.bd = xd->bd; + param.is_hbd = is_cur_buf_hbd(xd); + param.lossless = 0; + param.tx_set_type = av1_get_ext_tx_set_type( + param.tx_size, is_inter_block(xd->mi[0]), reduced_tx_set); + const int bd_idx = (xd->bd == 8) ? 0 : ((xd->bd == 10) ? 1 : 2); + const uint32_t max_qcoef_thresh = skip_pred_threshold[bd_idx][bsize]; + const int16_t *src_diff = x->plane[0].src_diff; + const int n_coeff = tx_w * tx_h; + const int16_t ac_q = av1_ac_quant_QTX(x->qindex, 0, xd->bd); + const uint32_t dc_thresh = max_qcoef_thresh * dc_q; + const uint32_t ac_thresh = max_qcoef_thresh * ac_q; + for (int row = 0; row < bh; row += tx_h) { + for (int col = 0; col < bw; col += tx_w) { + av1_fwd_txfm(src_diff + col, coefs, bw, ¶m); + // Operating on TX domain, not pixels; we want the QTX quantizers + const uint32_t dc_coef = (((uint32_t)abs(coefs[0])) << 7); + if (dc_coef >= dc_thresh) return 0; + for (int i = 1; i < n_coeff; ++i) { + const uint32_t ac_coef = (((uint32_t)abs(coefs[i])) << 7); + if (ac_coef >= ac_thresh) return 0; + } + } + src_diff += tx_h * bw; + } + return 1; +} + +// Used to set proper context for early termination with skip = 1. +static void set_skip_flag(MACROBLOCK *x, RD_STATS *rd_stats, int bsize, + int64_t dist) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const int n4 = bsize_to_num_blk(bsize); + const TX_SIZE tx_size = max_txsize_rect_lookup[bsize]; + memset(mbmi->txk_type, DCT_DCT, sizeof(mbmi->txk_type[0]) * TXK_TYPE_BUF_LEN); + memset(mbmi->inter_tx_size, tx_size, sizeof(mbmi->inter_tx_size)); + mbmi->tx_size = tx_size; + for (int i = 0; i < n4; ++i) set_blk_skip(x, 0, i, 1); + rd_stats->skip = 1; + if (is_cur_buf_hbd(xd)) dist = ROUND_POWER_OF_TWO(dist, (xd->bd - 8) * 2); + rd_stats->dist = rd_stats->sse = (dist << 4); + // Though decision is to make the block as skip based on luma stats, + // it is possible that block becomes non skip after chroma rd. In addition + // intermediate non skip costs calculated by caller function will be + // incorrect, if rate is set as zero (i.e., if zero_blk_rate is not + // accounted). Hence intermediate rate is populated to code the luma tx blks + // as skip, the caller function based on final rd decision (i.e., skip vs + // non-skip) sets the final rate accordingly. Here the rate populated + // corresponds to coding all the tx blocks with zero_blk_rate (based on max tx + // size possible) in the current block. Eg: For 128*128 block, rate would be + // 4 * zero_blk_rate where zero_blk_rate corresponds to coding of one 64x64 tx + // block as 'all zeros' + ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE]; + ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE]; + av1_get_entropy_contexts(bsize, &xd->plane[0], ctxa, ctxl); + ENTROPY_CONTEXT *ta = ctxa; + ENTROPY_CONTEXT *tl = ctxl; + const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); + TXB_CTX txb_ctx; + get_txb_ctx(bsize, tx_size, 0, ta, tl, &txb_ctx); + const int zero_blk_rate = x->coeff_costs[txs_ctx][PLANE_TYPE_Y] + .txb_skip_cost[txb_ctx.txb_skip_ctx][1]; + rd_stats->rate = zero_blk_rate * + (block_size_wide[bsize] >> tx_size_wide_log2[tx_size]) * + (block_size_high[bsize] >> tx_size_high_log2[tx_size]); +} + +static INLINE uint32_t get_block_residue_hash(MACROBLOCK *x, BLOCK_SIZE bsize) { + const int rows = block_size_high[bsize]; + const int cols = block_size_wide[bsize]; + const int16_t *diff = x->plane[0].src_diff; + const uint32_t hash = av1_get_crc32c_value(&x->mb_rd_record.crc_calculator, + (uint8_t *)diff, 2 * rows * cols); + return (hash << 5) + bsize; +} + +static void save_tx_rd_info(int n4, uint32_t hash, const MACROBLOCK *const x, + const RD_STATS *const rd_stats, + MB_RD_RECORD *tx_rd_record) { + int index; + if (tx_rd_record->num < RD_RECORD_BUFFER_LEN) { + index = + (tx_rd_record->index_start + tx_rd_record->num) % RD_RECORD_BUFFER_LEN; + ++tx_rd_record->num; + } else { + index = tx_rd_record->index_start; + tx_rd_record->index_start = + (tx_rd_record->index_start + 1) % RD_RECORD_BUFFER_LEN; + } + MB_RD_INFO *const tx_rd_info = &tx_rd_record->tx_rd_info[index]; + const MACROBLOCKD *const xd = &x->e_mbd; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + tx_rd_info->hash_value = hash; + tx_rd_info->tx_size = mbmi->tx_size; + memcpy(tx_rd_info->blk_skip, x->blk_skip, + sizeof(tx_rd_info->blk_skip[0]) * n4); + av1_copy(tx_rd_info->inter_tx_size, mbmi->inter_tx_size); + av1_copy(tx_rd_info->txk_type, mbmi->txk_type); + tx_rd_info->rd_stats = *rd_stats; +} + +static void fetch_tx_rd_info(int n4, const MB_RD_INFO *const tx_rd_info, + RD_STATS *const rd_stats, MACROBLOCK *const x) { + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + mbmi->tx_size = tx_rd_info->tx_size; + memcpy(x->blk_skip, tx_rd_info->blk_skip, + sizeof(tx_rd_info->blk_skip[0]) * n4); + av1_copy(mbmi->inter_tx_size, tx_rd_info->inter_tx_size); + av1_copy(mbmi->txk_type, tx_rd_info->txk_type); + *rd_stats = tx_rd_info->rd_stats; +} + +static INLINE int32_t find_mb_rd_info(const MB_RD_RECORD *const mb_rd_record, + const int64_t ref_best_rd, + const uint32_t hash) { + int32_t match_index = -1; + if (ref_best_rd != INT64_MAX) { + for (int i = 0; i < mb_rd_record->num; ++i) { + const int index = (mb_rd_record->index_start + i) % RD_RECORD_BUFFER_LEN; + // If there is a match in the tx_rd_record, fetch the RD decision and + // terminate early. + if (mb_rd_record->tx_rd_info[index].hash_value == hash) { + match_index = index; + break; + } + } + } + return match_index; +} + static void super_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, RD_STATS *rd_stats, BLOCK_SIZE bs, int64_t ref_best_rd) { MACROBLOCKD *xd = &x->e_mbd; av1_init_rd_stats(rd_stats); - + int is_inter = is_inter_block(xd->mi[0]); assert(bs == xd->mi[0]->sb_type); + const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2); + const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2); + + uint32_t hash = 0; + int32_t match_index = -1; + MB_RD_RECORD *mb_rd_record = NULL; + const int within_border = mi_row >= xd->tile.mi_row_start && + (mi_row + mi_size_high[bs] < xd->tile.mi_row_end) && + mi_col >= xd->tile.mi_col_start && + (mi_col + mi_size_wide[bs] < xd->tile.mi_col_end); + const int is_mb_rd_hash_enabled = + (within_border && cpi->sf.use_mb_rd_hash && is_inter); + const int n4 = bsize_to_num_blk(bs); + if (is_mb_rd_hash_enabled) { + hash = get_block_residue_hash(x, bs); + mb_rd_record = &x->mb_rd_record; + match_index = find_mb_rd_info(mb_rd_record, ref_best_rd, hash); + if (match_index != -1) { + MB_RD_INFO *tx_rd_info = &mb_rd_record->tx_rd_info[match_index]; + fetch_tx_rd_info(n4, tx_rd_info, rd_stats, x); + // Reset the pruning flags. + av1_zero(x->tx_search_prune); + x->tx_split_prune_flag = 0; + return; + } + } + + // If we predict that skip is the optimal RD decision - set the respective + // context and terminate early. + int64_t dist; + + if (cpi->sf.tx_type_search.use_skip_flag_prediction && is_inter && + (!xd->lossless[xd->mi[0]->segment_id]) && + predict_skip_flag(x, bs, &dist, cpi->common.reduced_tx_set_used)) { + // Populate rdstats as per skip decision + set_skip_flag(x, rd_stats, bs, dist); + // Save the RD search results into tx_rd_record. + if (is_mb_rd_hash_enabled) + save_tx_rd_info(n4, hash, x, rd_stats, mb_rd_record); + // Reset the pruning flags. + av1_zero(x->tx_search_prune); + x->tx_split_prune_flag = 0; + return; + } + if (xd->lossless[xd->mi[0]->segment_id]) { choose_smallest_tx_size(cpi, x, rd_stats, ref_best_rd, bs); } else if (cpi->sf.tx_size_search_method == USE_LARGESTALL) { @@ -3785,6 +4060,12 @@ static void super_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x, } else { choose_tx_size_type_from_rd(cpi, x, rd_stats, ref_best_rd, bs); } + + // Save the RD search results into tx_rd_record. + if (is_mb_rd_hash_enabled) { + assert(mb_rd_record != NULL); + save_tx_rd_info(n4, hash, x, rd_stats, mb_rd_record); + } } // Return the rate cost for luma prediction mode info. of intra blocks. @@ -4527,6 +4808,7 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x, const int *bmode_costs; PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; const int try_palette = + cpi->oxcf.enable_palette && av1_allow_palette(cpi->common.allow_screen_content_tools, mbmi->sb_type); uint8_t *best_palette_color_map = try_palette ? x->palette_buffer->best_palette_color_map : NULL; @@ -4542,8 +4824,7 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x, if (cpi->sf.intra_angle_estimation) { const int src_stride = x->plane[0].src.stride; const uint8_t *src = x->plane[0].src.buf; - angle_estimation(src, src_stride, rows, cols, bsize, - xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, + angle_estimation(src, src_stride, rows, cols, bsize, is_cur_buf_hbd(xd), directional_mode_skip_mask); } mbmi->filter_intra_mode_info.use_filter_intra = 0; @@ -4561,6 +4842,11 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x, int this_rate, this_rate_tokenonly, s; int64_t this_distortion, this_rd, this_model_rd; mbmi->mode = intra_rd_search_mode_order[mode_idx]; + if ((!cpi->oxcf.enable_smooth_intra || cpi->sf.disable_smooth_intra) && + (mbmi->mode == SMOOTH_PRED || mbmi->mode == SMOOTH_H_PRED || + mbmi->mode == SMOOTH_V_PRED)) + continue; + if (!cpi->oxcf.enable_paeth_intra && mbmi->mode == PAETH_PRED) continue; mbmi->angle_delta[PLANE_TYPE_Y] = 0; this_model_rd = intra_model_yrd(cpi, x, bsize, bmode_costs[mbmi->mode], mi_row, mi_col); @@ -4570,7 +4856,8 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x, if (this_model_rd < best_model_rd) best_model_rd = this_model_rd; is_directional_mode = av1_is_directional_mode(mbmi->mode); if (is_directional_mode && directional_mode_skip_mask[mbmi->mode]) continue; - if (is_directional_mode && av1_use_angle_delta(bsize)) { + if (is_directional_mode && av1_use_angle_delta(bsize) && + cpi->oxcf.enable_angle_delta) { this_rd_stats.rate = INT_MAX; rd_pick_intra_angle_sby(cpi, x, mi_row, mi_col, &this_rate, &this_rd_stats, bsize, bmode_costs[mbmi->mode], @@ -4649,6 +4936,8 @@ static int super_block_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x, const TX_SIZE uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd); int plane; int is_cost_valid = 1; + const int is_inter = is_inter_block(mbmi); + int64_t this_rd = 0, skip_rd = 0; av1_init_rd_stats(rd_stats); if (ref_best_rd < 0) is_cost_valid = 0; @@ -4657,7 +4946,7 @@ static int super_block_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x, bsize = scale_chroma_bsize(bsize, pd->subsampling_x, pd->subsampling_y); - if (is_inter_block(mbmi) && is_cost_valid) { + if (is_inter && is_cost_valid) { for (plane = 1; plane < MAX_MB_PLANE; ++plane) av1_subtract_plane(x, bsize, plane); } @@ -4665,15 +4954,26 @@ static int super_block_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x, if (is_cost_valid) { for (plane = 1; plane < MAX_MB_PLANE; ++plane) { RD_STATS pn_rd_stats; - txfm_rd_in_plane(x, cpi, &pn_rd_stats, ref_best_rd, 0, plane, bsize, - uv_tx_size, cpi->sf.use_fast_coef_costing, FTXS_NONE); + int64_t chroma_ref_best_rd = ref_best_rd; + // For inter blocks, refined ref_best_rd is used for early exit + // For intra blocks, even though current rd crosses ref_best_rd, early + // exit is not recommended as current rd is used for gating subsequent + // modes as well (say, for angular modes) + // TODO(any): Extend the early exit mechanism for intra modes as well + if (cpi->sf.perform_best_rd_based_gating_for_chroma && is_inter && + chroma_ref_best_rd != INT64_MAX) + chroma_ref_best_rd = ref_best_rd - AOMMIN(this_rd, skip_rd); + txfm_rd_in_plane(x, cpi, &pn_rd_stats, chroma_ref_best_rd, 0, plane, + bsize, uv_tx_size, cpi->sf.use_fast_coef_costing, + FTXS_NONE, 0); if (pn_rd_stats.rate == INT_MAX) { is_cost_valid = 0; break; } av1_merge_rd_stats(rd_stats, &pn_rd_stats); - if (RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) > ref_best_rd && - RDCOST(x->rdmult, 0, rd_stats->sse) > ref_best_rd) { + this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + skip_rd = RDCOST(x->rdmult, 0, rd_stats->sse); + if (AOMMIN(this_rd, skip_rd) > ref_best_rd) { is_cost_valid = 0; break; } @@ -4688,11 +4988,12 @@ static int super_block_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x, return is_cost_valid; } -static void tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size, - int blk_row, int blk_col, int plane, int block, - int plane_bsize, TXB_CTX *txb_ctx, RD_STATS *rd_stats, - FAST_TX_SEARCH_MODE ftxs_mode, int64_t ref_rdcost, - TXB_RD_INFO *rd_info_array) { +// Pick transform type for a transform block of tx_size. +static void tx_type_rd(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size, + int blk_row, int blk_col, int plane, int block, + int plane_bsize, TXB_CTX *txb_ctx, RD_STATS *rd_stats, + FAST_TX_SEARCH_MODE ftxs_mode, int64_t ref_rdcost, + TXB_RD_INFO *rd_info_array) { const struct macroblock_plane *const p = &x->plane[plane]; const uint16_t cur_joint_ctx = (txb_ctx->dc_sign_ctx << 8) + txb_ctx->txb_skip_ctx; @@ -4720,7 +5021,7 @@ static void tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size, RD_STATS this_rd_stats; search_txk_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, - txb_ctx, ftxs_mode, 0, ref_rdcost, &this_rd_stats); + txb_ctx, ftxs_mode, 0, 0, ref_rdcost, &this_rd_stats); av1_merge_rd_stats(rd_stats, &this_rd_stats); @@ -4885,9 +5186,9 @@ static void try_tx_block_no_split( rd_stats->zero_rate = zero_blk_rate; const int index = av1_get_txb_size_index(plane_bsize, blk_row, blk_col); mbmi->inter_tx_size[index] = tx_size; - tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize, - &txb_ctx, rd_stats, ftxs_mode, ref_best_rd, - rd_info_node != NULL ? rd_info_node->rd_info_array : NULL); + tx_type_rd(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize, &txb_ctx, + rd_stats, ftxs_mode, ref_best_rd, + rd_info_node != NULL ? rd_info_node->rd_info_array : NULL); assert(rd_stats->rate < INT_MAX); if ((RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >= @@ -4895,7 +5196,7 @@ static void try_tx_block_no_split( rd_stats->skip == 1) && !xd->lossless[mbmi->segment_id]) { #if CONFIG_RD_DEBUG - av1_update_txb_coeff_cost(rd_stats, plane, tx_size, blk_row, blk_col, + av1_update_txb_coeff_cost(rd_stats, 0, tx_size, blk_row, blk_col, zero_blk_rate - rd_stats->rate); #endif // CONFIG_RD_DEBUG rd_stats->rate = zero_blk_rate; @@ -4918,13 +5219,6 @@ static void try_tx_block_no_split( const int txk_type_idx = av1_get_txk_type_index(plane_bsize, blk_row, blk_col); no_split->tx_type = mbmi->txk_type[txk_type_idx]; - -#if CONFIG_ONE_PASS_SVM - if (plane_bsize >= BLOCK_8X8) { - av1_add_reg_stat(rd_stats, p->eobs[block], no_split->rd, rd_stats->sse, - blk_row, blk_col, plane_bsize, txsize_to_bsize[tx_size]); - } -#endif } static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, @@ -4932,8 +5226,8 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta, ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left, RD_STATS *rd_stats, - int64_t ref_best_rd, int *is_cost_valid, - FAST_TX_SEARCH_MODE ftxs_mode, + int64_t prev_level_rd, int64_t ref_best_rd, + int *is_cost_valid, FAST_TX_SEARCH_MODE ftxs_mode, TXB_RD_INFO_NODE *rd_info_node); static void try_tx_block_split( @@ -4943,6 +5237,7 @@ static void try_tx_block_split( int txfm_partition_ctx, int64_t no_split_rd, int64_t ref_best_rd, FAST_TX_SEARCH_MODE ftxs_mode, TXB_RD_INFO_NODE *rd_info_node, RD_STATS *split_rd_stats, int64_t *split_rd) { + assert(tx_size < TX_SIZES_ALL); MACROBLOCKD *const xd = &x->e_mbd; const int max_blocks_high = max_block_high(xd, plane_bsize, 0); const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0); @@ -4950,44 +5245,37 @@ static void try_tx_block_split( const int bsw = tx_size_wide_unit[sub_txs]; const int bsh = tx_size_high_unit[sub_txs]; const int sub_step = bsw * bsh; - RD_STATS this_rd_stats; - int this_cost_valid = 1; + const int nblks = + (tx_size_high_unit[tx_size] / bsh) * (tx_size_wide_unit[tx_size] / bsw); + assert(nblks > 0); + int blk_idx = 0; int64_t tmp_rd = 0; - + *split_rd = INT64_MAX; split_rd_stats->rate = x->txfm_partition_cost[txfm_partition_ctx][1]; - assert(tx_size < TX_SIZES_ALL); - - int blk_idx = 0; for (int r = 0; r < tx_size_high_unit[tx_size]; r += bsh) { for (int c = 0; c < tx_size_wide_unit[tx_size]; c += bsw, ++blk_idx) { + assert(blk_idx < 4); const int offsetr = blk_row + r; const int offsetc = blk_col + c; if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue; - assert(blk_idx < 4); + + RD_STATS this_rd_stats; + int this_cost_valid = 1; select_tx_block( cpi, x, offsetr, offsetc, block, sub_txs, depth + 1, plane_bsize, ta, - tl, tx_above, tx_left, &this_rd_stats, ref_best_rd - tmp_rd, - &this_cost_valid, ftxs_mode, + tl, tx_above, tx_left, &this_rd_stats, no_split_rd / nblks, + ref_best_rd - tmp_rd, &this_cost_valid, ftxs_mode, (rd_info_node != NULL) ? rd_info_node->children[blk_idx] : NULL); - - if (!this_cost_valid) goto LOOP_EXIT; - + if (!this_cost_valid) return; av1_merge_rd_stats(split_rd_stats, &this_rd_stats); - tmp_rd = RDCOST(x->rdmult, split_rd_stats->rate, split_rd_stats->dist); - - if (no_split_rd < tmp_rd) { - this_cost_valid = 0; - goto LOOP_EXIT; - } + if (no_split_rd < tmp_rd) return; block += sub_step; } } -LOOP_EXIT : {} - - if (this_cost_valid) *split_rd = tmp_rd; + *split_rd = tmp_rd; } // Search for the best tx partition/type for a given luma block. @@ -4996,8 +5284,8 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta, ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left, RD_STATS *rd_stats, - int64_t ref_best_rd, int *is_cost_valid, - FAST_TX_SEARCH_MODE ftxs_mode, + int64_t prev_level_rd, int64_t ref_best_rd, + int *is_cost_valid, FAST_TX_SEARCH_MODE ftxs_mode, TXB_RD_INFO_NODE *rd_info_node) { assert(tx_size < TX_SIZES_ALL); av1_init_rd_stats(rd_stats); @@ -5017,7 +5305,8 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, mbmi->sb_type, tx_size); struct macroblock_plane *const p = &x->plane[0]; - const int try_no_split = 1; + const int try_no_split = + cpi->oxcf.enable_tx64 || txsize_sqr_up_map[tx_size] != TX_64X64; int try_split = tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH; #if CONFIG_DIST_8X8 if (x->using_dist_8x8) @@ -5042,6 +5331,13 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, if (cpi->sf.txb_split_cap) { if (p->eobs[block] == 0) try_split = 0; } + + if (cpi->sf.adaptive_txb_search_level && + (no_split.rd - + (no_split.rd >> (2 + cpi->sf.adaptive_txb_search_level))) > + prev_level_rd) { + try_split = 0; + } } if (x->e_mbd.bd == 8 && !x->cb_partition_scan && try_split) { @@ -5089,98 +5385,12 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, } } -static void select_inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, - RD_STATS *rd_stats, BLOCK_SIZE bsize, - int64_t ref_best_rd, - FAST_TX_SEARCH_MODE ftxs_mode, - TXB_RD_INFO_NODE *rd_info_tree) { - MACROBLOCKD *const xd = &x->e_mbd; - int is_cost_valid = 1; - int64_t this_rd = 0, skip_rd = 0; - - if (ref_best_rd < 0) is_cost_valid = 0; - - av1_init_rd_stats(rd_stats); - - if (is_cost_valid) { - const struct macroblockd_plane *const pd = &xd->plane[0]; - const BLOCK_SIZE plane_bsize = - get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); - const int mi_width = mi_size_wide[plane_bsize]; - const int mi_height = mi_size_high[plane_bsize]; - const TX_SIZE max_tx_size = max_txsize_rect_lookup[plane_bsize]; - const int bh = tx_size_high_unit[max_tx_size]; - const int bw = tx_size_wide_unit[max_tx_size]; - int idx, idy; - int block = 0; - int step = tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size]; - ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE]; - ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE]; - TXFM_CONTEXT tx_above[MAX_MIB_SIZE]; - TXFM_CONTEXT tx_left[MAX_MIB_SIZE]; - - RD_STATS pn_rd_stats; - const int init_depth = - get_search_init_depth(mi_width, mi_height, 1, &cpi->sf); - av1_init_rd_stats(&pn_rd_stats); - - av1_get_entropy_contexts(bsize, pd, ctxa, ctxl); - memcpy(tx_above, xd->above_txfm_context, sizeof(TXFM_CONTEXT) * mi_width); - memcpy(tx_left, xd->left_txfm_context, sizeof(TXFM_CONTEXT) * mi_height); - const int skip_ctx = av1_get_skip_context(xd); - const int s0 = x->skip_cost[skip_ctx][0]; - const int s1 = x->skip_cost[skip_ctx][1]; - - skip_rd = RDCOST(x->rdmult, s1, 0); - this_rd = RDCOST(x->rdmult, s0, 0); - for (idy = 0; idy < mi_height; idy += bh) { - for (idx = 0; idx < mi_width; idx += bw) { - int64_t best_rd_sofar = (ref_best_rd - (AOMMIN(skip_rd, this_rd))); - select_tx_block(cpi, x, idy, idx, block, max_tx_size, init_depth, - plane_bsize, ctxa, ctxl, tx_above, tx_left, - &pn_rd_stats, best_rd_sofar, &is_cost_valid, ftxs_mode, - rd_info_tree); - if (!is_cost_valid || pn_rd_stats.rate == INT_MAX) { - av1_invalid_rd_stats(rd_stats); - return; - } - av1_merge_rd_stats(rd_stats, &pn_rd_stats); - skip_rd = RDCOST(x->rdmult, s1, rd_stats->sse); - this_rd = RDCOST(x->rdmult, rd_stats->rate + s0, rd_stats->dist); - block += step; - if (rd_info_tree != NULL) rd_info_tree += 1; - } - } - if (skip_rd <= this_rd) { - rd_stats->rate = 0; - rd_stats->dist = rd_stats->sse; - rd_stats->skip = 1; -#if CONFIG_ONE_PASS_SVM - av1_reg_stat_skipmode_update(rd_stats, x->rdmult); -#endif - } else { - rd_stats->skip = 0; - } - } - - if (!is_cost_valid) { - // reset cost value - av1_invalid_rd_stats(rd_stats); - } -} - -static int64_t select_tx_size_fix_type(const AV1_COMP *cpi, MACROBLOCK *x, +static int64_t select_tx_size_and_type(const AV1_COMP *cpi, MACROBLOCK *x, RD_STATS *rd_stats, BLOCK_SIZE bsize, int64_t ref_best_rd, TXB_RD_INFO_NODE *rd_info_tree) { - const int fast_tx_search = cpi->sf.tx_size_search_method > USE_FULL_RD; MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = xd->mi[0]; - const int is_inter = is_inter_block(mbmi); - const int skip_ctx = av1_get_skip_context(xd); - int s0 = x->skip_cost[skip_ctx][0]; - int s1 = x->skip_cost[skip_ctx][1]; - int64_t rd; + assert(is_inter_block(xd->mi[0])); // TODO(debargha): enable this as a speed feature where the // select_inter_block_yrd() function above will use a simplified search @@ -5188,16 +5398,71 @@ static int64_t select_tx_size_fix_type(const AV1_COMP *cpi, MACROBLOCK *x, // will use more complex search given that the transform partitions have // already been decided. + const int fast_tx_search = cpi->sf.tx_size_search_method > USE_FULL_RD; int64_t rd_thresh = ref_best_rd; if (fast_tx_search && rd_thresh < INT64_MAX) { if (INT64_MAX - rd_thresh > (rd_thresh >> 3)) rd_thresh += (rd_thresh >> 3); } assert(rd_thresh > 0); - FAST_TX_SEARCH_MODE ftxs_mode = + const FAST_TX_SEARCH_MODE ftxs_mode = fast_tx_search ? FTXS_DCT_AND_1D_DCT_ONLY : FTXS_NONE; - select_inter_block_yrd(cpi, x, rd_stats, bsize, rd_thresh, ftxs_mode, - rd_info_tree); + const struct macroblockd_plane *const pd = &xd->plane[0]; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + const int mi_width = mi_size_wide[plane_bsize]; + const int mi_height = mi_size_high[plane_bsize]; + ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE]; + ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE]; + TXFM_CONTEXT tx_above[MAX_MIB_SIZE]; + TXFM_CONTEXT tx_left[MAX_MIB_SIZE]; + av1_get_entropy_contexts(bsize, pd, ctxa, ctxl); + memcpy(tx_above, xd->above_txfm_context, sizeof(TXFM_CONTEXT) * mi_width); + memcpy(tx_left, xd->left_txfm_context, sizeof(TXFM_CONTEXT) * mi_height); + + const int skip_ctx = av1_get_skip_context(xd); + const int s0 = x->skip_cost[skip_ctx][0]; + const int s1 = x->skip_cost[skip_ctx][1]; + const int init_depth = + get_search_init_depth(mi_width, mi_height, 1, &cpi->sf); + const TX_SIZE max_tx_size = max_txsize_rect_lookup[plane_bsize]; + const int bh = tx_size_high_unit[max_tx_size]; + const int bw = tx_size_wide_unit[max_tx_size]; + const int step = bw * bh; + int64_t skip_rd = RDCOST(x->rdmult, s1, 0); + int64_t this_rd = RDCOST(x->rdmult, s0, 0); + int block = 0; + + av1_init_rd_stats(rd_stats); + for (int idy = 0; idy < mi_height; idy += bh) { + for (int idx = 0; idx < mi_width; idx += bw) { + const int64_t best_rd_sofar = + (rd_thresh == INT64_MAX) ? INT64_MAX + : (rd_thresh - (AOMMIN(skip_rd, this_rd))); + int is_cost_valid = 1; + RD_STATS pn_rd_stats; + select_tx_block(cpi, x, idy, idx, block, max_tx_size, init_depth, + plane_bsize, ctxa, ctxl, tx_above, tx_left, &pn_rd_stats, + INT64_MAX, best_rd_sofar, &is_cost_valid, ftxs_mode, + rd_info_tree); + if (!is_cost_valid || pn_rd_stats.rate == INT_MAX) { + av1_invalid_rd_stats(rd_stats); + return INT64_MAX; + } + av1_merge_rd_stats(rd_stats, &pn_rd_stats); + skip_rd = RDCOST(x->rdmult, s1, rd_stats->sse); + this_rd = RDCOST(x->rdmult, rd_stats->rate + s0, rd_stats->dist); + block += step; + if (rd_info_tree != NULL) rd_info_tree += 1; + } + } + + if (skip_rd <= this_rd) { + rd_stats->skip = 1; + } else { + rd_stats->skip = 0; + } + if (rd_stats->rate == INT_MAX) return INT64_MAX; // If fast_tx_search is true, only DCT and 1D DCT were tested in @@ -5208,20 +5473,15 @@ static int64_t select_tx_size_fix_type(const AV1_COMP *cpi, MACROBLOCK *x, return INT64_MAX; } + int64_t rd; if (rd_stats->skip) { rd = RDCOST(x->rdmult, s1, rd_stats->sse); -#if CONFIG_ONE_PASS_SVM - // TODO(chiyotsai@google.com): Investigate if these updates are really - // needed. - av1_reg_stat_skipmode_update(rd_stats, x->rdmult); -#endif } else { rd = RDCOST(x->rdmult, rd_stats->rate + s0, rd_stats->dist); + if (!xd->lossless[xd->mi[0]->segment_id]) + rd = AOMMIN(rd, RDCOST(x->rdmult, s1, rd_stats->sse)); } - if (is_inter && !xd->lossless[xd->mi[0]->segment_id] && !(rd_stats->skip)) - rd = AOMMIN(rd, RDCOST(x->rdmult, s1, rd_stats->sse)); - return rd; } @@ -5260,8 +5520,8 @@ static void tx_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, .txb_skip_cost[txb_ctx.txb_skip_ctx][1]; rd_stats->zero_rate = zero_blk_rate; rd_stats->ref_rdcost = ref_best_rd; - tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize, - &txb_ctx, rd_stats, ftxs_mode, ref_best_rd, NULL); + tx_type_rd(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize, + &txb_ctx, rd_stats, ftxs_mode, ref_best_rd, NULL); const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0]; if (RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >= RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) || @@ -5274,20 +5534,9 @@ static void tx_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, x->plane[0].txb_entropy_ctx[block] = 0; update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size, DCT_DCT); -#if CONFIG_ONE_PASS_SVM - av1_add_reg_stat(rd_stats, 0, RDCOST(x->rdmult, 0, rd_stats->sse), - rd_stats->sse, blk_row, blk_col, plane_bsize, - txsize_to_bsize[tx_size]); -#endif } else { rd_stats->skip = 0; set_blk_skip(x, 0, blk_row * mi_width + blk_col, 0); -#if CONFIG_ONE_PASS_SVM - av1_add_reg_stat(rd_stats, x->plane[0].eobs[block], - RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist), - rd_stats->sse, blk_row, blk_col, plane_bsize, - txsize_to_bsize[tx_size]); -#endif } if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH) rd_stats->rate += x->txfm_partition_cost[ctx][0]; @@ -5395,11 +5644,6 @@ static int inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, rd_stats->rate = 0; rd_stats->dist = rd_stats->sse; rd_stats->skip = 1; -#if CONFIG_ONE_PASS_SVM - // TODO(chiyotasi@google.com): Investigate if these updates are really - // needed. - av1_reg_stat_skipmode_update(rd_stats, x->rdmult); -#endif } if (this_rd > ref_best_rd) is_cost_valid = 0; @@ -5410,52 +5654,6 @@ static int inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, return is_cost_valid; } -static INLINE uint32_t get_block_residue_hash(MACROBLOCK *x, BLOCK_SIZE bsize) { - const int rows = block_size_high[bsize]; - const int cols = block_size_wide[bsize]; - const int16_t *diff = x->plane[0].src_diff; - const uint32_t hash = av1_get_crc32c_value(&x->mb_rd_record.crc_calculator, - (uint8_t *)diff, 2 * rows * cols); - return (hash << 5) + bsize; -} - -static void save_tx_rd_info(int n4, uint32_t hash, const MACROBLOCK *const x, - const RD_STATS *const rd_stats, - MB_RD_RECORD *tx_rd_record) { - int index; - if (tx_rd_record->num < RD_RECORD_BUFFER_LEN) { - index = - (tx_rd_record->index_start + tx_rd_record->num) % RD_RECORD_BUFFER_LEN; - ++tx_rd_record->num; - } else { - index = tx_rd_record->index_start; - tx_rd_record->index_start = - (tx_rd_record->index_start + 1) % RD_RECORD_BUFFER_LEN; - } - MB_RD_INFO *const tx_rd_info = &tx_rd_record->tx_rd_info[index]; - const MACROBLOCKD *const xd = &x->e_mbd; - const MB_MODE_INFO *const mbmi = xd->mi[0]; - tx_rd_info->hash_value = hash; - tx_rd_info->tx_size = mbmi->tx_size; - memcpy(tx_rd_info->blk_skip, x->blk_skip, - sizeof(tx_rd_info->blk_skip[0]) * n4); - av1_copy(tx_rd_info->inter_tx_size, mbmi->inter_tx_size); - av1_copy(tx_rd_info->txk_type, mbmi->txk_type); - tx_rd_info->rd_stats = *rd_stats; -} - -static void fetch_tx_rd_info(int n4, const MB_RD_INFO *const tx_rd_info, - RD_STATS *const rd_stats, MACROBLOCK *const x) { - MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = xd->mi[0]; - mbmi->tx_size = tx_rd_info->tx_size; - memcpy(x->blk_skip, tx_rd_info->blk_skip, - sizeof(tx_rd_info->blk_skip[0]) * n4); - av1_copy(mbmi->inter_tx_size, tx_rd_info->inter_tx_size); - av1_copy(mbmi->txk_type, tx_rd_info->txk_type); - *rd_stats = tx_rd_info->rd_stats; -} - static int find_tx_size_rd_info(TXB_RD_RECORD *cur_record, const uint32_t hash) { // Linear search through the circular buffer to find matching hash. @@ -5706,158 +5904,13 @@ static int find_tx_size_rd_records(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, return 1; } -// origin_threshold * 128 / 100 -static const uint32_t skip_pred_threshold[3][BLOCK_SIZES_ALL] = { - { - 64, 64, 64, 70, 60, 60, 68, 68, 68, 68, 68, - 68, 68, 68, 68, 68, 64, 64, 70, 70, 68, 68, - }, - { - 88, 88, 88, 86, 87, 87, 68, 68, 68, 68, 68, - 68, 68, 68, 68, 68, 88, 88, 86, 86, 68, 68, - }, - { - 90, 93, 93, 90, 93, 93, 74, 74, 74, 74, 74, - 74, 74, 74, 74, 74, 90, 90, 90, 90, 74, 74, - }, -}; - -// lookup table for predict_skip_flag -// int max_tx_size = max_txsize_rect_lookup[bsize]; -// if (tx_size_high[max_tx_size] > 16 || tx_size_wide[max_tx_size] > 16) -// max_tx_size = AOMMIN(max_txsize_lookup[bsize], TX_16X16); -static const TX_SIZE max_predict_sf_tx_size[BLOCK_SIZES_ALL] = { - TX_4X4, TX_4X8, TX_8X4, TX_8X8, TX_8X16, TX_16X8, - TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_16X16, - TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_4X16, TX_16X4, - TX_8X8, TX_8X8, TX_16X16, TX_16X16, -}; - -// Uses simple features on top of DCT coefficients to quickly predict -// whether optimal RD decision is to skip encoding the residual. -// The sse value is stored in dist. -static int predict_skip_flag(MACROBLOCK *x, BLOCK_SIZE bsize, int64_t *dist, - int reduced_tx_set) { - const int bw = block_size_wide[bsize]; - const int bh = block_size_high[bsize]; - const MACROBLOCKD *xd = &x->e_mbd; - const int16_t dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd); - - *dist = pixel_diff_dist(x, 0, 0, 0, bsize, bsize); - const int64_t mse = *dist / bw / bh; - // Normalized quantizer takes the transform upscaling factor (8 for tx size - // smaller than 32) into account. - const int16_t normalized_dc_q = dc_q >> 3; - const int64_t mse_thresh = (int64_t)normalized_dc_q * normalized_dc_q / 8; - // Predict not to skip when mse is larger than threshold. - if (mse > mse_thresh) return 0; - - const int max_tx_size = max_predict_sf_tx_size[bsize]; - const int tx_h = tx_size_high[max_tx_size]; - const int tx_w = tx_size_wide[max_tx_size]; - DECLARE_ALIGNED(32, tran_low_t, coefs[32 * 32]); - TxfmParam param; - param.tx_type = DCT_DCT; - param.tx_size = max_tx_size; - param.bd = xd->bd; - param.is_hbd = get_bitdepth_data_path_index(xd); - param.lossless = 0; - param.tx_set_type = av1_get_ext_tx_set_type( - param.tx_size, is_inter_block(xd->mi[0]), reduced_tx_set); - const int bd_idx = (xd->bd == 8) ? 0 : ((xd->bd == 10) ? 1 : 2); - const uint32_t max_qcoef_thresh = skip_pred_threshold[bd_idx][bsize]; - const int16_t *src_diff = x->plane[0].src_diff; - const int n_coeff = tx_w * tx_h; - const int16_t ac_q = av1_ac_quant_QTX(x->qindex, 0, xd->bd); - const uint32_t dc_thresh = max_qcoef_thresh * dc_q; - const uint32_t ac_thresh = max_qcoef_thresh * ac_q; - for (int row = 0; row < bh; row += tx_h) { - for (int col = 0; col < bw; col += tx_w) { - av1_fwd_txfm(src_diff + col, coefs, bw, ¶m); - // Operating on TX domain, not pixels; we want the QTX quantizers - const uint32_t dc_coef = (((uint32_t)abs(coefs[0])) << 7); - if (dc_coef >= dc_thresh) return 0; - for (int i = 1; i < n_coeff; ++i) { - const uint32_t ac_coef = (((uint32_t)abs(coefs[i])) << 7); - if (ac_coef >= ac_thresh) return 0; - } - } - src_diff += tx_h * bw; - } - return 1; -} - -#if CONFIG_ONE_PASS_SVM -static void calc_regional_sse(MACROBLOCK *x, BLOCK_SIZE bsize, int64_t dist, - RD_STATS *rd_stats) { - // TODO(chiyotsai@google.com): Don't need regional sse's unless we are doing - // none. - const int bw = block_size_wide[bsize]; - const int bw_mi = bw >> tx_size_wide_log2[0]; - const int bh_mi = bw >> tx_size_high_log2[0]; - const BLOCK_SIZE split_size = get_partition_subsize(bsize, PARTITION_SPLIT); - int64_t dist_0, dist_1, dist_2, dist_3; - MACROBLOCKD *xd = &x->e_mbd; - dist_0 = pixel_diff_dist(x, AOM_PLANE_Y, 0, 0, bsize, split_size); - dist_1 = pixel_diff_dist(x, AOM_PLANE_Y, 0, bw_mi / 2, bsize, split_size); - dist_2 = pixel_diff_dist(x, AOM_PLANE_Y, bh_mi / 2, 0, bsize, split_size); - dist_3 = - pixel_diff_dist(x, AOM_PLANE_Y, bh_mi / 2, bw_mi / 2, bsize, split_size); - - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - dist = ROUND_POWER_OF_TWO(dist, (xd->bd - 8) * 2); - dist_0 = ROUND_POWER_OF_TWO(dist_0, (xd->bd - 8) * 2); - dist_1 = ROUND_POWER_OF_TWO(dist_1, (xd->bd - 8) * 2); - dist_2 = ROUND_POWER_OF_TWO(dist_2, (xd->bd - 8) * 2); - dist_3 = ROUND_POWER_OF_TWO(dist_3, (xd->bd - 8) * 2); - } - const int scaling_factor = MAX_MIB_SIZE * MAX_MIB_SIZE; - rd_stats->y_sse = (dist << 4); - rd_stats->sse_0 = (dist_0 << 4) * scaling_factor; - rd_stats->sse_1 = (dist_1 << 4) * scaling_factor; - rd_stats->sse_2 = (dist_2 << 4) * scaling_factor; - rd_stats->sse_3 = (dist_3 << 4) * scaling_factor; - av1_reg_stat_skipmode_update(rd_stats, x->rdmult); -} -#endif - -// Used to set proper context for early termination with skip = 1. -static void set_skip_flag(MACROBLOCK *x, RD_STATS *rd_stats, int bsize, - int64_t dist) { - MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = xd->mi[0]; - const int n4 = bsize_to_num_blk(bsize); - const TX_SIZE tx_size = max_txsize_rect_lookup[bsize]; - memset(mbmi->txk_type, DCT_DCT, sizeof(mbmi->txk_type[0]) * TXK_TYPE_BUF_LEN); - memset(mbmi->inter_tx_size, tx_size, sizeof(mbmi->inter_tx_size)); - mbmi->tx_size = tx_size; - for (int i = 0; i < n4; ++i) set_blk_skip(x, 0, i, 1); - rd_stats->skip = 1; - rd_stats->rate = 0; - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) - dist = ROUND_POWER_OF_TWO(dist, (xd->bd - 8) * 2); - rd_stats->dist = rd_stats->sse = (dist << 4); -} - -static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x, - RD_STATS *rd_stats, BLOCK_SIZE bsize, int mi_row, - int mi_col, int64_t ref_best_rd) { +// Search for best transform size and type for luma inter blocks. +static void pick_tx_size_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x, + RD_STATS *rd_stats, BLOCK_SIZE bsize, + int mi_row, int mi_col, int64_t ref_best_rd) { const AV1_COMMON *cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = xd->mi[0]; - int64_t rd = INT64_MAX; - int64_t best_rd = INT64_MAX; - const int is_inter = is_inter_block(mbmi); - const int n4 = bsize_to_num_blk(bsize); - // Get the tx_size 1 level down - const TX_SIZE min_tx_size = sub_tx_size_map[max_txsize_rect_lookup[bsize]]; - const TxSetType tx_set_type = - av1_get_ext_tx_set_type(min_tx_size, is_inter, cm->reduced_tx_set_used); - const int within_border = - mi_row >= xd->tile.mi_row_start && - (mi_row + mi_size_high[bsize] < xd->tile.mi_row_end) && - mi_col >= xd->tile.mi_col_start && - (mi_col + mi_size_wide[bsize] < xd->tile.mi_col_end); + assert(is_inter_block(xd->mi[0])); av1_invalid_rd_stats(rd_stats); @@ -5874,8 +5927,7 @@ static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x, // tighter. assert(cpi->sf.model_based_prune_tx_search_level >= 0 && cpi->sf.model_based_prune_tx_search_level <= 2); - static const int prune_factor_by8[] = { 2 + MODELRD_TYPE_TX_SEARCH_PRUNE, - 4 + MODELRD_TYPE_TX_SEARCH_PRUNE }; + static const int prune_factor_by8[] = { 3, 5 }; if (!model_skip && ((model_rd * prune_factor_by8[cpi->sf.model_based_prune_tx_search_level - 1]) >> @@ -5883,38 +5935,41 @@ static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x, return; } - const uint32_t hash = get_block_residue_hash(x, bsize); - MB_RD_RECORD *mb_rd_record = &x->mb_rd_record; - - if (ref_best_rd != INT64_MAX && within_border && cpi->sf.use_mb_rd_hash) { - for (int i = 0; i < mb_rd_record->num; ++i) { - const int index = (mb_rd_record->index_start + i) % RD_RECORD_BUFFER_LEN; - // If there is a match in the tx_rd_record, fetch the RD decision and - // terminate early. - if (mb_rd_record->tx_rd_info[index].hash_value == hash) { - MB_RD_INFO *tx_rd_info = &mb_rd_record->tx_rd_info[index]; - fetch_tx_rd_info(n4, tx_rd_info, rd_stats, x); - return; - } + uint32_t hash = 0; + int32_t match_index = -1; + MB_RD_RECORD *mb_rd_record = NULL; + const int within_border = + mi_row >= xd->tile.mi_row_start && + (mi_row + mi_size_high[bsize] < xd->tile.mi_row_end) && + mi_col >= xd->tile.mi_col_start && + (mi_col + mi_size_wide[bsize] < xd->tile.mi_col_end); + const int is_mb_rd_hash_enabled = (within_border && cpi->sf.use_mb_rd_hash); + const int n4 = bsize_to_num_blk(bsize); + if (is_mb_rd_hash_enabled) { + hash = get_block_residue_hash(x, bsize); + mb_rd_record = &x->mb_rd_record; + match_index = find_mb_rd_info(mb_rd_record, ref_best_rd, hash); + if (match_index != -1) { + MB_RD_INFO *tx_rd_info = &mb_rd_record->tx_rd_info[match_index]; + fetch_tx_rd_info(n4, tx_rd_info, rd_stats, x); + return; } } // If we predict that skip is the optimal RD decision - set the respective // context and terminate early. int64_t dist; - if (is_inter && cpi->sf.tx_type_search.use_skip_flag_prediction && + if (cpi->sf.tx_type_search.use_skip_flag_prediction && predict_skip_flag(x, bsize, &dist, cm->reduced_tx_set_used)) { set_skip_flag(x, rd_stats, bsize, dist); -#if CONFIG_ONE_PASS_SVM - if (bsize >= BLOCK_8X8 && mi_size_wide[bsize] == mi_size_high[bsize] && - mbmi->partition == PARTITION_NONE) { - calc_regional_sse(x, bsize, dist, rd_stats); - } -#endif // Save the RD search results into tx_rd_record. - if (within_border) save_tx_rd_info(n4, hash, x, rd_stats, mb_rd_record); + if (is_mb_rd_hash_enabled) + save_tx_rd_info(n4, hash, x, rd_stats, mb_rd_record); return; } +#if CONFIG_SPEED_STATS + ++x->tx_search_count; +#endif // CONFIG_SPEED_STATS // Precompute residual hashes and find existing or add new RD records to // store and reuse rate and distortion values to speed up TX size search. @@ -5925,20 +5980,20 @@ static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x, find_tx_size_rd_records(x, bsize, mi_row, mi_col, matched_rd_info); } + // Get the tx_size 1 level down + const TX_SIZE min_tx_size = sub_tx_size_map[max_txsize_rect_lookup[bsize]]; + const TxSetType tx_set_type = + av1_get_ext_tx_set_type(min_tx_size, 1, cm->reduced_tx_set_used); prune_tx(cpi, bsize, x, xd, tx_set_type); int found = 0; - RD_STATS this_rd_stats; av1_init_rd_stats(&this_rd_stats); + const int64_t rd = + select_tx_size_and_type(cpi, x, &this_rd_stats, bsize, ref_best_rd, + found_rd_info ? matched_rd_info : NULL); - rd = select_tx_size_fix_type(cpi, x, &this_rd_stats, bsize, ref_best_rd, - found_rd_info ? matched_rd_info : NULL); - assert(IMPLIES(this_rd_stats.skip && !this_rd_stats.invalid_rate, - this_rd_stats.rate == 0)); - - ref_best_rd = AOMMIN(rd, ref_best_rd); - if (rd < best_rd) { + if (rd < INT64_MAX) { *rd_stats = this_rd_stats; found = 1; } @@ -5954,136 +6009,76 @@ static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x, if (!found) return; // Save the RD search results into tx_rd_record. - if (within_border && cpi->sf.use_mb_rd_hash) + if (is_mb_rd_hash_enabled) { + assert(mb_rd_record != NULL); save_tx_rd_info(n4, hash, x, rd_stats, mb_rd_record); -} - -#define FAVOR_CHROMA_SKIP 1 -static void tx_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, - int blk_col, int plane, int block, TX_SIZE tx_size, - BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *above_ctx, - ENTROPY_CONTEXT *left_ctx, RD_STATS *rd_stats, - FAST_TX_SEARCH_MODE ftxs_mode) { - assert(plane > 0); - assert(tx_size < TX_SIZES_ALL); - MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = xd->mi[0]; - const int max_blocks_high = max_block_high(xd, plane_bsize, plane); - const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane); - if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return; - - ENTROPY_CONTEXT *ta = above_ctx + blk_col; - ENTROPY_CONTEXT *tl = left_ctx + blk_row; - TXB_CTX txb_ctx; - get_txb_ctx(plane_bsize, tx_size, plane, ta, tl, &txb_ctx); - const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size); - const int zero_blk_rate = x->coeff_costs[txs_ctx][PLANE_TYPE_UV] - .txb_skip_cost[txb_ctx.txb_skip_ctx][1]; - tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block, plane_bsize, - &txb_ctx, rd_stats, ftxs_mode, INT64_MAX, NULL); - - const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0]; - const int blk_idx = blk_row * mi_width + blk_col; - const int64_t rdmult = x->rdmult * plane_rd_mult[1][PLANE_TYPE_UV] / - plane_rd_mult[1][PLANE_TYPE_Y]; - av1_set_txb_context(x, plane, block, tx_size, ta, tl); - if ((RDCOST(rdmult, rd_stats->rate, rd_stats->dist) >= - RDCOST(rdmult, zero_blk_rate, rd_stats->sse) || - rd_stats->skip == 1) && - !xd->lossless[mbmi->segment_id]) { - rd_stats->rate = zero_blk_rate; - rd_stats->dist = rd_stats->sse; - rd_stats->skip = 1; -#if FAVOR_CHROMA_SKIP - x->plane[plane].eobs[block] = 0; - x->plane[plane].txb_entropy_ctx[block] = 0; - set_blk_skip(x, plane, blk_idx, 1); -#else - set_blk_skip(x, plane, blk_idx, 0); -#endif - } else { - set_blk_skip(x, plane, blk_idx, 0); } } -// Return value 0: early termination triggered, no valid rd cost available; -// 1: rd cost values are valid. -static int inter_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x, - RD_STATS *rd_stats, BLOCK_SIZE bsize, - int64_t non_skip_ref_best_rd, - int64_t skip_ref_best_rd, - FAST_TX_SEARCH_MODE ftxs_mode) { - MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = xd->mi[0]; - int plane; - int is_cost_valid = 1; - int64_t this_rd = 0; - int64_t skip_rd = 0; - - if ((non_skip_ref_best_rd < 0) && (skip_ref_best_rd < 0)) is_cost_valid = 0; - - av1_init_rd_stats(rd_stats); +static void model_rd_for_sb_with_fullrdy( + const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, + int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum, + int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb, + int *plane_rate, int64_t *plane_sse, int64_t *plane_dist) { + const int ref = xd->mi[0]->ref_frame[0]; - if (x->skip_chroma_rd) { - if (!is_cost_valid) av1_invalid_rd_stats(rd_stats); + int64_t rate_sum = 0; + int64_t dist_sum = 0; + int64_t total_sse = 0; - return is_cost_valid; - } + for (int plane = plane_from; plane <= plane_to; ++plane) { + struct macroblock_plane *const p = &x->plane[plane]; + struct macroblockd_plane *const pd = &xd->plane[plane]; + const BLOCK_SIZE plane_bsize = + get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y); + const int bw = block_size_wide[plane_bsize]; + const int bh = block_size_high[plane_bsize]; + int64_t sse; + int rate; + int64_t dist; - const BLOCK_SIZE bsizec = scale_chroma_bsize( - bsize, xd->plane[1].subsampling_x, xd->plane[1].subsampling_y); + if (x->skip_chroma_rd && plane) continue; - if (is_inter_block(mbmi) && is_cost_valid) { - for (plane = 1; plane < MAX_MB_PLANE; ++plane) - av1_subtract_plane(x, bsizec, plane); - } + if (is_cur_buf_hbd(xd)) { + sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf, + pd->dst.stride, bw, bh); + } else { + sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw, + bh); + } + sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2); - if (is_cost_valid) { - for (plane = 1; plane < MAX_MB_PLANE; ++plane) { - const struct macroblockd_plane *const pd = &xd->plane[plane]; - const BLOCK_SIZE plane_bsize = - get_plane_block_size(bsizec, pd->subsampling_x, pd->subsampling_y); - const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0]; - const int mi_height = - block_size_high[plane_bsize] >> tx_size_high_log2[0]; - const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane); - const int bh = tx_size_high_unit[max_tx_size]; - const int bw = tx_size_wide_unit[max_tx_size]; - int idx, idy; - int block = 0; - const int step = bh * bw; - ENTROPY_CONTEXT ta[MAX_MIB_SIZE]; - ENTROPY_CONTEXT tl[MAX_MIB_SIZE]; - av1_get_entropy_contexts(bsizec, pd, ta, tl); - - for (idy = 0; idy < mi_height; idy += bh) { - for (idx = 0; idx < mi_width; idx += bw) { - RD_STATS pn_rd_stats; - av1_init_rd_stats(&pn_rd_stats); - tx_block_uvrd(cpi, x, idy, idx, plane, block, max_tx_size, - plane_bsize, ta, tl, &pn_rd_stats, ftxs_mode); - if (pn_rd_stats.rate == INT_MAX) { - av1_invalid_rd_stats(rd_stats); - return 0; - } - av1_merge_rd_stats(rd_stats, &pn_rd_stats); - this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); - skip_rd = RDCOST(x->rdmult, 0, rd_stats->sse); - if ((this_rd > non_skip_ref_best_rd) && - (skip_rd > skip_ref_best_rd)) { - av1_invalid_rd_stats(rd_stats); - return 0; - } - block += step; - } + RD_STATS rd_stats; + if (plane == 0) { + pick_tx_size_type_yrd(cpi, x, &rd_stats, bsize, mi_row, mi_col, + INT64_MAX); + if (rd_stats.invalid_rate) { + rate = 0; + dist = sse << 4; + } else { + rate = rd_stats.rate; + dist = rd_stats.dist; } + } else { + model_rd_with_curvfit(cpi, x, plane_bsize, plane, sse, bw * bh, &rate, + &dist); } - } else { - // reset cost value - av1_invalid_rd_stats(rd_stats); + + if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX); + + total_sse += sse; + rate_sum += rate; + dist_sum += dist; + + if (plane_rate) plane_rate[plane] = rate; + if (plane_sse) plane_sse[plane] = sse; + if (plane_dist) plane_dist[plane] = dist; } - return is_cost_valid; + if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0; + if (skip_sse_sb) *skip_sse_sb = total_sse << 4; + *out_rate_sum = (int)rate_sum; + *out_dist_sum = dist_sum; } static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x, @@ -6331,7 +6326,7 @@ static int cfl_rd_pick_alpha(MACROBLOCK *const x, const AV1_COMP *const cpi, const BLOCK_SIZE bsize = mbmi->sb_type; #if CONFIG_DEBUG - assert(is_cfl_allowed(xd)); + assert(is_cfl_allowed(xd) && cpi->oxcf.enable_cfl_intra); const int ssx = xd->plane[AOM_PLANE_U].subsampling_x; const int ssy = xd->plane[AOM_PLANE_U].subsampling_y; const BLOCK_SIZE plane_bsize = get_plane_block_size(mbmi->sb_type, ssx, ssy); @@ -6368,7 +6363,7 @@ static int cfl_rd_pick_alpha(MACROBLOCK *const x, const AV1_COMP *const cpi, mbmi->cfl_alpha_idx = 0; mbmi->cfl_alpha_signs = joint_sign; txfm_rd_in_plane(x, cpi, &rd_stats, best_rd, 0, plane + 1, bsize, - tx_size, cpi->sf.use_fast_coef_costing, FTXS_NONE); + tx_size, cpi->sf.use_fast_coef_costing, FTXS_NONE, 0); if (rd_stats.rate == INT_MAX) break; } const int alpha_rate = x->cfl_cost[joint_sign][plane][0]; @@ -6396,7 +6391,8 @@ static int cfl_rd_pick_alpha(MACROBLOCK *const x, const AV1_COMP *const cpi, mbmi->cfl_alpha_idx = (c << CFL_ALPHABET_SIZE_LOG2) + c; mbmi->cfl_alpha_signs = joint_sign; txfm_rd_in_plane(x, cpi, &rd_stats, best_rd, 0, plane + 1, bsize, - tx_size, cpi->sf.use_fast_coef_costing, FTXS_NONE); + tx_size, cpi->sf.use_fast_coef_costing, FTXS_NONE, + 0); if (rd_stats.rate == INT_MAX) break; } const int alpha_rate = x->cfl_cost[joint_sign][plane][c]; @@ -6469,18 +6465,24 @@ static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x, if (!(cpi->sf.intra_uv_mode_mask[txsize_sqr_up_map[max_tx_size]] & (1 << mode))) continue; + if (!cpi->oxcf.enable_smooth_intra && mode >= UV_SMOOTH_PRED && + mode <= UV_SMOOTH_H_PRED) + continue; + + if (!cpi->oxcf.enable_paeth_intra && mode == UV_PAETH_PRED) continue; mbmi->uv_mode = mode; int cfl_alpha_rate = 0; if (mode == UV_CFL_PRED) { - if (!is_cfl_allowed(xd)) continue; + if (!is_cfl_allowed(xd) || !cpi->oxcf.enable_cfl_intra) continue; assert(!is_directional_mode); const TX_SIZE uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd); cfl_alpha_rate = cfl_rd_pick_alpha(x, cpi, uv_tx_size, best_rd); if (cfl_alpha_rate == INT_MAX) continue; } mbmi->angle_delta[PLANE_TYPE_UV] = 0; - if (is_directional_mode && av1_use_angle_delta(mbmi->sb_type)) { + if (is_directional_mode && av1_use_angle_delta(mbmi->sb_type) && + cpi->oxcf.enable_angle_delta) { const int rate_overhead = x->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][mode]; if (!rd_pick_intra_angle_sbuv(cpi, x, bsize, rate_overhead, best_rd, @@ -6497,7 +6499,7 @@ static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x, this_rate = tokenonly_rd_stats.rate + intra_mode_info_cost_uv(cpi, x, mbmi, bsize, mode_cost); if (mode == UV_CFL_PRED) { - assert(is_cfl_allowed(xd)); + assert(is_cfl_allowed(xd) && cpi->oxcf.enable_cfl_intra); #if CONFIG_DEBUG if (!xd->lossless[mbmi->segment_id]) assert(xd->cfl.rate == tokenonly_rd_stats.rate + mode_cost); @@ -6516,6 +6518,7 @@ static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x, } const int try_palette = + cpi->oxcf.enable_palette && av1_allow_palette(cpi->common.allow_screen_content_tools, mbmi->sb_type); if (try_palette) { uint8_t *best_palette_color_map = x->palette_buffer->best_palette_color_map; @@ -6619,35 +6622,6 @@ static int get_interinter_compound_mask_rate(const MACROBLOCK *const x, } } -typedef struct { - int eobs; - int brate; - int byrate; - int64_t bdist; - int64_t bsse; - int64_t brdcost; - int_mv mvs[2]; - int_mv pred_mv[2]; - int_mv ref_mv[2]; - - ENTROPY_CONTEXT ta[2]; - ENTROPY_CONTEXT tl[2]; -} SEG_RDSTAT; - -typedef struct { - int_mv *ref_mv[2]; - int_mv mvp; - - int64_t segment_rd; - int r; - int64_t d; - int64_t sse; - int segment_yrate; - PREDICTION_MODE modes[4]; - SEG_RDSTAT rdstat[4][INTER_MODES + INTER_COMPOUND_MODES]; - int mvthresh; -} BEST_SEG_INFO; - static INLINE int mv_check_bounds(const MvLimits *mv_limits, const MV *mv) { return (mv->row >> 3) < mv_limits->row_min || (mv->row >> 3) > mv_limits->row_max || @@ -6693,7 +6667,7 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, const int p_row = ((mi_row * MI_SIZE) >> pd->subsampling_y) + 4 * ir; ConvolveParams conv_params = get_conv_params(0, plane, xd->bd); - conv_params.use_jnt_comp_avg = 0; + conv_params.use_dist_wtd_comp_avg = 0; WarpTypesAllowed warp_types[2]; for (ref = 0; ref < 2; ++ref) { const WarpedMotionParams *const wm = @@ -6734,7 +6708,7 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, } else { int_mv cur_int_mv, init_int_mv; cur_int_mv.as_mv.col = cur_mv[id].as_mv.col >> 3; - cur_int_mv.as_mv.row = cur_mv[id].as_mv.col >> 3; + cur_int_mv.as_mv.row = cur_mv[id].as_mv.row >> 3; init_int_mv.as_mv.row = init_mv[id].as_mv.row >> 3; init_int_mv.as_mv.col = init_mv[id].as_mv.col >> 3; if (cur_int_mv.as_int == init_int_mv.as_int) { @@ -6780,9 +6754,9 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x, mi_row * MI_SIZE, xd, cm->allow_warped_motion); const int order_idx = id != 0; - av1_jnt_comp_weight_assign(cm, mbmi, order_idx, &xd->jcp_param.fwd_offset, - &xd->jcp_param.bck_offset, - &xd->jcp_param.use_jnt_comp_avg, 1); + av1_dist_wtd_comp_weight_assign( + cm, mbmi, order_idx, &xd->jcp_param.fwd_offset, + &xd->jcp_param.bck_offset, &xd->jcp_param.use_dist_wtd_comp_avg, 1); // Do full-pixel compound motion search on the current reference frame. if (id) xd->plane[plane].pre[0] = ref_yv12[id]; @@ -7036,19 +7010,25 @@ static void setup_buffer_ref_mvs_inter( struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) { const AV1_COMMON *cm = &cpi->common; const int num_planes = av1_num_planes(cm); - const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame); + const YV12_BUFFER_CONFIG *scaled_ref_frame = + av1_get_scaled_ref_frame(cpi, ref_frame); MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; - const struct scale_factors *const sf = - &cm->current_frame.frame_refs[ref_frame - 1].sf; MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; - + const struct scale_factors *const sf = + get_ref_scale_factors_const(cm, ref_frame); + const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, ref_frame); assert(yv12 != NULL); - // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this - // use the UV scaling factors. - av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf, - num_planes); + if (scaled_ref_frame) { + // Setup pred block based on scaled reference, because av1_mv_pred() doesn't + // support scaling. + av1_setup_pred_block(xd, yv12_mb[ref_frame], scaled_ref_frame, mi_row, + mi_col, NULL, NULL, num_planes); + } else { + av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf, + num_planes); + } // Gets an initial list of candidate vectors from neighbours and orders them av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count, @@ -7056,11 +7036,18 @@ static void setup_buffer_ref_mvs_inter( mi_col, mbmi_ext->mode_context); // Further refinement that is encode side only to test the top few candidates - // in full and choose the best as the centre point for subsequent searches. + // in full and choose the best as the center point for subsequent searches. // The current implementation doesn't support scaling. - (void)block_size; - av1_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride, ref_frame, - block_size); + av1_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12_mb[ref_frame][0].stride, + ref_frame, block_size); + + // Go back to unscaled reference. + if (scaled_ref_frame) { + // We had temporarily setup pred block based on scaled reference above. Go + // back to unscaled reference now, for subsequent use. + av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf, + num_planes); + } } static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x, @@ -7165,13 +7152,13 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x, bestsme = av1_full_pixel_search( cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, 0, sadpb, cond_cost_list(cpi, cost_list), &ref_mv, INT_MAX, 1, - (MI_SIZE * mi_col), (MI_SIZE * mi_row), 0); + (MI_SIZE * mi_col), (MI_SIZE * mi_row), 0, &cpi->ss_cfg[SS_CFG_SRC]); break; case OBMC_CAUSAL: - bestsme = av1_obmc_full_pixel_search(cpi, x, &mvp_full, step_param, sadpb, - MAX_MVSEARCH_STEPS - 1 - step_param, - 1, &cpi->fn_ptr[bsize], &ref_mv, - &(x->best_mv.as_mv), 0); + bestsme = av1_obmc_full_pixel_search( + cpi, x, &mvp_full, step_param, sadpb, + MAX_MVSEARCH_STEPS - 1 - step_param, 1, &cpi->fn_ptr[bsize], &ref_mv, + &(x->best_mv.as_mv), 0, &cpi->ss_cfg[SS_CFG_SRC]); break; default: assert(0 && "Invalid motion mode!\n"); } @@ -7264,10 +7251,9 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x, x->pred_mv[ref] = x->best_mv.as_mv; } -static INLINE void restore_dst_buf(MACROBLOCKD *xd, BUFFER_SET dst, +static INLINE void restore_dst_buf(MACROBLOCKD *xd, const BUFFER_SET dst, const int num_planes) { - int i; - for (i = 0; i < num_planes; i++) { + for (int i = 0; i < num_planes; i++) { xd->plane[i].dst.buf = dst.plane[i]; xd->plane[i].dst.stride = dst.stride[i]; } @@ -7314,9 +7300,9 @@ static void build_second_inter_pred(const AV1_COMP *cpi, MACROBLOCK *x, plane, !ref_idx, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE, xd, cm->allow_warped_motion); - av1_jnt_comp_weight_assign(cm, mbmi, 0, &xd->jcp_param.fwd_offset, - &xd->jcp_param.bck_offset, - &xd->jcp_param.use_jnt_comp_avg, 1); + av1_dist_wtd_comp_weight_assign(cm, mbmi, 0, &xd->jcp_param.fwd_offset, + &xd->jcp_param.bck_offset, + &xd->jcp_param.use_dist_wtd_comp_avg, 1); } // Search for the best mv for one component of a compound, @@ -7442,7 +7428,7 @@ static void compound_single_motion_search_interinter( // Prediction buffer from second frame. DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[MAX_SB_SQUARE]); uint8_t *second_pred; - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + if (is_cur_buf_hbd(xd)) second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16); else second_pred = (uint8_t *)second_pred_alloc_16; @@ -7572,7 +7558,7 @@ static int estimate_wedge_sign(const AV1_COMP *cpi, const MACROBLOCK *x, const BLOCK_SIZE f_index = split_qtr[bsize]; assert(f_index != BLOCK_INVALID); - if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + if (is_cur_buf_hbd(&x->e_mbd)) { pred0 = CONVERT_TO_BYTEPTR(pred0); pred1 = CONVERT_TO_BYTEPTR(pred1); } @@ -7622,7 +7608,7 @@ static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x, int wedge_types = (1 << get_wedge_bits_lookup(bsize)); const uint8_t *mask; uint64_t sse; - const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH; + const int hbd = is_cur_buf_hbd(xd); const int bd_round = hbd ? (xd->bd - 8) * 2 : 0; DECLARE_ALIGNED(32, int16_t, residual0[MAX_SB_SQUARE]); // src - pred0 @@ -7693,7 +7679,7 @@ static int64_t pick_wedge_fixed_sign(const AV1_COMP *const cpi, int wedge_types = (1 << get_wedge_bits_lookup(bsize)); const uint8_t *mask; uint64_t sse; - const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH; + const int hbd = is_cur_buf_hbd(xd); const int bd_round = hbd ? (xd->bd - 8) * 2 : 0; for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) { mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize); @@ -7759,7 +7745,7 @@ static int64_t pick_interinter_seg(const AV1_COMP *const cpi, DIFFWTD_MASK_TYPE cur_mask_type; int64_t best_rd = INT64_MAX; DIFFWTD_MASK_TYPE best_mask_type = 0; - const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH; + const int hbd = is_cur_buf_hbd(xd); const int bd_round = hbd ? (xd->bd - 8) * 2 : 0; DECLARE_ALIGNED(16, uint8_t, seg_mask[2 * MAX_SB_SQUARE]); uint8_t *tmp_mask[2] = { xd->seg_mask, seg_mask }; @@ -7810,7 +7796,7 @@ static int64_t pick_interintra_wedge(const AV1_COMP *const cpi, const int bh = block_size_high[bsize]; DECLARE_ALIGNED(32, int16_t, residual1[MAX_SB_SQUARE]); // src - pred1 DECLARE_ALIGNED(32, int16_t, diff10[MAX_SB_SQUARE]); // pred1 - pred0 - if (get_bitdepth_data_path_index(xd)) { + if (is_cur_buf_hbd(xd)) { aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, CONVERT_TO_BYTEPTR(p1), bw, xd->bd); aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(p1), bw, @@ -7889,7 +7875,7 @@ static void get_inter_predictors_masked_compound( av1_build_inter_predictors_for_planes_single_buf( xd, bsize, 0, 0, mi_row, mi_col, 1, preds1, strides, can_use_previous); const struct buf_2d *const src = &x->plane[0].src; - if (get_bitdepth_data_path_index(xd)) { + if (is_cur_buf_hbd(xd)) { aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, CONVERT_TO_BYTEPTR(*preds1), bw, xd->bd); aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(*preds1), @@ -7904,21 +7890,24 @@ static void get_inter_predictors_masked_compound( static int64_t build_and_cost_compound_type( const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv, const BLOCK_SIZE bsize, const PREDICTION_MODE this_mode, int *rs2, - int rate_mv, BUFFER_SET *ctx, int *out_rate_mv, uint8_t **preds0, + int rate_mv, const BUFFER_SET *ctx, int *out_rate_mv, uint8_t **preds0, uint8_t **preds1, int16_t *residual1, int16_t *diff10, int *strides, int mi_row, int mi_col, int mode_rate, int64_t ref_best_rd, - int *calc_pred_masked_compound) { + int *calc_pred_masked_compound, int32_t *comp_rate, int64_t *comp_dist, + int64_t *const comp_model_rd, const int64_t comp_best_model_rd, + int64_t *const comp_model_rd_cur) { const AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; - int rate_sum; - int64_t dist_sum; int64_t best_rd_cur = INT64_MAX; int64_t rd = INT64_MAX; - int tmp_skip_txfm_sb; - int64_t tmp_skip_sse_sb; const COMPOUND_TYPE compound_type = mbmi->interinter_comp.type; + int rate_sum, tmp_skip_txfm_sb; + int64_t dist_sum, tmp_skip_sse_sb; + // TODO(any): Save pred and mask calculation as well into records. However + // this may increase memory requirements as compound segment mask needs to be + // stored in each record. if (*calc_pred_masked_compound) { get_inter_predictors_masked_compound(cpi, x, bsize, mi_row, mi_col, preds0, preds1, residual1, diff10, strides); @@ -7926,7 +7915,7 @@ static int64_t build_and_cost_compound_type( } if (cpi->sf.prune_wedge_pred_diff_based && compound_type == COMPOUND_WEDGE) { unsigned int sse; - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + if (is_cur_buf_hbd(xd)) (void)cpi->fn_ptr[bsize].vf(CONVERT_TO_BYTEPTR(*preds0), *strides, CONVERT_TO_BYTEPTR(*preds1), *strides, &sse); else @@ -7934,8 +7923,10 @@ static int64_t build_and_cost_compound_type( const unsigned int mse = ROUND_POWER_OF_TWO(sse, num_pels_log2_lookup[bsize]); // If two predictors are very similar, skip wedge compound mode search - if (mse < 8 || (!have_newmv_in_inter_mode(this_mode) && mse < 64)) + if (mse < 8 || (!have_newmv_in_inter_mode(this_mode) && mse < 64)) { + *comp_model_rd_cur = INT64_MAX; return INT64_MAX; + } } best_rd_cur = @@ -7947,34 +7938,76 @@ static int64_t build_and_cost_compound_type( // is unlikely to be the best mode considering the transform rd cost and other // mode overhead cost int64_t mode_rd = RDCOST(x->rdmult, *rs2 + mode_rate, 0); - if (mode_rd > ref_best_rd) return INT64_MAX; - - if (have_newmv_in_inter_mode(this_mode) && compound_type == COMPOUND_WEDGE) { - *out_rate_mv = interinter_compound_motion_search(cpi, x, cur_mv, bsize, - this_mode, mi_row, mi_col); - av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, ctx, bsize); - model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND]( - cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum, - &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL); - rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + rate_sum, dist_sum); - if (rd >= best_rd_cur) { - mbmi->mv[0].as_int = cur_mv[0].as_int; - mbmi->mv[1].as_int = cur_mv[1].as_int; + if (mode_rd > ref_best_rd) { + *comp_model_rd_cur = INT64_MAX; + return INT64_MAX; + } + + // Reuse data if matching record is found + if (comp_rate[compound_type] == INT_MAX) { + if (have_newmv_in_inter_mode(this_mode) && + compound_type == COMPOUND_WEDGE && + !cpi->sf.disable_interinter_wedge_newmv_search) { + *out_rate_mv = interinter_compound_motion_search( + cpi, x, cur_mv, bsize, this_mode, mi_row, mi_col); + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, ctx, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + + model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND]( + cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum, + &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL); + rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + rate_sum, dist_sum); + *comp_model_rd_cur = rd; + if (rd >= best_rd_cur) { + mbmi->mv[0].as_int = cur_mv[0].as_int; + mbmi->mv[1].as_int = cur_mv[1].as_int; + *out_rate_mv = rate_mv; + av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0, + strides, preds1, strides); + *comp_model_rd_cur = best_rd_cur; + } + } else { *out_rate_mv = rate_mv; av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0, strides, preds1, strides); + model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND]( + cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum, + &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL); + *comp_model_rd_cur = + RDCOST(x->rdmult, *rs2 + *out_rate_mv + rate_sum, dist_sum); } + RD_STATS rd_stats; + + if (cpi->sf.prune_comp_type_by_model_rd && + (*comp_model_rd_cur > comp_best_model_rd) && + comp_best_model_rd != INT64_MAX) { + *comp_model_rd_cur = INT64_MAX; + return INT64_MAX; + } + rd = estimate_yrd_for_sb(cpi, bsize, x, ref_best_rd, &rd_stats); + if (rd != INT64_MAX) { + rd = + RDCOST(x->rdmult, *rs2 + *out_rate_mv + rd_stats.rate, rd_stats.dist); + // Backup rate and distortion for future reuse + comp_rate[compound_type] = rd_stats.rate; + comp_dist[compound_type] = rd_stats.dist; + comp_model_rd[compound_type] = *comp_model_rd_cur; + } } else { + assert(comp_dist[compound_type] != INT64_MAX); + // When disable_interinter_wedge_newmv_search is set, motion refinement is + // disabled. Hence rate and distortion can be reused in this case as well + assert(IMPLIES(have_newmv_in_inter_mode(this_mode), + cpi->sf.disable_interinter_wedge_newmv_search)); + assert(mbmi->mv[0].as_int == cur_mv[0].as_int); + assert(mbmi->mv[1].as_int == cur_mv[1].as_int); *out_rate_mv = rate_mv; - av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0, strides, - preds1, strides); + // Calculate RD cost based on stored stats + rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + comp_rate[compound_type], + comp_dist[compound_type]); + *comp_model_rd_cur = comp_model_rd[compound_type]; } - rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum, - &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX); - if (rd != INT64_MAX) - rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + rate_sum, dist_sum); - return rd; } @@ -8172,8 +8205,9 @@ static INLINE int get_switchable_rate(MACROBLOCK *const x, // calculate the rdcost of given interpolation_filter static INLINE int64_t interpolation_filter_rd( - MACROBLOCK *const x, const AV1_COMP *const cpi, BLOCK_SIZE bsize, - int mi_row, int mi_col, BUFFER_SET *const orig_dst, int64_t *const rd, + MACROBLOCK *const x, const AV1_COMP *const cpi, + const TileDataEnc *tile_data, BLOCK_SIZE bsize, int mi_row, int mi_col, + const BUFFER_SET *const orig_dst, int64_t *const rd, int *const switchable_rate, int *const skip_txfm_sb, int64_t *const skip_sse_sb, const BUFFER_SET *dst_bufs[2], int filter_idx, const int switchable_ctx[2], const int skip_pred, int *rate, @@ -8196,6 +8230,8 @@ static INLINE int64_t interpolation_filter_rd( return 0; } + (void)tile_data; + assert(skip_pred != 2); assert((skip_pred >= 0) && (skip_pred <= cpi->default_interp_skip_flags)); assert(rate[0] >= 0); @@ -8209,11 +8245,13 @@ static INLINE int64_t interpolation_filter_rd( if (skip_pred != cpi->default_interp_skip_flags) { if (skip_pred != DEFAULT_LUMA_INTERP_SKIP_FLAG) { - av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize); + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); #if CONFIG_COLLECT_RD_STATS == 3 RD_STATS rd_stats_y; - select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, mi_row, mi_col, INT64_MAX); - PrintPredictionUnitStats(cpi, x, &rd_stats_y, bsize); + pick_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize, mi_row, mi_col, + INT64_MAX); + PrintPredictionUnitStats(cpi, tile_data, x, &rd_stats_y, bsize); #endif // CONFIG_COLLECT_RD_STATS == 3 model_rd_sb_fn[MODELRD_TYPE_INTERP_FILTER]( cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &tmp_rate[0], &tmp_dist[0], @@ -8234,8 +8272,8 @@ static INLINE int64_t interpolation_filter_rd( mbmi->interp_filters = last_best; return 0; } - av1_build_inter_predictors_sbp(cm, xd, mi_row, mi_col, orig_dst, bsize, - plane); + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, + plane, plane); model_rd_sb_fn[MODELRD_TYPE_INTERP_FILTER]( cpi, bsize, x, xd, plane, plane, mi_row, mi_col, &tmp_rate_uv, &tmp_dist_uv, &tmp_skip_sb_uv, &tmp_skip_sse_uv, NULL, NULL, NULL); @@ -8287,21 +8325,103 @@ static INLINE int64_t interpolation_filter_rd( return 0; } +static INLINE void pred_dual_interp_filter_rd( + MACROBLOCK *const x, const AV1_COMP *const cpi, + const TileDataEnc *tile_data, BLOCK_SIZE bsize, int mi_row, int mi_col, + const BUFFER_SET *const orig_dst, int64_t *const rd, + int *const switchable_rate, int *const skip_txfm_sb, + int64_t *const skip_sse_sb, const BUFFER_SET *dst_bufs[2], + InterpFilters filter_idx, const int switchable_ctx[2], const int skip_pred, + int *rate, int64_t *dist, InterpFilters af_horiz, InterpFilters af_vert, + InterpFilters lf_horiz, InterpFilters lf_vert) { + if ((af_horiz == lf_horiz) && (af_horiz != SWITCHABLE)) { + if (((af_vert == lf_vert) && (af_vert != SWITCHABLE))) { + filter_idx = af_horiz + (af_vert * SWITCHABLE_FILTERS); + if (filter_idx) { + interpolation_filter_rd(x, cpi, tile_data, bsize, mi_row, mi_col, + orig_dst, rd, switchable_rate, skip_txfm_sb, + skip_sse_sb, dst_bufs, filter_idx, + switchable_ctx, skip_pred, rate, dist); + } + } else { + for (filter_idx = af_horiz; filter_idx < (DUAL_FILTER_SET_SIZE); + filter_idx += SWITCHABLE_FILTERS) { + if (filter_idx) { + interpolation_filter_rd(x, cpi, tile_data, bsize, mi_row, mi_col, + orig_dst, rd, switchable_rate, skip_txfm_sb, + skip_sse_sb, dst_bufs, filter_idx, + switchable_ctx, skip_pred, rate, dist); + } + } + } + } else if ((af_vert == lf_vert) && (af_vert != SWITCHABLE)) { + for (filter_idx = (af_vert * SWITCHABLE_FILTERS); + filter_idx <= ((af_vert * SWITCHABLE_FILTERS) + 2); filter_idx += 1) { + if (filter_idx) { + interpolation_filter_rd(x, cpi, tile_data, bsize, mi_row, mi_col, + orig_dst, rd, switchable_rate, skip_txfm_sb, + skip_sse_sb, dst_bufs, filter_idx, + switchable_ctx, skip_pred, rate, dist); + } + } + } +} + // Find the best interp filter if dual_interp_filter = 0 static INLINE void find_best_non_dual_interp_filter( - MACROBLOCK *const x, const AV1_COMP *const cpi, BLOCK_SIZE bsize, - int mi_row, int mi_col, BUFFER_SET *const orig_dst, int64_t *const rd, + MACROBLOCK *const x, const AV1_COMP *const cpi, + const TileDataEnc *tile_data, BLOCK_SIZE bsize, int mi_row, int mi_col, + const BUFFER_SET *const orig_dst, int64_t *const rd, int *const switchable_rate, int *const skip_txfm_sb, int64_t *const skip_sse_sb, const BUFFER_SET *dst_bufs[2], const int switchable_ctx[2], const int skip_ver, const int skip_hor, int *rate, int64_t *dist, int filter_set_size) { int16_t i; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; // Regular filter evaluation should have been done and hence the same should // be the winner assert(x->e_mbd.mi[0]->interp_filters == filter_sets[0]); assert(filter_set_size == DUAL_FILTER_SET_SIZE); - + if ((skip_hor & skip_ver) != cpi->default_interp_skip_flags) { + const AV1_COMMON *cm = &cpi->common; + int bsl, pred_filter_search; + InterpFilters af = SWITCHABLE, lf = SWITCHABLE, filter_idx = 0; + const MB_MODE_INFO *const above_mbmi = xd->above_mbmi; + const MB_MODE_INFO *const left_mbmi = xd->left_mbmi; + bsl = mi_size_wide_log2[bsize]; + pred_filter_search = + cpi->sf.cb_pred_filter_search + ? (((mi_row + mi_col) >> bsl) + + get_chessboard_index(cm->current_frame.frame_number)) & + 0x1 + : 0; + if (above_mbmi && is_inter_block(above_mbmi)) { + af = above_mbmi->interp_filters; + } + if (left_mbmi && is_inter_block(left_mbmi)) { + lf = left_mbmi->interp_filters; + } + pred_filter_search &= ((af == lf) && (af != SWITCHABLE)); + if (pred_filter_search) { + filter_idx = SWITCHABLE * (af & 0xf); + // This assert tells that (filter_x == filter_y) for non-dual filter case + assert((filter_sets[filter_idx] & 0xffff) == + (filter_sets[filter_idx] >> 16)); + if (cpi->sf.adaptive_interp_filter_search && + (cpi->sf.interp_filter_search_mask & (1 << (filter_idx >> 2)))) { + return; + } + if (filter_idx) { + interpolation_filter_rd( + x, cpi, tile_data, bsize, mi_row, mi_col, orig_dst, rd, + switchable_rate, skip_txfm_sb, skip_sse_sb, dst_bufs, filter_idx, + switchable_ctx, (skip_hor & skip_ver), rate, dist); + } + return; + } + } // Reuse regular filter's modeled rd data for sharp filter for following // cases // 1) When bsize is 4x4 @@ -8321,10 +8441,14 @@ static INLINE void find_best_non_dual_interp_filter( for (i = filter_set_size - 1; i > 0; i -= (SWITCHABLE_FILTERS + 1)) { // This assert tells that (filter_x == filter_y) for non-dual filter case assert((filter_sets[i] & 0xffff) == (filter_sets[i] >> 16)); - interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd, - switchable_rate, skip_txfm_sb, skip_sse_sb, - dst_bufs, i, switchable_ctx, skip_pred, rate, - dist); + if (cpi->sf.adaptive_interp_filter_search && + (cpi->sf.interp_filter_search_mask & (1 << (i >> 2)))) { + continue; + } + interpolation_filter_rd(x, cpi, tile_data, bsize, mi_row, mi_col, + orig_dst, rd, switchable_rate, skip_txfm_sb, + skip_sse_sb, dst_bufs, i, switchable_ctx, + skip_pred, rate, dist); skip_pred = (skip_hor & skip_ver); } } else { @@ -8333,10 +8457,14 @@ static INLINE void find_best_non_dual_interp_filter( i += (SWITCHABLE_FILTERS + 1)) { // This assert tells that (filter_x == filter_y) for non-dual filter case assert((filter_sets[i] & 0xffff) == (filter_sets[i] >> 16)); - interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd, - switchable_rate, skip_txfm_sb, skip_sse_sb, - dst_bufs, i, switchable_ctx, skip_pred, rate, - dist); + if (cpi->sf.adaptive_interp_filter_search && + (cpi->sf.interp_filter_search_mask & (1 << (i >> 2)))) { + continue; + } + interpolation_filter_rd(x, cpi, tile_data, bsize, mi_row, mi_col, + orig_dst, rd, switchable_rate, skip_txfm_sb, + skip_sse_sb, dst_bufs, i, switchable_ctx, + skip_pred, rate, dist); // In first iteration, smooth filter is evaluated. If smooth filter // (which is less sharper) is the winner among regular and smooth filters, // sharp filter evaluation is skipped @@ -8344,8 +8472,6 @@ static INLINE void find_best_non_dual_interp_filter( // accounting switchable filter rate) if (cpi->sf.skip_sharp_interp_filter_search && skip_pred != cpi->default_interp_skip_flags) { - MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = xd->mi[0]; if (mbmi->interp_filters == filter_sets[(SWITCHABLE_FILTERS + 1)]) break; } @@ -8366,6 +8492,52 @@ static INLINE int is_interp_filter_match(const INTERPOLATION_FILTER_STATS *st, return 1; } +// Checks if characteristics of search match +static INLINE int is_comp_rd_match(const AV1_COMP *const cpi, + const MACROBLOCK *const x, + const COMP_RD_STATS *st, + const MB_MODE_INFO *const mi, + int32_t *comp_rate, int64_t *comp_dist, + int64_t *comp_model_rd) { + // TODO(ranjit): Ensure that compound type search use regular filter always + // and check if following check can be removed + // Check if interp filter matches with previous case + if (st->filter != mi->interp_filters) return 0; + + const MACROBLOCKD *const xd = &x->e_mbd; + // Match MV and reference indices + for (int i = 0; i < 2; ++i) { + if ((st->ref_frames[i] != mi->ref_frame[i]) || + (st->mv[i].as_int != mi->mv[i].as_int)) { + return 0; + } + const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[i]]; + if (is_global_mv_block(mi, wm->wmtype) != st->is_global[i]) return 0; + } + + // Store the stats for compound average + comp_rate[COMPOUND_AVERAGE] = st->rate[COMPOUND_AVERAGE]; + comp_dist[COMPOUND_AVERAGE] = st->dist[COMPOUND_AVERAGE]; + comp_model_rd[COMPOUND_AVERAGE] = st->comp_model_rd[COMPOUND_AVERAGE]; + comp_rate[COMPOUND_DISTWTD] = st->rate[COMPOUND_DISTWTD]; + comp_dist[COMPOUND_DISTWTD] = st->dist[COMPOUND_DISTWTD]; + comp_model_rd[COMPOUND_DISTWTD] = st->comp_model_rd[COMPOUND_DISTWTD]; + + // For compound wedge/segment, reuse data only if NEWMV is not present in + // either of the directions + if ((!have_newmv_in_inter_mode(mi->mode) && + !have_newmv_in_inter_mode(st->mode)) || + (cpi->sf.disable_interinter_wedge_newmv_search)) { + memcpy(&comp_rate[COMPOUND_WEDGE], &st->rate[COMPOUND_WEDGE], + sizeof(comp_rate[COMPOUND_WEDGE]) * 2); + memcpy(&comp_dist[COMPOUND_WEDGE], &st->dist[COMPOUND_WEDGE], + sizeof(comp_dist[COMPOUND_WEDGE]) * 2); + memcpy(&comp_model_rd[COMPOUND_WEDGE], &st->comp_model_rd[COMPOUND_WEDGE], + sizeof(comp_model_rd[COMPOUND_WEDGE]) * 2); + } + return 1; +} + static INLINE int find_interp_filter_in_stats(MACROBLOCK *x, MB_MODE_INFO *const mbmi) { const int comp_idx = mbmi->compound_idx; @@ -8379,9 +8551,27 @@ static INLINE int find_interp_filter_in_stats(MACROBLOCK *x, } return -1; // no match result found } +// Checks if similar compound type search case is accounted earlier +// If found, returns relevant rd data +static INLINE int find_comp_rd_in_stats(const AV1_COMP *const cpi, + const MACROBLOCK *x, + const MB_MODE_INFO *const mbmi, + int32_t *comp_rate, int64_t *comp_dist, + int64_t *comp_model_rd) { + for (int j = 0; j < x->comp_rd_stats_idx; ++j) { + if (is_comp_rd_match(cpi, x, &x->comp_rd_stats[j], mbmi, comp_rate, + comp_dist, comp_model_rd)) { + return 1; + } + } + return 0; // no match result found +} static INLINE void save_interp_filter_search_stat(MACROBLOCK *x, - MB_MODE_INFO *const mbmi) { + MB_MODE_INFO *const mbmi, + int64_t rd, int skip_txfm_sb, + int64_t skip_sse_sb, + unsigned int pred_sse) { const int comp_idx = mbmi->compound_idx; const int offset = x->interp_filter_stats_idx[comp_idx]; if (offset < MAX_INTERP_FILTER_STATS) { @@ -8389,19 +8579,52 @@ static INLINE void save_interp_filter_search_stat(MACROBLOCK *x, { mbmi->mv[0], mbmi->mv[1] }, { mbmi->ref_frame[0], mbmi->ref_frame[1] }, - mbmi->interinter_comp.type }; + mbmi->interinter_comp.type, + rd, + skip_txfm_sb, + skip_sse_sb, + pred_sse }; x->interp_filter_stats[comp_idx][offset] = stat; x->interp_filter_stats_idx[comp_idx]++; } } +static INLINE void save_comp_rd_search_stat(MACROBLOCK *x, + const MB_MODE_INFO *const mbmi, + const int32_t *comp_rate, + const int64_t *comp_dist, + const int64_t *comp_model_rd, + const int_mv *cur_mv) { + const int offset = x->comp_rd_stats_idx; + if (offset < MAX_COMP_RD_STATS) { + COMP_RD_STATS *const rd_stats = x->comp_rd_stats + offset; + memcpy(rd_stats->rate, comp_rate, sizeof(rd_stats->rate)); + memcpy(rd_stats->dist, comp_dist, sizeof(rd_stats->dist)); + memcpy(rd_stats->comp_model_rd, comp_model_rd, + sizeof(rd_stats->comp_model_rd)); + memcpy(rd_stats->mv, cur_mv, sizeof(rd_stats->mv)); + memcpy(rd_stats->ref_frames, mbmi->ref_frame, sizeof(rd_stats->ref_frames)); + rd_stats->mode = mbmi->mode; + rd_stats->filter = mbmi->interp_filters; + rd_stats->ref_mv_idx = mbmi->ref_mv_idx; + const MACROBLOCKD *const xd = &x->e_mbd; + for (int i = 0; i < 2; ++i) { + const WarpedMotionParams *const wm = + &xd->global_motion[mbmi->ref_frame[i]]; + rd_stats->is_global[i] = is_global_mv_block(mbmi, wm->wmtype); + } + ++x->comp_rd_stats_idx; + } +} + static int64_t interpolation_filter_search( - MACROBLOCK *const x, const AV1_COMP *const cpi, BLOCK_SIZE bsize, - int mi_row, int mi_col, const BUFFER_SET *const tmp_dst, - BUFFER_SET *const orig_dst, InterpFilter (*const single_filter)[REF_FRAMES], - int64_t *const rd, int *const switchable_rate, int *const skip_txfm_sb, - int64_t *const skip_sse_sb, const int skip_build_pred, - HandleInterModeArgs *args, int64_t ref_best_rd) { + MACROBLOCK *const x, const AV1_COMP *const cpi, + const TileDataEnc *tile_data, BLOCK_SIZE bsize, int mi_row, int mi_col, + const BUFFER_SET *const tmp_dst, const BUFFER_SET *const orig_dst, + InterpFilter (*const single_filter)[REF_FRAMES], int64_t *const rd, + int *const switchable_rate, int *const skip_txfm_sb, + int64_t *const skip_sse_sb, int *skip_build_pred, HandleInterModeArgs *args, + int64_t ref_best_rd) { const AV1_COMMON *cm = &cpi->common; const int num_planes = av1_num_planes(cm); MACROBLOCKD *const xd = &x->e_mbd; @@ -8418,12 +8641,23 @@ static int64_t interpolation_filter_search( const int ref_frame = xd->mi[0]->ref_frame[0]; (void)single_filter; - int match_found = -1; + int match_found_idx = -1; const InterpFilter assign_filter = cm->interp_filter; if (cpi->sf.skip_repeat_interpolation_filter_search && need_search) { - match_found = find_interp_filter_in_stats(x, mbmi); + match_found_idx = find_interp_filter_in_stats(x, mbmi); + } + if (match_found_idx != -1) { + const int comp_idx = mbmi->compound_idx; + *rd = x->interp_filter_stats[comp_idx][match_found_idx].rd; + *skip_txfm_sb = + x->interp_filter_stats[comp_idx][match_found_idx].skip_txfm_sb; + *skip_sse_sb = + x->interp_filter_stats[comp_idx][match_found_idx].skip_sse_sb; + x->pred_sse[ref_frame] = + x->interp_filter_stats[comp_idx][match_found_idx].pred_sse; + return 0; } - if (!need_search || match_found == -1) { + if (!need_search || match_found_idx == -1) { set_default_interp_filters(mbmi, assign_filter); } int switchable_ctx[2]; @@ -8431,13 +8665,16 @@ static int64_t interpolation_filter_search( switchable_ctx[1] = av1_get_pred_context_switchable_interp(xd, 1); *switchable_rate = get_switchable_rate(x, mbmi->interp_filters, switchable_ctx); - if (!skip_build_pred) - av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize); + if (!(*skip_build_pred)) { + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, 0, + av1_num_planes(cm) - 1); + *skip_build_pred = 1; + } #if CONFIG_COLLECT_RD_STATS == 3 RD_STATS rd_stats_y; - select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, mi_row, mi_col, INT64_MAX); - PrintPredictionUnitStats(cpi, x, &rd_stats_y, bsize); + pick_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize, mi_row, mi_col, INT64_MAX); + PrintPredictionUnitStats(cpi, tile_data, x, &rd_stats_y, bsize); #endif // CONFIG_COLLECT_RD_STATS == 3 model_rd_sb_fn[MODELRD_TYPE_INTERP_FILTER]( cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &tmp_rate[0], &tmp_dist[0], @@ -8458,7 +8695,7 @@ static int64_t interpolation_filter_search( *skip_sse_sb = best_skip_sse_sb[1]; x->pred_sse[ref_frame] = (unsigned int)(best_skip_sse_sb[0] >> 4); - if (assign_filter != SWITCHABLE || match_found != -1) { + if (assign_filter != SWITCHABLE || match_found_idx != -1) { return 0; } if (!need_search) { @@ -8493,9 +8730,8 @@ static int64_t interpolation_filter_search( const int is_compound = has_second_ref(mbmi); assert(is_intrabc_block(mbmi) == 0); for (int j = 0; j < 1 + is_compound; ++j) { - const RefBuffer *ref_buf = - &cm->current_frame.frame_refs[mbmi->ref_frame[j] - LAST_FRAME]; - const struct scale_factors *const sf = &ref_buf->sf; + const struct scale_factors *const sf = + get_ref_scale_factors_const(cm, mbmi->ref_frame[j]); // TODO(any): Refine skip flag calculation considering scaling if (av1_is_scaled(sf)) { skip_hor = 0; @@ -8543,38 +8779,72 @@ static int64_t interpolation_filter_search( int best_dual_mode = 0; // Find best of {R}x{R,Sm,Sh} const int bw = block_size_wide[bsize]; - int skip_pred = bw <= 4 ? cpi->default_interp_skip_flags : skip_hor; - for (i = (SWITCHABLE_FILTERS - 1); i >= 1; --i) { - if (interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd, - switchable_rate, best_skip_txfm_sb, - best_skip_sse_sb, dst_bufs, i, switchable_ctx, - skip_pred, tmp_rate, tmp_dist)) { - best_dual_mode = i; - } - skip_pred = skip_hor; - } - // From best of horizontal EIGHTTAP_REGULAR modes, check vertical modes const int bh = block_size_high[bsize]; - skip_pred = bh <= 4 ? cpi->default_interp_skip_flags : skip_ver; - assert(filter_set_size == DUAL_FILTER_SET_SIZE); - for (i = (best_dual_mode + (SWITCHABLE_FILTERS * 2)); - i >= (best_dual_mode + SWITCHABLE_FILTERS); i -= SWITCHABLE_FILTERS) { - interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd, - switchable_rate, best_skip_txfm_sb, - best_skip_sse_sb, dst_bufs, i, switchable_ctx, - skip_pred, tmp_rate, tmp_dist); - skip_pred = skip_ver; + int skip_pred; + int bsl, pred_filter_search; + InterpFilters af_horiz = SWITCHABLE, af_vert = SWITCHABLE, + lf_horiz = SWITCHABLE, lf_vert = SWITCHABLE, filter_idx = 0; + const MB_MODE_INFO *const above_mbmi = xd->above_mbmi; + const MB_MODE_INFO *const left_mbmi = xd->left_mbmi; + bsl = mi_size_wide_log2[bsize]; + pred_filter_search = + cpi->sf.cb_pred_filter_search + ? (((mi_row + mi_col) >> bsl) + + get_chessboard_index(cm->current_frame.frame_number)) & + 0x1 + : 0; + if (above_mbmi && is_inter_block(above_mbmi)) { + af_horiz = av1_extract_interp_filter(above_mbmi->interp_filters, 1); + af_vert = av1_extract_interp_filter(above_mbmi->interp_filters, 0); + } + if (left_mbmi && is_inter_block(left_mbmi)) { + lf_horiz = av1_extract_interp_filter(left_mbmi->interp_filters, 1); + lf_vert = av1_extract_interp_filter(left_mbmi->interp_filters, 0); + } + pred_filter_search &= !have_newmv_in_inter_mode(mbmi->mode); + pred_filter_search &= + ((af_horiz == lf_horiz) && (af_horiz != SWITCHABLE)) || + ((af_vert == lf_vert) && (af_vert != SWITCHABLE)); + if (pred_filter_search) { + pred_dual_interp_filter_rd( + x, cpi, tile_data, bsize, mi_row, mi_col, orig_dst, rd, + switchable_rate, best_skip_txfm_sb, best_skip_sse_sb, dst_bufs, + filter_idx, switchable_ctx, (skip_hor & skip_ver), tmp_rate, tmp_dist, + af_horiz, af_vert, lf_horiz, lf_vert); + } else { + skip_pred = bw <= 4 ? cpi->default_interp_skip_flags : skip_hor; + for (i = (SWITCHABLE_FILTERS - 1); i >= 1; --i) { + if (interpolation_filter_rd( + x, cpi, tile_data, bsize, mi_row, mi_col, orig_dst, rd, + switchable_rate, best_skip_txfm_sb, best_skip_sse_sb, dst_bufs, + i, switchable_ctx, skip_pred, tmp_rate, tmp_dist)) { + best_dual_mode = i; + } + skip_pred = skip_hor; + } + // From best of horizontal EIGHTTAP_REGULAR modes, check vertical modes + skip_pred = bh <= 4 ? cpi->default_interp_skip_flags : skip_ver; + assert(filter_set_size == DUAL_FILTER_SET_SIZE); + for (i = (best_dual_mode + (SWITCHABLE_FILTERS * 2)); + i >= (best_dual_mode + SWITCHABLE_FILTERS); + i -= SWITCHABLE_FILTERS) { + interpolation_filter_rd( + x, cpi, tile_data, bsize, mi_row, mi_col, orig_dst, rd, + switchable_rate, best_skip_txfm_sb, best_skip_sse_sb, dst_bufs, i, + switchable_ctx, skip_pred, tmp_rate, tmp_dist); + skip_pred = skip_ver; + } } } else if (cm->seq_params.enable_dual_filter == 0) { find_best_non_dual_interp_filter( - x, cpi, bsize, mi_row, mi_col, orig_dst, rd, switchable_rate, + x, cpi, tile_data, bsize, mi_row, mi_col, orig_dst, rd, switchable_rate, best_skip_txfm_sb, best_skip_sse_sb, dst_bufs, switchable_ctx, skip_ver, skip_hor, tmp_rate, tmp_dist, filter_set_size); } else { // EIGHTTAP_REGULAR mode is calculated beforehand for (i = 1; i < filter_set_size; ++i) { - interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd, - switchable_rate, best_skip_txfm_sb, + interpolation_filter_rd(x, cpi, tile_data, bsize, mi_row, mi_col, + orig_dst, rd, switchable_rate, best_skip_txfm_sb, best_skip_sse_sb, dst_bufs, i, switchable_ctx, (skip_hor & skip_ver), tmp_rate, tmp_dist); } @@ -8586,7 +8856,8 @@ static int64_t interpolation_filter_search( // in either of the directions Condition below is necessary, but not // sufficient assert((skip_hor == 1) || (skip_ver == 1)); - av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize); + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); } *skip_txfm_sb = best_skip_txfm_sb[1]; *skip_sse_sb = best_skip_sse_sb[1]; @@ -8594,174 +8865,145 @@ static int64_t interpolation_filter_search( // save search results if (cpi->sf.skip_repeat_interpolation_filter_search) { - assert(match_found == -1); - save_interp_filter_search_stat(x, mbmi); + assert(match_found_idx == -1); + save_interp_filter_search_stat(x, mbmi, *rd, *skip_txfm_sb, *skip_sse_sb, + x->pred_sse[ref_frame]); } return 0; } -static int txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, - int mi_row, int mi_col, RD_STATS *rd_stats, - RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv, - int mode_rate, int64_t ref_best_rd) { +static int txfm_search(const AV1_COMP *cpi, const TileDataEnc *tile_data, + MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col, + RD_STATS *rd_stats, RD_STATS *rd_stats_y, + RD_STATS *rd_stats_uv, int mode_rate, + int64_t ref_best_rd) { /* * This function combines y and uv planes' transform search processes - * together, when the prediction is generated. It first does subtration to + * together, when the prediction is generated. It first does subtraction to * obtain the prediction error. Then it calls - * select_tx_type_yrd/super_block_yrd and inter_block_uvrd sequentially and - * handles the early terminations happen in those functions. At the end, it + * pick_tx_size_type_yrd/super_block_yrd and super_block_uvrd sequentially and + * handles the early terminations happening in those functions. At the end, it * computes the rd_stats/_y/_uv accordingly. */ const AV1_COMMON *cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; - int skip_txfm_sb = 0; - const int num_planes = av1_num_planes(cm); const int ref_frame_1 = mbmi->ref_frame[1]; const int64_t mode_rd = RDCOST(x->rdmult, mode_rate, 0); const int64_t rd_thresh = ref_best_rd == INT64_MAX ? INT64_MAX : ref_best_rd - mode_rd; const int skip_ctx = av1_get_skip_context(xd); + const int skip_flag_cost[2] = { x->skip_cost[skip_ctx][0], + x->skip_cost[skip_ctx][1] }; const int64_t min_header_rate = - mode_rate + AOMMIN(x->skip_cost[skip_ctx][0], x->skip_cost[skip_ctx][1]); + mode_rate + AOMMIN(skip_flag_cost[0], skip_flag_cost[1]); // Account for minimum skip and non_skip rd. // Eventually either one of them will be added to mode_rate const int64_t min_header_rd_possible = RDCOST(x->rdmult, min_header_rate, 0); + (void)tile_data; if (min_header_rd_possible > ref_best_rd) { av1_invalid_rd_stats(rd_stats_y); - av1_invalid_rd_stats(rd_stats); return 0; } av1_init_rd_stats(rd_stats); av1_init_rd_stats(rd_stats_y); - av1_init_rd_stats(rd_stats_uv); rd_stats->rate = mode_rate; - if (!cpi->common.all_lossless) - check_block_skip(cpi, bsize, x, xd, 0, num_planes - 1, &skip_txfm_sb); - if (!skip_txfm_sb) { - int64_t non_skip_rdcosty = INT64_MAX; - int64_t skip_rdcosty = INT64_MAX; - int64_t min_rdcosty = INT64_MAX; - int is_cost_valid_uv = 0; - - // cost and distortion - av1_subtract_plane(x, bsize, 0); - if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) { - // Motion mode - select_tx_type_yrd(cpi, x, rd_stats_y, bsize, mi_row, mi_col, rd_thresh); + // cost and distortion + av1_subtract_plane(x, bsize, 0); + if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) { + pick_tx_size_type_yrd(cpi, x, rd_stats_y, bsize, mi_row, mi_col, rd_thresh); #if CONFIG_COLLECT_RD_STATS == 2 - PrintPredictionUnitStats(cpi, x, rd_stats_y, bsize); + PrintPredictionUnitStats(cpi, tile_data, x, rd_stats_y, bsize); #endif // CONFIG_COLLECT_RD_STATS == 2 - } else { - super_block_yrd(cpi, x, rd_stats_y, bsize, rd_thresh); - memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size)); - for (int i = 0; i < xd->n4_h * xd->n4_w; ++i) - set_blk_skip(x, 0, i, rd_stats_y->skip); - } - - if (rd_stats_y->rate == INT_MAX) { - av1_invalid_rd_stats(rd_stats); - // TODO(angiebird): check if we need this - // restore_dst_buf(xd, *orig_dst, num_planes); - mbmi->ref_frame[1] = ref_frame_1; - return 0; - } + } else { + super_block_yrd(cpi, x, rd_stats_y, bsize, rd_thresh); + memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size)); + for (int i = 0; i < xd->n4_h * xd->n4_w; ++i) + set_blk_skip(x, 0, i, rd_stats_y->skip); + } - av1_merge_rd_stats(rd_stats, rd_stats_y); + if (rd_stats_y->rate == INT_MAX) { + // TODO(angiebird): check if we need this + // restore_dst_buf(xd, *orig_dst, num_planes); + mbmi->ref_frame[1] = ref_frame_1; + return 0; + } - non_skip_rdcosty = RDCOST( - x->rdmult, rd_stats->rate + x->skip_cost[skip_ctx][0], rd_stats->dist); - skip_rdcosty = - RDCOST(x->rdmult, mode_rate + x->skip_cost[skip_ctx][1], rd_stats->sse); - min_rdcosty = AOMMIN(non_skip_rdcosty, skip_rdcosty); + av1_merge_rd_stats(rd_stats, rd_stats_y); + + const int64_t non_skip_rdcosty = + RDCOST(x->rdmult, rd_stats->rate + skip_flag_cost[0], rd_stats->dist); + const int64_t skip_rdcosty = + RDCOST(x->rdmult, mode_rate + skip_flag_cost[1], rd_stats->sse); + const int64_t min_rdcosty = AOMMIN(non_skip_rdcosty, skip_rdcosty); + if (min_rdcosty > ref_best_rd) { + const int64_t tokenonly_rdy = + AOMMIN(RDCOST(x->rdmult, rd_stats_y->rate, rd_stats_y->dist), + RDCOST(x->rdmult, 0, rd_stats_y->sse)); + // Invalidate rd_stats_y to skip the rest of the motion modes search + if (tokenonly_rdy - (tokenonly_rdy >> cpi->sf.prune_motion_mode_level) > + rd_thresh) + av1_invalid_rd_stats(rd_stats_y); + mbmi->ref_frame[1] = ref_frame_1; + return 0; + } - if (min_rdcosty > ref_best_rd) { - int64_t tokenonly_rdy = - AOMMIN(RDCOST(x->rdmult, rd_stats_y->rate, rd_stats_y->dist), - RDCOST(x->rdmult, 0, rd_stats_y->sse)); - // Invalidate rd_stats_y to skip the rest of the motion modes search - if (tokenonly_rdy - (tokenonly_rdy >> cpi->sf.adaptive_txb_search_level) > - rd_thresh) - av1_invalid_rd_stats(rd_stats_y); + av1_init_rd_stats(rd_stats_uv); + const int num_planes = av1_num_planes(cm); + if (num_planes > 1) { + int64_t ref_best_chroma_rd = ref_best_rd; + // Calculate best rd cost possible for chroma + if (cpi->sf.perform_best_rd_based_gating_for_chroma && + (ref_best_chroma_rd != INT64_MAX)) { + ref_best_chroma_rd = + (ref_best_chroma_rd - AOMMIN(non_skip_rdcosty, skip_rdcosty)); + } + const int is_cost_valid_uv = + super_block_uvrd(cpi, x, rd_stats_uv, bsize, ref_best_chroma_rd); + if (!is_cost_valid_uv) { mbmi->ref_frame[1] = ref_frame_1; return 0; } + av1_merge_rd_stats(rd_stats, rd_stats_uv); + } - if (num_planes > 1) { - /* clang-format off */ - is_cost_valid_uv = - inter_block_uvrd(cpi, x, rd_stats_uv, bsize, - ref_best_rd - non_skip_rdcosty, - ref_best_rd - skip_rdcosty, FTXS_NONE); - if (!is_cost_valid_uv) { - mbmi->ref_frame[1] = ref_frame_1; - return 0; - } - /* clang-format on */ - av1_merge_rd_stats(rd_stats, rd_stats_uv); - } else { - av1_init_rd_stats(rd_stats_uv); - } - if (rd_stats->skip) { - rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate; - rd_stats_y->rate = 0; - rd_stats_uv->rate = 0; - rd_stats->rate += x->skip_cost[skip_ctx][1]; - mbmi->skip = 0; - // here mbmi->skip temporarily plays a role as what this_skip2 does - - int64_t tmprd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); - if (tmprd > ref_best_rd) { - mbmi->ref_frame[1] = ref_frame_1; - return 0; - } -#if CONFIG_ONE_PASS_SVM - av1_reg_stat_skipmode_update(rd_stats_y, x->rdmult); -#endif - } else if (!xd->lossless[mbmi->segment_id] && - (RDCOST(x->rdmult, - rd_stats_y->rate + rd_stats_uv->rate + - x->skip_cost[skip_ctx][0], - rd_stats->dist) >= - RDCOST(x->rdmult, x->skip_cost[skip_ctx][1], rd_stats->sse))) { - rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate; - rd_stats->rate += x->skip_cost[skip_ctx][1]; - rd_stats->dist = rd_stats->sse; - rd_stats_y->rate = 0; - rd_stats_uv->rate = 0; - mbmi->skip = 1; -#if CONFIG_ONE_PASS_SVM - av1_reg_stat_skipmode_update(rd_stats_y, x->rdmult); -#endif - } else { - rd_stats->rate += x->skip_cost[skip_ctx][0]; - mbmi->skip = 0; - } - } else { - x->skip = 1; - mbmi->tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode); - // The cost of skip bit needs to be added. - mbmi->skip = 0; - rd_stats->rate += x->skip_cost[skip_ctx][1]; - - rd_stats->dist = 0; - rd_stats->sse = 0; + if (rd_stats->skip) { + rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate; rd_stats_y->rate = 0; rd_stats_uv->rate = 0; - rd_stats->skip = 1; - int64_t tmprd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + rd_stats->dist = rd_stats->sse; + rd_stats_y->dist = rd_stats_y->sse; + rd_stats_uv->dist = rd_stats_uv->sse; + rd_stats->rate += skip_flag_cost[1]; + mbmi->skip = 1; + // here mbmi->skip temporarily plays a role as what this_skip2 does + + const int64_t tmprd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); if (tmprd > ref_best_rd) { mbmi->ref_frame[1] = ref_frame_1; return 0; } -#if CONFIG_ONE_PASS_SVM - av1_add_reg_stat(rd_stats, 0, 0, 0, 0, 0, bsize, bsize); - av1_reg_stat_skipmode_update(rd_stats, x->rdmult); -#endif + } else if (!xd->lossless[mbmi->segment_id] && + (RDCOST(x->rdmult, + rd_stats_y->rate + rd_stats_uv->rate + skip_flag_cost[0], + rd_stats->dist) >= + RDCOST(x->rdmult, skip_flag_cost[1], rd_stats->sse))) { + rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate; + rd_stats->rate += skip_flag_cost[1]; + rd_stats->dist = rd_stats->sse; + rd_stats_y->dist = rd_stats_y->sse; + rd_stats_uv->dist = rd_stats_uv->sse; + rd_stats_y->rate = 0; + rd_stats_uv->rate = 0; + mbmi->skip = 1; + } else { + rd_stats->rate += skip_flag_cost[0]; + mbmi->skip = 0; } + return 1; } @@ -8773,18 +9015,30 @@ static INLINE bool enable_wedge_search(MACROBLOCK *const x, x->edge_strength > cpi->sf.disable_wedge_search_edge_thresh; } +static INLINE bool enable_wedge_interinter_search(MACROBLOCK *const x, + const AV1_COMP *const cpi) { + return enable_wedge_search(x, cpi) && cpi->oxcf.enable_interinter_wedge; +} + +static INLINE bool enable_wedge_interintra_search(MACROBLOCK *const x, + const AV1_COMP *const cpi) { + return enable_wedge_search(x, cpi) && cpi->oxcf.enable_interintra_wedge && + !cpi->sf.disable_wedge_interintra_search; +} + static int handle_inter_intra_mode(const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, int mi_row, int mi_col, MB_MODE_INFO *mbmi, HandleInterModeArgs *args, int64_t ref_best_rd, int *rate_mv, - int *tmp_rate2, BUFFER_SET *orig_dst) { + int *tmp_rate2, const BUFFER_SET *orig_dst) { const AV1_COMMON *const cm = &cpi->common; const int num_planes = av1_num_planes(cm); MACROBLOCKD *xd = &x->e_mbd; INTERINTRA_MODE best_interintra_mode = II_DC_PRED; - int64_t rd, best_interintra_rd = INT64_MAX; + int64_t rd = INT64_MAX; + int64_t best_interintra_rd = INT64_MAX; int rmode, rate_sum; int64_t dist_sum; int tmp_rate_mv = 0; @@ -8803,60 +9057,118 @@ static int handle_inter_intra_mode(const AV1_COMP *const cpi, mbmi->ref_frame[1] = NONE_FRAME; xd->plane[0].dst.buf = tmp_buf; xd->plane[0].dst.stride = bw; - av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, NULL, bsize); + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); restore_dst_buf(xd, *orig_dst, num_planes); mbmi->ref_frame[1] = INTRA_FRAME; - mbmi->use_wedge_interintra = 0; best_interintra_mode = args->inter_intra_mode[mbmi->ref_frame[0]]; - int j = 0; - if (cpi->sf.reuse_inter_intra_mode == 0 || - best_interintra_mode == INTERINTRA_MODES) { - for (j = 0; j < INTERINTRA_MODES; ++j) { - mbmi->interintra_mode = (INTERINTRA_MODE)j; - rmode = interintra_mode_cost[mbmi->interintra_mode]; + + if (cpi->oxcf.enable_smooth_interintra && + !cpi->sf.disable_smooth_interintra) { + mbmi->use_wedge_interintra = 0; + int j = 0; + if (cpi->sf.reuse_inter_intra_mode == 0 || + best_interintra_mode == INTERINTRA_MODES) { + for (j = 0; j < INTERINTRA_MODES; ++j) { + if ((!cpi->oxcf.enable_smooth_intra || cpi->sf.disable_smooth_intra) && + (INTERINTRA_MODE)j == II_SMOOTH_PRED) + continue; + mbmi->interintra_mode = (INTERINTRA_MODE)j; + rmode = interintra_mode_cost[mbmi->interintra_mode]; + av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst, + intrapred, bw); + av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); + model_rd_sb_fn[MODELRD_TYPE_INTERINTRA]( + cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum, + &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL); + rd = RDCOST(x->rdmult, tmp_rate_mv + rate_sum + rmode, dist_sum); + if (rd < best_interintra_rd) { + best_interintra_rd = rd; + best_interintra_mode = mbmi->interintra_mode; + } + } + args->inter_intra_mode[mbmi->ref_frame[0]] = best_interintra_mode; + } + assert(IMPLIES(!cpi->oxcf.enable_smooth_interintra || + cpi->sf.disable_smooth_interintra, + best_interintra_mode != II_SMOOTH_PRED)); + rmode = interintra_mode_cost[best_interintra_mode]; + if (j == 0 || best_interintra_mode != II_SMOOTH_PRED) { + mbmi->interintra_mode = best_interintra_mode; av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst, intrapred, bw); av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); - model_rd_sb_fn[MODELRD_TYPE_INTERINTRA]( - cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum, - &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL); - rd = RDCOST(x->rdmult, tmp_rate_mv + rate_sum + rmode, dist_sum); - if (rd < best_interintra_rd) { - best_interintra_rd = rd; - best_interintra_mode = mbmi->interintra_mode; - } - } - args->inter_intra_mode[mbmi->ref_frame[0]] = best_interintra_mode; - } - if (j == 0 || best_interintra_mode != II_SMOOTH_PRED) { - mbmi->interintra_mode = best_interintra_mode; - rmode = interintra_mode_cost[mbmi->interintra_mode]; - av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst, - intrapred, bw); - av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); - } - rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum, - &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX); - if (rd != INT64_MAX) - rd = RDCOST(x->rdmult, *rate_mv + rmode + rate_sum + rwedge, dist_sum); - best_interintra_rd = rd; - if (ref_best_rd < INT64_MAX && (best_interintra_rd >> 1) > ref_best_rd) { - return -1; + } + + RD_STATS rd_stats; + rd = estimate_yrd_for_sb(cpi, bsize, x, ref_best_rd, &rd_stats); + if (rd != INT64_MAX) { + rd = RDCOST(x->rdmult, *rate_mv + rmode + rd_stats.rate + rwedge, + rd_stats.dist); + } + best_interintra_rd = rd; + if (ref_best_rd < INT64_MAX && + ((best_interintra_rd >> 4) * 9) > ref_best_rd) { + return -1; + } } if (is_wedge_used) { int64_t best_interintra_rd_nowedge = rd; int64_t best_interintra_rd_wedge = INT64_MAX; int_mv tmp_mv; - if (enable_wedge_search(x, cpi)) { + if (enable_wedge_interintra_search(x, cpi)) { mbmi->use_wedge_interintra = 1; rwedge = av1_cost_literal(get_interintra_wedge_bits(bsize)) + x->wedge_interintra_cost[bsize][1]; - best_interintra_rd_wedge = - pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_); + if (!cpi->oxcf.enable_smooth_interintra || + cpi->sf.disable_smooth_interintra) { + if (best_interintra_mode == INTERINTRA_MODES) { + mbmi->interintra_mode = II_SMOOTH_PRED; + best_interintra_mode = II_SMOOTH_PRED; + av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst, + intrapred, bw); + best_interintra_rd_wedge = + pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_); + + int j = 0; + for (j = 0; j < INTERINTRA_MODES; ++j) { + mbmi->interintra_mode = (INTERINTRA_MODE)j; + rmode = interintra_mode_cost[mbmi->interintra_mode]; + av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, + orig_dst, intrapred, bw); + av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); + model_rd_sb_fn[MODELRD_TYPE_INTERINTRA]( + cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum, + &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL); + rd = RDCOST(x->rdmult, tmp_rate_mv + rate_sum + rmode, dist_sum); + if (rd < best_interintra_rd) { + best_interintra_rd_wedge = rd; + best_interintra_mode = mbmi->interintra_mode; + } + } + args->inter_intra_mode[mbmi->ref_frame[0]] = best_interintra_mode; + mbmi->interintra_mode = best_interintra_mode; + + if (best_interintra_mode != II_SMOOTH_PRED) { + av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, + orig_dst, intrapred, bw); + } + } else { + mbmi->interintra_mode = best_interintra_mode; + av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst, + intrapred, bw); + best_interintra_rd_wedge = + pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_); + } + } else { + best_interintra_rd_wedge = + pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_); + } + rmode = interintra_mode_cost[mbmi->interintra_mode]; best_interintra_rd_wedge += RDCOST(x->rdmult, rmode + *rate_mv + rwedge, 0); rd = INT64_MAX; @@ -8871,8 +9183,8 @@ static int handle_inter_intra_mode(const AV1_COMP *const cpi, 0); if (mbmi->mv[0].as_int != tmp_mv.as_int) { mbmi->mv[0].as_int = tmp_mv.as_int; - av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, - bsize); + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND]( cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum, &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL); @@ -8886,12 +9198,17 @@ static int handle_inter_intra_mode(const AV1_COMP *const cpi, av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw); } // Evaluate closer to true rd - rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum, - &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX); - if (rd != INT64_MAX) - rd = RDCOST(x->rdmult, rmode + tmp_rate_mv + rwedge + rate_sum, - dist_sum); + RD_STATS rd_stats; + rd = estimate_yrd_for_sb(cpi, bsize, x, ref_best_rd, &rd_stats); + if (rd != INT64_MAX) { + rd = RDCOST(x->rdmult, rmode + tmp_rate_mv + rwedge + rd_stats.rate, + rd_stats.dist); + } best_interintra_rd_wedge = rd; + if ((!cpi->oxcf.enable_smooth_interintra || + cpi->sf.disable_smooth_interintra) && + best_interintra_rd_wedge == INT64_MAX) + return -1; if (best_interintra_rd_wedge < best_interintra_rd_nowedge) { mbmi->use_wedge_interintra = 1; mbmi->mv[0].as_int = tmp_mv.as_int; @@ -8900,33 +9217,133 @@ static int handle_inter_intra_mode(const AV1_COMP *const cpi, } else { mbmi->use_wedge_interintra = 0; mbmi->mv[0].as_int = mv0.as_int; - av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize); + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); } } else { + if (!cpi->oxcf.enable_smooth_interintra || + cpi->sf.disable_smooth_interintra) + return -1; mbmi->use_wedge_interintra = 0; } - } // if (is_interintra_wedge_used(bsize)) + } else { + if (best_interintra_rd == INT64_MAX) return -1; + } if (num_planes > 1) { - av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, orig_dst, bsize); + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, + AOM_PLANE_U, num_planes - 1); + } + return 0; +} + +// If number of valid neighbours is 1, +// 1) ROTZOOM parameters can be obtained reliably (2 parameters from +// one neighbouring MV) +// 2) For IDENTITY/TRANSLATION cases, warp can perform better due to +// a different interpolation filter being used. However the quality +// gains (due to the same) may not be much +// For above 2 cases warp evaluation is skipped + +static int check_if_optimal_warp(const AV1_COMP *cpi, + WarpedMotionParams *wm_params, + int num_proj_ref) { + int is_valid_warp = 1; + if (cpi->sf.prune_warp_using_wmtype) { + TransformationType wmtype = get_wmtype(wm_params); + if (num_proj_ref == 1) { + if (wmtype != ROTZOOM) is_valid_warp = 0; + } else { + if (wmtype < ROTZOOM) is_valid_warp = 0; + } + } + return is_valid_warp; +} + +struct obmc_check_mv_field_ctxt { + MB_MODE_INFO *current_mi; + int mv_field_check_result; +}; + +static INLINE void obmc_check_identical_mv(MACROBLOCKD *xd, int rel_mi_col, + uint8_t nb_mi_width, + MB_MODE_INFO *nb_mi, void *fun_ctxt, + const int num_planes) { + (void)xd; + (void)rel_mi_col; + (void)nb_mi_width; + (void)num_planes; + struct obmc_check_mv_field_ctxt *ctxt = + (struct obmc_check_mv_field_ctxt *)fun_ctxt; + const MB_MODE_INFO *current_mi = ctxt->current_mi; + + if (ctxt->mv_field_check_result == 0) return; + + if (nb_mi->ref_frame[0] != current_mi->ref_frame[0] || + nb_mi->mv[0].as_int != current_mi->mv[0].as_int || + nb_mi->interp_filters != current_mi->interp_filters) { + ctxt->mv_field_check_result = 0; + } +} + +// Check if the neighbors' motions used by obmc have same parameters as for +// the current block. If all the parameters are identical, obmc will produce +// the same prediction as from regular bmc, therefore we can skip the +// overlapping operations for less complexity. The parameters checked include +// reference frame, motion vector, and interpolation filter. +int check_identical_obmc_mv_field(const AV1_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int mi_col) { + const BLOCK_SIZE bsize = xd->mi[0]->sb_type; + struct obmc_check_mv_field_ctxt mv_field_check_ctxt = { xd->mi[0], 1 }; + + foreach_overlappable_nb_above(cm, xd, mi_col, + max_neighbor_obmc[mi_size_wide_log2[bsize]], + obmc_check_identical_mv, &mv_field_check_ctxt); + foreach_overlappable_nb_left(cm, xd, mi_row, + max_neighbor_obmc[mi_size_high_log2[bsize]], + obmc_check_identical_mv, &mv_field_check_ctxt); + + return mv_field_check_ctxt.mv_field_check_result; +} + +static int skip_interintra_based_on_first_pass_stats(const AV1_COMP *const cpi, + MACROBLOCK *const x, + BLOCK_SIZE bsize, + int mi_row, int mi_col) { + MACROBLOCKD *xd = &x->e_mbd; + MB_MODE_INFO *mbmi = xd->mi[0]; + if (cpi->two_pass_partition_search && + cpi->sf.use_first_partition_pass_interintra_stats && + !x->cb_partition_scan) { + const int mi_width = mi_size_wide[bsize]; + const int mi_height = mi_size_high[bsize]; + // Search in the stats table to see if obmc motion mode was used in the + // first pass of partition search. + for (int row = mi_row; row < mi_row + mi_width; + row += FIRST_PARTITION_PASS_SAMPLE_REGION) { + for (int col = mi_col; col < mi_col + mi_height; + col += FIRST_PARTITION_PASS_SAMPLE_REGION) { + const int index = av1_first_partition_pass_stats_index(row, col); + const FIRST_PARTITION_PASS_STATS *const stats = + &x->first_partition_pass_stats[index]; + if (stats->interintra_motion_mode_count[mbmi->ref_frame[0]]) { + return 0; + } + } + } + return 1; } return 0; } // TODO(afergs): Refactor the MBMI references in here - there's four // TODO(afergs): Refactor optional args - add them to a struct or remove -static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, - BLOCK_SIZE bsize, RD_STATS *rd_stats, - RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv, - int *disable_skip, int mi_row, int mi_col, - HandleInterModeArgs *const args, - int64_t ref_best_rd, const int *refs, - int *rate_mv, BUFFER_SET *orig_dst -#if CONFIG_COLLECT_INTER_MODE_RD_STATS - , - TileDataEnc *tile_data, int64_t *best_est_rd, - int do_tx_search, InterModesInfo *inter_modes_info -#endif -) { +static int64_t motion_mode_rd( + const AV1_COMP *const cpi, TileDataEnc *tile_data, MACROBLOCK *const x, + BLOCK_SIZE bsize, RD_STATS *rd_stats, RD_STATS *rd_stats_y, + RD_STATS *rd_stats_uv, int *disable_skip, int mi_row, int mi_col, + HandleInterModeArgs *const args, int64_t ref_best_rd, const int *refs, + int *rate_mv, const BUFFER_SET *orig_dst, int64_t *best_est_rd, + int do_tx_search, InterModesInfo *inter_modes_info) { const AV1_COMMON *const cm = &cpi->common; const int num_planes = av1_num_planes(cm); MACROBLOCKD *xd = &x->e_mbd; @@ -8936,16 +9353,17 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, const int rate2_nocoeff = rd_stats->rate; int best_xskip = 0, best_disable_skip = 0; RD_STATS best_rd_stats, best_rd_stats_y, best_rd_stats_uv; - MB_MODE_INFO base_mbmi, best_mbmi; uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE]; const int rate_mv0 = *rate_mv; - - int interintra_allowed = cm->seq_params.enable_interintra_compound && - is_interintra_allowed(mbmi) && mbmi->compound_idx; + int skip_interintra_mode = 0; + const int interintra_allowed = cm->seq_params.enable_interintra_compound && + is_interintra_allowed(mbmi) && + mbmi->compound_idx; int pts0[SAMPLES_ARRAY_SIZE], pts_inref0[SAMPLES_ARRAY_SIZE]; assert(mbmi->ref_frame[1] != INTRA_FRAME); const MV_REFERENCE_FRAME ref_frame_1 = mbmi->ref_frame[1]; + (void)tile_data; av1_invalid_rd_stats(&best_rd_stats); aom_clear_system_state(); mbmi->num_proj_ref = 1; // assume num_proj_ref >=1 @@ -8957,21 +9375,22 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, if (last_motion_mode_allowed == WARPED_CAUSAL) { mbmi->num_proj_ref = findSamples(cm, xd, mi_row, mi_col, pts0, pts_inref0); } - int total_samples = mbmi->num_proj_ref; + const int total_samples = mbmi->num_proj_ref; if (total_samples == 0) { last_motion_mode_allowed = OBMC_CAUSAL; } - base_mbmi = *mbmi; - SimpleRDState *simple_states = &args->simple_rd_state[mbmi->ref_mv_idx]; + const MB_MODE_INFO base_mbmi = *mbmi; + MB_MODE_INFO best_mbmi; + SimpleRDState *const simple_states = &args->simple_rd_state[mbmi->ref_mv_idx]; const int switchable_rate = av1_is_interp_needed(xd) ? av1_get_switchable_rate(cm, x, xd) : 0; int64_t best_rd = INT64_MAX; int best_rate_mv = rate_mv0; - int identical_obmc_mv_field_detected = + const int identical_obmc_mv_field_detected = (cpi->sf.skip_obmc_in_uniform_mv_field || cpi->sf.skip_wm_in_uniform_mv_field) - ? av1_check_identical_obmc_mv_field(cm, xd, mi_row, mi_col) + ? check_identical_obmc_mv_field(cm, xd, mi_row, mi_col) : 0; for (int mode_index = (int)SIMPLE_TRANSLATION; mode_index <= (int)last_motion_mode_allowed + interintra_allowed; @@ -8980,10 +9399,8 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, if (cpi->sf.prune_single_motion_modes_by_simple_trans && args->single_ref_first_pass && mode_index) break; - int64_t tmp_rd = INT64_MAX; int tmp_rate2 = rate2_nocoeff; - int is_interintra_mode = mode_index > (int)last_motion_mode_allowed; - int skip_txfm_sb = 0; + const int is_interintra_mode = mode_index > (int)last_motion_mode_allowed; int tmp_rate_mv = rate_mv0; *mbmi = base_mbmi; @@ -8994,6 +9411,9 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, assert(mbmi->ref_frame[1] != INTRA_FRAME); } + if (cpi->oxcf.enable_obmc == 0 && mbmi->motion_mode == OBMC_CAUSAL) + continue; + if (identical_obmc_mv_field_detected) { if (cpi->sf.skip_obmc_in_uniform_mv_field && mbmi->motion_mode == OBMC_CAUSAL) @@ -9007,28 +9427,29 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, // SIMPLE_TRANSLATION mode: no need to recalculate. // The prediction is calculated before motion_mode_rd() is called in // handle_inter_mode() - if (cpi->sf.prune_single_motion_modes_by_simple_trans && - args->single_ref_first_pass == 0 && !is_comp_pred) { - if (simple_states->early_skipped) { - assert(simple_states->rd_stats.rdcost == INT64_MAX); - return INT64_MAX; - } - if (simple_states->rd_stats.rdcost != INT64_MAX) { - best_rd = simple_states->rd_stats.rdcost; - best_rd_stats = simple_states->rd_stats; - best_rd_stats_y = simple_states->rd_stats_y; - best_rd_stats_uv = simple_states->rd_stats_uv; - memcpy(best_blk_skip, simple_states->blk_skip, - sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w); - best_xskip = simple_states->skip; - best_disable_skip = simple_states->disable_skip; - best_mbmi = *mbmi; + if (cpi->sf.prune_single_motion_modes_by_simple_trans && !is_comp_pred) { + if (args->single_ref_first_pass == 0) { + if (simple_states->early_skipped) { + assert(simple_states->rd_stats.rdcost == INT64_MAX); + return INT64_MAX; + } + if (simple_states->rd_stats.rdcost != INT64_MAX) { + best_rd = simple_states->rd_stats.rdcost; + best_rd_stats = simple_states->rd_stats; + best_rd_stats_y = simple_states->rd_stats_y; + best_rd_stats_uv = simple_states->rd_stats_uv; + memcpy(best_blk_skip, simple_states->blk_skip, + sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w); + best_xskip = simple_states->skip; + best_disable_skip = simple_states->disable_skip; + best_mbmi = *mbmi; + } + continue; } - continue; + simple_states->early_skipped = 0; } - simple_states->early_skipped = 0; } else if (mbmi->motion_mode == OBMC_CAUSAL) { - uint32_t cur_mv = mbmi->mv[0].as_int; + const uint32_t cur_mv = mbmi->mv[0].as_int; assert(!is_comp_pred); if (have_newmv_in_inter_mode(this_mode)) { single_motion_search(cpi, x, bsize, mi_row, mi_col, 0, &tmp_rate_mv); @@ -9041,7 +9462,8 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, tmp_rate2 = rate2_nocoeff - rate_mv0 + tmp_rate_mv; } if (mbmi->mv[0].as_int != cur_mv) { - av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize); + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, + 0, av1_num_planes(cm) - 1); } av1_build_obmc_inter_prediction( cm, xd, mi_row, mi_col, args->above_pred_buf, args->above_pred_stride, @@ -9069,7 +9491,12 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, if (have_newmv_in_inter_mode(this_mode)) { const int_mv mv0 = mbmi->mv[0]; const WarpedMotionParams wm_params0 = mbmi->wm_params; - int num_proj_ref0 = mbmi->num_proj_ref; + const int num_proj_ref0 = mbmi->num_proj_ref; + + if (cpi->sf.prune_warp_using_wmtype) { + TransformationType wmtype = get_wmtype(&mbmi->wm_params); + if (wmtype < ROTZOOM) continue; + } // Refine MV in a small range. av1_refine_warped_mv(cpi, x, bsize, mi_row, mi_col, pts0, pts_inref0, @@ -9098,24 +9525,27 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, mbmi->wm_params = wm_params0; mbmi->num_proj_ref = num_proj_ref0; } + } else { + if (!check_if_optimal_warp(cpi, &mbmi->wm_params, mbmi->num_proj_ref)) + continue; } - av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize); + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0, + av1_num_planes(cm) - 1); } else { continue; } } else if (is_interintra_mode) { + skip_interintra_mode = skip_interintra_based_on_first_pass_stats( + cpi, x, bsize, mi_row, mi_col); + if (skip_interintra_mode) continue; const int ret = handle_inter_intra_mode( cpi, x, bsize, mi_row, mi_col, mbmi, args, ref_best_rd, &tmp_rate_mv, &tmp_rate2, orig_dst); if (ret < 0) continue; } - if (!cpi->common.all_lossless) - check_block_skip(cpi, bsize, x, xd, 0, num_planes - 1, &skip_txfm_sb); - x->skip = 0; - rd_stats->dist = 0; rd_stats->sse = 0; rd_stats->skip = 1; @@ -9146,85 +9576,93 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, } } - if (!skip_txfm_sb) { -#if CONFIG_COLLECT_INTER_MODE_RD_STATS - int64_t est_rd = 0; - int est_skip = 0; - if (cpi->sf.inter_mode_rd_model_estimation && cm->tile_cols == 1 && - cm->tile_rows == 1) { - InterModeRdModel *md = &tile_data->inter_mode_rd_models[mbmi->sb_type]; - if (md->ready) { - const int64_t curr_sse = get_sse(cpi, x); - est_rd = get_est_rd(tile_data, mbmi->sb_type, x->rdmult, curr_sse, - rd_stats->rate); - est_skip = est_rd * 0.8 > *best_est_rd; - if (est_skip) { - mbmi->ref_frame[1] = ref_frame_1; - continue; - } else { - if (est_rd < *best_est_rd) { - *best_est_rd = est_rd; - } - } - } + if (cpi->sf.model_based_motion_mode_rd_breakout && do_tx_search) { + int model_rate; + int64_t model_dist; + model_rd_sb_fn[MODELRD_TYPE_MOTION_MODE_RD]( + cpi, mbmi->sb_type, x, xd, 0, num_planes - 1, mi_row, mi_col, + &model_rate, &model_dist, NULL, NULL, NULL, NULL, NULL); + const int64_t est_rd = + RDCOST(x->rdmult, rd_stats->rate + model_rate, model_dist); + if ((est_rd >> 3) * 6 > ref_best_rd) { + mbmi->ref_frame[1] = ref_frame_1; + continue; } -#endif // CONFIG_COLLECT_INTER_MODE_RD_STATS } -#if CONFIG_COLLECT_INTER_MODE_RD_STATS if (!do_tx_search) { - const int64_t curr_sse = get_sse(cpi, x); + int64_t curr_sse = -1; int est_residue_cost = 0; int64_t est_dist = 0; - const int has_est_rd = get_est_rate_dist(tile_data, bsize, curr_sse, - &est_residue_cost, &est_dist); - (void)has_est_rd; - assert(has_est_rd); + int64_t est_rd = 0; + if (cpi->sf.inter_mode_rd_model_estimation == 1) { + curr_sse = get_sse(cpi, x); + const int has_est_rd = get_est_rate_dist(tile_data, bsize, curr_sse, + &est_residue_cost, &est_dist); + (void)has_est_rd; + assert(has_est_rd); + } else if (cpi->sf.inter_mode_rd_model_estimation == 2 || + cpi->sf.use_nonrd_pick_mode) { + model_rd_sb_fn[MODELRD_TYPE_MOTION_MODE_RD]( + cpi, bsize, x, xd, 0, num_planes - 1, mi_row, mi_col, + &est_residue_cost, &est_dist, NULL, &curr_sse, NULL, NULL, NULL); + } + est_rd = RDCOST(x->rdmult, rd_stats->rate + est_residue_cost, est_dist); + if (est_rd * 0.8 > *best_est_rd) { + mbmi->ref_frame[1] = ref_frame_1; + continue; + } const int mode_rate = rd_stats->rate; rd_stats->rate += est_residue_cost; rd_stats->dist = est_dist; - rd_stats->rdcost = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + rd_stats->rdcost = est_rd; + *best_est_rd = AOMMIN(*best_est_rd, rd_stats->rdcost); if (cm->current_frame.reference_mode == SINGLE_REFERENCE) { if (!is_comp_pred) { + assert(curr_sse >= 0); inter_modes_info_push(inter_modes_info, mode_rate, curr_sse, - rd_stats->rdcost, mbmi); + rd_stats->rdcost, false, NULL, rd_stats, + rd_stats_y, rd_stats_uv, mbmi); } } else { + assert(curr_sse >= 0); inter_modes_info_push(inter_modes_info, mode_rate, curr_sse, - rd_stats->rdcost, mbmi); + rd_stats->rdcost, false, NULL, rd_stats, + rd_stats_y, rd_stats_uv, mbmi); } } else { -#endif - if (!txfm_search(cpi, x, bsize, mi_row, mi_col, rd_stats, rd_stats_y, - rd_stats_uv, rd_stats->rate, ref_best_rd)) { + if (!txfm_search(cpi, tile_data, x, bsize, mi_row, mi_col, rd_stats, + rd_stats_y, rd_stats_uv, rd_stats->rate, ref_best_rd)) { if (rd_stats_y->rate == INT_MAX && mode_index == 0) { - simple_states->early_skipped = 1; + if (cpi->sf.prune_single_motion_modes_by_simple_trans && + !is_comp_pred) { + simple_states->early_skipped = 1; + } return INT64_MAX; } continue; } - if (!skip_txfm_sb) { - const int64_t curr_rd = - RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); - if (curr_rd < ref_best_rd) { - ref_best_rd = curr_rd; - } - *disable_skip = 0; -#if CONFIG_COLLECT_INTER_MODE_RD_STATS - if (cpi->sf.inter_mode_rd_model_estimation) { - const int skip_ctx = av1_get_skip_context(xd); - inter_mode_data_push(tile_data, mbmi->sb_type, rd_stats->sse, - rd_stats->dist, - rd_stats_y->rate + rd_stats_uv->rate + - x->skip_cost[skip_ctx][mbmi->skip]); - } -#endif // CONFIG_COLLECT_INTER_MODE_RD_STATS - } else { - *disable_skip = 1; + + const int64_t curr_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + ref_best_rd = AOMMIN(ref_best_rd, curr_rd); + *disable_skip = 0; + if (cpi->sf.inter_mode_rd_model_estimation == 1) { + const int skip_ctx = av1_get_skip_context(xd); + inter_mode_data_push(tile_data, mbmi->sb_type, rd_stats->sse, + rd_stats->dist, + rd_stats_y->rate + rd_stats_uv->rate + + x->skip_cost[skip_ctx][mbmi->skip]); + } + + // 2 means to both do the tx search and also update the inter_modes_info + // structure, since some modes will be conditionally TX searched. + if (do_tx_search == 2) { + rd_stats->rdcost = curr_rd; + inter_modes_info_push(inter_modes_info, rd_stats->rate, rd_stats->sse, + curr_rd, true, x->blk_skip, rd_stats, rd_stats_y, + rd_stats_uv, mbmi); } -#if CONFIG_COLLECT_INTER_MODE_RD_STATS } -#endif if (this_mode == GLOBALMV || this_mode == GLOBAL_GLOBALMV) { if (is_nontrans_global_motion(xd, xd->mi[0])) { @@ -9233,7 +9671,7 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, } } - tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); + const int64_t tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist); if (mode_index == 0) { args->simple_rd[this_mode][mbmi->ref_mv_idx][mbmi->ref_frame[0]] = tmp_rd; if (!is_comp_pred) { @@ -9247,7 +9685,7 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, simple_states->disable_skip = *disable_skip; } } - if ((mode_index == 0) || (tmp_rd < best_rd)) { + if (mode_index == 0 || tmp_rd < best_rd) { best_mbmi = *mbmi; best_rd = tmp_rd; best_rd_stats = *rd_stats; @@ -9283,11 +9721,12 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x, static int64_t skip_mode_rd(RD_STATS *rd_stats, const AV1_COMP *const cpi, MACROBLOCK *const x, BLOCK_SIZE bsize, int mi_row, - int mi_col, BUFFER_SET *const orig_dst) { + int mi_col, const BUFFER_SET *const orig_dst) { const AV1_COMMON *cm = &cpi->common; const int num_planes = av1_num_planes(cm); MACROBLOCKD *const xd = &x->e_mbd; - av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize); + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, 0, + av1_num_planes(cm) - 1); int64_t total_sse = 0; for (int plane = 0; plane < num_planes; ++plane) { @@ -9299,44 +9738,8 @@ static int64_t skip_mode_rd(RD_STATS *rd_stats, const AV1_COMP *const cpi, const int bh = block_size_high[plane_bsize]; av1_subtract_plane(x, bsize, plane); - int64_t sse; -#if CONFIG_ONE_PASS_SVM - if (plane == AOM_PLANE_Y && bsize >= BLOCK_8X8 && bw == bh) { - rd_stats->sse_0 = aom_sum_squares_2d_i16(p->src_diff, bw, bw / 2, bh / 2) - << 4; - rd_stats->sse_1 = - aom_sum_squares_2d_i16(p->src_diff + bw / 2, bw, bw / 2, bh / 2) << 4; - rd_stats->sse_2 = - aom_sum_squares_2d_i16(p->src_diff + bh / 2 * bw, bw, bw / 2, bh / 2) - << 4; - rd_stats->sse_3 = - aom_sum_squares_2d_i16(p->src_diff + bh / 2 * bw + bw / 2, bw, bw / 2, - bh / 2) - << 4; - - sse = - rd_stats->sse_0 + rd_stats->sse_1 + rd_stats->sse_2 + rd_stats->sse_3; - total_sse += sse; - - const int scaling_factor = MAX_MIB_SIZE * MAX_MIB_SIZE; - rd_stats->sse = sse; - rd_stats->sse_0 = rd_stats->sse_0 * scaling_factor; - rd_stats->sse_1 = rd_stats->sse_1 * scaling_factor; - rd_stats->sse_2 = rd_stats->sse_2 * scaling_factor; - rd_stats->sse_3 = rd_stats->sse_3 * scaling_factor; - rd_stats->y_sse = sse; - // TODO(chiyotsai@google.com): Don't manually set the flags - av1_reg_stat_skipmode_update(rd_stats, x->rdmult); - } else { - sse = aom_sum_squares_2d_i16(p->src_diff, bw, bw, bh); - sse = sse << 4; - total_sse += sse; - } -#else - sse = aom_sum_squares_2d_i16(p->src_diff, bw, bw, bh); - sse = sse << 4; + int64_t sse = aom_sum_squares_2d_i16(p->src_diff, bw, bw, bh) << 4; total_sse += sse; -#endif } const int skip_mode_ctx = av1_get_skip_mode_context(xd); rd_stats->dist = rd_stats->sse = total_sse; @@ -9456,25 +9859,20 @@ typedef struct { uint8_t *tmp_best_mask_buf; // backup of the best segmentation mask } CompoundTypeRdBuffers; -static int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x, - BLOCK_SIZE bsize, int mi_col, int mi_row, - int_mv *cur_mv, int masked_compound_used, - BUFFER_SET *orig_dst, const BUFFER_SET *tmp_dst, - CompoundTypeRdBuffers *buffers, int *rate_mv, - int64_t *rd, RD_STATS *rd_stats, - int64_t ref_best_rd) { +static int compound_type_rd( + const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_col, + int mi_row, int_mv *cur_mv, int mode_search_mask, int masked_compound_used, + const BUFFER_SET *orig_dst, const BUFFER_SET *tmp_dst, + CompoundTypeRdBuffers *buffers, int *rate_mv, int64_t *rd, + RD_STATS *rd_stats, int64_t ref_best_rd, int *is_luma_interp_done) { const AV1_COMMON *cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *mbmi = xd->mi[0]; const PREDICTION_MODE this_mode = mbmi->mode; const int bw = block_size_wide[bsize]; - int rate_sum, rs2; - int64_t dist_sum; - + int rs2; int_mv best_mv[2]; int best_tmp_rate_mv = *rate_mv; - int tmp_skip_txfm_sb; - int64_t tmp_skip_sse_sb; INTERINTER_COMPOUND_DATA best_compound_data; best_compound_data.type = COMPOUND_AVERAGE; uint8_t *preds0[1] = { buffers->pred0 }; @@ -9486,56 +9884,214 @@ static int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x, COMPOUND_TYPE cur_type; int best_compmode_interinter_cost = 0; int calc_pred_masked_compound = 1; + int64_t comp_dist[COMPOUND_TYPES] = { INT64_MAX, INT64_MAX, INT64_MAX, + INT64_MAX }; + int32_t comp_rate[COMPOUND_TYPES] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX }; + int64_t comp_model_rd[COMPOUND_TYPES] = { INT64_MAX, INT64_MAX, INT64_MAX, + INT64_MAX }; + const int match_found = + find_comp_rd_in_stats(cpi, x, mbmi, comp_rate, comp_dist, comp_model_rd); best_mv[0].as_int = cur_mv[0].as_int; best_mv[1].as_int = cur_mv[1].as_int; *rd = INT64_MAX; + int rate_sum, tmp_skip_txfm_sb; + int64_t dist_sum, tmp_skip_sse_sb; + int64_t comp_best_model_rd = INT64_MAX; + // Special handling if both compound_average and compound_distwtd + // are to be searched. In this case, first estimate between the two + // modes and then call estimate_yrd_for_sb() only for the better of + // the two. + const int try_average_comp = (mode_search_mask & (1 << COMPOUND_AVERAGE)); + const int try_distwtd_comp = + ((mode_search_mask & (1 << COMPOUND_DISTWTD)) && + cm->seq_params.order_hint_info.enable_dist_wtd_comp == 1 && + cpi->sf.use_dist_wtd_comp_flag != DIST_WTD_COMP_DISABLED); + const int try_average_and_distwtd_comp = + try_average_comp && try_distwtd_comp && + comp_rate[COMPOUND_AVERAGE] == INT_MAX && + comp_rate[COMPOUND_DISTWTD] == INT_MAX; for (cur_type = COMPOUND_AVERAGE; cur_type < COMPOUND_TYPES; cur_type++) { - if (cur_type != COMPOUND_AVERAGE && !masked_compound_used) break; + if (((1 << cur_type) & mode_search_mask) == 0) { + if (cur_type == COMPOUND_AVERAGE) restore_dst_buf(xd, *tmp_dst, 1); + continue; + } if (!is_interinter_compound_used(cur_type, bsize)) continue; + if (cur_type >= COMPOUND_WEDGE && !masked_compound_used) break; + if (cur_type == COMPOUND_DISTWTD && !try_distwtd_comp) continue; + if (cur_type == COMPOUND_AVERAGE && try_average_and_distwtd_comp) continue; + + int64_t comp_model_rd_cur = INT64_MAX; tmp_rate_mv = *rate_mv; int64_t best_rd_cur = INT64_MAX; - mbmi->interinter_comp.type = cur_type; - int masked_type_cost = 0; - const int comp_group_idx_ctx = get_comp_group_idx_context(xd); const int comp_index_ctx = get_comp_index_context(cm, xd); - mbmi->compound_idx = 1; - if (cur_type == COMPOUND_AVERAGE) { + + if (cur_type == COMPOUND_DISTWTD && try_average_and_distwtd_comp) { + int est_rate[2]; + int64_t est_dist[2], est_rd[2]; + + int masked_type_cost[2] = { 0, 0 }; mbmi->comp_group_idx = 0; + + // First find the modeled rd cost for COMPOUND_AVERAGE + mbmi->interinter_comp.type = COMPOUND_AVERAGE; + mbmi->compound_idx = 1; if (masked_compound_used) { - masked_type_cost += x->comp_group_idx_cost[comp_group_idx_ctx][0]; - } - masked_type_cost += x->comp_idx_cost[comp_index_ctx][1]; - rs2 = masked_type_cost; - const int64_t mode_rd = RDCOST(x->rdmult, rs2 + rd_stats->rate, 0); - if (mode_rd < ref_best_rd) { - av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize); - int64_t est_rd = - estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum, - &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX); - if (est_rd != INT64_MAX) - best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + rate_sum, dist_sum); - } - // use spare buffer for following compound type try + masked_type_cost[COMPOUND_AVERAGE] += + x->comp_group_idx_cost[comp_group_idx_ctx][mbmi->comp_group_idx]; + } + masked_type_cost[COMPOUND_AVERAGE] += + x->comp_idx_cost[comp_index_ctx][mbmi->compound_idx]; + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + *is_luma_interp_done = 1; + model_rd_sb_fn[MODELRD_CURVFIT]( + cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &est_rate[COMPOUND_AVERAGE], + &est_dist[COMPOUND_AVERAGE], NULL, NULL, NULL, NULL, NULL); + est_rate[COMPOUND_AVERAGE] += masked_type_cost[COMPOUND_AVERAGE]; + est_rd[COMPOUND_AVERAGE] = + RDCOST(x->rdmult, est_rate[COMPOUND_AVERAGE] + *rate_mv, + est_dist[COMPOUND_AVERAGE]); restore_dst_buf(xd, *tmp_dst, 1); + + // Next find the modeled rd cost for COMPOUND_DISTWTD + mbmi->interinter_comp.type = COMPOUND_DISTWTD; + mbmi->compound_idx = 0; + if (masked_compound_used) { + masked_type_cost[COMPOUND_DISTWTD] += + x->comp_group_idx_cost[comp_group_idx_ctx][mbmi->comp_group_idx]; + } + masked_type_cost[COMPOUND_DISTWTD] += + x->comp_idx_cost[comp_index_ctx][mbmi->compound_idx]; + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, + AOM_PLANE_Y, AOM_PLANE_Y); + model_rd_sb_fn[MODELRD_CURVFIT]( + cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &est_rate[COMPOUND_DISTWTD], + &est_dist[COMPOUND_DISTWTD], NULL, NULL, NULL, NULL, NULL); + est_rate[COMPOUND_DISTWTD] += masked_type_cost[COMPOUND_DISTWTD]; + est_rd[COMPOUND_DISTWTD] = + RDCOST(x->rdmult, est_rate[COMPOUND_DISTWTD] + *rate_mv, + est_dist[COMPOUND_DISTWTD]); + + // Choose the better of the two based on modeled cost and call + // estimate_yrd_for_sb() for that one. + if (est_rd[COMPOUND_AVERAGE] <= est_rd[COMPOUND_DISTWTD]) { + mbmi->interinter_comp.type = COMPOUND_AVERAGE; + mbmi->compound_idx = 1; + restore_dst_buf(xd, *orig_dst, 1); + RD_STATS est_rd_stats; + const int64_t est_rd_ = + estimate_yrd_for_sb(cpi, bsize, x, ref_best_rd, &est_rd_stats); + rs2 = masked_type_cost[COMPOUND_AVERAGE]; + if (est_rd_ != INT64_MAX) { + best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + est_rd_stats.rate, + est_rd_stats.dist); + restore_dst_buf(xd, *tmp_dst, 1); + comp_rate[COMPOUND_AVERAGE] = est_rd_stats.rate; + comp_dist[COMPOUND_AVERAGE] = est_rd_stats.dist; + comp_model_rd[COMPOUND_AVERAGE] = est_rd[COMPOUND_AVERAGE]; + comp_model_rd_cur = est_rd[COMPOUND_AVERAGE]; + } + restore_dst_buf(xd, *tmp_dst, 1); + } else { + RD_STATS est_rd_stats; + const int64_t est_rd_ = + estimate_yrd_for_sb(cpi, bsize, x, ref_best_rd, &est_rd_stats); + rs2 = masked_type_cost[COMPOUND_DISTWTD]; + if (est_rd_ != INT64_MAX) { + best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + est_rd_stats.rate, + est_rd_stats.dist); + comp_rate[COMPOUND_DISTWTD] = est_rd_stats.rate; + comp_dist[COMPOUND_DISTWTD] = est_rd_stats.dist; + comp_model_rd[COMPOUND_DISTWTD] = est_rd[COMPOUND_DISTWTD]; + comp_model_rd_cur = est_rd[COMPOUND_DISTWTD]; + } + } } else { - mbmi->comp_group_idx = 1; - masked_type_cost += x->comp_group_idx_cost[comp_group_idx_ctx][1]; - masked_type_cost += x->compound_type_cost[bsize][cur_type - 1]; - rs2 = masked_type_cost; - if (enable_wedge_search(x, cpi) && *rd / 3 < ref_best_rd) { - best_rd_cur = build_and_cost_compound_type( - cpi, x, cur_mv, bsize, this_mode, &rs2, *rate_mv, orig_dst, - &tmp_rate_mv, preds0, preds1, buffers->residual1, buffers->diff10, - strides, mi_row, mi_col, rd_stats->rate, ref_best_rd, - &calc_pred_masked_compound); + mbmi->interinter_comp.type = cur_type; + int masked_type_cost = 0; + if (cur_type == COMPOUND_AVERAGE || cur_type == COMPOUND_DISTWTD) { + mbmi->comp_group_idx = 0; + mbmi->compound_idx = (cur_type == COMPOUND_AVERAGE); + if (masked_compound_used) { + masked_type_cost += + x->comp_group_idx_cost[comp_group_idx_ctx][mbmi->comp_group_idx]; + } + masked_type_cost += + x->comp_idx_cost[comp_index_ctx][mbmi->compound_idx]; + rs2 = masked_type_cost; + const int64_t mode_rd = RDCOST(x->rdmult, rs2 + rd_stats->rate, 0); + if (mode_rd < ref_best_rd) { + // Reuse data if matching record is found + if (comp_rate[cur_type] == INT_MAX) { + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, + bsize, AOM_PLANE_Y, AOM_PLANE_Y); + if (cur_type == COMPOUND_AVERAGE) *is_luma_interp_done = 1; + RD_STATS est_rd_stats; + const int64_t est_rd = + estimate_yrd_for_sb(cpi, bsize, x, ref_best_rd, &est_rd_stats); + if (comp_rate[cur_type] != INT_MAX) { + assert(comp_rate[cur_type] == est_rd_stats.rate); + assert(comp_dist[cur_type] == est_rd_stats.dist); + } + if (est_rd != INT64_MAX) { + best_rd_cur = + RDCOST(x->rdmult, rs2 + *rate_mv + est_rd_stats.rate, + est_rd_stats.dist); + model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND]( + cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum, + &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL); + comp_model_rd_cur = + RDCOST(x->rdmult, rs2 + *rate_mv + rate_sum, dist_sum); + + // Backup rate and distortion for future reuse + comp_rate[cur_type] = est_rd_stats.rate; + comp_dist[cur_type] = est_rd_stats.dist; + comp_model_rd[cur_type] = comp_model_rd_cur; + } + } else { + // Calculate RD cost based on stored stats + assert(comp_dist[cur_type] != INT64_MAX); + best_rd_cur = + RDCOST(x->rdmult, rs2 + *rate_mv + comp_rate[cur_type], + comp_dist[cur_type]); + comp_model_rd_cur = comp_model_rd[cur_type]; + } + } + // use spare buffer for following compound type try + if (cur_type == COMPOUND_AVERAGE) restore_dst_buf(xd, *tmp_dst, 1); + } else { + mbmi->comp_group_idx = 1; + mbmi->compound_idx = 1; + masked_type_cost += + x->comp_group_idx_cost[comp_group_idx_ctx][mbmi->comp_group_idx]; + masked_type_cost += + x->compound_type_cost[bsize][cur_type - COMPOUND_WEDGE]; + rs2 = masked_type_cost; + + if (((*rd / cpi->max_comp_type_rd_threshold_div) * + cpi->max_comp_type_rd_threshold_mul) < ref_best_rd) { + const COMPOUND_TYPE compound_type = mbmi->interinter_comp.type; + + if (!((compound_type == COMPOUND_WEDGE && + !enable_wedge_interinter_search(x, cpi)) || + (compound_type == COMPOUND_DIFFWTD && + !cpi->oxcf.enable_diff_wtd_comp))) + best_rd_cur = build_and_cost_compound_type( + cpi, x, cur_mv, bsize, this_mode, &rs2, *rate_mv, orig_dst, + &tmp_rate_mv, preds0, preds1, buffers->residual1, + buffers->diff10, strides, mi_row, mi_col, rd_stats->rate, + ref_best_rd, &calc_pred_masked_compound, comp_rate, comp_dist, + comp_model_rd, comp_best_model_rd, &comp_model_rd_cur); + } } } if (best_rd_cur < *rd) { *rd = best_rd_cur; + comp_best_model_rd = comp_model_rd_cur; best_compound_data = mbmi->interinter_comp; - if (masked_compound_used && cur_type != COMPOUND_TYPES - 1) { + if (masked_compound_used && cur_type >= COMPOUND_WEDGE) { memcpy(buffers->tmp_best_mask_buf, xd->seg_mask, mask_len); } best_compmode_interinter_cost = rs2; @@ -9555,8 +10111,8 @@ static int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x, mbmi->mv[1].as_int = cur_mv[1].as_int; } if (mbmi->interinter_comp.type != best_compound_data.type) { - mbmi->comp_group_idx = - (best_compound_data.type == COMPOUND_AVERAGE) ? 0 : 1; + mbmi->comp_group_idx = (best_compound_data.type < COMPOUND_WEDGE) ? 0 : 1; + mbmi->compound_idx = !(best_compound_data.type == COMPOUND_DISTWTD); mbmi->interinter_comp = best_compound_data; memcpy(xd->seg_mask, buffers->tmp_best_mask_buf, mask_len); } @@ -9569,6 +10125,9 @@ static int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x, } } restore_dst_buf(xd, *orig_dst, 1); + if (!match_found) + save_comp_rd_search_stat(x, mbmi, comp_rate, comp_dist, comp_model_rd, + cur_mv); return best_compmode_interinter_cost; } @@ -9609,20 +10168,13 @@ typedef struct { int_mv mv; } inter_mode_info; -static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x, - BLOCK_SIZE bsize, RD_STATS *rd_stats, - RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv, - int *disable_skip, int mi_row, int mi_col, - HandleInterModeArgs *args, int64_t ref_best_rd, - uint8_t *const tmp_buf, - CompoundTypeRdBuffers *rd_buffers -#if CONFIG_COLLECT_INTER_MODE_RD_STATS - , - TileDataEnc *tile_data, int64_t *best_est_rd, - const int do_tx_search, - InterModesInfo *inter_modes_info -#endif -) { +static int64_t handle_inter_mode( + AV1_COMP *const cpi, TileDataEnc *tile_data, MACROBLOCK *x, + BLOCK_SIZE bsize, RD_STATS *rd_stats, RD_STATS *rd_stats_y, + RD_STATS *rd_stats_uv, int *disable_skip, int mi_row, int mi_col, + HandleInterModeArgs *args, int64_t ref_best_rd, uint8_t *const tmp_buf, + CompoundTypeRdBuffers *rd_buffers, int64_t *best_est_rd, + const int do_tx_search, InterModesInfo *inter_modes_info) { const AV1_COMMON *cm = &cpi->common; const int num_planes = av1_num_planes(cm); MACROBLOCKD *xd = &x->e_mbd; @@ -9642,7 +10194,7 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x, // one for future predictions. In the end, copy from tmp_buf to // dst if necessary. struct macroblockd_plane *p = xd->plane; - BUFFER_SET orig_dst = { + const BUFFER_SET orig_dst = { { p[0].dst.buf, p[1].dst.buf, p[2].dst.buf }, { p[0].dst.stride, p[1].dst.stride, p[2].dst.stride }, }; @@ -9668,11 +10220,20 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x, int backup_rate_mv = 0; inter_mode_info mode_info[MAX_REF_MV_SERCH]; - int comp_idx; - const int search_jnt_comp = is_comp_pred & - cm->seq_params.order_hint_info.enable_jnt_comp & - (mbmi->mode != GLOBAL_GLOBALMV) & - (cpi->sf.use_jnt_comp_flag != JNT_COMP_DISABLED); + int mode_search_mask[2]; + const int do_two_loop_comp_search = + is_comp_pred && cpi->sf.two_loop_comp_search; + if (do_two_loop_comp_search) { + // TODO(debargha): Change this to try alternate ways of splitting + // modes while doing two pass compound_mode search. + mode_search_mask[0] = (1 << COMPOUND_AVERAGE); + } else { + mode_search_mask[0] = (1 << COMPOUND_AVERAGE) | (1 << COMPOUND_DISTWTD) | + (1 << COMPOUND_WEDGE) | (1 << COMPOUND_DIFFWTD); + } + mode_search_mask[1] = ((1 << COMPOUND_AVERAGE) | (1 << COMPOUND_DISTWTD) | + (1 << COMPOUND_WEDGE) | (1 << COMPOUND_DIFFWTD)) - + mode_search_mask[0]; // TODO(jingning): This should be deprecated shortly. const int has_nearmv = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0; @@ -9729,42 +10290,35 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x, } const RD_STATS backup_rd_stats = *rd_stats; - // If !search_jnt_comp, we need to force mbmi->compound_idx = 1. - for (comp_idx = 1; comp_idx >= !search_jnt_comp; --comp_idx) { + + for (int comp_loop_idx = 0; comp_loop_idx <= do_two_loop_comp_search; + ++comp_loop_idx) { int rs = 0; int compmode_interinter_cost = 0; - mbmi->compound_idx = comp_idx; - if (is_comp_pred && comp_idx == 0) { - *rd_stats = backup_rd_stats; - mbmi->interinter_comp.type = COMPOUND_AVERAGE; - mbmi->num_proj_ref = 0; - mbmi->motion_mode = SIMPLE_TRANSLATION; - mbmi->comp_group_idx = 0; - const int comp_index_ctx = get_comp_index_context(cm, xd); - compmode_interinter_cost += x->comp_idx_cost[comp_index_ctx][0]; - } + if (is_comp_pred && comp_loop_idx == 1) *rd_stats = backup_rd_stats; int_mv cur_mv[2]; if (!build_cur_mv(cur_mv, this_mode, cm, x)) { continue; } if (have_newmv_in_inter_mode(this_mode)) { - if (comp_idx == 0) { + if (comp_loop_idx == 1) { cur_mv[0] = backup_mv[0]; cur_mv[1] = backup_mv[1]; rate_mv = backup_rate_mv; } +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, handle_newmv_time); +#endif if (cpi->sf.prune_single_motion_modes_by_simple_trans && args->single_ref_first_pass == 0 && !is_comp_pred) { const int ref0 = mbmi->ref_frame[0]; newmv_ret_val = args->single_newmv_valid[ref_mv_idx][ref0] ? 0 : 1; cur_mv[0] = args->single_newmv[ref_mv_idx][ref0]; rate_mv = args->single_newmv_rate[ref_mv_idx][ref0]; - } else if (!(search_jnt_comp && - (cpi->sf.use_jnt_comp_flag == JNT_COMP_SKIP_MV_SEARCH) && - comp_idx == 0)) { + } else if (comp_loop_idx == 0) { newmv_ret_val = handle_newmv(cpi, x, bsize, cur_mv, mi_row, mi_col, &rate_mv, args); @@ -9774,6 +10328,9 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x, backup_mv[1] = cur_mv[1]; backup_rate_mv = rate_mv; } +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, handle_newmv_time); +#endif if (newmv_ret_val != 0) { continue; @@ -9817,7 +10374,6 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x, best_rd = RDCOST(x->rdmult, best_rd_stats.rate, best_rd_stats.dist); if (best_rd < ref_best_rd) ref_best_rd = best_rd; - skip = 1; break; } @@ -9869,46 +10425,90 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x, continue; } +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, compound_type_rd_time); +#endif int skip_build_pred = 0; - if (is_comp_pred && comp_idx) { - // Find matching interp filter or set to default interp filter - const int need_search = - av1_is_interp_needed(xd) && av1_is_interp_search_needed(xd); - int match_found = -1; - const InterpFilter assign_filter = cm->interp_filter; - if (cpi->sf.skip_repeat_interpolation_filter_search && need_search) { - match_found = find_interp_filter_in_stats(x, mbmi); - } - if (!need_search || match_found == -1) { - set_default_interp_filters(mbmi, assign_filter); - } + if (is_comp_pred) { + if (mode_search_mask[comp_loop_idx] == (1 << COMPOUND_AVERAGE)) { + // Only compound_average + mbmi->interinter_comp.type = COMPOUND_AVERAGE; + mbmi->num_proj_ref = 0; + mbmi->motion_mode = SIMPLE_TRANSLATION; + mbmi->comp_group_idx = 0; + mbmi->compound_idx = 1; + const int comp_index_ctx = get_comp_index_context(cm, xd); + compmode_interinter_cost += + x->comp_idx_cost[comp_index_ctx][mbmi->compound_idx]; + } else if (mode_search_mask[comp_loop_idx] == (1 << COMPOUND_DISTWTD)) { + // Only compound_distwtd + if (!cm->seq_params.order_hint_info.enable_dist_wtd_comp || + cpi->sf.use_dist_wtd_comp_flag == DIST_WTD_COMP_DISABLED || + (do_two_loop_comp_search && mbmi->mode == GLOBAL_GLOBALMV)) + continue; + mbmi->interinter_comp.type = COMPOUND_DISTWTD; + mbmi->num_proj_ref = 0; + mbmi->motion_mode = SIMPLE_TRANSLATION; + mbmi->comp_group_idx = 0; + mbmi->compound_idx = 0; + const int comp_index_ctx = get_comp_index_context(cm, xd); + compmode_interinter_cost += + x->comp_idx_cost[comp_index_ctx][mbmi->compound_idx]; + } else { + // Find matching interp filter or set to default interp filter + const int need_search = + av1_is_interp_needed(xd) && av1_is_interp_search_needed(xd); + int match_found = -1; + const InterpFilter assign_filter = cm->interp_filter; + int is_luma_interp_done = 0; + if (cpi->sf.skip_repeat_interpolation_filter_search && need_search) { + match_found = find_interp_filter_in_stats(x, mbmi); + } + if (!need_search || match_found == -1) { + set_default_interp_filters(mbmi, assign_filter); + } - int64_t best_rd_compound; - compmode_interinter_cost = compound_type_rd( - cpi, x, bsize, mi_col, mi_row, cur_mv, masked_compound_used, - &orig_dst, &tmp_dst, rd_buffers, &rate_mv, &best_rd_compound, - rd_stats, ref_best_rd); - if (ref_best_rd < INT64_MAX && - (best_rd_compound >> 3) * 6 > ref_best_rd) { - restore_dst_buf(xd, orig_dst, num_planes); - continue; - } - // No need to call av1_build_inter_predictors_sby if - // COMPOUND_AVERAGE is selected because it is the first - // candidate in compound_type_rd, and the following - // compound types searching uses tmp_dst buffer - if (mbmi->interinter_comp.type == COMPOUND_AVERAGE) { - if (num_planes > 1) - av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, &orig_dst, - bsize); - skip_build_pred = 1; + int64_t best_rd_compound; + compmode_interinter_cost = compound_type_rd( + cpi, x, bsize, mi_col, mi_row, cur_mv, + mode_search_mask[comp_loop_idx], masked_compound_used, &orig_dst, + &tmp_dst, rd_buffers, &rate_mv, &best_rd_compound, rd_stats, + ref_best_rd, &is_luma_interp_done); + if (ref_best_rd < INT64_MAX && + (best_rd_compound >> 4) * (11 + 2 * do_two_loop_comp_search) > + ref_best_rd) { + restore_dst_buf(xd, orig_dst, num_planes); + continue; + } + // No need to call av1_enc_build_inter_predictor for luma if + // COMPOUND_AVERAGE is selected because it is the first + // candidate in compound_type_rd, and the following + // compound types searching uses tmp_dst buffer + + if (mbmi->interinter_comp.type == COMPOUND_AVERAGE && + is_luma_interp_done) { + if (num_planes > 1) { + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst, + bsize, AOM_PLANE_U, num_planes - 1); + } + skip_build_pred = 1; + } } } +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, compound_type_rd_time); +#endif +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, interpolation_filter_search_time); +#endif ret_val = interpolation_filter_search( - x, cpi, bsize, mi_row, mi_col, &tmp_dst, &orig_dst, + x, cpi, tile_data, bsize, mi_row, mi_col, &tmp_dst, &orig_dst, args->single_filter, &rd, &rs, &skip_txfm_sb, &skip_sse_sb, - skip_build_pred, args, ref_best_rd); + &skip_build_pred, args, ref_best_rd); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, interpolation_filter_search_time); +#endif if (args->modelled_rd != NULL && !is_comp_pred) { args->modelled_rd[this_mode][ref_mv_idx][refs[0]] = rd; } @@ -9939,8 +10539,12 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x, } } rd_stats->rate += compmode_interinter_cost; + if (skip_build_pred != 1) { + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst, bsize, + 0, av1_num_planes(cm) - 1); + } - if (search_jnt_comp && cpi->sf.jnt_comp_fast_tx_search && comp_idx == 0) { + if (cpi->sf.second_loop_comp_fast_tx_search && comp_loop_idx == 1) { // TODO(chengchen): this speed feature introduces big loss. // Need better estimation of rate distortion. int dummy_rate; @@ -9949,7 +10553,7 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x, int64_t plane_sse[MAX_MB_PLANE] = { 0 }; int64_t plane_dist[MAX_MB_PLANE] = { 0 }; - model_rd_sb_fn[MODELRD_TYPE_JNT_COMPOUND]( + model_rd_sb_fn[MODELRD_TYPE_DIST_WTD_COMPOUND]( cpi, bsize, x, xd, 0, num_planes - 1, mi_row, mi_col, &dummy_rate, &dummy_dist, &skip_txfm_sb, &skip_sse_sb, plane_rate, plane_sse, plane_dist); @@ -9965,15 +10569,15 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x, rd_stats_y->dist = plane_dist[0]; rd_stats_uv->dist = plane_dist[1] + plane_dist[2]; } else { -#if CONFIG_COLLECT_INTER_MODE_RD_STATS - ret_val = motion_mode_rd( - cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv, disable_skip, - mi_row, mi_col, args, ref_best_rd, refs, &rate_mv, &orig_dst, - tile_data, best_est_rd, do_tx_search, inter_modes_info); -#else - ret_val = motion_mode_rd(cpi, x, bsize, rd_stats, rd_stats_y, +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, motion_mode_rd_time); +#endif + ret_val = motion_mode_rd(cpi, tile_data, x, bsize, rd_stats, rd_stats_y, rd_stats_uv, disable_skip, mi_row, mi_col, - args, ref_best_rd, refs, &rate_mv, &orig_dst); + args, ref_best_rd, refs, &rate_mv, &orig_dst, + best_est_rd, do_tx_search, inter_modes_info); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, motion_mode_rd_time); #endif } mode_info[ref_mv_idx].mv.as_int = mbmi->mv[0].as_int; @@ -10019,10 +10623,10 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x, } static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, - RD_STATS *rd_cost, BLOCK_SIZE bsize, + RD_STATS *rd_stats, BLOCK_SIZE bsize, int64_t best_rd) { const AV1_COMMON *const cm = &cpi->common; - if (!av1_allow_intrabc(cm)) return INT64_MAX; + if (!av1_allow_intrabc(cm) || !cpi->oxcf.enable_intrabc) return INT64_MAX; const int num_planes = av1_num_planes(cm); MACROBLOCKD *const xd = &x->e_mbd; @@ -10074,7 +10678,7 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, }; MB_MODE_INFO best_mbmi = *mbmi; - RD_STATS best_rdcost = *rd_cost; + RD_STATS best_rdstats = *rd_stats; int best_skip = x->skip; uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE] = { 0 }; @@ -10118,17 +10722,18 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, MV mvp_full = dv_ref.as_mv; mvp_full.col >>= 3; mvp_full.row >>= 3; - int sadpb = x->sadperbit16; + const int sadpb = x->sadperbit16; int cost_list[5]; - int bestsme = av1_full_pixel_search( + const int bestsme = av1_full_pixel_search( cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, 0, sadpb, cond_cost_list(cpi, cost_list), &dv_ref.as_mv, INT_MAX, 1, - (MI_SIZE * mi_col), (MI_SIZE * mi_row), 1); + (MI_SIZE * mi_col), (MI_SIZE * mi_row), 1, + &cpi->ss_cfg[SS_CFG_LOOKAHEAD]); x->mv_limits = tmp_mv_limits; if (bestsme == INT_MAX) continue; mvp_full = x->best_mv.as_mv; - MV dv = { .row = mvp_full.row * 8, .col = mvp_full.col * 8 }; + const MV dv = { .row = mvp_full.row * 8, .col = mvp_full.col * 8 }; if (mv_check_bounds(&x->mv_limits, &dv)) continue; if (!av1_is_dv_valid(dv, cm, xd, mi_row, mi_col, bsize, cm->seq_params.mib_size_log2)) @@ -10147,74 +10752,39 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, mbmi->interp_filters = av1_broadcast_interp_filter(BILINEAR); mbmi->skip = 0; x->skip = 0; - av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize); + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0, + av1_num_planes(cm) - 1); int *dvcost[2] = { (int *)&cpi->dv_cost[0][MV_MAX], (int *)&cpi->dv_cost[1][MV_MAX] }; // TODO(aconverse@google.com): The full motion field defining discount // in MV_COST_WEIGHT is too large. Explore other values. - int rate_mv = av1_mv_bit_cost(&dv, &dv_ref.as_mv, cpi->dv_joint_cost, - dvcost, MV_COST_WEIGHT_SUB); + const int rate_mv = av1_mv_bit_cost(&dv, &dv_ref.as_mv, cpi->dv_joint_cost, + dvcost, MV_COST_WEIGHT_SUB); const int rate_mode = x->intrabc_cost[1]; - RD_STATS rd_stats, rd_stats_uv; - av1_subtract_plane(x, bsize, 0); - if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) { - // Intrabc - select_tx_type_yrd(cpi, x, &rd_stats, bsize, mi_row, mi_col, INT64_MAX); - } else { - super_block_yrd(cpi, x, &rd_stats, bsize, INT64_MAX); - memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size)); - for (int i = 0; i < xd->n4_h * xd->n4_w; ++i) - set_blk_skip(x, 0, i, rd_stats.skip); - } - if (num_planes > 1) { - super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX); - av1_merge_rd_stats(&rd_stats, &rd_stats_uv); - } -#if CONFIG_RD_DEBUG - mbmi->rd_stats = rd_stats; -#endif - - const int skip_ctx = av1_get_skip_context(xd); - - RD_STATS rdc_noskip; - av1_init_rd_stats(&rdc_noskip); - rdc_noskip.rate = - rate_mode + rate_mv + rd_stats.rate + x->skip_cost[skip_ctx][0]; - rdc_noskip.dist = rd_stats.dist; - rdc_noskip.rdcost = RDCOST(x->rdmult, rdc_noskip.rate, rdc_noskip.dist); - if (rdc_noskip.rdcost < best_rd) { - best_rd = rdc_noskip.rdcost; + RD_STATS rd_stats_yuv, rd_stats_y, rd_stats_uv; + if (!txfm_search(cpi, NULL, x, bsize, mi_row, mi_col, &rd_stats_yuv, + &rd_stats_y, &rd_stats_uv, rate_mode + rate_mv, INT64_MAX)) + continue; + rd_stats_yuv.rdcost = + RDCOST(x->rdmult, rd_stats_yuv.rate, rd_stats_yuv.dist); + if (rd_stats_yuv.rdcost < best_rd) { + best_rd = rd_stats_yuv.rdcost; best_mbmi = *mbmi; - best_skip = x->skip; - best_rdcost = rdc_noskip; + best_skip = mbmi->skip; + best_rdstats = rd_stats_yuv; memcpy(best_blk_skip, x->blk_skip, sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w); } - - if (!xd->lossless[mbmi->segment_id]) { - x->skip = 1; - mbmi->skip = 1; - RD_STATS rdc_skip; - av1_init_rd_stats(&rdc_skip); - rdc_skip.rate = rate_mode + rate_mv + x->skip_cost[skip_ctx][1]; - rdc_skip.dist = rd_stats.sse; - rdc_skip.rdcost = RDCOST(x->rdmult, rdc_skip.rate, rdc_skip.dist); - if (rdc_skip.rdcost < best_rd) { - best_rd = rdc_skip.rdcost; - best_mbmi = *mbmi; - best_skip = x->skip; - best_rdcost = rdc_skip; - memcpy(best_blk_skip, x->blk_skip, - sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w); - } - } } *mbmi = best_mbmi; - *rd_cost = best_rdcost; + *rd_stats = best_rdstats; x->skip = best_skip; memcpy(x->blk_skip, best_blk_skip, sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w); +#if CONFIG_RD_DEBUG + mbmi->rd_stats = *rd_stats; +#endif return best_rd; } @@ -10340,15 +10910,6 @@ static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x, int above_stride, const uint8_t *left, int left_stride); -static const int ref_frame_flag_list[REF_FRAMES] = { 0, - AOM_LAST_FLAG, - AOM_LAST2_FLAG, - AOM_LAST3_FLAG, - AOM_GOLD_FLAG, - AOM_BWD_FLAG, - AOM_ALT2_FLAG, - AOM_ALT_FLAG }; - static void rd_pick_skip_mode(RD_STATS *rd_cost, InterModeSearchState *search_state, const AV1_COMP *const cpi, MACROBLOCK *const x, @@ -10381,6 +10942,10 @@ static void rd_pick_skip_mode(RD_STATS *rd_cost, return; } + if (!cpi->oxcf.enable_onesided_comp && cpi->all_one_sided_refs) { + return; + } + mbmi->mode = this_mode; mbmi->uv_mode = UV_DC_PRED; mbmi->ref_frame[0] = ref_frame; @@ -10437,7 +11002,8 @@ static void rd_pick_skip_mode(RD_STATS *rd_cost, rd_cost->dist) : INT64_MAX; - if (skip_mode_rd_stats.rdcost <= best_intra_inter_mode_cost) { + if (skip_mode_rd_stats.rdcost <= best_intra_inter_mode_cost && + (!xd->lossless[mbmi->segment_id] || skip_mode_rd_stats.dist == 0)) { assert(mode_index != -1); search_state->best_mbmode.skip_mode = 1; search_state->best_mbmode = *mbmi; @@ -10483,13 +11049,6 @@ static void rd_pick_skip_mode(RD_STATS *rd_cost, rd_cost->dist = rd_cost->sse = skip_mode_rd_stats.dist; rd_cost->rdcost = skip_mode_rd_stats.rdcost; -#if CONFIG_ONE_PASS_SVM - if (bsize >= BLOCK_8X8 && - block_size_high[bsize] == block_size_wide[bsize]) { - av1_copy_reg_stat(rd_cost, &skip_mode_rd_stats); - } -#endif - search_state->best_rd = rd_cost->rdcost; search_state->best_skip2 = 1; search_state->best_mode_skippable = 1; @@ -10539,15 +11098,15 @@ static void sf_refine_fast_tx_type_search( } if (is_inter_mode(mbmi->mode)) { - av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize); + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0, + av1_num_planes(cm) - 1); if (mbmi->motion_mode == OBMC_CAUSAL) av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col); av1_subtract_plane(x, bsize, 0); if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) { - // av1_rd_pick_inter_mode_sb - select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, mi_row, mi_col, - INT64_MAX); + pick_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize, mi_row, mi_col, + INT64_MAX); assert(rd_stats_y.rate != INT_MAX); } else { super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX); @@ -10555,19 +11114,14 @@ static void sf_refine_fast_tx_type_search( for (int i = 0; i < xd->n4_h * xd->n4_w; ++i) set_blk_skip(x, 0, i, rd_stats_y.skip); } - if (num_planes > 1) { - inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX, INT64_MAX, - FTXS_NONE); - } else { - av1_init_rd_stats(&rd_stats_uv); - } } else { super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX); - if (num_planes > 1) { - super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX); - } else { - av1_init_rd_stats(&rd_stats_uv); - } + } + + if (num_planes > 1) { + super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX); + } else { + av1_init_rd_stats(&rd_stats_uv); } if (RDCOST(x->rdmult, @@ -10602,13 +11156,193 @@ static void sf_refine_fast_tx_type_search( } } +typedef struct { + // Mask for each reference frame, specifying which prediction modes to NOT try + // during search. + uint32_t pred_modes[REF_FRAMES]; + // If ref_combo[i][j + 1] is true, do NOT try prediction using combination of + // reference frames (i, j). + // Note: indexing with 'j + 1' is due to the fact that 2nd reference can be -1 + // (NONE_FRAME). + bool ref_combo[REF_FRAMES][REF_FRAMES + 1]; +} mode_skip_mask_t; + +// Update 'ref_combo' mask to disable given 'ref' in single and compound modes. +static void disable_reference(MV_REFERENCE_FRAME ref, + bool ref_combo[REF_FRAMES][REF_FRAMES + 1]) { + for (MV_REFERENCE_FRAME ref2 = NONE_FRAME; ref2 < REF_FRAMES; ++ref2) { + ref_combo[ref][ref2 + 1] = true; + } +} + +// Update 'ref_combo' mask to disable all inter references except ALTREF. +static void disable_inter_references_except_altref( + bool ref_combo[REF_FRAMES][REF_FRAMES + 1]) { + disable_reference(LAST_FRAME, ref_combo); + disable_reference(LAST2_FRAME, ref_combo); + disable_reference(LAST3_FRAME, ref_combo); + disable_reference(GOLDEN_FRAME, ref_combo); + disable_reference(BWDREF_FRAME, ref_combo); + disable_reference(ALTREF2_FRAME, ref_combo); +} + +static const MV_REFERENCE_FRAME reduced_ref_combos[][2] = { + { LAST_FRAME, NONE_FRAME }, { ALTREF_FRAME, NONE_FRAME }, + { LAST_FRAME, ALTREF_FRAME }, { GOLDEN_FRAME, NONE_FRAME }, + { INTRA_FRAME, NONE_FRAME }, { GOLDEN_FRAME, ALTREF_FRAME }, + { LAST_FRAME, GOLDEN_FRAME }, { LAST_FRAME, INTRA_FRAME }, + { LAST_FRAME, BWDREF_FRAME }, { LAST_FRAME, LAST3_FRAME }, + { GOLDEN_FRAME, BWDREF_FRAME }, { GOLDEN_FRAME, INTRA_FRAME }, + { BWDREF_FRAME, NONE_FRAME }, { BWDREF_FRAME, ALTREF_FRAME }, + { ALTREF_FRAME, INTRA_FRAME }, { BWDREF_FRAME, INTRA_FRAME }, +}; + +static const MV_REFERENCE_FRAME real_time_ref_combos[][2] = { + { LAST_FRAME, NONE_FRAME }, + { ALTREF_FRAME, NONE_FRAME }, + { GOLDEN_FRAME, NONE_FRAME }, + { INTRA_FRAME, NONE_FRAME } +}; + +typedef enum { REF_SET_FULL, REF_SET_REDUCED, REF_SET_REALTIME } REF_SET; + +static void default_skip_mask(mode_skip_mask_t *mask, REF_SET ref_set) { + if (ref_set == REF_SET_FULL) { + // Everything available by default. + memset(mask, 0, sizeof(*mask)); + } else { + // All modes available by default. + memset(mask->pred_modes, 0, sizeof(mask->pred_modes)); + // All references disabled first. + for (MV_REFERENCE_FRAME ref1 = INTRA_FRAME; ref1 < REF_FRAMES; ++ref1) { + for (MV_REFERENCE_FRAME ref2 = NONE_FRAME; ref2 < REF_FRAMES; ++ref2) { + mask->ref_combo[ref1][ref2 + 1] = true; + } + } + const MV_REFERENCE_FRAME(*ref_set_combos)[2]; + int num_ref_combos; + + // Then enable reduced set of references explicitly. + switch (ref_set) { + case REF_SET_REDUCED: + ref_set_combos = reduced_ref_combos; + num_ref_combos = + (int)sizeof(reduced_ref_combos) / sizeof(reduced_ref_combos[0]); + break; + case REF_SET_REALTIME: + ref_set_combos = real_time_ref_combos; + num_ref_combos = + (int)sizeof(real_time_ref_combos) / sizeof(real_time_ref_combos[0]); + break; + default: assert(0); num_ref_combos = 0; + } + + for (int i = 0; i < num_ref_combos; ++i) { + const MV_REFERENCE_FRAME *const this_combo = ref_set_combos[i]; + mask->ref_combo[this_combo[0]][this_combo[1] + 1] = false; + } + } +} + +static void init_mode_skip_mask(mode_skip_mask_t *mask, const AV1_COMP *cpi, + MACROBLOCK *x, BLOCK_SIZE bsize) { + const AV1_COMMON *const cm = &cpi->common; + const struct segmentation *const seg = &cm->seg; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + unsigned char segment_id = mbmi->segment_id; + const SPEED_FEATURES *const sf = &cpi->sf; + REF_SET ref_set = REF_SET_FULL; + + if (sf->use_real_time_ref_set) + ref_set = REF_SET_REALTIME; + else if (cpi->oxcf.enable_reduced_reference_set) + ref_set = REF_SET_REDUCED; + + default_skip_mask(mask, ref_set); + + int min_pred_mv_sad = INT_MAX; + MV_REFERENCE_FRAME ref_frame; + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) + min_pred_mv_sad = AOMMIN(min_pred_mv_sad, x->pred_mv_sad[ref_frame]); + + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + if (!(cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame])) { + // Skip checking missing reference in both single and compound reference + // modes. + disable_reference(ref_frame, mask->ref_combo); + } else { + // Skip fixed mv modes for poor references + if ((x->pred_mv_sad[ref_frame] >> 2) > min_pred_mv_sad) { + mask->pred_modes[ref_frame] |= INTER_NEAREST_NEAR_ZERO; + } + } + if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) && + get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) { + // Reference not used for the segment. + disable_reference(ref_frame, mask->ref_combo); + } + } + // Note: We use the following drop-out only if the SEG_LVL_REF_FRAME feature + // is disabled for this segment. This is to prevent the possibility that we + // end up unable to pick any mode. + if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) { + // Only consider GLOBALMV/ALTREF_FRAME for alt ref frame, + // unless ARNR filtering is enabled in which case we want + // an unfiltered alternative. We allow near/nearest as well + // because they may result in zero-zero MVs but be cheaper. + if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) { + disable_inter_references_except_altref(mask->ref_combo); + + mask->pred_modes[ALTREF_FRAME] = ~INTER_NEAREST_NEAR_ZERO; + const MV_REFERENCE_FRAME tmp_ref_frames[2] = { ALTREF_FRAME, NONE_FRAME }; + int_mv near_mv, nearest_mv, global_mv; + get_this_mv(&nearest_mv, NEARESTMV, 0, 0, tmp_ref_frames, x->mbmi_ext); + get_this_mv(&near_mv, NEARMV, 0, 0, tmp_ref_frames, x->mbmi_ext); + get_this_mv(&global_mv, GLOBALMV, 0, 0, tmp_ref_frames, x->mbmi_ext); + + if (near_mv.as_int != global_mv.as_int) + mask->pred_modes[ALTREF_FRAME] |= (1 << NEARMV); + if (nearest_mv.as_int != global_mv.as_int) + mask->pred_modes[ALTREF_FRAME] |= (1 << NEARESTMV); + } + } + + if (cpi->rc.is_src_frame_alt_ref) { + if (sf->alt_ref_search_fp) { + assert(cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME]); + mask->pred_modes[ALTREF_FRAME] = 0; + disable_inter_references_except_altref(mask->ref_combo); + disable_reference(INTRA_FRAME, mask->ref_combo); + } + } + + if (sf->alt_ref_search_fp) + if (!cm->show_frame && x->pred_mv_sad[GOLDEN_FRAME] < INT_MAX) + if (x->pred_mv_sad[ALTREF_FRAME] > (x->pred_mv_sad[GOLDEN_FRAME] << 1)) + mask->pred_modes[ALTREF_FRAME] |= INTER_ALL; + + if (sf->adaptive_mode_search) { + if (cm->show_frame && !cpi->rc.is_src_frame_alt_ref && + cpi->rc.frames_since_golden >= 3) + if ((x->pred_mv_sad[GOLDEN_FRAME] >> 1) > x->pred_mv_sad[LAST_FRAME]) + mask->pred_modes[GOLDEN_FRAME] |= INTER_ALL; + } + + if (bsize > sf->max_intra_bsize) { + disable_reference(INTRA_FRAME, mask->ref_combo); + } + + mask->pred_modes[INTRA_FRAME] |= + ~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]); +} + // Please add/modify parameter setting in this function, making it consistent // and easy to read and maintain. static void set_params_rd_pick_inter_mode( const AV1_COMP *cpi, MACROBLOCK *x, HandleInterModeArgs *args, - BLOCK_SIZE bsize, int mi_row, int mi_col, uint16_t ref_frame_skip_mask[2], - uint32_t mode_skip_mask[REF_FRAMES], int skip_ref_frame_mask, - unsigned int ref_costs_single[REF_FRAMES], + BLOCK_SIZE bsize, int mi_row, int mi_col, mode_skip_mask_t *mode_skip_mask, + int skip_ref_frame_mask, unsigned int ref_costs_single[REF_FRAMES], unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES], struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) { const AV1_COMMON *const cm = &cpi->common; @@ -10616,8 +11350,6 @@ static void set_params_rd_pick_inter_mode( MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; - const struct segmentation *const seg = &cm->seg; - const SPEED_FEATURES *const sf = &cpi->sf; unsigned char segment_id = mbmi->segment_id; int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1, @@ -10629,7 +11361,7 @@ static void set_params_rd_pick_inter_mode( for (int i = 0; i < MB_MODE_COUNT; ++i) for (int k = 0; k < REF_FRAMES; ++k) args->single_filter[i][k] = SWITCHABLE; - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + if (is_cur_buf_hbd(xd)) { int len = sizeof(uint16_t); args->above_pred_buf[0] = CONVERT_TO_BYTEPTR(x->above_pred_buf); args->above_pred_buf[1] = @@ -10659,9 +11391,8 @@ static void set_params_rd_pick_inter_mode( for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { x->pred_mv_sad[ref_frame] = INT_MAX; x->mbmi_ext->mode_context[ref_frame] = 0; - x->mbmi_ext->compound_mode_context[ref_frame] = 0; mbmi_ext->ref_mv_count[ref_frame] = UINT8_MAX; - if (cpi->ref_frame_flags & ref_frame_flag_list[ref_frame]) { + if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) { if (mbmi->partition != PARTITION_NONE && mbmi->partition != PARTITION_SPLIT) { if (skip_ref_frame_mask & (1 << ref_frame)) { @@ -10678,7 +11409,7 @@ static void set_params_rd_pick_inter_mode( if (skip) continue; } } - assert(get_ref_frame_buffer(cpi, ref_frame) != NULL); + assert(get_ref_frame_yv12_buf(cm, ref_frame) != NULL); setup_buffer_ref_mvs_inter(cpi, x, ref_frame, bsize, mi_row, mi_col, yv12_mb); } @@ -10688,8 +11419,8 @@ static void set_params_rd_pick_inter_mode( x->mbmi_ext->mode_context[ref_frame] = 0; mbmi_ext->ref_mv_count[ref_frame] = UINT8_MAX; const MV_REFERENCE_FRAME *rf = ref_frame_map[ref_frame - REF_FRAMES]; - if (!((cpi->ref_frame_flags & ref_frame_flag_list[rf[0]]) && - (cpi->ref_frame_flags & ref_frame_flag_list[rf[1]]))) { + if (!((cpi->ref_frame_flags & av1_ref_frame_flag_list[rf[0]]) && + (cpi->ref_frame_flags & av1_ref_frame_flag_list[rf[1]]))) { continue; } @@ -10722,93 +11453,122 @@ static void set_params_rd_pick_inter_mode( args->left_pred_stride[0]); } - int min_pred_mv_sad = INT_MAX; - for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) - min_pred_mv_sad = AOMMIN(min_pred_mv_sad, x->pred_mv_sad[ref_frame]); + init_mode_skip_mask(mode_skip_mask, cpi, x, bsize); - for (int i = 0; i < 2; ++i) { - ref_frame_skip_mask[i] = 0; - } - memset(mode_skip_mask, 0, REF_FRAMES * sizeof(*mode_skip_mask)); - for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { - if (!(cpi->ref_frame_flags & ref_frame_flag_list[ref_frame])) { - // Skip checking missing references in both single and compound reference - // modes. Note that a mode will be skipped iff both reference frames - // are masked out. - ref_frame_skip_mask[0] |= (1 << ref_frame); - ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK; - } else { - // Skip fixed mv modes for poor references - if ((x->pred_mv_sad[ref_frame] >> 2) > min_pred_mv_sad) { - mode_skip_mask[ref_frame] |= INTER_NEAREST_NEAR_ZERO; - } - } - // If the segment reference frame feature is enabled.... - // then do nothing if the current ref frame is not allowed.. - if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) && - get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) { - ref_frame_skip_mask[0] |= (1 << ref_frame); - ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK; - } + if (cpi->sf.tx_type_search.fast_intra_tx_type_search || + cpi->oxcf.use_intra_default_tx_only) + x->use_default_intra_tx_type = 1; + else + x->use_default_intra_tx_type = 0; + + if (cpi->sf.tx_type_search.fast_inter_tx_type_search) + x->use_default_inter_tx_type = 1; + else + x->use_default_inter_tx_type = 0; + if (cpi->sf.skip_repeat_interpolation_filter_search) { + x->interp_filter_stats_idx[0] = 0; + x->interp_filter_stats_idx[1] = 0; } + x->comp_rd_stats_idx = 0; +} - // Disable this drop out case if the ref frame - // segment level feature is enabled for this segment. This is to - // prevent the possibility that we end up unable to pick any mode. - if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) { - // Only consider GLOBALMV/ALTREF_FRAME for alt ref frame, - // unless ARNR filtering is enabled in which case we want - // an unfiltered alternative. We allow near/nearest as well - // because they may result in zero-zero MVs but be cheaper. - if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) { - ref_frame_skip_mask[0] = (1 << LAST_FRAME) | (1 << LAST2_FRAME) | - (1 << LAST3_FRAME) | (1 << BWDREF_FRAME) | - (1 << ALTREF2_FRAME) | (1 << GOLDEN_FRAME); - ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK; - // TODO(zoeliu): To further explore whether following needs to be done for - // BWDREF_FRAME as well. - mode_skip_mask[ALTREF_FRAME] = ~INTER_NEAREST_NEAR_ZERO; - const MV_REFERENCE_FRAME tmp_ref_frames[2] = { ALTREF_FRAME, NONE_FRAME }; - int_mv near_mv, nearest_mv, global_mv; - get_this_mv(&nearest_mv, NEARESTMV, 0, 0, tmp_ref_frames, x->mbmi_ext); - get_this_mv(&near_mv, NEARMV, 0, 0, tmp_ref_frames, x->mbmi_ext); - get_this_mv(&global_mv, GLOBALMV, 0, 0, tmp_ref_frames, x->mbmi_ext); +// TODO(kyslov): now this is very similar to set_params_rd_pick_inter_mode +// (except that doesn't set ALTREF parameters) +// consider passing a flag to select non-rd path (similar to +// encode_sb_row) +static void set_params_nonrd_pick_inter_mode( + const AV1_COMP *cpi, MACROBLOCK *x, HandleInterModeArgs *args, + BLOCK_SIZE bsize, int mi_row, int mi_col, mode_skip_mask_t *mode_skip_mask, + int skip_ref_frame_mask, unsigned int ref_costs_single[REF_FRAMES], + unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES], + struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) { + const AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; + unsigned char segment_id = mbmi->segment_id; + int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; + int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1, + MAX_SB_SIZE >> 1 }; + int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1, + MAX_SB_SIZE >> 1 }; + int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; - if (near_mv.as_int != global_mv.as_int) - mode_skip_mask[ALTREF_FRAME] |= (1 << NEARMV); - if (nearest_mv.as_int != global_mv.as_int) - mode_skip_mask[ALTREF_FRAME] |= (1 << NEARESTMV); - } - } + for (int i = 0; i < MB_MODE_COUNT; ++i) + for (int k = 0; k < REF_FRAMES; ++k) args->single_filter[i][k] = SWITCHABLE; - if (cpi->rc.is_src_frame_alt_ref) { - if (sf->alt_ref_search_fp) { - assert(cpi->ref_frame_flags & ref_frame_flag_list[ALTREF_FRAME]); - mode_skip_mask[ALTREF_FRAME] = 0; - ref_frame_skip_mask[0] = ~(1 << ALTREF_FRAME); - ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK; - } + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + int len = sizeof(uint16_t); + args->above_pred_buf[0] = CONVERT_TO_BYTEPTR(x->above_pred_buf); + args->above_pred_buf[1] = + CONVERT_TO_BYTEPTR(x->above_pred_buf + (MAX_SB_SQUARE >> 1) * len); + args->above_pred_buf[2] = + CONVERT_TO_BYTEPTR(x->above_pred_buf + MAX_SB_SQUARE * len); + args->left_pred_buf[0] = CONVERT_TO_BYTEPTR(x->left_pred_buf); + args->left_pred_buf[1] = + CONVERT_TO_BYTEPTR(x->left_pred_buf + (MAX_SB_SQUARE >> 1) * len); + args->left_pred_buf[2] = + CONVERT_TO_BYTEPTR(x->left_pred_buf + MAX_SB_SQUARE * len); + } else { + args->above_pred_buf[0] = x->above_pred_buf; + args->above_pred_buf[1] = x->above_pred_buf + (MAX_SB_SQUARE >> 1); + args->above_pred_buf[2] = x->above_pred_buf + MAX_SB_SQUARE; + args->left_pred_buf[0] = x->left_pred_buf; + args->left_pred_buf[1] = x->left_pred_buf + (MAX_SB_SQUARE >> 1); + args->left_pred_buf[2] = x->left_pred_buf + MAX_SB_SQUARE; } - if (sf->alt_ref_search_fp) - if (!cm->show_frame && x->pred_mv_sad[GOLDEN_FRAME] < INT_MAX) - if (x->pred_mv_sad[ALTREF_FRAME] > (x->pred_mv_sad[GOLDEN_FRAME] << 1)) - mode_skip_mask[ALTREF_FRAME] |= INTER_ALL; + av1_collect_neighbors_ref_counts(xd); - if (sf->adaptive_mode_search) { - if (cm->show_frame && !cpi->rc.is_src_frame_alt_ref && - cpi->rc.frames_since_golden >= 3) - if ((x->pred_mv_sad[GOLDEN_FRAME] >> 1) > x->pred_mv_sad[LAST_FRAME]) - mode_skip_mask[GOLDEN_FRAME] |= INTER_ALL; - } + estimate_ref_frame_costs(cm, xd, x, segment_id, ref_costs_single, + ref_costs_comp); - if (bsize > sf->max_intra_bsize) { - ref_frame_skip_mask[0] |= (1 << INTRA_FRAME); - ref_frame_skip_mask[1] |= (1 << INTRA_FRAME); + MV_REFERENCE_FRAME ref_frame; + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + x->pred_mv_sad[ref_frame] = INT_MAX; + x->mbmi_ext->mode_context[ref_frame] = 0; + mbmi_ext->ref_mv_count[ref_frame] = UINT8_MAX; + if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) { + if (mbmi->partition != PARTITION_NONE && + mbmi->partition != PARTITION_SPLIT) { + if (skip_ref_frame_mask & (1 << ref_frame)) { + int skip = 1; + for (int r = ALTREF_FRAME + 1; r < MODE_CTX_REF_FRAMES; ++r) { + if (!(skip_ref_frame_mask & (1 << r))) { + const MV_REFERENCE_FRAME *rf = ref_frame_map[r - REF_FRAMES]; + if (rf[0] == ref_frame || rf[1] == ref_frame) { + skip = 0; + break; + } + } + } + if (skip) continue; + } + } + assert(get_ref_frame_yv12_buf(cm, ref_frame) != NULL); + setup_buffer_ref_mvs_inter(cpi, x, ref_frame, bsize, mi_row, mi_col, + yv12_mb); + } } + av1_count_overlappable_neighbors(cm, xd, mi_row, mi_col); - mode_skip_mask[INTRA_FRAME] |= - ~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]); + if (check_num_overlappable_neighbors(mbmi) && + is_motion_variation_allowed_bsize(bsize)) { + av1_build_prediction_by_above_preds(cm, xd, mi_row, mi_col, + args->above_pred_buf, dst_width1, + dst_height1, args->above_pred_stride); + av1_build_prediction_by_left_preds(cm, xd, mi_row, mi_col, + args->left_pred_buf, dst_width2, + dst_height2, args->left_pred_stride); + av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col, + 0, num_planes); + calc_target_weighted_pred( + cm, x, xd, mi_row, mi_col, args->above_pred_buf[0], + args->above_pred_stride[0], args->left_pred_buf[0], + args->left_pred_stride[0]); + } + init_mode_skip_mask(mode_skip_mask, cpi, x, bsize); if (cpi->sf.tx_type_search.fast_intra_tx_type_search) x->use_default_intra_tx_type = 1; @@ -10900,9 +11660,6 @@ static void search_palette_mode(const AV1_COMP *cpi, MACROBLOCK *x, int mi_row, rate2 -= rd_stats_y.rate; if (num_planes > 1) rate2 -= search_state->rate_uv_tokenonly[uv_tx]; rate2 += x->skip_cost[av1_get_skip_context(xd)][1]; -#if CONFIG_ONE_PASS_SVM - av1_reg_stat_skipmode_update(&rd_stats_y, x->rdmult); -#endif } else { rate2 += x->skip_cost[av1_get_skip_context(xd)][0]; } @@ -10919,9 +11676,6 @@ static void search_palette_mode(const AV1_COMP *cpi, MACROBLOCK *x, int mi_row, search_state->best_mode_skippable = skippable; memcpy(ctx->blk_skip, x->blk_skip, sizeof(x->blk_skip[0]) * ctx->num_4x4_blk); -#if CONFIG_ONE_PASS_SVM - av1_copy_reg_stat(rd_cost, &rd_stats_y); -#endif } } @@ -11016,32 +11770,89 @@ static void init_inter_mode_search_state(InterModeSearchState *search_state, av1_zero(search_state->single_state_modelled_cnt); } +bool mask_says_skip(const mode_skip_mask_t *mode_skip_mask, + const MV_REFERENCE_FRAME *ref_frame, + const PREDICTION_MODE this_mode) { + if (mode_skip_mask->pred_modes[ref_frame[0]] & (1 << this_mode)) { + return true; + } + + return mode_skip_mask->ref_combo[ref_frame[0]][ref_frame[1] + 1]; +} + +static int inter_mode_compatible_skip(const AV1_COMP *cpi, const MACROBLOCK *x, + BLOCK_SIZE bsize, int mode_index) { + const AV1_COMMON *const cm = &cpi->common; + const struct segmentation *const seg = &cm->seg; + const MV_REFERENCE_FRAME *ref_frame = av1_mode_order[mode_index].ref_frame; + const PREDICTION_MODE this_mode = av1_mode_order[mode_index].mode; + const CurrentFrame *const current_frame = &cm->current_frame; + const MACROBLOCKD *const xd = &x->e_mbd; + const MB_MODE_INFO *const mbmi = xd->mi[0]; + const unsigned char segment_id = mbmi->segment_id; + const int comp_pred = ref_frame[1] > INTRA_FRAME; + + if (comp_pred) { + if (frame_is_intra_only(cm)) return 1; + + if (current_frame->reference_mode == SINGLE_REFERENCE) return 1; + + // Skip compound inter modes if ARF is not available. + if (!(cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame[1]])) + return 1; + + // Do not allow compound prediction if the segment level reference frame + // feature is in use as in this case there can only be one reference. + if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) return 1; + + if (!is_comp_ref_allowed(bsize)) return 1; + } + + if (ref_frame[0] > INTRA_FRAME && ref_frame[1] == INTRA_FRAME) { + // Mode must be compatible + if (!is_interintra_allowed_mode(this_mode)) return 1; + if (!is_interintra_allowed_bsize(bsize)) return 1; + } + + return 0; +} + +static int fetch_picked_ref_frames_mask(const MACROBLOCK *const x, + BLOCK_SIZE bsize, int mib_size, + int mi_row, int mi_col) { + const int sb_size_mask = mib_size - 1; + const int mi_row_in_sb = mi_row & sb_size_mask; + const int mi_col_in_sb = mi_col & sb_size_mask; + const int mi_w = mi_size_wide[bsize]; + const int mi_h = mi_size_high[bsize]; + int picked_ref_frames_mask = 0; + for (int i = mi_row_in_sb; i < mi_row_in_sb + mi_h; ++i) { + for (int j = mi_col_in_sb; j < mi_col_in_sb + mi_w; ++j) { + picked_ref_frames_mask |= x->picked_ref_frames_mask[i * 32 + j]; + } + } + return picked_ref_frames_mask; +} + // Case 1: return 0, means don't skip this mode // Case 2: return 1, means skip this mode completely // Case 3: return 2, means skip compound only, but still try single motion modes static int inter_mode_search_order_independent_skip( - const AV1_COMP *cpi, const PICK_MODE_CONTEXT *ctx, const MACROBLOCK *x, - BLOCK_SIZE bsize, int mode_index, int mi_row, int mi_col, - uint32_t *mode_skip_mask, uint16_t *ref_frame_skip_mask, - InterModeSearchState *search_state) { + const AV1_COMP *cpi, const MACROBLOCK *x, BLOCK_SIZE bsize, int mode_index, + int mi_row, int mi_col, mode_skip_mask_t *mode_skip_mask, + InterModeSearchState *search_state, int skip_ref_frame_mask) { const SPEED_FEATURES *const sf = &cpi->sf; const AV1_COMMON *const cm = &cpi->common; - const struct segmentation *const seg = &cm->seg; const OrderHintInfo *const order_hint_info = &cm->seq_params.order_hint_info; const CurrentFrame *const current_frame = &cm->current_frame; const MACROBLOCKD *const xd = &x->e_mbd; const MB_MODE_INFO *const mbmi = xd->mi[0]; - const unsigned char segment_id = mbmi->segment_id; const MV_REFERENCE_FRAME *ref_frame = av1_mode_order[mode_index].ref_frame; const PREDICTION_MODE this_mode = av1_mode_order[mode_index].mode; + const int comp_pred = ref_frame[1] > INTRA_FRAME; int skip_motion_mode = 0; - if (mode_skip_mask[ref_frame[0]] & (1 << this_mode)) { - return 1; - } - - if ((ref_frame_skip_mask[0] & (1 << ref_frame[0])) && - (ref_frame_skip_mask[1] & (1 << AOMMAX(0, ref_frame[1])))) { + if (mask_says_skip(mode_skip_mask, ref_frame, this_mode)) { return 1; } @@ -11053,14 +11864,14 @@ static int inter_mode_search_order_independent_skip( if (mbmi->partition != PARTITION_NONE && mbmi->partition != PARTITION_SPLIT) { const int ref_type = av1_ref_frame_type(ref_frame); - int skip_ref = ctx->skip_ref_frame_mask & (1 << ref_type); + int skip_ref = skip_ref_frame_mask & (1 << ref_type); if (ref_type <= ALTREF_FRAME && skip_ref) { // Since the compound ref modes depends on the motion estimation result of // two single ref modes( best mv of single ref modes as the start point ) // If current single ref mode is marked skip, we need to check if it will // be used in compound ref modes. for (int r = ALTREF_FRAME + 1; r < MODE_CTX_REF_FRAMES; ++r) { - if (!(ctx->skip_ref_frame_mask & (1 << r))) { + if (!(skip_ref_frame_mask & (1 << r))) { const MV_REFERENCE_FRAME *rf = ref_frame_map[r - REF_FRAMES]; if (rf[0] == ref_type || rf[1] == ref_type) { // Found a not skipped compound ref mode which contains current @@ -11077,8 +11888,7 @@ static int inter_mode_search_order_independent_skip( if (skip_ref) return 1; } - if (cpi->sf.mode_pruning_based_on_two_pass_partition_search && - !x->cb_partition_scan) { + if (cpi->two_pass_partition_search && !x->cb_partition_scan) { const int mi_width = mi_size_wide[bsize]; const int mi_height = mi_size_high[bsize]; int found = 0; @@ -11101,12 +11911,6 @@ static int inter_mode_search_order_independent_skip( if (!found) return 1; } - if (ref_frame[0] > INTRA_FRAME && ref_frame[1] == INTRA_FRAME) { - // Mode must by compatible - if (!is_interintra_allowed_mode(this_mode)) return 1; - if (!is_interintra_allowed_bsize(bsize)) return 1; - } - // This is only used in motion vector unit test. if (cpi->oxcf.motion_vector_unit_test && ref_frame[0] == INTRA_FRAME) return 1; @@ -11121,22 +11925,6 @@ static int inter_mode_search_order_independent_skip( x->source_variance < skip_intra_var_thresh) return 1; } - } else { - if (!is_comp_ref_allowed(bsize) && ref_frame[1] > INTRA_FRAME) return 1; - } - - const int comp_pred = ref_frame[1] > INTRA_FRAME; - if (comp_pred) { - if (!cpi->allow_comp_inter_inter) return 1; - - if (current_frame->reference_mode == SINGLE_REFERENCE) return 1; - - // Skip compound inter modes if ARF is not available. - if (!(cpi->ref_frame_flags & ref_frame_flag_list[ref_frame[1]])) return 1; - - // Do not allow compound prediction if the segment level reference frame - // feature is in use as in this case there can only be one reference. - if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) return 1; } if (sf->selective_ref_frame) { @@ -11176,8 +11964,7 @@ static int inter_mode_search_order_independent_skip( if ((sf->selective_ref_frame >= 2) && comp_pred && !cpi->all_one_sided_refs) { unsigned int ref_offsets[2]; for (int i = 0; i < 2; ++i) { - const RefCntBuffer *const buf = - cm->current_frame.frame_refs[ref_frame[i] - LAST_FRAME].buf; + const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame[i]); assert(buf != NULL); ref_offsets[i] = buf->order_hint; } @@ -11192,12 +11979,57 @@ static int inter_mode_search_order_independent_skip( return 1; } + if (sf->selective_ref_frame >= 4 && comp_pred) { + // Check if one of the reference is ALTREF2_FRAME and BWDREF_FRAME is a + // valid reference. + if ((ref_frame[0] == ALTREF2_FRAME || ref_frame[1] == ALTREF2_FRAME) && + (cpi->ref_frame_flags & av1_ref_frame_flag_list[BWDREF_FRAME])) { + // Check if both ALTREF2_FRAME and BWDREF_FRAME are future references. + if ((get_relative_dist( + order_hint_info, + cm->cur_frame->ref_order_hints[ALTREF2_FRAME - LAST_FRAME], + current_frame->order_hint) > 0) && + (get_relative_dist( + order_hint_info, + cm->cur_frame->ref_order_hints[BWDREF_FRAME - LAST_FRAME], + current_frame->order_hint) > 0)) { + // Drop ALTREF2_FRAME as a reference if BWDREF_FRAME is a closer + // reference to the current frame than ALTREF2_FRAME + if (get_relative_dist( + order_hint_info, + cm->cur_frame->ref_order_hints[ALTREF2_FRAME - LAST_FRAME], + cm->cur_frame->ref_order_hints[BWDREF_FRAME - LAST_FRAME]) >= + 0) { + const RefCntBuffer *const buf_arf2 = + get_ref_frame_buf(cm, ALTREF2_FRAME); + assert(buf_arf2 != NULL); + const RefCntBuffer *const buf_bwd = + get_ref_frame_buf(cm, BWDREF_FRAME); + assert(buf_bwd != NULL); + (void)buf_arf2; + (void)buf_bwd; + return 1; + } + } + } + } + if (skip_repeated_mv(cm, x, this_mode, ref_frame, search_state)) { return 1; } if (skip_motion_mode) { return 2; } + + if (!cpi->oxcf.enable_global_motion && + (this_mode == GLOBALMV || this_mode == GLOBAL_GLOBALMV)) { + return 1; + } + + if (!cpi->oxcf.enable_onesided_comp && comp_pred && cpi->all_one_sided_refs) { + return 1; + } + return 0; } @@ -11233,6 +12065,7 @@ static int64_t handle_intra_mode(InterModeSearchState *search_state, assert(mbmi->ref_frame[0] == INTRA_FRAME); PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; const int try_palette = + cpi->oxcf.enable_palette && av1_allow_palette(cm->allow_screen_content_tools, mbmi->sb_type); const int *const intra_mode_cost = x->mbmode_cost[size_group_lookup[bsize]]; const int intra_cost_penalty = av1_get_intra_cost_penalty( @@ -11255,14 +12088,14 @@ static int64_t handle_intra_mode(InterModeSearchState *search_state, TX_SIZE uv_tx; int is_directional_mode = av1_is_directional_mode(mbmi->mode); - if (is_directional_mode && av1_use_angle_delta(bsize)) { + if (is_directional_mode && av1_use_angle_delta(bsize) && + cpi->oxcf.enable_angle_delta) { int rate_dummy; int64_t model_rd = INT64_MAX; if (sf->intra_angle_estimation && !search_state->angle_stats_ready) { const int src_stride = x->plane[0].src.stride; const uint8_t *src = x->plane[0].src.buf; - angle_estimation(src, src_stride, rows, cols, bsize, - xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, + angle_estimation(src, src_stride, rows, cols, bsize, is_cur_buf_hbd(xd), search_state->directional_mode_skip_mask); search_state->angle_stats_ready = 1; } @@ -11795,6 +12628,16 @@ static void release_compound_type_rd_buffers( av1_zero(*bufs); // Set all pointers to NULL for safety. } +// Enables do_tx_search on a per-mode basis. +int do_tx_search_mode(int do_tx_search_global, int midx, int adaptive) { + if (!adaptive || do_tx_search_global) { + return do_tx_search_global; + } + // A value of 2 indicates it is being turned on conditionally + // for the mode. Turn it on for the first 7 modes. + return midx < 7 ? 2 : 0; +} + void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data, MACROBLOCK *x, int mi_row, int mi_col, RD_STATS *rd_cost, BLOCK_SIZE bsize, @@ -11805,6 +12648,7 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data, MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = xd->mi[0]; const int try_palette = + cpi->oxcf.enable_palette && av1_allow_palette(cm->allow_screen_content_tools, mbmi->sb_type); PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info; const struct segmentation *const seg = &cm->seg; @@ -11815,16 +12659,8 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data, unsigned int ref_costs_single[REF_FRAMES]; unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES]; int *comp_inter_cost = x->comp_inter_cost[av1_get_reference_mode_context(xd)]; - int *mode_map = tile_data->mode_map[bsize]; - uint32_t mode_skip_mask[REF_FRAMES]; - uint16_t ref_frame_skip_mask[2]; + mode_skip_mask_t mode_skip_mask; uint8_t motion_mode_skip_mask = 0; // second pass of single ref modes -#if CONFIG_ONE_PASS_SVM - int temp_y_eob = 0, temp_y_eob_0 = 0, temp_y_eob_1 = 0, temp_y_eob_2 = 0, - temp_y_eob_3 = 0; - int64_t temp_y_rd = 0, temp_y_rd_0 = 0, temp_y_rd_1 = 0, temp_y_rd_2 = 0, - temp_y_rd_3 = 0; -#endif InterModeSearchState search_state; init_inter_mode_search_state(&search_state, cpi, tile_data, x, bsize, @@ -11847,23 +12683,42 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data, av1_invalid_rd_stats(rd_cost); + // Ref frames that are selected by square partition blocks. + int picked_ref_frames_mask = 0; + if (cpi->sf.prune_ref_frame_for_rect_partitions && + mbmi->partition != PARTITION_NONE && mbmi->partition != PARTITION_SPLIT) { + // prune_ref_frame_for_rect_partitions = 1 implies prune only extended + // partition blocks. prune_ref_frame_for_rect_partitions >=2 + // implies prune for vert, horiz and extended partition blocks. + if ((mbmi->partition != PARTITION_VERT && + mbmi->partition != PARTITION_HORZ) || + cpi->sf.prune_ref_frame_for_rect_partitions >= 2) { + picked_ref_frames_mask = fetch_picked_ref_frames_mask( + x, bsize, cm->seq_params.mib_size, mi_row, mi_col); + } + } + + // Skip ref frames that never selected by square blocks. + const int skip_ref_frame_mask = + picked_ref_frames_mask ? ~picked_ref_frames_mask : 0; + // init params, set frame modes, speed features - set_params_rd_pick_inter_mode( - cpi, x, &args, bsize, mi_row, mi_col, ref_frame_skip_mask, mode_skip_mask, - ctx->skip_ref_frame_mask, ref_costs_single, ref_costs_comp, yv12_mb); + set_params_rd_pick_inter_mode(cpi, x, &args, bsize, mi_row, mi_col, + &mode_skip_mask, skip_ref_frame_mask, + ref_costs_single, ref_costs_comp, yv12_mb); -#if CONFIG_COLLECT_INTER_MODE_RD_STATS int64_t best_est_rd = INT64_MAX; // TODO(angiebird): Turn this on when this speed feature is well tested -#if 1 const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize]; - const int do_tx_search = !md->ready; -#else - const int do_tx_search = 1; -#endif + // If do_tx_search_global is 0, only estimated RD should be computed. + // If do_tx_search_global is 1, all modes have TX search performed. + // If do_tx_search_global is 2, some modes will have TX search performed. + const int do_tx_search_global = + !((cpi->sf.inter_mode_rd_model_estimation == 1 && md->ready) || + (cpi->sf.inter_mode_rd_model_estimation == 2 && + x->source_variance < 512)); InterModesInfo *inter_modes_info = x->inter_modes_info; inter_modes_info->num = 0; -#endif int intra_mode_num = 0; int intra_mode_idx_ls[MAX_MODES]; @@ -11876,8 +12731,9 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data, alloc_compound_type_rd_buffers(cm, &rd_buffers); for (int midx = 0; midx < MAX_MODES; ++midx) { - int mode_index = mode_map[midx]; - const MODE_DEFINITION *mode_order = &av1_mode_order[mode_index]; + const int do_tx_search = do_tx_search_mode( + do_tx_search_global, midx, sf->inter_mode_rd_model_estimation_adaptive); + const MODE_DEFINITION *mode_order = &av1_mode_order[midx]; this_mode = mode_order->mode; const MV_REFERENCE_FRAME ref_frame = mode_order->ref_frame[0]; const MV_REFERENCE_FRAME second_ref_frame = mode_order->ref_frame[1]; @@ -11899,8 +12755,8 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data, if (args.single_ref_first_pass) { // clear stats for (int k = 0; k < MAX_REF_MV_SERCH; ++k) { - x->simple_rd_state[mode_index][k].rd_stats.rdcost = INT64_MAX; - x->simple_rd_state[mode_index][k].early_skipped = 0; + x->simple_rd_state[midx][k].rd_stats.rdcost = INT64_MAX; + x->simple_rd_state[midx][k].early_skipped = 0; } } else { if (motion_mode_skip_mask & (1 << ref_frame)) { @@ -11923,14 +12779,16 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data, int skippable = 0; int this_skip2 = 0; - init_mbmi(mbmi, mode_index, cm); + init_mbmi(mbmi, midx, cm); x->skip = 0; set_ref_ptrs(cm, xd, ref_frame, second_ref_frame); + if (inter_mode_compatible_skip(cpi, x, bsize, midx)) continue; + const int ret = inter_mode_search_order_independent_skip( - cpi, ctx, x, bsize, mode_index, mi_row, mi_col, mode_skip_mask, - ref_frame_skip_mask, &search_state); + cpi, x, bsize, midx, mi_row, mi_col, &mode_skip_mask, &search_state, + skip_ref_frame_mask); if (ret == 1) continue; args.skip_motion_mode = (ret == 2); @@ -11940,8 +12798,7 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data, } } - if (search_state.best_rd < search_state.mode_threshold[mode_index]) - continue; + if (search_state.best_rd < search_state.mode_threshold[midx]) continue; if (sf->prune_comp_search_by_single_result > 0 && comp_pred) { if (compound_skip_by_single_states(cpi, &search_state, this_mode, @@ -11967,7 +12824,12 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data, } if (ref_frame == INTRA_FRAME) { - if (sf->adaptive_mode_search) + if ((!cpi->oxcf.enable_smooth_intra || sf->disable_smooth_intra) && + (mbmi->mode == SMOOTH_PRED || mbmi->mode == SMOOTH_H_PRED || + mbmi->mode == SMOOTH_V_PRED)) + continue; + if (!cpi->oxcf.enable_paeth_intra && mbmi->mode == PAETH_PRED) continue; + if (sf->adaptive_mode_search > 1) if ((x->source_variance << num_pels_log2_lookup[bsize]) > search_state.best_pred_sse) continue; @@ -11995,7 +12857,7 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data, } if (ref_frame == INTRA_FRAME) { - intra_mode_idx_ls[intra_mode_num++] = mode_index; + intra_mode_idx_ls[intra_mode_num++] = midx; continue; } else { mbmi->angle_delta[PLANE_TYPE_Y] = 0; @@ -12014,30 +12876,25 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data, args.single_newmv_valid = search_state.single_newmv_valid; args.single_comp_cost = real_compmode_cost; args.ref_frame_cost = ref_frame_cost; - if (mode_index < MAX_SINGLE_REF_MODES) { - args.simple_rd_state = x->simple_rd_state[mode_index]; + if (midx < MAX_SINGLE_REF_MODES) { + args.simple_rd_state = x->simple_rd_state[midx]; } -#if CONFIG_COLLECT_INTER_MODE_RD_STATS + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, handle_inter_mode_time); +#endif this_rd = handle_inter_mode( - cpi, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv, &disable_skip, - mi_row, mi_col, &args, ref_best_rd, tmp_buf, &rd_buffers, tile_data, - &best_est_rd, do_tx_search, inter_modes_info); -#else - this_rd = handle_inter_mode(cpi, x, bsize, &rd_stats, &rd_stats_y, - &rd_stats_uv, &disable_skip, mi_row, mi_col, - &args, ref_best_rd, tmp_buf, &rd_buffers); + cpi, tile_data, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv, + &disable_skip, mi_row, mi_col, &args, ref_best_rd, tmp_buf, + &rd_buffers, &best_est_rd, do_tx_search, inter_modes_info); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, handle_inter_mode_time); #endif rate2 = rd_stats.rate; skippable = rd_stats.skip; distortion2 = rd_stats.dist; rate_y = rd_stats_y.rate; rate_uv = rd_stats_uv.rate; -#if CONFIG_ONE_PASS_SVM - av1_unpack_reg_stat(&rd_stats_y, &temp_y_eob, &temp_y_eob_0, - &temp_y_eob_1, &temp_y_eob_2, &temp_y_eob_3, - &temp_y_rd, &temp_y_rd_0, &temp_y_rd_1, - &temp_y_rd_2, &temp_y_rd_3); -#endif } if (sf->prune_comp_search_by_single_result > 0 && @@ -12063,7 +12920,7 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data, } if (!mode_excluded) { // Note index of best mode so far - search_state.best_mode_index = mode_index; + search_state.best_mode_index = midx; if (ref_frame == INTRA_FRAME) { /* required for left and above block mv */ @@ -12079,7 +12936,6 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data, search_state.best_mbmode = *mbmi; search_state.best_skip2 = this_skip2; search_state.best_mode_skippable = skippable; -#if CONFIG_COLLECT_INTER_MODE_RD_STATS if (do_tx_search) { // When do_tx_search == 0, handle_inter_mode won't provide correct // rate_y and rate_uv because txfm_search process is replaced by @@ -12090,24 +12946,7 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data, rate_y + x->skip_cost[av1_get_skip_context(xd)][this_skip2 || skippable]; search_state.best_rate_uv = rate_uv; - -#if CONFIG_ONE_PASS_SVM - av1_set_reg_stat(rd_cost, temp_y_eob, temp_y_eob_0, temp_y_eob_1, - temp_y_eob_2, temp_y_eob_3, temp_y_rd, temp_y_rd_0, - temp_y_rd_1, temp_y_rd_2, temp_y_rd_3); -#endif } -#else // CONFIG_COLLECT_INTER_MODE_RD_STATS - search_state.best_rate_y = - rate_y + - x->skip_cost[av1_get_skip_context(xd)][this_skip2 || skippable]; - search_state.best_rate_uv = rate_uv; -#if CONFIG_ONE_PASS_SVM - av1_set_reg_stat(rd_cost, temp_y_eob, temp_y_eob_0, temp_y_eob_1, - temp_y_eob_2, temp_y_eob_3, temp_y_rd, temp_y_rd_0, - temp_y_rd_1, temp_y_rd_2, temp_y_rd_3); -#endif -#endif // CONFIG_COLLECT_INTER_MODE_RD_STATS memcpy(ctx->blk_skip, x->blk_skip, sizeof(x->blk_skip[0]) * ctx->num_4x4_blk); } @@ -12148,51 +12987,67 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data, release_compound_type_rd_buffers(&rd_buffers); -#if CONFIG_COLLECT_INTER_MODE_RD_STATS - if (!do_tx_search) { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, do_tx_search_time); +#endif + if (do_tx_search_global != 1) { inter_modes_info_sort(inter_modes_info, inter_modes_info->rd_idx_pair_arr); search_state.best_rd = INT64_MAX; int64_t top_est_rd = - inter_modes_info->est_rd_arr[inter_modes_info->rd_idx_pair_arr[0].idx]; + inter_modes_info->num > 0 + ? inter_modes_info + ->est_rd_arr[inter_modes_info->rd_idx_pair_arr[0].idx] + : INT64_MAX; for (int j = 0; j < inter_modes_info->num; ++j) { const int data_idx = inter_modes_info->rd_idx_pair_arr[j].idx; *mbmi = inter_modes_info->mbmi_arr[data_idx]; int64_t curr_est_rd = inter_modes_info->est_rd_arr[data_idx]; - if (curr_est_rd * 0.9 > top_est_rd) { - continue; - } - const int mode_rate = inter_modes_info->mode_rate_arr[data_idx]; - - x->skip = 0; - set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); - - // Select prediction reference frames. - const int is_comp_pred = mbmi->ref_frame[1] > INTRA_FRAME; - for (i = 0; i < num_planes; i++) { - xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i]; - if (is_comp_pred) xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i]; - } + if (curr_est_rd * 0.80 > top_est_rd) break; RD_STATS rd_stats; RD_STATS rd_stats_y; RD_STATS rd_stats_uv; - av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize); - if (mbmi->motion_mode == OBMC_CAUSAL) - av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col); - - if (!txfm_search(cpi, x, bsize, mi_row, mi_col, &rd_stats, &rd_stats_y, - &rd_stats_uv, mode_rate, search_state.best_rd)) { - continue; + bool true_rd = inter_modes_info->true_rd_arr[data_idx]; + if (true_rd) { + rd_stats = inter_modes_info->rd_cost_arr[data_idx]; + rd_stats_y = inter_modes_info->rd_cost_y_arr[data_idx]; + rd_stats_uv = inter_modes_info->rd_cost_uv_arr[data_idx]; + memcpy(x->blk_skip, inter_modes_info->blk_skip_arr[data_idx], + sizeof(x->blk_skip[0]) * ctx->num_4x4_blk); } else { - const int skip_ctx = av1_get_skip_context(xd); - inter_mode_data_push(tile_data, mbmi->sb_type, rd_stats.sse, - rd_stats.dist, - rd_stats_y.rate + rd_stats_uv.rate + - x->skip_cost[skip_ctx][mbmi->skip]); + const int mode_rate = inter_modes_info->mode_rate_arr[data_idx]; + + x->skip = 0; + set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); + + // Select prediction reference frames. + const int is_comp_pred = mbmi->ref_frame[1] > INTRA_FRAME; + for (i = 0; i < num_planes; i++) { + xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i]; + if (is_comp_pred) + xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i]; + } + + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0, + av1_num_planes(cm) - 1); + if (mbmi->motion_mode == OBMC_CAUSAL) + av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col); + + if (!txfm_search(cpi, tile_data, x, bsize, mi_row, mi_col, &rd_stats, + &rd_stats_y, &rd_stats_uv, mode_rate, + search_state.best_rd)) { + continue; + } else if (cpi->sf.inter_mode_rd_model_estimation == 1) { + const int skip_ctx = av1_get_skip_context(xd); + inter_mode_data_push(tile_data, mbmi->sb_type, rd_stats.sse, + rd_stats.dist, + rd_stats_y.rate + rd_stats_uv.rate + + x->skip_cost[skip_ctx][mbmi->skip]); + } + rd_stats.rdcost = RDCOST(x->rdmult, rd_stats.rate, rd_stats.dist); } - rd_stats.rdcost = RDCOST(x->rdmult, rd_stats.rate, rd_stats.dist); if (rd_stats.rdcost < search_state.best_rd) { search_state.best_rd = rd_stats.rdcost; @@ -12211,14 +13066,16 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data, search_state.best_rate_uv = rd_stats_uv.rate; memcpy(ctx->blk_skip, x->blk_skip, sizeof(x->blk_skip[0]) * ctx->num_4x4_blk); -#if CONFIG_ONE_PASS_SVM - av1_copy_reg_stat(rd_cost, &rd_stats_y); -#endif } } } +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, do_tx_search_time); #endif +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, handle_intra_mode_time); +#endif for (int j = 0; j < intra_mode_num; ++j) { const int mode_index = intra_mode_idx_ls[j]; const MV_REFERENCE_FRAME ref_frame = @@ -12256,11 +13113,11 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data, search_state.best_rate_uv = intra_rd_stats_uv.rate; memcpy(ctx->blk_skip, x->blk_skip, sizeof(x->blk_skip[0]) * ctx->num_4x4_blk); -#if CONFIG_ONE_PASS_SVM - av1_copy_reg_stat(rd_cost, &intra_rd_stats_y); -#endif } } +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, handle_intra_mode_time); +#endif // In effect only when speed >= 2. sf_refine_fast_tx_type_search( @@ -12273,7 +13130,6 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data, search_palette_mode(cpi, x, mi_row, mi_col, rd_cost, ctx, bsize, mbmi, pmi, ref_costs_single, &search_state); } - search_state.best_mbmode.skip_mode = 0; if (cm->current_frame.skip_mode_info.skip_mode_flag && !segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) && @@ -12351,6 +13207,496 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data, } } +// TODO(kyslov): now this is very similar to av1_rd_pick_inter_mode_sb except: +// it only checks non-compound mode and +// it doesn't check palette mode +// it doesn't refine tx search +// this function is likely to be heavily modified with nonrd mode +// decision +void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data, + MACROBLOCK *x, int mi_row, int mi_col, + RD_STATS *rd_cost, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx, + int64_t best_rd_so_far) { + AV1_COMMON *const cm = &cpi->common; + const int num_planes = av1_num_planes(cm); + const SPEED_FEATURES *const sf = &cpi->sf; + MACROBLOCKD *const xd = &x->e_mbd; + MB_MODE_INFO *const mbmi = xd->mi[0]; + const struct segmentation *const seg = &cm->seg; + PREDICTION_MODE this_mode; + unsigned char segment_id = mbmi->segment_id; + int i; + struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]; + unsigned int ref_costs_single[REF_FRAMES]; + unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES]; + int *comp_inter_cost = x->comp_inter_cost[av1_get_reference_mode_context(xd)]; + mode_skip_mask_t mode_skip_mask; + uint8_t motion_mode_skip_mask = 0; // second pass of single ref modes + + InterModeSearchState search_state; + init_inter_mode_search_state(&search_state, cpi, tile_data, x, bsize, + best_rd_so_far); + INTERINTRA_MODE interintra_modes[REF_FRAMES] = { + INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, + INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES + }; + HandleInterModeArgs args = { + { NULL }, { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }, + { NULL }, { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1 }, + NULL, NULL, + NULL, search_state.modelled_rd, + { { 0 } }, INT_MAX, + INT_MAX, search_state.simple_rd, + 0, interintra_modes, + 1, NULL + }; + for (i = 0; i < REF_FRAMES; ++i) x->pred_sse[i] = INT_MAX; + + av1_invalid_rd_stats(rd_cost); + + // Ref frames that are selected by square partition blocks. + int picked_ref_frames_mask = 0; + if (cpi->sf.prune_ref_frame_for_rect_partitions && + mbmi->partition != PARTITION_NONE && mbmi->partition != PARTITION_SPLIT) { + // Don't enable for vert and horz partition blocks if current frame + // will be used as bwd or arf2. + if ((!cpi->refresh_bwd_ref_frame && !cpi->refresh_alt2_ref_frame) || + (mbmi->partition != PARTITION_VERT && + mbmi->partition != PARTITION_HORZ)) { + picked_ref_frames_mask = fetch_picked_ref_frames_mask( + x, bsize, cm->seq_params.mib_size, mi_row, mi_col); + } + } + + // Skip ref frames that never selected by square blocks. + const int skip_ref_frame_mask = + picked_ref_frames_mask ? ~picked_ref_frames_mask : 0; + + // init params, set frame modes, speed features + set_params_nonrd_pick_inter_mode(cpi, x, &args, bsize, mi_row, mi_col, + &mode_skip_mask, skip_ref_frame_mask, + ref_costs_single, ref_costs_comp, yv12_mb); + + int64_t best_est_rd = INT64_MAX; + InterModesInfo *inter_modes_info = x->inter_modes_info; + inter_modes_info->num = 0; + + int intra_mode_num = 0; + int intra_mode_idx_ls[MAX_MODES]; + int reach_first_comp_mode = 0; + + // Temporary buffers used by handle_inter_mode(). + uint8_t *const tmp_buf = get_buf_by_bd(xd, x->tmp_obmc_bufs[0]); + + CompoundTypeRdBuffers rd_buffers; + alloc_compound_type_rd_buffers(cm, &rd_buffers); + + for (int midx = 0; midx < MAX_MODES; ++midx) { + const MODE_DEFINITION *mode_order = &av1_mode_order[midx]; + this_mode = mode_order->mode; + const MV_REFERENCE_FRAME ref_frame = mode_order->ref_frame[0]; + const MV_REFERENCE_FRAME second_ref_frame = mode_order->ref_frame[1]; + const int comp_pred = second_ref_frame > INTRA_FRAME; + + if (second_ref_frame != NONE_FRAME) continue; + + // When single ref motion search ends: + // 1st pass: To evaluate single ref RD results and rewind to the beginning; + // 2nd pass: To continue with compound ref search. + if (sf->prune_single_motion_modes_by_simple_trans) { + if (comp_pred && args.single_ref_first_pass) { + args.single_ref_first_pass = 0; + // Reach the first comp ref mode + // Reset midx to start the 2nd pass for single ref motion search + midx = -1; + motion_mode_skip_mask = analyze_simple_trans_states(cpi, x); + continue; + } + if (!comp_pred && ref_frame != INTRA_FRAME) { // single ref mode + if (args.single_ref_first_pass) { + // clear stats + for (int k = 0; k < MAX_REF_MV_SERCH; ++k) { + x->simple_rd_state[midx][k].rd_stats.rdcost = INT64_MAX; + x->simple_rd_state[midx][k].early_skipped = 0; + } + } else { + if (motion_mode_skip_mask & (1 << ref_frame)) { + continue; + } + } + } + } + + // Reach the first compound prediction mode + if (sf->prune_comp_search_by_single_result > 0 && comp_pred && + reach_first_comp_mode == 0) { + analyze_single_states(cpi, &search_state); + reach_first_comp_mode = 1; + } + int64_t this_rd = INT64_MAX; + int disable_skip = 0; + int rate2 = 0; + int64_t distortion2 = 0; + int skippable = 0; + int this_skip2 = 0; + + init_mbmi(mbmi, midx, cm); + + x->skip = 0; + set_ref_ptrs(cm, xd, ref_frame, second_ref_frame); + + if (inter_mode_compatible_skip(cpi, x, bsize, midx)) continue; + + const int ret = inter_mode_search_order_independent_skip( + cpi, x, bsize, midx, mi_row, mi_col, &mode_skip_mask, &search_state, + skip_ref_frame_mask); + if (ret == 1) continue; + args.skip_motion_mode = (ret == 2); + + if (sf->drop_ref && comp_pred) { + if (sf_check_is_drop_ref(mode_order, &search_state)) { + continue; + } + } + + if (search_state.best_rd < search_state.mode_threshold[midx]) continue; + + if (sf->prune_comp_search_by_single_result > 0 && comp_pred) { + if (compound_skip_by_single_states(cpi, &search_state, this_mode, + ref_frame, second_ref_frame, x)) + continue; + } + + const int ref_frame_cost = comp_pred + ? ref_costs_comp[ref_frame][second_ref_frame] + : ref_costs_single[ref_frame]; + const int compmode_cost = + is_comp_ref_allowed(mbmi->sb_type) ? comp_inter_cost[comp_pred] : 0; + const int real_compmode_cost = + cm->current_frame.reference_mode == REFERENCE_MODE_SELECT + ? compmode_cost + : 0; + + if (comp_pred) { + if ((sf->mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) && + search_state.best_mode_index >= 0 && + search_state.best_mbmode.ref_frame[0] == INTRA_FRAME) + continue; + } + + if (ref_frame == INTRA_FRAME) { + if (!cpi->oxcf.enable_smooth_intra && + (mbmi->mode == SMOOTH_PRED || mbmi->mode == SMOOTH_H_PRED || + mbmi->mode == SMOOTH_V_PRED)) + continue; + if (!cpi->oxcf.enable_paeth_intra && mbmi->mode == PAETH_PRED) continue; + if (sf->adaptive_mode_search > 1) + if ((x->source_variance << num_pels_log2_lookup[bsize]) > + search_state.best_pred_sse) + continue; + + if (this_mode != DC_PRED) { + // Only search the oblique modes if the best so far is + // one of the neighboring directional modes + if ((sf->mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) && + (this_mode >= D45_PRED && this_mode <= PAETH_PRED)) { + if (search_state.best_mode_index >= 0 && + search_state.best_mbmode.ref_frame[0] > INTRA_FRAME) + continue; + } + if (sf->mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) { + if (conditional_skipintra(this_mode, search_state.best_intra_mode)) + continue; + } + } + } + + // Select prediction reference frames. + for (i = 0; i < num_planes; i++) { + xd->plane[i].pre[0] = yv12_mb[ref_frame][i]; + if (comp_pred) xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i]; + } + + if (ref_frame == INTRA_FRAME) { + intra_mode_idx_ls[intra_mode_num++] = midx; + continue; + } else { + mbmi->angle_delta[PLANE_TYPE_Y] = 0; + mbmi->angle_delta[PLANE_TYPE_UV] = 0; + mbmi->filter_intra_mode_info.use_filter_intra = 0; + mbmi->ref_mv_idx = 0; + int64_t ref_best_rd = search_state.best_rd; + { + RD_STATS rd_stats, rd_stats_y, rd_stats_uv; + av1_init_rd_stats(&rd_stats); + rd_stats.rate = rate2; + + // Point to variables that are maintained between loop iterations + args.single_newmv = search_state.single_newmv; + args.single_newmv_rate = search_state.single_newmv_rate; + args.single_newmv_valid = search_state.single_newmv_valid; + args.single_comp_cost = real_compmode_cost; + args.ref_frame_cost = ref_frame_cost; + if (midx < MAX_SINGLE_REF_MODES) { + args.simple_rd_state = x->simple_rd_state[midx]; + } + this_rd = handle_inter_mode( + cpi, tile_data, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv, + &disable_skip, mi_row, mi_col, &args, ref_best_rd, tmp_buf, + &rd_buffers, &best_est_rd, 0, inter_modes_info); + rate2 = rd_stats.rate; + skippable = rd_stats.skip; + distortion2 = rd_stats.dist; + } + + if (sf->prune_comp_search_by_single_result > 0 && + is_inter_singleref_mode(this_mode) && args.single_ref_first_pass) { + collect_single_states(x, &search_state, mbmi); + } + + if (this_rd == INT64_MAX) continue; + + this_skip2 = mbmi->skip; + this_rd = RDCOST(x->rdmult, rate2, distortion2); + } + + // Did this mode help.. i.e. is it the new best mode + if (this_rd < search_state.best_rd || x->skip) { + int mode_excluded = 0; + if (comp_pred) { + mode_excluded = cm->current_frame.reference_mode == SINGLE_REFERENCE; + } + if (!mode_excluded) { + // Note index of best mode so far + search_state.best_mode_index = midx; + + if (ref_frame == INTRA_FRAME) { + /* required for left and above block mv */ + mbmi->mv[0].as_int = 0; + } else { + search_state.best_pred_sse = x->pred_sse[ref_frame]; + } + + rd_cost->rate = rate2; + rd_cost->dist = distortion2; + rd_cost->rdcost = this_rd; + search_state.best_rd = this_rd; + search_state.best_mbmode = *mbmi; + search_state.best_skip2 = this_skip2; + search_state.best_mode_skippable = skippable; + memcpy(ctx->blk_skip, x->blk_skip, + sizeof(x->blk_skip[0]) * ctx->num_4x4_blk); + } + } + + /* keep record of best compound/single-only prediction */ + if (!disable_skip && ref_frame != INTRA_FRAME) { + int64_t single_rd, hybrid_rd, single_rate, hybrid_rate; + + if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) { + single_rate = rate2 - compmode_cost; + hybrid_rate = rate2; + } else { + single_rate = rate2; + hybrid_rate = rate2 + compmode_cost; + } + + single_rd = RDCOST(x->rdmult, single_rate, distortion2); + hybrid_rd = RDCOST(x->rdmult, hybrid_rate, distortion2); + + if (!comp_pred) { + if (single_rd < search_state.best_pred_rd[SINGLE_REFERENCE]) + search_state.best_pred_rd[SINGLE_REFERENCE] = single_rd; + } else { + if (single_rd < search_state.best_pred_rd[COMPOUND_REFERENCE]) + search_state.best_pred_rd[COMPOUND_REFERENCE] = single_rd; + } + if (hybrid_rd < search_state.best_pred_rd[REFERENCE_MODE_SELECT]) + search_state.best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd; + } + if (sf->drop_ref && second_ref_frame == NONE_FRAME) { + // Collect data from single ref mode, and analyze data. + sf_drop_ref_analyze(&search_state, mode_order, distortion2); + } + + if (x->skip && !comp_pred) break; + } + + release_compound_type_rd_buffers(&rd_buffers); + + inter_modes_info_sort(inter_modes_info, inter_modes_info->rd_idx_pair_arr); + search_state.best_rd = INT64_MAX; + + if (inter_modes_info->num > 0) { + const int data_idx = inter_modes_info->rd_idx_pair_arr[0].idx; + *mbmi = inter_modes_info->mbmi_arr[data_idx]; + const int mode_rate = inter_modes_info->mode_rate_arr[data_idx]; + + x->skip = 0; + set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]); + + // Select prediction reference frames. + const int is_comp_pred = mbmi->ref_frame[1] > INTRA_FRAME; + for (i = 0; i < num_planes; i++) { + xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i]; + if (is_comp_pred) xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i]; + } + + RD_STATS rd_stats; + RD_STATS rd_stats_y; + RD_STATS rd_stats_uv; + + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0, + av1_num_planes(cm) - 1); + if (mbmi->motion_mode == OBMC_CAUSAL) + av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col); + + if (txfm_search(cpi, tile_data, x, bsize, mi_row, mi_col, &rd_stats, + &rd_stats_y, &rd_stats_uv, mode_rate, + search_state.best_rd)) { + if (cpi->sf.inter_mode_rd_model_estimation == 1) { + const int skip_ctx = av1_get_skip_context(xd); + inter_mode_data_push(tile_data, mbmi->sb_type, rd_stats.sse, + rd_stats.dist, + rd_stats_y.rate + rd_stats_uv.rate + + x->skip_cost[skip_ctx][mbmi->skip]); + } + rd_stats.rdcost = RDCOST(x->rdmult, rd_stats.rate, rd_stats.dist); + + if (rd_stats.rdcost < search_state.best_rd) { + search_state.best_rd = rd_stats.rdcost; + // Note index of best mode so far + const int mode_index = get_prediction_mode_idx( + mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]); + search_state.best_mode_index = mode_index; + *rd_cost = rd_stats; + search_state.best_rd = rd_stats.rdcost; + search_state.best_mbmode = *mbmi; + search_state.best_skip2 = mbmi->skip; + search_state.best_mode_skippable = rd_stats.skip; + search_state.best_rate_y = + rd_stats_y.rate + + x->skip_cost[av1_get_skip_context(xd)][rd_stats.skip || mbmi->skip]; + search_state.best_rate_uv = rd_stats_uv.rate; + memcpy(ctx->blk_skip, x->blk_skip, + sizeof(x->blk_skip[0]) * ctx->num_4x4_blk); + } + } + } + + for (int j = 0; j < intra_mode_num; ++j) { + const int mode_index = intra_mode_idx_ls[j]; + const MV_REFERENCE_FRAME ref_frame = + av1_mode_order[mode_index].ref_frame[0]; + assert(av1_mode_order[mode_index].ref_frame[1] == NONE_FRAME); + assert(ref_frame == INTRA_FRAME); + if (sf->skip_intra_in_interframe && search_state.skip_intra_modes) break; + init_mbmi(mbmi, mode_index, cm); + x->skip = 0; + set_ref_ptrs(cm, xd, INTRA_FRAME, NONE_FRAME); + + // Select prediction reference frames. + for (i = 0; i < num_planes; i++) { + xd->plane[i].pre[0] = yv12_mb[ref_frame][i]; + } + + RD_STATS intra_rd_stats, intra_rd_stats_y, intra_rd_stats_uv; + + const int ref_frame_cost = ref_costs_single[ref_frame]; + intra_rd_stats.rdcost = handle_intra_mode( + &search_state, cpi, x, bsize, mi_row, mi_col, ref_frame_cost, ctx, 0, + &intra_rd_stats, &intra_rd_stats_y, &intra_rd_stats_uv); + if (intra_rd_stats.rdcost < search_state.best_rd) { + search_state.best_rd = intra_rd_stats.rdcost; + // Note index of best mode so far + search_state.best_mode_index = mode_index; + *rd_cost = intra_rd_stats; + search_state.best_rd = intra_rd_stats.rdcost; + search_state.best_mbmode = *mbmi; + search_state.best_skip2 = 0; + search_state.best_mode_skippable = intra_rd_stats.skip; + search_state.best_rate_y = + intra_rd_stats_y.rate + + x->skip_cost[av1_get_skip_context(xd)][intra_rd_stats.skip]; + search_state.best_rate_uv = intra_rd_stats_uv.rate; + memcpy(ctx->blk_skip, x->blk_skip, + sizeof(x->blk_skip[0]) * ctx->num_4x4_blk); + } + } + + search_state.best_mbmode.skip_mode = 0; + if (cm->current_frame.skip_mode_info.skip_mode_flag && + !segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) && + is_comp_ref_allowed(bsize)) { + rd_pick_skip_mode(rd_cost, &search_state, cpi, x, bsize, mi_row, mi_col, + yv12_mb); + } + + // Make sure that the ref_mv_idx is only nonzero when we're + // using a mode which can support ref_mv_idx + if (search_state.best_mbmode.ref_mv_idx != 0 && + !(search_state.best_mbmode.mode == NEWMV || + search_state.best_mbmode.mode == NEW_NEWMV || + have_nearmv_in_inter_mode(search_state.best_mbmode.mode))) { + search_state.best_mbmode.ref_mv_idx = 0; + } + + if (search_state.best_mode_index < 0 || + search_state.best_rd >= best_rd_so_far) { + rd_cost->rate = INT_MAX; + rd_cost->rdcost = INT64_MAX; + return; + } + + assert( + (cm->interp_filter == SWITCHABLE) || + (cm->interp_filter == + av1_extract_interp_filter(search_state.best_mbmode.interp_filters, 0)) || + !is_inter_block(&search_state.best_mbmode)); + assert( + (cm->interp_filter == SWITCHABLE) || + (cm->interp_filter == + av1_extract_interp_filter(search_state.best_mbmode.interp_filters, 1)) || + !is_inter_block(&search_state.best_mbmode)); + + if (!cpi->rc.is_src_frame_alt_ref) + av1_update_rd_thresh_fact(cm, tile_data->thresh_freq_fact, + sf->adaptive_rd_thresh, bsize, + search_state.best_mode_index); + + // macroblock modes + *mbmi = search_state.best_mbmode; + x->skip |= search_state.best_skip2; + + // Note: this section is needed since the mode may have been forced to + // GLOBALMV by the all-zero mode handling of ref-mv. + if (mbmi->mode == GLOBALMV || mbmi->mode == GLOBAL_GLOBALMV) { + // Correct the interp filters for GLOBALMV + if (is_nontrans_global_motion(xd, xd->mi[0])) { + assert(mbmi->interp_filters == + av1_broadcast_interp_filter( + av1_unswitchable_filter(cm->interp_filter))); + } + } + + for (i = 0; i < REFERENCE_MODES; ++i) { + if (search_state.best_pred_rd[i] == INT64_MAX) + search_state.best_pred_diff[i] = INT_MIN; + else + search_state.best_pred_diff[i] = + search_state.best_rd - search_state.best_pred_rd[i]; + } + + x->skip |= search_state.best_mode_skippable; + + assert(search_state.best_mode_index >= 0); + + store_coding_context(x, ctx, search_state.best_mode_index, + search_state.best_pred_diff, + search_state.best_mode_skippable); +} + void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi, TileDataEnc *tile_data, MACROBLOCK *x, int mi_row, int mi_col, @@ -12494,7 +13840,7 @@ static INLINE void calc_target_weighted_pred_above( int32_t *wsrc = ctxt->x->wsrc_buf + (rel_mi_col * MI_SIZE); int32_t *mask = ctxt->x->mask_buf + (rel_mi_col * MI_SIZE); const uint8_t *tmp = ctxt->tmp + rel_mi_col * MI_SIZE; - const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0; + const int is_hbd = is_cur_buf_hbd(xd); if (!is_hbd) { for (int row = 0; row < ctxt->overlap; ++row) { @@ -12540,7 +13886,7 @@ static INLINE void calc_target_weighted_pred_left( int32_t *wsrc = ctxt->x->wsrc_buf + (rel_mi_row * MI_SIZE * bw); int32_t *mask = ctxt->x->mask_buf + (rel_mi_row * MI_SIZE * bw); const uint8_t *tmp = ctxt->tmp + (rel_mi_row * MI_SIZE * ctxt->tmp_stride); - const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0; + const int is_hbd = is_cur_buf_hbd(xd); if (!is_hbd) { for (int row = 0; row < nb_mi_height * MI_SIZE; ++row) { @@ -12622,7 +13968,7 @@ static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x, int32_t *mask_buf = x->mask_buf; int32_t *wsrc_buf = x->wsrc_buf; - const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0; + const int is_hbd = is_cur_buf_hbd(xd); const int src_scale = AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA; // plane 0 should not be subsampled @@ -12741,12 +14087,14 @@ void gaussian_blur(const uint8_t *src, int src_stride, int w, int h, } } -static uint16_t edge_probability(const uint8_t *input, int w, int h, +static EdgeInfo edge_probability(const uint8_t *input, int w, int h, bool high_bd, int bd) { // The probability of an edge in the whole image is the same as the highest // probability of an edge for any individual pixel. Use Sobel as the metric // for finding an edge. uint16_t highest = 0; + uint16_t highest_x = 0; + uint16_t highest_y = 0; // Ignore the 1 pixel border around the image for the computation. for (int j = 1; j < h - 1; ++j) { for (int i = 1; i < w - 1; ++i) { @@ -12756,18 +14104,22 @@ static uint16_t edge_probability(const uint8_t *input, int w, int h, int16_t g_y = g.y >> (bd - 8); uint16_t magnitude = (uint16_t)sqrt(g_x * g_x + g_y * g_y); highest = AOMMAX(highest, magnitude); + highest_x = AOMMAX(highest_x, g_x); + highest_y = AOMMAX(highest_y, g_y); } } - return highest; + EdgeInfo ei = { .magnitude = highest, .x = highest_x, .y = highest_y }; + return ei; } /* Uses most of the Canny edge detection algorithm to find if there are any * edges in the image. */ -uint16_t av1_edge_exists(const uint8_t *src, int src_stride, int w, int h, +EdgeInfo av1_edge_exists(const uint8_t *src, int src_stride, int w, int h, bool high_bd, int bd) { if (w < 3 || h < 3) { - return 0; + EdgeInfo n = { .magnitude = 0, .x = 0, .y = 0 }; + return n; } uint8_t *blurred; if (high_bd) { @@ -12780,7 +14132,7 @@ uint16_t av1_edge_exists(const uint8_t *src, int src_stride, int w, int h, // want a probability of an edge existing in the buffer, which is determined // by the strongest edge in it -- we don't need to eliminate the weaker // edges. Use Sobel for the edge detection. - uint16_t prob = edge_probability(blurred, w, h, high_bd, bd); + EdgeInfo prob = edge_probability(blurred, w, h, high_bd, bd); if (high_bd) { aom_free(CONVERT_TO_SHORTPTR(blurred)); } else { diff --git a/libaom/av1/encoder/rdopt.h b/libaom/av1/encoder/rdopt.h index 5ff2df3..7ba1b18 100644 --- a/libaom/av1/encoder/rdopt.h +++ b/libaom/av1/encoder/rdopt.h @@ -123,18 +123,33 @@ void av1_rd_pick_inter_mode_sb(struct AV1_COMP *cpi, struct RD_STATS *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far); +void av1_nonrd_pick_inter_mode_sb(struct AV1_COMP *cpi, + struct TileDataEnc *tile_data, + struct macroblock *x, int mi_row, int mi_col, + struct RD_STATS *rd_cost, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx, + int64_t best_rd_so_far); + void av1_rd_pick_inter_mode_sb_seg_skip( const struct AV1_COMP *cpi, struct TileDataEnc *tile_data, struct macroblock *x, int mi_row, int mi_col, struct RD_STATS *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far); +// The best edge strength seen in the block, as well as the best x and y +// components of edge strength seen. +typedef struct { + uint16_t magnitude; + uint16_t x; + uint16_t y; +} EdgeInfo; + /** Returns an integer indicating the strength of the edge. * 0 means no edge found, 556 is the strength of a solid black/white edge, * and the number may range higher if the signal is even stronger (e.g., on a * corner). high_bd is a bool indicating the source should be treated * as a 16-bit array. bd is the bit depth. */ -uint16_t av1_edge_exists(const uint8_t *src, int src_stride, int w, int h, +EdgeInfo av1_edge_exists(const uint8_t *src, int src_stride, int w, int h, bool high_bd, int bd); /** Applies a Gaussian blur with sigma = 1.3. Used by av1_edge_exists and @@ -151,10 +166,8 @@ typedef struct { sobel_xy sobel(const uint8_t *input, int stride, int i, int j, bool high_bd); -#if CONFIG_COLLECT_INTER_MODE_RD_STATS void av1_inter_mode_data_init(struct TileDataEnc *tile_data); void av1_inter_mode_data_fit(TileDataEnc *tile_data, int rdmult); -#endif #ifdef __cplusplus } // extern "C" diff --git a/libaom/av1/encoder/reconinter_enc.c b/libaom/av1/encoder/reconinter_enc.c index 1100222..4b477ce 100644 --- a/libaom/av1/encoder/reconinter_enc.c +++ b/libaom/av1/encoder/reconinter_enc.c @@ -138,27 +138,28 @@ static INLINE void build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd, assert(bw < 8 || bh < 8); ConvolveParams conv_params = get_conv_params_no_round( 0, plane, xd->tmp_conv_dst, tmp_dst_stride, is_compound, xd->bd); - conv_params.use_jnt_comp_avg = 0; + conv_params.use_dist_wtd_comp_avg = 0; struct buf_2d *const dst_buf = &pd->dst; uint8_t *dst = dst_buf->buf + dst_buf->stride * y + x; ref = 0; - const RefBuffer *ref_buf = - &cm->current_frame - .frame_refs[this_mbmi->ref_frame[ref] - LAST_FRAME]; + const RefCntBuffer *ref_buf = + get_ref_frame_buf(cm, this_mbmi->ref_frame[ref]); + const struct scale_factors *ref_scale_factors = + get_ref_scale_factors_const(cm, this_mbmi->ref_frame[ref]); - pd->pre[ref].buf0 = (plane == 1) ? ref_buf->buf->buf.u_buffer - : ref_buf->buf->buf.v_buffer; + pd->pre[ref].buf0 = + (plane == 1) ? ref_buf->buf.u_buffer : ref_buf->buf.v_buffer; pd->pre[ref].buf = - pd->pre[ref].buf0 + - scaled_buffer_offset(pre_x, pre_y, ref_buf->buf->buf.uv_stride, - &ref_buf->sf); - pd->pre[ref].width = ref_buf->buf->buf.uv_crop_width; - pd->pre[ref].height = ref_buf->buf->buf.uv_crop_height; - pd->pre[ref].stride = ref_buf->buf->buf.uv_stride; + pd->pre[ref].buf0 + scaled_buffer_offset(pre_x, pre_y, + ref_buf->buf.uv_stride, + ref_scale_factors); + pd->pre[ref].width = ref_buf->buf.uv_crop_width; + pd->pre[ref].height = ref_buf->buf.uv_crop_height; + pd->pre[ref].stride = ref_buf->buf.uv_stride; const struct scale_factors *const sf = - is_intrabc ? &cm->sf_identity : &ref_buf->sf; + is_intrabc ? &cm->sf_identity : ref_scale_factors; struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref]; const MV mv = this_mbmi->mv[ref].as_mv; @@ -195,15 +196,15 @@ static INLINE void build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd, { ConvolveParams conv_params = get_conv_params_no_round( 0, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd); - av1_jnt_comp_weight_assign(cm, mi, 0, &conv_params.fwd_offset, - &conv_params.bck_offset, - &conv_params.use_jnt_comp_avg, is_compound); + av1_dist_wtd_comp_weight_assign( + cm, mi, 0, &conv_params.fwd_offset, &conv_params.bck_offset, + &conv_params.use_dist_wtd_comp_avg, is_compound); struct buf_2d *const dst_buf = &pd->dst; uint8_t *const dst = dst_buf->buf; for (ref = 0; ref < 1 + is_compound; ++ref) { const struct scale_factors *const sf = - is_intrabc ? &cm->sf_identity : &xd->block_refs[ref]->sf; + is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref]; struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref]; const MV mv = mi->mv[ref].as_mv; @@ -236,46 +237,19 @@ static INLINE void build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd, } } -static void build_inter_predictors_for_planes(const AV1_COMMON *cm, - MACROBLOCKD *xd, BLOCK_SIZE bsize, - int mi_row, int mi_col, - int plane_from, int plane_to) { - int plane; +static void build_inter_predictors_for_plane(const AV1_COMMON *cm, + MACROBLOCKD *xd, int mi_row, + int mi_col, const BUFFER_SET *ctx, + BLOCK_SIZE bsize, int plane_idx) { + const struct macroblockd_plane *pd = &xd->plane[plane_idx]; + if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x, + pd->subsampling_y)) + return; + const int mi_x = mi_col * MI_SIZE; const int mi_y = mi_row * MI_SIZE; - for (plane = plane_from; plane <= plane_to; ++plane) { - const struct macroblockd_plane *pd = &xd->plane[plane]; - const int bw = pd->width; - const int bh = pd->height; - - if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x, - pd->subsampling_y)) - continue; - - build_inter_predictors(cm, xd, plane, xd->mi[0], 0, bw, bh, mi_x, mi_y); - } -} - -void av1_build_inter_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd, - int mi_row, int mi_col, BUFFER_SET *ctx, - BLOCK_SIZE bsize) { - av1_build_inter_predictors_sbp(cm, xd, mi_row, mi_col, ctx, bsize, 0); -} - -void av1_build_inter_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd, - int mi_row, int mi_col, BUFFER_SET *ctx, - BLOCK_SIZE bsize) { - for (int plane_idx = 1; plane_idx < MAX_MB_PLANE; plane_idx++) { - av1_build_inter_predictors_sbp(cm, xd, mi_row, mi_col, ctx, bsize, - plane_idx); - } -} - -void av1_build_inter_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd, - int mi_row, int mi_col, BUFFER_SET *ctx, - BLOCK_SIZE bsize, int plane_idx) { - build_inter_predictors_for_planes(cm, xd, bsize, mi_row, mi_col, plane_idx, - plane_idx); + build_inter_predictors(cm, xd, plane_idx, xd->mi[0], 0, pd->width, pd->height, + mi_x, mi_y); if (is_interintra_pred(xd->mi[0])) { BUFFER_SET default_ctx = { { NULL, NULL, NULL }, { 0, 0, 0 } }; @@ -290,13 +264,14 @@ void av1_build_inter_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd, } } -void av1_build_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd, - int mi_row, int mi_col, BUFFER_SET *ctx, - BLOCK_SIZE bsize) { - const int num_planes = av1_num_planes(cm); - av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, ctx, bsize); - if (num_planes > 1) - av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, ctx, bsize); +void av1_enc_build_inter_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int mi_col, + const BUFFER_SET *ctx, BLOCK_SIZE bsize, + int plane_from, int plane_to) { + for (int plane_idx = plane_from; plane_idx <= plane_to; ++plane_idx) { + build_inter_predictors_for_plane(cm, xd, mi_row, mi_col, ctx, bsize, + plane_idx); + } } // TODO(sarahparker): @@ -309,7 +284,7 @@ void av1_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, InterpFilters interp_filters, const WarpTypesAllowed *warp_types, int p_col, int p_row, int plane, int ref, - enum mv_precision precision, int x, int y, + mv_precision precision, int x, int y, const MACROBLOCKD *xd, int can_use_previous) { const int is_q4 = precision == MV_PRECISION_Q4; const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2, @@ -452,7 +427,7 @@ void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd, int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE }; - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + if (is_cur_buf_hbd(xd)) { int len = sizeof(uint16_t); dst_buf1[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0]); dst_buf1[1] = @@ -493,7 +468,7 @@ static void build_inter_predictors_single_buf(MACROBLOCKD *xd, int plane, struct macroblockd_plane *const pd = &xd->plane[plane]; const MB_MODE_INFO *mi = xd->mi[0]; - const struct scale_factors *const sf = &xd->block_refs[ref]->sf; + const struct scale_factors *const sf = xd->block_ref_scale_factors[ref]; struct buf_2d *const pre_buf = &pd->pre[ref]; uint8_t *const dst = get_buf_by_bd(xd, ext_dst) + ext_dst_stride * y + x; const MV mv = mi->mv[ref].as_mv; @@ -575,37 +550,41 @@ static void build_wedge_inter_predictor_from_buf( uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x; mbmi->interinter_comp.seg_mask = xd->seg_mask; const INTERINTER_COMPOUND_DATA *comp_data = &mbmi->interinter_comp; + const int is_hbd = is_cur_buf_hbd(xd); if (is_compound && is_masked_compound_type(comp_data->type)) { if (!plane && comp_data->type == COMPOUND_DIFFWTD) { - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + if (is_hbd) { av1_build_compound_diffwtd_mask_highbd( comp_data->seg_mask, comp_data->mask_type, CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0, CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, h, w, xd->bd); - else + } else { av1_build_compound_diffwtd_mask( comp_data->seg_mask, comp_data->mask_type, ext_dst0, ext_dst_stride0, ext_dst1, ext_dst_stride1, h, w); + } } - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + if (is_hbd) { build_masked_compound_highbd( dst, dst_buf->stride, CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0, CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, comp_data, mbmi->sb_type, h, w, xd->bd); - else + } else { build_masked_compound(dst, dst_buf->stride, ext_dst0, ext_dst_stride0, ext_dst1, ext_dst_stride1, comp_data, mbmi->sb_type, h, w); + } } else { - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + if (is_hbd) { aom_highbd_convolve_copy(CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0, dst, dst_buf->stride, NULL, 0, NULL, 0, w, h, xd->bd); - else + } else { aom_convolve_copy(ext_dst0, ext_dst_stride0, dst, dst_buf->stride, NULL, 0, NULL, 0, w, h); + } } } diff --git a/libaom/av1/encoder/reconinter_enc.h b/libaom/av1/encoder/reconinter_enc.h index 10d5e8c..5687168 100644 --- a/libaom/av1/encoder/reconinter_enc.h +++ b/libaom/av1/encoder/reconinter_enc.h @@ -23,21 +23,10 @@ extern "C" { #endif -void av1_build_inter_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd, - int mi_row, int mi_col, BUFFER_SET *ctx, - BLOCK_SIZE bsize); - -void av1_build_inter_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd, - int mi_row, int mi_col, BUFFER_SET *ctx, - BLOCK_SIZE bsize); - -void av1_build_inter_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd, - int mi_row, int mi_col, BUFFER_SET *ctx, - BLOCK_SIZE bsize, int plane_idx); - -void av1_build_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd, - int mi_row, int mi_col, BUFFER_SET *ctx, - BLOCK_SIZE bsize); +void av1_enc_build_inter_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int mi_col, + const BUFFER_SET *ctx, BLOCK_SIZE bsize, + int plane_from, int plane_to); void av1_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const MV *src_mv, @@ -46,7 +35,7 @@ void av1_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, InterpFilters interp_filters, const WarpTypesAllowed *warp_types, int p_col, int p_row, int plane, int ref, - enum mv_precision precision, int x, int y, + mv_precision precision, int x, int y, const MACROBLOCKD *xd, int can_use_previous); // Detect if the block have sub-pixel level motion vectors diff --git a/libaom/av1/encoder/speed_features.c b/libaom/av1/encoder/speed_features.c index fd0368e..5dfc585 100644 --- a/libaom/av1/encoder/speed_features.c +++ b/libaom/av1/encoder/speed_features.c @@ -17,13 +17,9 @@ #include "aom_dsp/aom_dsp_common.h" -// Setting this to 1 will disable trellis optimization completely. -// Setting this to 2 will disable trellis optimization within the -// transform search. Trellis optimization will still be applied -// in the final encode. -#define DISABLE_TRELLISQ_SEARCH 0 - #define MAX_MESH_SPEED 5 // Max speed setting for mesh motion method +// Max speed setting for tx domain evaluation +#define MAX_TX_DOMAIN_EVAL_SPEED 5 static MESH_PATTERN good_quality_mesh_patterns[MAX_MESH_SPEED + 1][MAX_MESH_STEP] = { { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } }, @@ -50,6 +46,22 @@ static MESH_PATTERN intrabc_mesh_patterns[MAX_MESH_SPEED + 1][MAX_MESH_STEP] = { static uint8_t intrabc_max_mesh_pct[MAX_MESH_SPEED + 1] = { 100, 100, 100, 25, 25, 10 }; +// Threshold values to be used for pruning the txfm_domain_distortion +// based on block MSE +// TODO(any): Experiment the threshold logic based on variance metric +static unsigned int tx_domain_dist_thresholds[MAX_TX_DOMAIN_EVAL_SPEED + 1] = { + UINT_MAX, 162754, 22026, 22026, 22026, 0 +}; +// Threshold values to be used for disabling coeff RD-optimization +// based on block MSE +// TODO(any): Experiment the threshold logic based on variance metric +static unsigned int coeff_opt_dist_thresholds[5] = { UINT_MAX, 162754, 162754, + 22026, 22026 }; +// scaling values to be used for gating wedge/compound segment based on best +// approximate rd +static int comp_type_rd_threshold_mul[3] = { 1, 11, 12 }; +static int comp_type_rd_threshold_div[3] = { 3, 16, 16 }; + // Intra only frames, golden frames (except alt ref overlays) and // alt ref frames tend to be coded at a higher than ambient quality static int frame_is_boosted(const AV1_COMP *cpi) { @@ -62,7 +74,7 @@ static int frame_is_boosted(const AV1_COMP *cpi) { // partly on the screen area that over which they propogate. Propogation is // limited by transform block size but the screen area take up by a given block // size will be larger for a small image format stretched to full screen. -static BLOCK_SIZE set_partition_min_limit(AV1_COMMON *const cm) { +static BLOCK_SIZE set_partition_min_limit(const AV1_COMMON *const cm) { unsigned int screen_area = (cm->width * cm->height); // Select block size based on image format size. @@ -78,24 +90,21 @@ static BLOCK_SIZE set_partition_min_limit(AV1_COMMON *const cm) { } } -// Do we have an internal image edge (e.g. formatting bars). -static int has_internal_image_edge(const AV1_COMP *cpi) { - return (cpi->oxcf.pass == 2) && - ((cpi->twopass.this_frame_stats.inactive_zone_rows > 0) || - (cpi->twopass.this_frame_stats.inactive_zone_cols > 0)); -} - -static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi, - SPEED_FEATURES *sf, - int speed) { - AV1_COMMON *const cm = &cpi->common; +static void set_good_speed_feature_framesize_dependent( + const AV1_COMP *const cpi, SPEED_FEATURES *const sf, int speed) { + const AV1_COMMON *const cm = &cpi->common; const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720; const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480; if (is_480p_or_larger) { sf->use_square_partition_only_threshold = BLOCK_128X128; + if (is_720p_or_larger) + sf->auto_max_partition_based_on_simple_motion = ADAPT_PRED; + else + sf->auto_max_partition_based_on_simple_motion = RELAXED_PRED; } else { sf->use_square_partition_only_threshold = BLOCK_64X64; + sf->auto_max_partition_based_on_simple_motion = DIRECT_PRED; } // TODO(huisu@google.com): train models for 720P and above. @@ -107,6 +116,11 @@ static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi, sf->ml_partition_search_breakout_thresh[4] = -1; // BLOCK_128X128 } + if (is_720p_or_larger && speed >= CONFIG_2PASS_PARTITION_SEARCH_LVL_START && + speed < CONFIG_2PASS_PARTITION_SEARCH_LVL_END) { + sf->two_pass_partition_search = 1; + } + if (speed >= 1) { if (is_720p_or_larger) { sf->use_square_partition_only_threshold = BLOCK_128X128; @@ -122,18 +136,28 @@ static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi, sf->ml_partition_search_breakout_thresh[2] = 300; // BLOCK_32X32 sf->ml_partition_search_breakout_thresh[3] = 300; // BLOCK_64X64 sf->ml_partition_search_breakout_thresh[4] = -1; // BLOCK_128X128 + + sf->firstpass_simple_motion_search_early_term = 1; } } if (speed >= 2) { if (is_720p_or_larger) { - sf->disable_split_mask = - cm->show_frame ? DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT; + sf->use_square_partition_only_threshold = BLOCK_64X64; + } else if (is_480p_or_larger) { + sf->use_square_partition_only_threshold = BLOCK_32X32; + } else { + // TODO(chiyotsai@google.com): Setting the threshold to BLOCK_16X16 incurs + // a large loss (about 0.584%). Try increasing the threshold on boosted + // frame and see if it improves the performance. + sf->use_square_partition_only_threshold = BLOCK_32X32; + } + + if (is_720p_or_larger) { sf->adaptive_pred_interp_filter = 0; sf->partition_search_breakout_dist_thr = (1 << 24); sf->partition_search_breakout_rate_thr = 120; } else { - sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY; sf->partition_search_breakout_dist_thr = (1 << 22); sf->partition_search_breakout_rate_thr = 100; } @@ -142,24 +166,15 @@ static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi, if (speed >= 3) { if (is_720p_or_larger) { - sf->disable_split_mask = DISABLE_ALL_SPLIT; sf->partition_search_breakout_dist_thr = (1 << 25); sf->partition_search_breakout_rate_thr = 200; } else { sf->max_intra_bsize = BLOCK_32X32; - sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT; sf->partition_search_breakout_dist_thr = (1 << 23); sf->partition_search_breakout_rate_thr = 120; } - } - - // If this is a two pass clip that fits the criteria for animated or - // graphics content then reset disable_split_mask for speeds 2+. - // Also if the image edge is internal to the coded area. - if ((speed >= 2) && (cpi->oxcf.pass == 2) && - ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) || - (has_internal_image_edge(cpi)))) { - sf->disable_split_mask = DISABLE_COMPOUND_SPLIT; + sf->use_first_partition_pass_interintra_stats = + sf->two_pass_partition_search; } if (speed >= 4) { @@ -168,15 +183,15 @@ static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi, } else { sf->partition_search_breakout_dist_thr = (1 << 24); } - sf->disable_split_mask = DISABLE_ALL_SPLIT; } } -static void set_good_speed_features_framesize_independent(AV1_COMP *cpi, - SPEED_FEATURES *sf, - int speed) { - AV1_COMMON *const cm = &cpi->common; +static void set_good_speed_features_framesize_independent( + const AV1_COMP *const cpi, SPEED_FEATURES *const sf, int speed) { + const AV1_COMMON *const cm = &cpi->common; const int boosted = frame_is_boosted(cpi); + const int is_boosted_arf2_bwd_type = + boosted || cpi->refresh_bwd_ref_frame || cpi->refresh_alt2_ref_frame; // Speed 0 for all speed features that give neutral coding performance change. sf->reduce_inter_modes = 1; @@ -184,16 +199,22 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi, sf->ml_prune_rect_partition = 1; sf->ml_prune_ab_partition = 1; sf->ml_prune_4_partition = 1; + sf->simple_motion_search_prune_rect = 1; sf->adaptive_txb_search_level = 1; - sf->use_jnt_comp_flag = JNT_COMP_SKIP_MV_SEARCH; + sf->use_dist_wtd_comp_flag = DIST_WTD_COMP_SKIP_MV_SEARCH; sf->model_based_prune_tx_search_level = 1; sf->model_based_post_interp_filter_breakout = 1; + sf->model_based_motion_mode_rd_breakout = 1; + + // TODO(debargha): Test, tweak and turn on either 1 or 2 sf->inter_mode_rd_model_estimation = 1; + sf->inter_mode_rd_model_estimation_adaptive = 0; + + sf->two_loop_comp_search = 0; sf->prune_ref_frame_for_rect_partitions = - !(boosted || cpi->refresh_bwd_ref_frame || cpi->refresh_alt2_ref_frame); - sf->prune_ref_mode_for_partitions = sf->prune_ref_frame_for_rect_partitions; + boosted ? 0 : (is_boosted_arf2_bwd_type ? 1 : 2); sf->less_rectangular_check_level = 1; - sf->gm_search_type = GM_REDUCED_REF_SEARCH; + sf->gm_search_type = GM_REDUCED_REF_SEARCH_SKIP_L2_L3; sf->gm_disable_recode = 1; sf->use_fast_interpolation_filter_search = 1; sf->intra_tx_size_search_init_depth_sqr = 1; @@ -202,28 +223,250 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi, sf->prune_wedge_pred_diff_based = 1; sf->disable_wedge_search_var_thresh = 0; sf->disable_wedge_search_edge_thresh = 0; + sf->prune_motion_mode_level = 1; + sf->cb_pred_filter_search = 0; + sf->use_nonrd_pick_mode = 0; + sf->use_real_time_ref_set = 0; if (speed >= 1) { sf->gm_erroradv_type = GM_ERRORADV_TR_1; sf->selective_ref_frame = 2; + sf->intra_tx_size_search_init_depth_rect = 1; + sf->tx_size_search_lgr_block = 1; + + sf->prune_ext_partition_types_search_level = 2; + sf->skip_repeat_interpolation_filter_search = 1; + sf->tx_type_search.skip_tx_search = 1; + sf->tx_type_search.ml_tx_split_thresh = 40; + sf->model_based_prune_tx_search_level = 0; + sf->adaptive_txb_search_level = 2; + sf->use_intra_txb_hash = 1; + sf->optimize_b_precheck = 1; + sf->dual_sgr_penalty_level = 1; + sf->use_accurate_subpel_search = USE_4_TAPS; + sf->reuse_inter_intra_mode = 1; + sf->prune_comp_search_by_single_result = 1; + sf->skip_repeated_newmv = 1; + sf->obmc_full_pixel_search_level = 1; + // TODO(anyone): Following speed feature will be further explored to + // identify the appropriate tradeoff between encoder performance and its + // speed. + sf->prune_single_motion_modes_by_simple_trans = 1; + + sf->simple_motion_search_split_only = 1; + sf->simple_motion_search_early_term_none = 1; + + sf->disable_wedge_search_var_thresh = 0; + sf->disable_wedge_search_edge_thresh = 0; + sf->disable_interinter_wedge_newmv_search = boosted ? 0 : 1; + sf->prune_comp_type_by_comp_avg = 1; + sf->prune_motion_mode_level = 2; + sf->gm_search_type = GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2; + sf->cb_pred_filter_search = 1; + sf->use_transform_domain_distortion = boosted ? 0 : 1; + sf->perform_coeff_opt = boosted ? 0 : 1; + sf->use_inter_txb_hash = 0; + } + + if (speed >= 2) { + sf->gm_erroradv_type = GM_ERRORADV_TR_2; + + sf->selective_ref_frame = 3; sf->inter_tx_size_search_init_depth_rect = 1; sf->inter_tx_size_search_init_depth_sqr = 1; + + sf->fast_cdef_search = 1; + + sf->adaptive_rd_thresh = 1; + sf->mv.auto_mv_step_size = 1; + sf->mv.subpel_iters_per_step = 1; + sf->disable_filter_search_var_thresh = 100; + sf->comp_inter_joint_search_thresh = BLOCK_SIZES_ALL; + + sf->partition_search_breakout_rate_thr = 80; + sf->allow_partition_search_skip = 1; + sf->disable_wedge_search_var_thresh = 100; + sf->disable_wedge_search_edge_thresh = 0; + sf->disable_interinter_wedge_newmv_search = 1; + sf->fast_wedge_sign_estimate = 1; + sf->disable_dual_filter = 1; + sf->use_dist_wtd_comp_flag = DIST_WTD_COMP_DISABLED; + sf->prune_comp_type_by_comp_avg = 2; + // TODO(Sachin): Enable/Enhance this speed feature for speed 2 & 3 + sf->cb_pred_filter_search = 0; + sf->adaptive_interp_filter_search = 1; + sf->perform_coeff_opt = boosted ? 0 : 2; + } + + if (speed >= 3) { + sf->tx_size_search_method = boosted ? USE_FULL_RD : USE_LARGESTALL; + sf->less_rectangular_check_level = 2; + sf->adaptive_pred_interp_filter = 1; + // adaptive_motion_search breaks encoder multi-thread tests. + // The values in x->pred_mv[] differ for single and multi-thread cases. + // See aomedia:1778. + // sf->adaptive_motion_search = 1; + sf->recode_loop = ALLOW_RECODE_KFARFGF; + sf->use_transform_domain_distortion = boosted ? 1 : 2; + sf->use_accurate_subpel_search = USE_2_TAPS; + sf->adaptive_rd_thresh = 2; + if (cpi->oxcf.enable_smooth_interintra) { + sf->disable_smooth_interintra = + (boosted || cpi->refresh_bwd_ref_frame || cpi->refresh_alt2_ref_frame) + ? 0 + : 1; + } + sf->tx_type_search.prune_mode = PRUNE_2D_FAST; + sf->gm_search_type = GM_DISABLE_SEARCH; + sf->prune_comp_search_by_single_result = 2; + sf->prune_motion_mode_level = boosted ? 2 : 3; + sf->prune_warp_using_wmtype = 1; + // TODO(yunqing): evaluate this speed feature for speed 1 & 2, and combine + // it with cpi->sf.disable_wedge_search_var_thresh. + sf->disable_wedge_interintra_search = 1; + // TODO(any): Experiment with the early exit mechanism for speeds 0, 1 and 2 + // and clean-up the speed feature + sf->perform_best_rd_based_gating_for_chroma = 1; + sf->prune_ref_frame_for_rect_partitions = + frame_is_intra_only(&cpi->common) ? 0 : (boosted ? 1 : 2); + sf->perform_coeff_opt = is_boosted_arf2_bwd_type ? 2 : 3; + sf->prune_comp_type_by_model_rd = boosted ? 0 : 1; + // TODO(Venkat): Clean-up frame type dependency for + // simple_motion_search_split_only in partition search function and set the + // speed feature accordingly + // TODO(Venkat): Evaluate this speed feature for speed 1 & 2 + sf->simple_motion_search_split_only = + cm->allow_screen_content_tools ? 1 : 2; + sf->disable_smooth_intra = + !frame_is_intra_only(&cpi->common) || (cpi->rc.frames_to_key != 1); + } + + if (speed >= 4) { + sf->use_intra_txb_hash = 0; + sf->tx_type_search.fast_intra_tx_type_search = 1; + sf->disable_loop_restoration_chroma = + (boosted || cm->allow_screen_content_tools) ? 0 : 1; + sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED; + sf->adaptive_pred_interp_filter = 0; + sf->cb_pred_filter_search = 1; + sf->adaptive_mode_search = 1; + sf->alt_ref_search_fp = 1; + sf->skip_sharp_interp_filter_search = 1; + sf->perform_coeff_opt = is_boosted_arf2_bwd_type ? 2 : 4; + sf->adaptive_txb_search_level = boosted ? 2 : 3; + } + + if (speed >= 5) { + sf->recode_loop = ALLOW_RECODE_KFMAXBW; + sf->intra_y_mode_mask[TX_64X64] = INTRA_DC_H_V; + sf->intra_uv_mode_mask[TX_64X64] = UV_INTRA_DC_H_V_CFL; + sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V; + sf->intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC_H_V_CFL; + sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V; + sf->intra_uv_mode_mask[TX_16X16] = UV_INTRA_DC_H_V_CFL; + sf->tx_size_search_method = USE_LARGESTALL; + sf->mv.search_method = BIGDIA; + sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_MORE; + sf->adaptive_rd_thresh = 4; + sf->mode_search_skip_flags = + (cm->current_frame.frame_type == KEY_FRAME) + ? 0 + : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER | + FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR | + FLAG_EARLY_TERMINATE; + sf->disable_filter_search_var_thresh = 200; + sf->use_fast_coef_costing = 1; + sf->partition_search_breakout_rate_thr = 300; + sf->use_transform_domain_distortion = 2; + } + + if (speed >= 6) { + int i; + sf->optimize_coefficients = NO_TRELLIS_OPT; + sf->mv.search_method = HEX; + sf->disable_filter_search_var_thresh = 500; + for (i = 0; i < TX_SIZES; ++i) { + sf->intra_y_mode_mask[i] = INTRA_DC; + sf->intra_uv_mode_mask[i] = UV_INTRA_DC_CFL; + } + sf->partition_search_breakout_rate_thr = 500; + sf->mv.reduce_first_step_size = 1; + sf->simple_model_rd_from_var = 1; + } + if (speed >= 7) { + sf->default_max_partition_size = BLOCK_32X32; + sf->default_min_partition_size = BLOCK_8X8; + sf->intra_y_mode_mask[TX_64X64] = INTRA_DC; + sf->intra_y_mode_mask[TX_32X32] = INTRA_DC; + sf->frame_parameter_update = 0; + sf->mv.search_method = FAST_HEX; + sf->partition_search_type = REFERENCE_PARTITION; + sf->mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH; + // TODO(any): evaluate adaptive_mode_search=1 for speed 7 & 8 + sf->adaptive_mode_search = 2; + } + if (speed >= 8) { + sf->mv.search_method = FAST_DIAMOND; + sf->mv.subpel_force_stop = HALF_PEL; + sf->lpf_pick = LPF_PICK_FROM_Q; + } +} + +// TODO(kyslov): now this is very similar to +// set_good_speed_features_framesize_independent +// except it sets non-rd flag on speed8. This function will likely +// be modified in the future with RT-specific speed features +static void set_rt_speed_features_framesize_independent(AV1_COMP *cpi, + SPEED_FEATURES *sf, + int speed) { + AV1_COMMON *const cm = &cpi->common; + const int boosted = frame_is_boosted(cpi); + + // Speed 0 for all speed features that give neutral coding performance change. + sf->reduce_inter_modes = 1; + sf->prune_ext_partition_types_search_level = 1; + sf->ml_prune_rect_partition = 1; + sf->ml_prune_ab_partition = 1; + sf->ml_prune_4_partition = 1; + sf->adaptive_txb_search_level = 1; + sf->use_dist_wtd_comp_flag = DIST_WTD_COMP_SKIP_MV_SEARCH; + sf->model_based_prune_tx_search_level = 1; + sf->model_based_post_interp_filter_breakout = 1; + sf->model_based_motion_mode_rd_breakout = 1; + + // TODO(debargha): Test, tweak and turn on either 1 or 2 + sf->inter_mode_rd_model_estimation = 0; + sf->inter_mode_rd_model_estimation_adaptive = 0; + sf->two_loop_comp_search = 0; + + sf->prune_ref_frame_for_rect_partitions = !boosted; + sf->less_rectangular_check_level = 1; + sf->gm_search_type = GM_REDUCED_REF_SEARCH_SKIP_L2_L3; + sf->gm_disable_recode = 1; + sf->use_fast_interpolation_filter_search = 1; + sf->intra_tx_size_search_init_depth_sqr = 1; + sf->intra_angle_estimation = 1; + sf->selective_ref_frame = 1; + sf->prune_wedge_pred_diff_based = 1; + sf->disable_wedge_search_var_thresh = 0; + sf->disable_wedge_search_edge_thresh = 0; + sf->prune_motion_mode_level = 1; + sf->cb_pred_filter_search = 0; + sf->use_nonrd_pick_mode = 0; + sf->use_real_time_ref_set = 0; + + if (speed >= 1) { + sf->gm_erroradv_type = GM_ERRORADV_TR_1; + sf->selective_ref_frame = 2; + sf->intra_tx_size_search_init_depth_rect = 1; sf->tx_size_search_lgr_block = 1; - if (speed >= CONFIG_2PASS_PARTITION_SEARCH_LVL) { - sf->two_pass_partition_search = 1; - sf->mode_pruning_based_on_two_pass_partition_search = 1; - } sf->prune_ext_partition_types_search_level = 2; sf->skip_repeat_interpolation_filter_search = 1; sf->tx_type_search.skip_tx_search = 1; sf->tx_type_search.ml_tx_split_thresh = 40; sf->model_based_prune_tx_search_level = 0; - sf->model_based_post_interp_filter_breakout = 0; - // TODO(angiebird): Re-evaluate the impact of inter_mode_rd_model_estimation - // on speed 1 - sf->inter_mode_rd_model_estimation = 0; sf->adaptive_txb_search_level = 2; sf->use_intra_txb_hash = 1; sf->optimize_b_precheck = 1; @@ -238,15 +481,23 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi, // speed. sf->prune_single_motion_modes_by_simple_trans = 1; - sf->full_pixel_motion_search_based_split = 1; + sf->simple_motion_search_prune_rect = 1; + sf->disable_wedge_search_var_thresh = 0; sf->disable_wedge_search_edge_thresh = 0; + sf->prune_comp_type_by_comp_avg = 1; + sf->prune_motion_mode_level = 2; + sf->gm_search_type = GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2; + sf->cb_pred_filter_search = 1; + sf->use_transform_domain_distortion = boosted ? 0 : 1; } if (speed >= 2) { sf->gm_erroradv_type = GM_ERRORADV_TR_2; sf->selective_ref_frame = 3; + sf->inter_tx_size_search_init_depth_rect = 1; + sf->inter_tx_size_search_init_depth_sqr = 1; sf->fast_cdef_search = 1; sf->adaptive_rd_thresh = 1; @@ -256,18 +507,19 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi, sf->comp_inter_joint_search_thresh = BLOCK_SIZES_ALL; sf->partition_search_breakout_rate_thr = 80; - // Note: This speed feature is disable as it seems to be worse in - // compression/quality and is also slower. - // sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX; sf->allow_partition_search_skip = 1; sf->disable_wedge_search_var_thresh = 100; sf->disable_wedge_search_edge_thresh = 0; sf->fast_wedge_sign_estimate = 1; sf->disable_dual_filter = 1; - sf->use_jnt_comp_flag = JNT_COMP_DISABLED; + sf->use_dist_wtd_comp_flag = DIST_WTD_COMP_DISABLED; + sf->prune_comp_type_by_comp_avg = 2; + sf->cb_pred_filter_search = 0; + sf->adaptive_interp_filter_search = 1; } if (speed >= 3) { + sf->selective_ref_frame = 4; sf->tx_size_search_method = boosted ? USE_FULL_RD : USE_LARGESTALL; sf->less_rectangular_check_level = 2; sf->adaptive_pred_interp_filter = 1; @@ -282,22 +534,23 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi, sf->tx_type_search.prune_mode = PRUNE_2D_FAST; sf->gm_search_type = GM_DISABLE_SEARCH; sf->prune_comp_search_by_single_result = 2; + sf->prune_motion_mode_level = boosted ? 2 : 3; + sf->prune_warp_using_wmtype = 1; + // TODO(yunqing): evaluate this speed feature for speed 1 & 2, and combine + // it with cpi->sf.disable_wedge_search_var_thresh. + sf->disable_wedge_interintra_search = 1; } if (speed >= 4) { sf->use_intra_txb_hash = 0; - sf->use_inter_txb_hash = 0; sf->use_mb_rd_hash = 0; sf->tx_type_search.fast_intra_tx_type_search = 1; sf->tx_type_search.fast_inter_tx_type_search = 1; - sf->use_square_partition_only_threshold = - boosted ? BLOCK_128X128 : BLOCK_4X4; sf->tx_size_search_method = frame_is_intra_only(cm) ? USE_FULL_RD : USE_LARGESTALL; sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED; sf->adaptive_pred_interp_filter = 0; sf->adaptive_mode_search = 1; - sf->cb_partition_search = !boosted; sf->alt_ref_search_fp = 1; sf->skip_sharp_interp_filter_search = 1; } @@ -310,7 +563,6 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi, sf->intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC_H_V_CFL; sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V; sf->intra_uv_mode_mask[TX_16X16] = UV_INTRA_DC_H_V_CFL; - sf->use_square_partition_only_threshold = BLOCK_4X4; sf->tx_size_search_method = USE_LARGESTALL; sf->mv.search_method = BIGDIA; sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_MORE; @@ -352,30 +604,25 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi, } if (speed >= 8) { sf->mv.search_method = FAST_DIAMOND; - sf->mv.subpel_force_stop = 2; - sf->lpf_pick = LPF_PICK_MINIMAL_LPF; + sf->lpf_pick = LPF_PICK_FROM_Q; + sf->default_max_partition_size = BLOCK_128X128; + sf->default_min_partition_size = BLOCK_8X8; + sf->partition_search_type = VAR_BASED_PARTITION; + sf->use_real_time_ref_set = 1; + // Can't use LARGEST TX mode with pre-calculated partition + // and disabled TX64 + if (!cpi->oxcf.enable_tx64) sf->tx_size_search_method = USE_FAST_RD; + sf->use_nonrd_pick_mode = 1; + sf->inter_mode_rd_model_estimation = 2; } } -void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi) { +void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi, int speed) { SPEED_FEATURES *const sf = &cpi->sf; const AV1EncoderConfig *const oxcf = &cpi->oxcf; - RD_OPT *const rd = &cpi->rd; - int i; if (oxcf->mode == GOOD) { - set_good_speed_feature_framesize_dependent(cpi, sf, oxcf->speed); - } - - if (sf->disable_split_mask == DISABLE_ALL_SPLIT) { - sf->adaptive_pred_interp_filter = 0; - } - - // Check for masked out split cases. - for (i = 0; i < MAX_REFS; ++i) { - if (sf->disable_split_mask & (1 << i)) { - rd->thresh_mult_sub8x8[i] = INT_MAX; - } + set_good_speed_feature_framesize_dependent(cpi, sf, speed); } // This is only used in motion vector unit test. @@ -385,7 +632,7 @@ void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi) { cpi->find_fractional_mv_step = av1_return_min_sub_pixel_mv; } -void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) { +void av1_set_speed_features_framesize_independent(AV1_COMP *cpi, int speed) { AV1_COMMON *const cm = &cpi->common; SPEED_FEATURES *const sf = &cpi->sf; MACROBLOCK *const x = &cpi->td.mb; @@ -398,25 +645,33 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) { sf->recode_loop = ALLOW_RECODE; sf->mv.subpel_search_method = SUBPEL_TREE; sf->mv.subpel_iters_per_step = 2; - sf->mv.subpel_force_stop = 0; -#if DISABLE_TRELLISQ_SEARCH == 2 - sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf) - ? FINAL_PASS_TRELLIS_OPT - : NO_TRELLIS_OPT; -#elif DISABLE_TRELLISQ_SEARCH == 1 - sf->optimize_coefficients = NO_TRELLIS_OPT; -#else - if (is_lossless_requested(&cpi->oxcf)) + sf->mv.subpel_force_stop = EIGHTH_PEL; + if (cpi->oxcf.disable_trellis_quant == 3) { + sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf) + ? NO_ESTIMATE_YRD_TRELLIS_OPT + : NO_TRELLIS_OPT; + } else if (cpi->oxcf.disable_trellis_quant == 2) { + sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf) + ? FINAL_PASS_TRELLIS_OPT + : NO_TRELLIS_OPT; + } else if (cpi->oxcf.disable_trellis_quant == 0) { + if (is_lossless_requested(&cpi->oxcf)) + sf->optimize_coefficients = NO_TRELLIS_OPT; + else + sf->optimize_coefficients = FULL_TRELLIS_OPT; + } else if (cpi->oxcf.disable_trellis_quant == 1) { sf->optimize_coefficients = NO_TRELLIS_OPT; - else - sf->optimize_coefficients = FULL_TRELLIS_OPT; -#endif // DISABLE_TRELLISQ_SEARCH + } else { + assert(0 && "Invalid disable_trellis_quant value"); + } sf->gm_erroradv_type = GM_ERRORADV_TR_0; sf->mv.reduce_first_step_size = 0; sf->mv.auto_mv_step_size = 0; sf->comp_inter_joint_search_thresh = BLOCK_4X4; sf->adaptive_rd_thresh = 0; - sf->tx_size_search_method = USE_FULL_RD; + // TODO(sarahparker) Pair this with a speed setting once experiments are done + sf->trellis_eob_fast = 0; + sf->tx_size_search_method = cpi->oxcf.tx_size_search_method; sf->inter_tx_size_search_init_depth_sqr = 0; sf->inter_tx_size_search_init_depth_rect = 0; sf->intra_tx_size_search_init_depth_rect = 0; @@ -424,12 +679,12 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) { sf->tx_size_search_lgr_block = 0; sf->model_based_prune_tx_search_level = 0; sf->model_based_post_interp_filter_breakout = 0; + sf->model_based_motion_mode_rd_breakout = 0; sf->reduce_inter_modes = 0; sf->selective_ref_gm = 1; sf->adaptive_motion_search = 0; sf->adaptive_pred_interp_filter = 0; sf->adaptive_mode_search = 0; - sf->cb_partition_search = 0; sf->alt_ref_search_fp = 0; sf->partition_search_type = SEARCH_PARTITION; sf->tx_type_search.prune_mode = PRUNE_2D_ACCURATE; @@ -442,19 +697,20 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) { sf->less_rectangular_check_level = 0; sf->use_square_partition_only_threshold = BLOCK_128X128; sf->prune_ref_frame_for_rect_partitions = 0; - sf->prune_ref_mode_for_partitions = 0; - sf->auto_min_max_partition_size = NOT_IN_USE; + sf->auto_max_partition_based_on_simple_motion = NOT_IN_USE; + sf->auto_min_partition_based_on_simple_motion = 0; sf->rd_auto_partition_min_limit = BLOCK_4X4; sf->default_max_partition_size = BLOCK_LARGEST; sf->default_min_partition_size = BLOCK_4X4; sf->adjust_partitioning_from_last_frame = 0; - sf->disable_split_mask = 0; sf->mode_search_skip_flags = 0; sf->disable_filter_search_var_thresh = 0; sf->allow_partition_search_skip = 0; sf->use_accurate_subpel_search = USE_8_TAPS; sf->disable_wedge_search_edge_thresh = 0; + sf->use_first_partition_pass_interintra_stats = 0; sf->disable_wedge_search_var_thresh = 0; + sf->disable_loop_restoration_chroma = 0; sf->fast_wedge_sign_estimate = 0; sf->prune_wedge_pred_diff_based = 0; sf->drop_ref = 0; @@ -462,17 +718,19 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) { sf->txb_split_cap = 1; sf->adaptive_txb_search_level = 0; sf->two_pass_partition_search = 0; - sf->mode_pruning_based_on_two_pass_partition_search = 0; + sf->firstpass_simple_motion_search_early_term = 0; sf->use_intra_txb_hash = 0; sf->use_inter_txb_hash = 1; sf->use_mb_rd_hash = 1; sf->optimize_b_precheck = 0; - sf->jnt_comp_fast_tx_search = 0; - sf->use_jnt_comp_flag = JNT_COMP_ENABLED; + sf->two_loop_comp_search = 1; + sf->second_loop_comp_fast_tx_search = 0; + sf->use_dist_wtd_comp_flag = DIST_WTD_COMP_ENABLED; sf->reuse_inter_intra_mode = 0; sf->intra_angle_estimation = 0; sf->skip_obmc_in_uniform_mv_field = 0; sf->skip_wm_in_uniform_mv_field = 0; + sf->adaptive_interp_filter_search = 0; for (i = 0; i < TX_SIZES; i++) { sf->intra_y_mode_mask[i] = INTRA_ALL; @@ -497,7 +755,9 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) { for (i = 0; i < PARTITION_BLOCK_SIZES; ++i) { sf->ml_partition_search_breakout_thresh[i] = -1; // -1 means not enabled. } - sf->full_pixel_motion_search_based_split = 0; + sf->simple_motion_search_split_only = 0; + sf->simple_motion_search_prune_rect = 0; + sf->simple_motion_search_early_term_none = 0; // Set this at the appropriate speed levels sf->use_transform_domain_distortion = 0; @@ -514,12 +774,29 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) { // Set decoder side speed feature to use less dual sgr modes sf->dual_sgr_penalty_level = 0; + // TODO(angiebird, debargha): Re-evaluate the impact of + // inter_mode_rd_model_estimation in conjunction with + // model_based_motion_mode_rd_breakout sf->inter_mode_rd_model_estimation = 0; + sf->inter_mode_rd_model_estimation_adaptive = 0; + sf->obmc_full_pixel_search_level = 0; sf->skip_sharp_interp_filter_search = 0; + sf->prune_comp_type_by_comp_avg = 0; + sf->disable_interinter_wedge_newmv_search = 0; + sf->disable_smooth_interintra = 0; + sf->prune_motion_mode_level = 0; + sf->prune_warp_using_wmtype = 0; + sf->disable_wedge_interintra_search = 0; + sf->perform_coeff_opt = 0; + sf->prune_comp_type_by_model_rd = 0; + sf->disable_smooth_intra = 0; + sf->perform_best_rd_based_gating_for_chroma = 0; if (oxcf->mode == GOOD) - set_good_speed_features_framesize_independent(cpi, sf, oxcf->speed); + set_good_speed_features_framesize_independent(cpi, sf, speed); + else if (oxcf->mode == REALTIME) + set_rt_speed_features_framesize_independent(cpi, sf, speed); if (!cpi->seq_params_locked) { cpi->common.seq_params.enable_dual_filter &= !sf->disable_dual_filter; @@ -534,39 +811,44 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) { cpi->diamond_search_sad = av1_diamond_search_sad; sf->allow_exhaustive_searches = 1; - int speed = (oxcf->speed > MAX_MESH_SPEED) ? MAX_MESH_SPEED : oxcf->speed; + + const int mesh_speed = AOMMIN(speed, MAX_MESH_SPEED); if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) sf->exhaustive_searches_thresh = (1 << 24); else sf->exhaustive_searches_thresh = (1 << 25); - sf->max_exaustive_pct = good_quality_max_mesh_pct[speed]; - if (speed > 0) + sf->max_exaustive_pct = good_quality_max_mesh_pct[mesh_speed]; + if (mesh_speed > 0) sf->exhaustive_searches_thresh = sf->exhaustive_searches_thresh << 1; for (i = 0; i < MAX_MESH_STEP; ++i) { - sf->mesh_patterns[i].range = good_quality_mesh_patterns[speed][i].range; + sf->mesh_patterns[i].range = + good_quality_mesh_patterns[mesh_speed][i].range; sf->mesh_patterns[i].interval = - good_quality_mesh_patterns[speed][i].interval; + good_quality_mesh_patterns[mesh_speed][i].interval; } if ((frame_is_intra_only(cm) && cm->allow_screen_content_tools) && (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION || cpi->oxcf.content == AOM_CONTENT_SCREEN)) { for (i = 0; i < MAX_MESH_STEP; ++i) { - sf->mesh_patterns[i].range = intrabc_mesh_patterns[speed][i].range; - sf->mesh_patterns[i].interval = intrabc_mesh_patterns[speed][i].interval; + sf->mesh_patterns[i].range = intrabc_mesh_patterns[mesh_speed][i].range; + sf->mesh_patterns[i].interval = + intrabc_mesh_patterns[mesh_speed][i].interval; } - sf->max_exaustive_pct = intrabc_max_mesh_pct[speed]; + sf->max_exaustive_pct = intrabc_max_mesh_pct[mesh_speed]; } // Slow quant, dct and trellis not worthwhile for first pass // so make sure they are always turned off. if (oxcf->pass == 1) sf->optimize_coefficients = NO_TRELLIS_OPT; - // No recode for 1 pass. + // No recode or trellis for 1 pass. if (oxcf->pass == 0) { sf->recode_loop = DISALLOW_RECODE; sf->optimize_coefficients = NO_TRELLIS_OPT; } + // FIXME: trellis not very efficient for quantization matrices + if (oxcf->using_qm) sf->optimize_coefficients = NO_TRELLIS_OPT; if (sf->mv.subpel_search_method == SUBPEL_TREE) { cpi->find_fractional_mv_step = av1_find_best_sub_pixel_tree; @@ -578,12 +860,6 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) { cpi->find_fractional_mv_step = av1_find_best_sub_pixel_tree_pruned_evenmore; } - cpi->optimize_speed_feature = - oxcf->pass != 1 ? sf->optimize_coefficients : NO_TRELLIS_OPT; - // FIXME: trellis not very efficient for quantisation matrices - if (cm->using_qmatrix) cpi->optimize_speed_feature = NO_TRELLIS_OPT; - if (oxcf->disable_trellis_quant) cpi->optimize_speed_feature = NO_TRELLIS_OPT; - x->min_partition_size = sf->default_min_partition_size; x->max_partition_size = sf->default_max_partition_size; @@ -592,6 +868,17 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) { cpi->find_fractional_mv_step = av1_return_max_sub_pixel_mv; else if (cpi->oxcf.motion_vector_unit_test == 2) cpi->find_fractional_mv_step = av1_return_min_sub_pixel_mv; + cpi->max_comp_type_rd_threshold_mul = + comp_type_rd_threshold_mul[sf->prune_comp_type_by_comp_avg]; + cpi->max_comp_type_rd_threshold_div = + comp_type_rd_threshold_div[sf->prune_comp_type_by_comp_avg]; + const int tx_domain_speed = AOMMIN(speed, MAX_TX_DOMAIN_EVAL_SPEED); + cpi->tx_domain_dist_threshold = tx_domain_dist_thresholds[tx_domain_speed]; + + // assert ensures that coeff_opt_dist_thresholds is accessed correctly + assert(cpi->sf.perform_coeff_opt >= 0 && cpi->sf.perform_coeff_opt < 5); + cpi->coeff_opt_dist_threshold = + coeff_opt_dist_thresholds[cpi->sf.perform_coeff_opt]; #if CONFIG_DIST_8X8 if (sf->use_transform_domain_distortion > 0) cpi->oxcf.using_dist_8x8 = 0; @@ -600,6 +887,9 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) { #endif // CONFIG_DIST_8X8 if (cpi->oxcf.row_mt == 1 && (cpi->oxcf.max_threads > 1)) { sf->adaptive_rd_thresh = 0; - sf->inter_mode_rd_model_estimation = 0; + if (sf->inter_mode_rd_model_estimation == 1) { + sf->inter_mode_rd_model_estimation = 0; + sf->inter_mode_rd_model_estimation_adaptive = 0; + } } } diff --git a/libaom/av1/encoder/speed_features.h b/libaom/av1/encoder/speed_features.h index f71dcbf..a321192 100644 --- a/libaom/av1/encoder/speed_features.h +++ b/libaom/av1/encoder/speed_features.h @@ -73,7 +73,7 @@ enum { (1 << THR_ALTR) | (1 << THR_GOLD) }; -typedef enum { +enum { TXFM_CODING_SF = 1, INTER_PRED_SF = 2, INTRA_PRED_SF = 4, @@ -82,9 +82,9 @@ typedef enum { RD_SKIP_SF = 32, RESERVE_2_SF = 64, RESERVE_3_SF = 128, -} DEV_SPEED_FEATURES; +} UENUM1BYTE(DEV_SPEED_FEATURES); -typedef enum { +enum { DIAMOND = 0, NSTEP = 1, HEX = 2, @@ -92,9 +92,9 @@ typedef enum { SQUARE = 4, FAST_HEX = 5, FAST_DIAMOND = 6 -} SEARCH_METHODS; +} UENUM1BYTE(SEARCH_METHODS); -typedef enum { +enum { // No recode. DISALLOW_RECODE = 0, // Allow recode for KF and exceeding maximum frame bandwidth. @@ -103,28 +103,23 @@ typedef enum { ALLOW_RECODE_KFARFGF = 2, // Allow recode for all frames based on bitrate constraints. ALLOW_RECODE = 3, -} RECODE_LOOP_TYPE; +} UENUM1BYTE(RECODE_LOOP_TYPE); -typedef enum { +enum { SUBPEL_TREE = 0, SUBPEL_TREE_PRUNED = 1, // Prunes 1/2-pel searches SUBPEL_TREE_PRUNED_MORE = 2, // Prunes 1/2-pel searches more aggressively SUBPEL_TREE_PRUNED_EVENMORE = 3, // Prunes 1/2- and 1/4-pel searches // Other methods to come -} SUBPEL_SEARCH_METHODS; +} UENUM1BYTE(SUBPEL_SEARCH_METHODS); -typedef enum { +enum { USE_FULL_RD = 0, USE_FAST_RD, USE_LARGESTALL, -} TX_SIZE_SEARCH_METHOD; - -typedef enum { - NOT_IN_USE = 0, - RELAXED_NEIGHBORING_MIN_MAX = 1 -} AUTO_MIN_MAX_MODE; +} UENUM1BYTE(TX_SIZE_SEARCH_METHOD); -typedef enum { +enum { // Try the full image with different values. LPF_PICK_FROM_FULL_IMAGE, // Try a small portion of the image with different values. @@ -133,9 +128,9 @@ typedef enum { LPF_PICK_FROM_Q, // Pick 0 to disable LPF if LPF was enabled last frame LPF_PICK_MINIMAL_LPF -} LPF_PICK_METHOD; +} UENUM1BYTE(LPF_PICK_METHOD); -typedef enum { +enum { // Terminate search early based on distortion so far compared to // qp step, distortion in the neighborhood of the frame, etc. FLAG_EARLY_TERMINATE = 1 << 0, @@ -152,9 +147,9 @@ typedef enum { // Skips intra modes other than DC_PRED if the source variance is small FLAG_SKIP_INTRA_LOWVAR = 1 << 5, -} MODE_SEARCH_SKIP_LOGIC; +} UENUM1BYTE(MODE_SEARCH_SKIP_LOGIC); -typedef enum { +enum { NO_PRUNE = 0, // eliminates one tx type in vertical and horizontal direction PRUNE_ONE = 1, @@ -165,7 +160,7 @@ typedef enum { PRUNE_2D_ACCURATE = 3, // similar, but applies much more aggressive pruning to get better speed-up PRUNE_2D_FAST = 4, -} TX_TYPE_PRUNE_MODE; +} UENUM1BYTE(TX_TYPE_PRUNE_MODE); typedef struct { TX_TYPE_PRUNE_MODE prune_mode; @@ -184,15 +179,31 @@ typedef struct { int skip_tx_search; } TX_TYPE_SEARCH; -typedef enum { +enum { // Search partitions using RD criterion SEARCH_PARTITION, // Always use a fixed size partition FIXED_PARTITION, - REFERENCE_PARTITION -} PARTITION_SEARCH_TYPE; + REFERENCE_PARTITION, + + VAR_BASED_PARTITION +} UENUM1BYTE(PARTITION_SEARCH_TYPE); + +enum { + EIGHTH_PEL, + QUARTER_PEL, + HALF_PEL, + FULL_PEL +} UENUM1BYTE(SUBPEL_FORCE_STOP); + +enum { + NOT_IN_USE, + DIRECT_PRED, + RELAXED_PRED, + ADAPT_PRED +} UENUM1BYTE(MAX_PART_PRED_MODE); typedef struct MV_SPEED_FEATURES { // Motion search method (Diamond, NSTEP, Hex, Big Diamond, Square, etc). @@ -215,8 +226,8 @@ typedef struct MV_SPEED_FEATURES { // Maximum number of steps in logarithmic subpel search before giving up. int subpel_iters_per_step; - // Control when to stop subpel search - int subpel_force_stop; + // When to stop subpel search. + SUBPEL_FORCE_STOP subpel_force_stop; } MV_SPEED_FEATURES; #define MAX_MESH_STEP 4 @@ -226,35 +237,43 @@ typedef struct MESH_PATTERN { int interval; } MESH_PATTERN; -typedef enum { +enum { GM_FULL_SEARCH, - GM_REDUCED_REF_SEARCH, + GM_REDUCED_REF_SEARCH_SKIP_L2_L3, + GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2, GM_DISABLE_SEARCH -} GM_SEARCH_TYPE; +} UENUM1BYTE(GM_SEARCH_TYPE); -typedef enum { +enum { GM_ERRORADV_TR_0, GM_ERRORADV_TR_1, GM_ERRORADV_TR_2, GM_ERRORADV_TR_TYPES, -} GM_ERRORADV_TYPE; +} UENUM1BYTE(GM_ERRORADV_TYPE); -typedef enum { - NO_TRELLIS_OPT, // No trellis optimization - FULL_TRELLIS_OPT, // Trellis optimization in all stages - FINAL_PASS_TRELLIS_OPT // Trellis optimization in only the final encode pass -} TRELLIS_OPT_TYPE; +enum { + NO_TRELLIS_OPT, // No trellis optimization + FULL_TRELLIS_OPT, // Trellis optimization in all stages + FINAL_PASS_TRELLIS_OPT, // Trellis optimization in only the final encode pass + NO_ESTIMATE_YRD_TRELLIS_OPT // Disable trellis in estimate_yrd_for_sb +} UENUM1BYTE(TRELLIS_OPT_TYPE); -typedef enum { +enum { FULL_TXFM_RD, LOW_TXFM_RD, -} TXFM_RD_MODEL; +} UENUM1BYTE(TXFM_RD_MODEL); + +enum { + DIST_WTD_COMP_ENABLED, + DIST_WTD_COMP_SKIP_MV_SEARCH, + DIST_WTD_COMP_DISABLED, +} UENUM1BYTE(DIST_WTD_COMP_FLAG); typedef enum { - JNT_COMP_ENABLED, - JNT_COMP_SKIP_MV_SEARCH, - JNT_COMP_DISABLED, -} JNT_COMP_FLAG; + FLAG_SKIP_EIGHTTAP = 1 << EIGHTTAP_REGULAR, + FLAG_SKIP_EIGHTTAP_SMOOTH = 1 << EIGHTTAP_SMOOTH, + FLAG_SKIP_EIGHTTAP_SHARP = 1 << MULTITAP_SHARP, +} INTERP_FILTER_MASK; typedef struct SPEED_FEATURES { MV_SPEED_FEATURES mv; @@ -335,11 +354,16 @@ typedef struct SPEED_FEATURES { // 1: use model based rd breakout int model_based_post_interp_filter_breakout; + // Model based breakout in motion_mode_rd + // 0: no breakout + // 1: use model based rd breakout + int model_based_motion_mode_rd_breakout; + // Used if partition_search_type = FIXED_SIZE_PARTITION BLOCK_SIZE always_this_block_size; // Drop less likely to be picked reference frames in the RD search. - // Has four levels for now: 0, 1, 2 and 3, where higher levels prune more + // Has five levels for now: 0, 1, 2, 3 and 4, where higher levels prune more // aggressively than lower ones. (0 means no pruning). int selective_ref_frame; @@ -351,6 +375,10 @@ typedef struct SPEED_FEATURES { // Use a ML model to prune horz and vert partitions int ml_prune_rect_partition; + // Disable/Enable interintra motion mode based on stats collected during + // first_partition_search_pass + int use_first_partition_pass_interintra_stats; + // Use a ML model to prune horz_a, horz_b, vert_a and vert_b partitions. int ml_prune_ab_partition; @@ -359,12 +387,13 @@ typedef struct SPEED_FEATURES { int fast_cdef_search; - // 2-pass coding block partition search + // 2-pass coding block partition search, and also use the mode decisions made + // in the initial partition search to prune mode candidates, e.g. ref frames. int two_pass_partition_search; - // Use the mode decisions made in the initial partition search to prune mode - // candidates, e.g. ref frames. - int mode_pruning_based_on_two_pass_partition_search; + // Terminate early in firstpass of two_pass partition search for faster + // firstpass. + int firstpass_simple_motion_search_early_term; // Skip rectangular partition test when partition type none gives better // rd than partition type split. Can take values 0 - 2, 0 referring to no @@ -375,14 +404,17 @@ typedef struct SPEED_FEATURES { BLOCK_SIZE use_square_partition_only_threshold; // Prune reference frames for rectangular partitions. + // 0 implies no pruning + // 1 implies prune for extended partition + // 2 implies prune horiz, vert and extended partition int prune_ref_frame_for_rect_partitions; - // Prune ref/mode choices for partitions. - int prune_ref_mode_for_partitions; + // Sets min and max square partition levels for this superblock based on + // motion vector and prediction error distribution produced from 16x16 + // simple motion search + MAX_PART_PRED_MODE auto_max_partition_based_on_simple_motion; + int auto_min_partition_based_on_simple_motion; - // Sets min and max partition sizes for this superblock based on the - // same superblock in last encoded frame, and the left and above neighbor. - AUTO_MIN_MAX_MODE auto_min_max_partition_size; // Ensures the rd based auto partition search will always // go down at least to the specified level. BLOCK_SIZE rd_auto_partition_min_limit; @@ -396,11 +428,6 @@ typedef struct SPEED_FEATURES { // frame's partitioning. Only used if use_lastframe_partitioning is set. int adjust_partitioning_from_last_frame; - // Disables sub 8x8 blocksizes in different scenarios: Choices are to disable - // it always, to allow it for only Last frame and Intra, disable it for all - // inter modes or to enable it always. - int disable_split_mask; - // TODO(jingning): combine the related motion search speed features // This allows us to use motion search at other sizes as a starting // point for this motion search and limits the search range around it. @@ -427,8 +454,6 @@ typedef struct SPEED_FEATURES { // Adaptive prediction mode search int adaptive_mode_search; - int cb_partition_search; - int alt_ref_search_fp; // Implements various heuristics to skip searching modes @@ -541,18 +566,26 @@ typedef struct SPEED_FEATURES { // Calculate RD cost before doing optimize_b, and skip if the cost is large. int optimize_b_precheck; - // Use model rd instead of transform search in jnt_comp - int jnt_comp_fast_tx_search; + // Use two-loop compound search + int two_loop_comp_search; + + // Use model rd instead of transform search in second loop of compound search + int second_loop_comp_fast_tx_search; // Decide when and how to use joint_comp. - JNT_COMP_FLAG use_jnt_comp_flag; + DIST_WTD_COMP_FLAG use_dist_wtd_comp_flag; // Decoder side speed feature to add penalty for use of dual-sgr filters. // Takes values 0 - 10, 0 indicating no penalty and each additional level // adding a penalty of 1% int dual_sgr_penalty_level; - // Dynamically estimate final rd from prediction error and mode cost + // 2-pass inter mode model estimation where the preliminary pass skips + // transform search and uses a model to estimate rd, while the final pass + // computes the full transform search. Two types of models are supported: + // 0: not used + // 1: used with online dynamic rd model + // 2: used with static rd model int inter_mode_rd_model_estimation; // Skip some ref frames in compound motion search by single motion search @@ -581,24 +614,95 @@ typedef struct SPEED_FEATURES { // Prune intra mode candidates based on source block gradient stats. int intra_angle_estimation; - // Performs full pixel motion search before none_partition to decide if we - // want to split directly without trying other partition types. - int full_pixel_motion_search_based_split; - // Skip obmc or warped motion mode when neighborhood motion field is // identical int skip_obmc_in_uniform_mv_field; int skip_wm_in_uniform_mv_field; + // Enable/disable ME for interinter wedge search. + int disable_interinter_wedge_newmv_search; + + // Enable/disable smooth inter-intra mode + int disable_smooth_interintra; + // skip sharp_filter evaluation based on regular and smooth filter rd for // dual_filter=0 case int skip_sharp_interp_filter_search; + + // prune wedge and compound segment approximate rd evaluation based on + // compound average rd/ref_best_rd + int prune_comp_type_by_comp_avg; + + // Prune/gate motion mode evaluation based on token based rd + // during transform search for inter blocks + // Values are 0 (not used) , 1 - 3 with progressively increasing + // aggressiveness + int prune_motion_mode_level; + + // Gate warp evaluation for motions of type IDENTITY, + // TRANSLATION and AFFINE(based on number of warp neighbors) + int prune_warp_using_wmtype; + + // Perform simple_motion_search on each possible subblock and use it to prune + // PARTITION_HORZ and PARTITION_VERT. + int simple_motion_search_prune_rect; + + // Perform simple motion search before none_partition to decide if we + // want to split directly without trying other partition types. + int simple_motion_search_split_only; + + // Use features from simple_motion_search to terminate prediction block + // partition after PARTITION_NONE + int simple_motion_search_early_term_none; + + int cb_pred_filter_search; + + // adaptive interp_filter search to allow skip of certain filter types. + int adaptive_interp_filter_search; + + // mask for skip evaluation of certain interp_filter type. + INTERP_FILTER_MASK interp_filter_search_mask; + + // Flag used to control the ref_best_rd based gating for chroma + int perform_best_rd_based_gating_for_chroma; + + // Enable/disable interintra wedge search. + int disable_wedge_interintra_search; + + // Disable loop restoration for Chroma plane + int disable_loop_restoration_chroma; + + // Flag used to control the extent of coeff R-D optimization + int perform_coeff_opt; + + // Flag used to control the speed of the eob selection in trellis. + int trellis_eob_fast; + + // This flag controls the use of non-RD mode decision. + int use_nonrd_pick_mode; + + // prune wedge and compound segment approximate rd evaluation based on + // compound average modeled rd + int prune_comp_type_by_model_rd; + + // Enable/disable smooth intra modes. + int disable_smooth_intra; + + // use reduced ref set for real-time mode + int use_real_time_ref_set; + + // Perform a full TX search on some modes while using the + // inter-mode RD model for others. Only enabled when + // inter_mode_rd_model_estimation != 0 + int inter_mode_rd_model_estimation_adaptive; } SPEED_FEATURES; struct AV1_COMP; -void av1_set_speed_features_framesize_independent(struct AV1_COMP *cpi); -void av1_set_speed_features_framesize_dependent(struct AV1_COMP *cpi); +void av1_set_speed_features_framesize_independent(struct AV1_COMP *cpi, + int speed); +void av1_set_speed_features_framesize_dependent(struct AV1_COMP *cpi, + int speed); #ifdef __cplusplus } // extern "C" diff --git a/libaom/av1/encoder/temporal_filter.c b/libaom/av1/encoder/temporal_filter.c index ace585e..ba883d7 100644 --- a/libaom/av1/encoder/temporal_filter.c +++ b/libaom/av1/encoder/temporal_filter.c @@ -37,13 +37,22 @@ #define EDGE_THRESHOLD 50 #define SQRT_PI_BY_2 1.25331413732 +static unsigned int index_mult[14] = { + 0, 0, 0, 0, 49152, 39322, 32768, 28087, 24576, 21846, 19661, 17874, 0, 15124 +}; + +static int64_t highbd_index_mult[14] = { 0U, 0U, 0U, + 0U, 3221225472U, 2576980378U, + 2147483648U, 1840700270U, 1610612736U, + 1431655766U, 1288490189U, 1171354718U, + 0U, 991146300U }; + static void temporal_filter_predictors_mb_c( MACROBLOCKD *xd, uint8_t *y_mb_ptr, uint8_t *u_mb_ptr, uint8_t *v_mb_ptr, int stride, int uv_block_width, int uv_block_height, int mv_row, int mv_col, uint8_t *pred, struct scale_factors *scale, int x, int y, - int can_use_previous, int num_planes) { - const MV mv = { mv_row, mv_col }; - enum mv_precision mv_precision_uv; + int can_use_previous, int num_planes, MV *blk_mvs, int use_32x32) { + mv_precision mv_precision_uv; int uv_stride; // TODO(angiebird): change plane setting accordingly ConvolveParams conv_params = get_conv_params(0, 0, xd->bd); @@ -52,33 +61,146 @@ static void temporal_filter_predictors_mb_c( WarpTypesAllowed warp_types; memset(&warp_types, 0, sizeof(WarpTypesAllowed)); - if (uv_block_width == 8) { + const int ssx = (uv_block_width == (BW >> 1)) ? 1 : 0; + if (ssx) { uv_stride = (stride + 1) >> 1; mv_precision_uv = MV_PRECISION_Q4; } else { uv_stride = stride; mv_precision_uv = MV_PRECISION_Q3; } - av1_build_inter_predictor(y_mb_ptr, stride, &pred[0], 16, &mv, scale, 16, 16, - &conv_params, interp_filters, &warp_types, x, y, 0, - 0, MV_PRECISION_Q3, x, y, xd, can_use_previous); + if (use_32x32) { + assert(mv_row >= INT16_MIN && mv_row <= INT16_MAX && mv_col >= INT16_MIN && + mv_col <= INT16_MAX); + const MV mv = { (int16_t)mv_row, (int16_t)mv_col }; + + av1_build_inter_predictor(y_mb_ptr, stride, &pred[0], BW, &mv, scale, BW, + BH, &conv_params, interp_filters, &warp_types, x, + y, 0, 0, MV_PRECISION_Q3, x, y, xd, + can_use_previous); + if (num_planes > 1) { + av1_build_inter_predictor( + u_mb_ptr, uv_stride, &pred[BLK_PELS], uv_block_width, &mv, scale, + uv_block_width, uv_block_height, &conv_params, interp_filters, + &warp_types, x, y, 1, 0, mv_precision_uv, x, y, xd, can_use_previous); + av1_build_inter_predictor( + v_mb_ptr, uv_stride, &pred[(BLK_PELS << 1)], uv_block_width, &mv, + scale, uv_block_width, uv_block_height, &conv_params, interp_filters, + &warp_types, x, y, 2, 0, mv_precision_uv, x, y, xd, can_use_previous); + } + + return; + } + + // While use_32x32 = 0, construct the 32x32 predictor using 4 16x16 + // predictors. + int i, j, k = 0, ys = (BH >> 1), xs = (BW >> 1); + // Y predictor + for (i = 0; i < BH; i += ys) { + for (j = 0; j < BW; j += xs) { + const MV mv = blk_mvs[k]; + const int y_offset = i * stride + j; + const int p_offset = i * BW + j; + + av1_build_inter_predictor(y_mb_ptr + y_offset, stride, &pred[p_offset], + BW, &mv, scale, xs, ys, &conv_params, + interp_filters, &warp_types, x, y, 0, 0, + MV_PRECISION_Q3, x, y, xd, can_use_previous); + k++; + } + } + + // U and V predictors if (num_planes > 1) { - av1_build_inter_predictor( - u_mb_ptr, uv_stride, &pred[256], uv_block_width, &mv, scale, - uv_block_width, uv_block_height, &conv_params, interp_filters, - &warp_types, x, y, 1, 0, mv_precision_uv, x, y, xd, can_use_previous); - - av1_build_inter_predictor( - v_mb_ptr, uv_stride, &pred[512], uv_block_width, &mv, scale, - uv_block_width, uv_block_height, &conv_params, interp_filters, - &warp_types, x, y, 2, 0, mv_precision_uv, x, y, xd, can_use_previous); + ys = (uv_block_height >> 1); + xs = (uv_block_width >> 1); + k = 0; + + for (i = 0; i < uv_block_height; i += ys) { + for (j = 0; j < uv_block_width; j += xs) { + const MV mv = blk_mvs[k]; + const int uv_offset = i * uv_stride + j; + const int p_offset = i * uv_block_width + j; + + av1_build_inter_predictor(u_mb_ptr + uv_offset, uv_stride, + &pred[BLK_PELS + p_offset], uv_block_width, + &mv, scale, xs, ys, &conv_params, + interp_filters, &warp_types, x, y, 1, 0, + mv_precision_uv, x, y, xd, can_use_previous); + av1_build_inter_predictor( + v_mb_ptr + uv_offset, uv_stride, &pred[(BLK_PELS << 1) + p_offset], + uv_block_width, &mv, scale, xs, ys, &conv_params, interp_filters, + &warp_types, x, y, 2, 0, mv_precision_uv, x, y, xd, + can_use_previous); + k++; + } + } } } -static INLINE int64_t mod_index(int64_t sum_dist, int index, int rounding, - int strength, int filter_weight) { - int64_t mod = (sum_dist * 3) / index; +static void apply_temporal_filter_self(const uint8_t *pred, int buf_stride, + unsigned int block_width, + unsigned int block_height, + int filter_weight, uint32_t *accumulator, + uint16_t *count) { + const int modifier = filter_weight * 16; + unsigned int i, j, k = 0; + assert(filter_weight == 2); + + for (i = 0; i < block_height; i++) { + for (j = 0; j < block_width; j++) { + const int pixel_value = pred[i * buf_stride + j]; + count[k] += modifier; + accumulator[k] += modifier * pixel_value; + ++k; + } + } +} + +static void highbd_apply_temporal_filter_self( + const uint8_t *pred8, int buf_stride, unsigned int block_width, + unsigned int block_height, int filter_weight, uint32_t *accumulator, + uint16_t *count) { + const int modifier = filter_weight * 16; + const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); + unsigned int i, j, k = 0; + assert(filter_weight == 2); + + for (i = 0; i < block_height; i++) { + for (j = 0; j < block_width; j++) { + const int pixel_value = pred[i * buf_stride + j]; + count[k] += modifier; + accumulator[k] += modifier * pixel_value; + ++k; + } + } +} + +static INLINE int mod_index(int sum_dist, int index, int rounding, int strength, + int filter_weight) { + assert(index >= 0 && index <= 13); + assert(index_mult[index] != 0); + + int mod = (clamp(sum_dist, 0, UINT16_MAX) * index_mult[index]) >> 16; + mod += rounding; + mod >>= strength; + + mod = AOMMIN(16, mod); + + mod = 16 - mod; + mod *= filter_weight; + + return mod; +} + +static INLINE int highbd_mod_index(int64_t sum_dist, int index, int rounding, + int strength, int filter_weight) { + assert(index >= 0 && index <= 13); + assert(highbd_index_mult[index] != 0); + + int mod = + (int)((AOMMIN(sum_dist, INT32_MAX) * highbd_index_mult[index]) >> 32); mod += rounding; mod >>= strength; @@ -106,12 +228,35 @@ static INLINE void calculate_squared_errors(const uint8_t *s, int s_stride, } } -static void apply_temporal_filter( +static INLINE int get_filter_weight(unsigned int i, unsigned int j, + unsigned int block_height, + unsigned int block_width, const int *blk_fw, + int use_32x32) { + if (use_32x32) + // blk_fw[0] ~ blk_fw[3] are the same. + return blk_fw[0]; + + int filter_weight = 0; + if (i < block_height / 2) { + if (j < block_width / 2) + filter_weight = blk_fw[0]; + else + filter_weight = blk_fw[1]; + } else { + if (j < block_width / 2) + filter_weight = blk_fw[2]; + else + filter_weight = blk_fw[3]; + } + return filter_weight; +} + +void av1_apply_temporal_filter_c( const uint8_t *y_frame1, int y_stride, const uint8_t *y_pred, int y_buf_stride, const uint8_t *u_frame1, const uint8_t *v_frame1, int uv_stride, const uint8_t *u_pred, const uint8_t *v_pred, int uv_buf_stride, unsigned int block_width, unsigned int block_height, - int ss_x, int ss_y, int strength, int filter_weight, + int ss_x, int ss_y, int strength, const int *blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count) { unsigned int i, j, k, m; @@ -119,20 +264,17 @@ static void apply_temporal_filter( const int rounding = (1 << strength) >> 1; const unsigned int uv_block_width = block_width >> ss_x; const unsigned int uv_block_height = block_height >> ss_y; - DECLARE_ALIGNED(16, uint16_t, y_diff_sse[256]); - DECLARE_ALIGNED(16, uint16_t, u_diff_sse[256]); - DECLARE_ALIGNED(16, uint16_t, v_diff_sse[256]); + DECLARE_ALIGNED(16, uint16_t, y_diff_sse[BLK_PELS]); + DECLARE_ALIGNED(16, uint16_t, u_diff_sse[BLK_PELS]); + DECLARE_ALIGNED(16, uint16_t, v_diff_sse[BLK_PELS]); int idx = 0, idy; - assert(filter_weight >= 0); - assert(filter_weight <= 2); - - memset(y_diff_sse, 0, 256 * sizeof(uint16_t)); - memset(u_diff_sse, 0, 256 * sizeof(uint16_t)); - memset(v_diff_sse, 0, 256 * sizeof(uint16_t)); + memset(y_diff_sse, 0, BLK_PELS * sizeof(uint16_t)); + memset(u_diff_sse, 0, BLK_PELS * sizeof(uint16_t)); + memset(v_diff_sse, 0, BLK_PELS * sizeof(uint16_t)); - // Calculate diff^2 for each pixel of the 16x16 block. + // Calculate diff^2 for each pixel of the block. // TODO(yunqing): the following code needs to be optimized. calculate_squared_errors(y_frame1, y_stride, y_pred, y_buf_stride, y_diff_sse, block_width, block_height); @@ -144,6 +286,8 @@ static void apply_temporal_filter( for (i = 0, k = 0, m = 0; i < block_height; i++) { for (j = 0; j < block_width; j++) { const int pixel_value = y_pred[i * y_buf_stride + j]; + int filter_weight = + get_filter_weight(i, j, block_height, block_width, blk_fw, use_32x32); // non-local mean approach int y_index = 0; @@ -249,22 +393,22 @@ static INLINE void highbd_calculate_squared_errors( } } -static void highbd_apply_temporal_filter( +void av1_highbd_apply_temporal_filter_c( const uint8_t *yf, int y_stride, const uint8_t *yp, int y_buf_stride, const uint8_t *uf, const uint8_t *vf, int uv_stride, const uint8_t *up, const uint8_t *vp, int uv_buf_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, - int filter_weight, uint32_t *y_accumulator, uint16_t *y_count, - uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, - uint16_t *v_count) { + const int *blk_fw, int use_32x32, uint32_t *y_accumulator, + uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, + uint32_t *v_accumulator, uint16_t *v_count) { unsigned int i, j, k, m; int64_t modifier; const int rounding = (1 << strength) >> 1; const unsigned int uv_block_width = block_width >> ss_x; const unsigned int uv_block_height = block_height >> ss_y; - DECLARE_ALIGNED(16, uint32_t, y_diff_sse[256]); - DECLARE_ALIGNED(16, uint32_t, u_diff_sse[256]); - DECLARE_ALIGNED(16, uint32_t, v_diff_sse[256]); + DECLARE_ALIGNED(16, uint32_t, y_diff_sse[BLK_PELS]); + DECLARE_ALIGNED(16, uint32_t, u_diff_sse[BLK_PELS]); + DECLARE_ALIGNED(16, uint32_t, v_diff_sse[BLK_PELS]); const uint16_t *y_frame1 = CONVERT_TO_SHORTPTR(yf); const uint16_t *u_frame1 = CONVERT_TO_SHORTPTR(uf); @@ -274,14 +418,11 @@ static void highbd_apply_temporal_filter( const uint16_t *v_pred = CONVERT_TO_SHORTPTR(vp); int idx = 0, idy; - assert(filter_weight >= 0); - assert(filter_weight <= 2); - - memset(y_diff_sse, 0, 256 * sizeof(uint32_t)); - memset(u_diff_sse, 0, 256 * sizeof(uint32_t)); - memset(v_diff_sse, 0, 256 * sizeof(uint32_t)); + memset(y_diff_sse, 0, BLK_PELS * sizeof(uint32_t)); + memset(u_diff_sse, 0, BLK_PELS * sizeof(uint32_t)); + memset(v_diff_sse, 0, BLK_PELS * sizeof(uint32_t)); - // Calculate diff^2 for each pixel of the 16x16 block. + // Calculate diff^2 for each pixel of the block. // TODO(yunqing): the following code needs to be optimized. highbd_calculate_squared_errors(y_frame1, y_stride, y_pred, y_buf_stride, y_diff_sse, block_width, block_height); @@ -293,6 +434,8 @@ static void highbd_apply_temporal_filter( for (i = 0, k = 0, m = 0; i < block_height; i++) { for (j = 0; j < block_width; j++) { const int pixel_value = y_pred[i * y_buf_stride + j]; + int filter_weight = + get_filter_weight(i, j, block_height, block_width, blk_fw, use_32x32); // non-local mean approach int y_index = 0; @@ -321,11 +464,11 @@ static void highbd_apply_temporal_filter( y_index += 2; - modifier = - mod_index(modifier, y_index, rounding, strength, filter_weight); + const int final_y_mod = highbd_mod_index(modifier, y_index, rounding, + strength, filter_weight); - y_count[k] += modifier; - y_accumulator[k] += modifier * pixel_value; + y_count[k] += final_y_mod; + y_accumulator[k] += final_y_mod * pixel_value; ++k; @@ -367,13 +510,15 @@ static void highbd_apply_temporal_filter( u_mod += y_diff; v_mod += y_diff; - u_mod = mod_index(u_mod, cr_index, rounding, strength, filter_weight); - v_mod = mod_index(v_mod, cr_index, rounding, strength, filter_weight); + const int final_u_mod = highbd_mod_index(u_mod, cr_index, rounding, + strength, filter_weight); + const int final_v_mod = highbd_mod_index(v_mod, cr_index, rounding, + strength, filter_weight); - u_count[m] += u_mod; - u_accumulator[m] += u_mod * u_pixel_value; - v_count[m] += v_mod; - v_accumulator[m] += v_mod * v_pixel_value; + u_count[m] += final_u_mod; + u_accumulator[m] += final_u_mod * u_pixel_value; + v_count[m] += final_v_mod; + v_accumulator[m] += final_v_mod * v_pixel_value; ++m; } // Complete YUV pixel @@ -385,8 +530,8 @@ static void highbd_apply_temporal_filter( void av1_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, - int filter_weight, unsigned int *accumulator, - uint16_t *count) { + const int *blk_fw, int use_32x32, + unsigned int *accumulator, uint16_t *count) { unsigned int i, j, k; int modifier; int byte = 0; @@ -395,6 +540,8 @@ void av1_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, for (i = 0, k = 0; i < block_height; i++) { for (j = 0; j < block_width; j++, k++) { int pixel_value = *frame2; + int filter_weight = + get_filter_weight(i, j, block_height, block_width, blk_fw, use_32x32); // non-local mean approach int diff_sse[9] = { 0 }; @@ -447,7 +594,7 @@ void av1_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, void av1_highbd_temporal_filter_apply_c( uint8_t *frame1_8, unsigned int stride, uint8_t *frame2_8, unsigned int block_width, unsigned int block_height, int strength, - int filter_weight, unsigned int *accumulator, uint16_t *count) { + int *blk_fw, int use_32x32, unsigned int *accumulator, uint16_t *count) { uint16_t *frame1 = CONVERT_TO_SHORTPTR(frame1_8); uint16_t *frame2 = CONVERT_TO_SHORTPTR(frame2_8); unsigned int i, j, k; @@ -458,6 +605,8 @@ void av1_highbd_temporal_filter_apply_c( for (i = 0, k = 0; i < block_height; i++) { for (j = 0; j < block_width; j++, k++) { int pixel_value = *frame2; + int filter_weight = + get_filter_weight(i, j, block_height, block_width, blk_fw, use_32x32); // non-local mean approach int diff_sse[9] = { 0 }; @@ -509,8 +658,8 @@ void av1_highbd_temporal_filter_apply_c( static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi, uint8_t *arf_frame_buf, uint8_t *frame_ptr_buf, - int stride, int x_pos, - int y_pos) { + int stride, int x_pos, int y_pos, + MV *blk_mvs, int *blk_bestsme) { MACROBLOCK *const x = &cpi->td.mb; MACROBLOCKD *const xd = &x->e_mbd; const MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv; @@ -543,9 +692,12 @@ static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi, av1_set_mv_search_range(&x->mv_limits, &best_ref_mv1); - av1_full_pixel_search(cpi, x, BLOCK_16X16, &best_ref_mv1_full, step_param, - NSTEP, 1, sadpb, cond_cost_list(cpi, cost_list), - &best_ref_mv1, 0, 0, x_pos, y_pos, 0); + // av1_full_pixel_search() parameters: best_ref_mv1_full is the start mv, and + // best_ref_mv1 is for mv rate calculation. The search result is stored in + // x->best_mv. + av1_full_pixel_search(cpi, x, TF_BLOCK, &best_ref_mv1_full, step_param, NSTEP, + 1, sadpb, cond_cost_list(cpi, cost_list), &best_ref_mv1, + 0, 0, x_pos, y_pos, 0, &cpi->ss_cfg[SS_CFG_LOOKAHEAD]); x->mv_limits = tmp_mv_limits; // Ignore mv costing by sending NULL pointer instead of cost array @@ -559,19 +711,64 @@ static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi, x->best_mv.as_mv.row *= 8; x->best_mv.as_mv.col *= 8; - bestsme = cpi->fn_ptr[BLOCK_16X16].vf(y + offset, y_stride, src_address, - src_stride, &sse); - } else { - bestsme = cpi->find_fractional_mv_step( - x, &cpi->common, 0, 0, &best_ref_mv1, - cpi->common.allow_high_precision_mv, x->errorperbit, - &cpi->fn_ptr[BLOCK_16X16], 0, mv_sf->subpel_iters_per_step, - cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, - NULL, 0, 0, 16, 16, USE_8_TAPS, 1); + bestsme = cpi->fn_ptr[TF_BLOCK].vf(y + offset, y_stride, src_address, + src_stride, &sse); + + x->e_mbd.mi[0]->mv[0] = x->best_mv; + + // Restore input state + x->plane[0].src = src; + xd->plane[0].pre[0] = pre; + + return bestsme; } + // find_fractional_mv_step parameters: best_ref_mv1 is for mv rate cost + // calculation. The start full mv and the search result are stored in + // x->best_mv. mi_row and mi_col are only needed for "av1_is_scaled(sf)=1" + // case. + bestsme = cpi->find_fractional_mv_step( + x, &cpi->common, 0, 0, &best_ref_mv1, cpi->common.allow_high_precision_mv, + x->errorperbit, &cpi->fn_ptr[TF_BLOCK], 0, mv_sf->subpel_iters_per_step, + cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, NULL, + 0, 0, BW, BH, USE_8_TAPS, 1); + x->e_mbd.mi[0]->mv[0] = x->best_mv; + // DO motion search on 4 16x16 sub_blocks. + int i, j, k = 0; + best_ref_mv1.row = x->e_mbd.mi[0]->mv[0].as_mv.row; + best_ref_mv1.col = x->e_mbd.mi[0]->mv[0].as_mv.col; + best_ref_mv1_full.col = best_ref_mv1.col >> 3; + best_ref_mv1_full.row = best_ref_mv1.row >> 3; + + for (i = 0; i < BH; i += SUB_BH) { + for (j = 0; j < BW; j += SUB_BW) { + // Setup frame pointers + x->plane[0].src.buf = arf_frame_buf + i * stride + j; + x->plane[0].src.stride = stride; + xd->plane[0].pre[0].buf = frame_ptr_buf + i * stride + j; + xd->plane[0].pre[0].stride = stride; + + av1_set_mv_search_range(&x->mv_limits, &best_ref_mv1); + av1_full_pixel_search(cpi, x, TF_SUB_BLOCK, &best_ref_mv1_full, + step_param, NSTEP, 1, sadpb, + cond_cost_list(cpi, cost_list), &best_ref_mv1, 0, 0, + x_pos, y_pos, 0, &cpi->ss_cfg[SS_CFG_LOOKAHEAD]); + x->mv_limits = tmp_mv_limits; + + blk_bestsme[k] = cpi->find_fractional_mv_step( + x, &cpi->common, 0, 0, &best_ref_mv1, + cpi->common.allow_high_precision_mv, x->errorperbit, + &cpi->fn_ptr[TF_SUB_BLOCK], 0, mv_sf->subpel_iters_per_step, + cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, + NULL, 0, 0, SUB_BW, SUB_BH, USE_8_TAPS, 1); + + blk_mvs[k] = x->best_mv.as_mv; + k++; + } + } + // Restore input state x->plane[0].src = src; xd->plane[0].pre[0] = pre; @@ -582,39 +779,42 @@ static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi, static void temporal_filter_iterate_c(AV1_COMP *cpi, YV12_BUFFER_CONFIG **frames, int frame_count, int alt_ref_index, - int strength, RefBuffer *ref_buf) { + int strength, + struct scale_factors *ref_scale_factors) { const AV1_COMMON *cm = &cpi->common; const int num_planes = av1_num_planes(cm); int byte; int frame; int mb_col, mb_row; - unsigned int filter_weight; - int mb_cols = (frames[alt_ref_index]->y_crop_width + 15) >> 4; - int mb_rows = (frames[alt_ref_index]->y_crop_height + 15) >> 4; + int mb_cols = (frames[alt_ref_index]->y_crop_width + BW - 1) >> BW_LOG2; + int mb_rows = (frames[alt_ref_index]->y_crop_height + BH - 1) >> BH_LOG2; int mb_y_offset = 0; + int mb_y_src_offset = 0; int mb_uv_offset = 0; - DECLARE_ALIGNED(16, unsigned int, accumulator[16 * 16 * 3]); - DECLARE_ALIGNED(16, uint16_t, count[16 * 16 * 3]); + int mb_uv_src_offset = 0; + DECLARE_ALIGNED(16, unsigned int, accumulator[BLK_PELS * 3]); + DECLARE_ALIGNED(16, uint16_t, count[BLK_PELS * 3]); MACROBLOCKD *mbd = &cpi->td.mb.e_mbd; YV12_BUFFER_CONFIG *f = frames[alt_ref_index]; uint8_t *dst1, *dst2; - DECLARE_ALIGNED(32, uint16_t, predictor16[16 * 16 * 3]); - DECLARE_ALIGNED(32, uint8_t, predictor8[16 * 16 * 3]); + DECLARE_ALIGNED(32, uint16_t, predictor16[BLK_PELS * 3]); + DECLARE_ALIGNED(32, uint8_t, predictor8[BLK_PELS * 3]); uint8_t *predictor; - const int mb_uv_height = 16 >> mbd->plane[1].subsampling_y; - const int mb_uv_width = 16 >> mbd->plane[1].subsampling_x; + const int mb_uv_height = BH >> mbd->plane[1].subsampling_y; + const int mb_uv_width = BW >> mbd->plane[1].subsampling_x; // Save input state uint8_t *input_buffer[MAX_MB_PLANE]; int i; - if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + const int is_hbd = is_cur_buf_hbd(mbd); + if (is_hbd) { predictor = CONVERT_TO_BYTEPTR(predictor16); } else { predictor = predictor8; } - mbd->block_refs[0] = ref_buf; - mbd->block_refs[1] = ref_buf; + mbd->block_ref_scale_factors[0] = ref_scale_factors; + mbd->block_ref_scale_factors[1] = ref_scale_factors; for (i = 0; i < num_planes; i++) input_buffer[i] = mbd->plane[i].pre[0].buf; @@ -631,108 +831,173 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi, // To keep the mv in play for both Y and UV planes the max that it // can be on a border is therefore 16 - (2*AOM_INTERP_EXTEND+1). cpi->td.mb.mv_limits.row_min = - -((mb_row * 16) + (17 - 2 * AOM_INTERP_EXTEND)); + -((mb_row * BH) + (17 - 2 * AOM_INTERP_EXTEND)); cpi->td.mb.mv_limits.row_max = - ((mb_rows - 1 - mb_row) * 16) + (17 - 2 * AOM_INTERP_EXTEND); + ((mb_rows - 1 - mb_row) * BH) + (17 - 2 * AOM_INTERP_EXTEND); for (mb_col = 0; mb_col < mb_cols; mb_col++) { int j, k; int stride; - memset(accumulator, 0, 16 * 16 * 3 * sizeof(accumulator[0])); - memset(count, 0, 16 * 16 * 3 * sizeof(count[0])); + memset(accumulator, 0, BLK_PELS * 3 * sizeof(accumulator[0])); + memset(count, 0, BLK_PELS * 3 * sizeof(count[0])); cpi->td.mb.mv_limits.col_min = - -((mb_col * 16) + (17 - 2 * AOM_INTERP_EXTEND)); + -((mb_col * BW) + (17 - 2 * AOM_INTERP_EXTEND)); cpi->td.mb.mv_limits.col_max = - ((mb_cols - 1 - mb_col) * 16) + (17 - 2 * AOM_INTERP_EXTEND); + ((mb_cols - 1 - mb_col) * BW) + (17 - 2 * AOM_INTERP_EXTEND); for (frame = 0; frame < frame_count; frame++) { - const int thresh_low = 10000; - const int thresh_high = 20000; + // MVs for 4 16x16 sub blocks. + MV blk_mvs[4]; + // Filter weights for 4 16x16 sub blocks. + int blk_fw[4] = { 0, 0, 0, 0 }; + int use_32x32 = 0; if (frames[frame] == NULL) continue; mbd->mi[0]->mv[0].as_mv.row = 0; mbd->mi[0]->mv[0].as_mv.col = 0; mbd->mi[0]->motion_mode = SIMPLE_TRANSLATION; + blk_mvs[0] = kZeroMv; + blk_mvs[1] = kZeroMv; + blk_mvs[2] = kZeroMv; + blk_mvs[3] = kZeroMv; if (frame == alt_ref_index) { - filter_weight = 2; + blk_fw[0] = blk_fw[1] = blk_fw[2] = blk_fw[3] = 2; + use_32x32 = 1; } else { + int thresh_low = 10000; + int thresh_high = 20000; + int blk_bestsme[4] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX }; + // Find best match in this frame by MC int err = temporal_filter_find_matching_mb_c( - cpi, frames[alt_ref_index]->y_buffer + mb_y_offset, - frames[frame]->y_buffer + mb_y_offset, frames[frame]->y_stride, - mb_col * 16, mb_row * 16); - - // Assign higher weight to matching MB if it's error - // score is lower. If not applying MC default behavior - // is to weight all MBs equal. - filter_weight = err < thresh_low ? 2 : err < thresh_high ? 1 : 0; + cpi, frames[alt_ref_index]->y_buffer + mb_y_src_offset, + frames[frame]->y_buffer + mb_y_src_offset, + frames[frame]->y_stride, mb_col * BW, mb_row * BH, blk_mvs, + blk_bestsme); + + int err16 = + blk_bestsme[0] + blk_bestsme[1] + blk_bestsme[2] + blk_bestsme[3]; + int max_err = INT_MIN, min_err = INT_MAX; + for (k = 0; k < 4; k++) { + if (min_err > blk_bestsme[k]) min_err = blk_bestsme[k]; + if (max_err < blk_bestsme[k]) max_err = blk_bestsme[k]; + } + + if (((err * 15 < (err16 << 4)) && max_err - min_err < 12000) || + ((err * 14 < (err16 << 4)) && max_err - min_err < 6000)) { + use_32x32 = 1; + // Assign higher weight to matching MB if it's error + // score is lower. If not applying MC default behavior + // is to weight all MBs equal. + blk_fw[0] = err < (thresh_low << THR_SHIFT) + ? 2 + : err < (thresh_high << THR_SHIFT) ? 1 : 0; + blk_fw[1] = blk_fw[2] = blk_fw[3] = blk_fw[0]; + } else { + use_32x32 = 0; + for (k = 0; k < 4; k++) + blk_fw[k] = blk_bestsme[k] < thresh_low + ? 2 + : blk_bestsme[k] < thresh_high ? 1 : 0; + } } - if (filter_weight != 0) { + if (blk_fw[0] || blk_fw[1] || blk_fw[2] || blk_fw[3]) { // Construct the predictors temporal_filter_predictors_mb_c( - mbd, frames[frame]->y_buffer + mb_y_offset, - frames[frame]->u_buffer + mb_uv_offset, - frames[frame]->v_buffer + mb_uv_offset, frames[frame]->y_stride, - mb_uv_width, mb_uv_height, mbd->mi[0]->mv[0].as_mv.row, - mbd->mi[0]->mv[0].as_mv.col, predictor, &ref_buf->sf, mb_col * 16, - mb_row * 16, cm->allow_warped_motion, num_planes); + mbd, frames[frame]->y_buffer + mb_y_src_offset, + frames[frame]->u_buffer + mb_uv_src_offset, + frames[frame]->v_buffer + mb_uv_src_offset, + frames[frame]->y_stride, mb_uv_width, mb_uv_height, + mbd->mi[0]->mv[0].as_mv.row, mbd->mi[0]->mv[0].as_mv.col, + predictor, ref_scale_factors, mb_col * BW, mb_row * BH, + cm->allow_warped_motion, num_planes, blk_mvs, use_32x32); // Apply the filter (YUV) - if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - int adj_strength = strength + 2 * (mbd->bd - 8); - - if (num_planes <= 1) { - // Single plane case - av1_highbd_temporal_filter_apply_c( - f->y_buffer + mb_y_offset, f->y_stride, predictor, 16, 16, - adj_strength, filter_weight, accumulator, count); - } else { - // Process 3 planes together. - highbd_apply_temporal_filter( - f->y_buffer + mb_y_offset, f->y_stride, predictor, 16, - f->u_buffer + mb_uv_offset, f->v_buffer + mb_uv_offset, - f->uv_stride, predictor + 256, predictor + 512, mb_uv_width, - 16, 16, mbd->plane[1].subsampling_x, - mbd->plane[1].subsampling_y, adj_strength, filter_weight, - accumulator, count, accumulator + 256, count + 256, - accumulator + 512, count + 512); + if (frame == alt_ref_index) { + uint8_t *pred = predictor; + uint32_t *accum = accumulator; + uint16_t *cnt = count; + int plane; + + // All 4 blk_fws are equal to 2. + for (plane = 0; plane < num_planes; ++plane) { + const int pred_stride = plane ? mb_uv_width : BW; + const unsigned int w = plane ? mb_uv_width : BW; + const unsigned int h = plane ? mb_uv_height : BH; + + if (is_hbd) { + highbd_apply_temporal_filter_self(pred, pred_stride, w, h, + blk_fw[0], accum, cnt); + } else { + apply_temporal_filter_self(pred, pred_stride, w, h, blk_fw[0], + accum, cnt); + } + + pred += BLK_PELS; + accum += BLK_PELS; + cnt += BLK_PELS; } } else { - if (num_planes <= 1) { - // Single plane case - av1_temporal_filter_apply_c( - f->y_buffer + mb_y_offset, f->y_stride, predictor, 16, 16, - strength, filter_weight, accumulator, count); + if (is_hbd) { + const int adj_strength = strength + 2 * (mbd->bd - 8); + + if (num_planes <= 1) { + // Single plane case + av1_highbd_temporal_filter_apply_c( + f->y_buffer + mb_y_src_offset, f->y_stride, predictor, BW, + BH, adj_strength, blk_fw, use_32x32, accumulator, count); + } else { + // Process 3 planes together. + av1_highbd_apply_temporal_filter( + f->y_buffer + mb_y_src_offset, f->y_stride, predictor, BW, + f->u_buffer + mb_uv_src_offset, + f->v_buffer + mb_uv_src_offset, f->uv_stride, + predictor + BLK_PELS, predictor + (BLK_PELS << 1), + mb_uv_width, BW, BH, mbd->plane[1].subsampling_x, + mbd->plane[1].subsampling_y, adj_strength, blk_fw, + use_32x32, accumulator, count, accumulator + BLK_PELS, + count + BLK_PELS, accumulator + (BLK_PELS << 1), + count + (BLK_PELS << 1)); + } } else { - // Process 3 planes together. - apply_temporal_filter( - f->y_buffer + mb_y_offset, f->y_stride, predictor, 16, - f->u_buffer + mb_uv_offset, f->v_buffer + mb_uv_offset, - f->uv_stride, predictor + 256, predictor + 512, mb_uv_width, - 16, 16, mbd->plane[1].subsampling_x, - mbd->plane[1].subsampling_y, strength, filter_weight, - accumulator, count, accumulator + 256, count + 256, - accumulator + 512, count + 512); + if (num_planes <= 1) { + // Single plane case + av1_temporal_filter_apply_c( + f->y_buffer + mb_y_src_offset, f->y_stride, predictor, BW, + BH, strength, blk_fw, use_32x32, accumulator, count); + } else { + // Process 3 planes together. + av1_apply_temporal_filter( + f->y_buffer + mb_y_src_offset, f->y_stride, predictor, BW, + f->u_buffer + mb_uv_src_offset, + f->v_buffer + mb_uv_src_offset, f->uv_stride, + predictor + BLK_PELS, predictor + (BLK_PELS << 1), + mb_uv_width, BW, BH, mbd->plane[1].subsampling_x, + mbd->plane[1].subsampling_y, strength, blk_fw, use_32x32, + accumulator, count, accumulator + BLK_PELS, + count + BLK_PELS, accumulator + (BLK_PELS << 1), + count + (BLK_PELS << 1)); + } } } } } // Normalize filter output to produce AltRef frame - if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + if (is_hbd) { uint16_t *dst1_16; uint16_t *dst2_16; dst1 = cpi->alt_ref_buffer.y_buffer; dst1_16 = CONVERT_TO_SHORTPTR(dst1); stride = cpi->alt_ref_buffer.y_stride; byte = mb_y_offset; - for (i = 0, k = 0; i < 16; i++) { - for (j = 0; j < 16; j++, k++) { + for (i = 0, k = 0; i < BH; i++) { + for (j = 0; j < BW; j++, k++) { dst1_16[byte] = (uint16_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]); @@ -740,7 +1005,7 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi, byte++; } - byte += stride - 16; + byte += stride - BW; } if (num_planes > 1) { dst1 = cpi->alt_ref_buffer.u_buffer; @@ -749,9 +1014,9 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi, dst2_16 = CONVERT_TO_SHORTPTR(dst2); stride = cpi->alt_ref_buffer.uv_stride; byte = mb_uv_offset; - for (i = 0, k = 256; i < mb_uv_height; i++) { + for (i = 0, k = BLK_PELS; i < mb_uv_height; i++) { for (j = 0; j < mb_uv_width; j++, k++) { - int m = k + 256; + int m = k + BLK_PELS; // U dst1_16[byte] = (uint16_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]); @@ -768,24 +1033,24 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi, dst1 = cpi->alt_ref_buffer.y_buffer; stride = cpi->alt_ref_buffer.y_stride; byte = mb_y_offset; - for (i = 0, k = 0; i < 16; i++) { - for (j = 0; j < 16; j++, k++) { + for (i = 0, k = 0; i < BH; i++) { + for (j = 0; j < BW; j++, k++) { dst1[byte] = (uint8_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]); // move to next pixel byte++; } - byte += stride - 16; + byte += stride - BW; } if (num_planes > 1) { dst1 = cpi->alt_ref_buffer.u_buffer; dst2 = cpi->alt_ref_buffer.v_buffer; stride = cpi->alt_ref_buffer.uv_stride; byte = mb_uv_offset; - for (i = 0, k = 256; i < mb_uv_height; i++) { + for (i = 0, k = BLK_PELS; i < mb_uv_height; i++) { for (j = 0; j < mb_uv_width; j++, k++) { - int m = k + 256; + int m = k + BLK_PELS; // U dst1[byte] = (uint8_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]); @@ -799,11 +1064,16 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi, } } } - mb_y_offset += 16; + mb_y_offset += BW; + mb_y_src_offset += BW; mb_uv_offset += mb_uv_width; + mb_uv_src_offset += mb_uv_width; } - mb_y_offset += 16 * (f->y_stride - mb_cols); - mb_uv_offset += mb_uv_height * f->uv_stride - mb_uv_width * mb_cols; + mb_y_offset += BH * cpi->alt_ref_buffer.y_stride - BW * mb_cols; + mb_y_src_offset += BH * f->y_stride - BW * mb_cols; + mb_uv_src_offset += mb_uv_height * f->uv_stride - mb_uv_width * mb_cols; + mb_uv_offset += + mb_uv_height * cpi->alt_ref_buffer.uv_stride - mb_uv_width * mb_cols; } // Restore input state @@ -920,7 +1190,7 @@ static void adjust_arnr_filter(AV1_COMP *cpi, int distance, int group_boost, MACROBLOCKD *mbd = &cpi->td.mb.e_mbd; struct lookahead_entry *buf = av1_lookahead_peek(cpi->lookahead, distance); double noiselevel; - if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + if (is_cur_buf_hbd(mbd)) { noiselevel = highbd_estimate_noise( buf->img.y_buffer, buf->img.y_crop_width, buf->img.y_crop_height, buf->img.y_stride, mbd->bd, EDGE_THRESHOLD); @@ -974,8 +1244,7 @@ void av1_temporal_filter(AV1_COMP *cpi, int distance) { int strength; int frames_to_blur_backward; int frames_to_blur_forward; - RefBuffer ref_buf; - ref_buf.buf = NULL; + struct scale_factors sf; YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS] = { NULL }; const GF_GROUP *const gf_group = &cpi->twopass.gf_group; @@ -984,9 +1253,8 @@ void av1_temporal_filter(AV1_COMP *cpi, int distance) { // Apply context specific adjustments to the arnr filter parameters. if (gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE) { // TODO(weitinglin): Currently, we enforce the filtering strength on - // extra ARFs' to be zeros. We should investigate in which - // case it is more beneficial to use non-zero strength - // filtering. + // internal ARFs to be zeros. We should investigate in which case it is more + // beneficial to use non-zero strength filtering. strength = 0; frames_to_blur = 1; } else { @@ -1020,7 +1288,7 @@ void av1_temporal_filter(AV1_COMP *cpi, int distance) { // supported. // ARF is produced at the native frame size and resized when coded. av1_setup_scale_factors_for_frame( - &ref_buf.sf, frames[0]->y_crop_width, frames[0]->y_crop_height, + &sf, frames[0]->y_crop_width, frames[0]->y_crop_height, frames[0]->y_crop_width, frames[0]->y_crop_height); } @@ -1031,5 +1299,5 @@ void av1_temporal_filter(AV1_COMP *cpi, int distance) { av1_initialize_cost_tables(&cpi->common, &cpi->td.mb); temporal_filter_iterate_c(cpi, frames, frames_to_blur, - frames_to_blur_backward, strength, &ref_buf); + frames_to_blur_backward, strength, &sf); } diff --git a/libaom/av1/encoder/temporal_filter.h b/libaom/av1/encoder/temporal_filter.h index 1ff1162..bb26c36 100644 --- a/libaom/av1/encoder/temporal_filter.h +++ b/libaom/av1/encoder/temporal_filter.h @@ -18,6 +18,18 @@ extern "C" { #define ARNR_FILT_QINDEX 128 +// Block size used in temporal filtering +#define TF_BLOCK BLOCK_32X32 +#define BH 32 +#define BH_LOG2 5 +#define BW 32 +#define BW_LOG2 5 +#define BLK_PELS 1024 // Pixels in the block +#define THR_SHIFT 2 +#define TF_SUB_BLOCK BLOCK_16X16 +#define SUB_BH 16 +#define SUB_BW 16 + void av1_temporal_filter(AV1_COMP *cpi, int distance); #ifdef __cplusplus diff --git a/libaom/av1/encoder/tokenize.h b/libaom/av1/encoder/tokenize.h index 63b505f..c80af7b 100644 --- a/libaom/av1/encoder/tokenize.h +++ b/libaom/av1/encoder/tokenize.h @@ -38,11 +38,11 @@ struct tokenize_b_args { uint8_t allow_update_cdf; }; -typedef enum { +enum { OUTPUT_ENABLED = 0, DRY_RUN_NORMAL, DRY_RUN_COSTCOEFFS, -} RUN_TYPE; +} UENUM1BYTE(RUN_TYPE); // Note in all the tokenize functions rate if non NULL is incremented // with the coefficient token cost only if dry_run = DRY_RUN_COSTCOEFS, diff --git a/libaom/av1/encoder/tpl_model.c b/libaom/av1/encoder/tpl_model.c new file mode 100644 index 0000000..79afb6d --- /dev/null +++ b/libaom/av1/encoder/tpl_model.c @@ -0,0 +1,592 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <stdint.h> + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom/aom_codec.h" + +#include "av1/common/onyxc_int.h" +#include "av1/common/reconintra.h" + +#include "av1/encoder/encoder.h" +#include "av1/encoder/reconinter_enc.h" + +typedef struct GF_PICTURE { + YV12_BUFFER_CONFIG *frame; + int ref_frame[7]; +} GF_PICTURE; + +static void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + TX_SIZE tx_size, int64_t *recon_error, + int64_t *sse) { + const struct macroblock_plane *const p = &x->plane[plane]; + const SCAN_ORDER *const scan_order = &av1_default_scan_orders[tx_size]; + uint16_t eob; + int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]; + const int shift = tx_size == TX_32X32 ? 0 : 2; + + av1_quantize_fp_32x32(coeff, pix_num, p->zbin_QTX, p->round_fp_QTX, + p->quant_fp_QTX, p->quant_shift_QTX, qcoeff, dqcoeff, + p->dequant_QTX, &eob, scan_order->scan, + scan_order->iscan); + + *recon_error = av1_block_error(coeff, dqcoeff, pix_num, sse) >> shift; + *recon_error = AOMMAX(*recon_error, 1); + + *sse = (*sse) >> shift; + *sse = AOMMAX(*sse, 1); +} + +static void wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff, + TX_SIZE tx_size) { + switch (tx_size) { + case TX_8X8: aom_hadamard_8x8(src_diff, bw, coeff); break; + case TX_16X16: aom_hadamard_16x16(src_diff, bw, coeff); break; + case TX_32X32: aom_hadamard_32x32(src_diff, bw, coeff); break; + default: assert(0); + } +} + +static uint32_t motion_compensated_prediction(AV1_COMP *cpi, ThreadData *td, + uint8_t *cur_frame_buf, + uint8_t *ref_frame_buf, + int stride, BLOCK_SIZE bsize, + int mi_row, int mi_col) { + AV1_COMMON *cm = &cpi->common; + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv; + const SEARCH_METHODS search_method = NSTEP; + int step_param; + int sadpb = x->sadperbit16; + uint32_t bestsme = UINT_MAX; + int distortion; + uint32_t sse; + int cost_list[5]; + const MvLimits tmp_mv_limits = x->mv_limits; + + MV best_ref_mv1 = { 0, 0 }; + MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */ + + best_ref_mv1_full.col = best_ref_mv1.col >> 3; + best_ref_mv1_full.row = best_ref_mv1.row >> 3; + + // Setup frame pointers + x->plane[0].src.buf = cur_frame_buf; + x->plane[0].src.stride = stride; + xd->plane[0].pre[0].buf = ref_frame_buf; + xd->plane[0].pre[0].stride = stride; + + step_param = mv_sf->reduce_first_step_size; + step_param = AOMMIN(step_param, MAX_MVSEARCH_STEPS - 2); + + av1_set_mv_search_range(&x->mv_limits, &best_ref_mv1); + + av1_full_pixel_search(cpi, x, bsize, &best_ref_mv1_full, step_param, + search_method, 0, sadpb, cond_cost_list(cpi, cost_list), + &best_ref_mv1, INT_MAX, 0, (MI_SIZE * mi_col), + (MI_SIZE * mi_row), 0, &cpi->ss_cfg[SS_CFG_SRC]); + + /* restore UMV window */ + x->mv_limits = tmp_mv_limits; + + const int pw = block_size_wide[bsize]; + const int ph = block_size_high[bsize]; + bestsme = cpi->find_fractional_mv_step( + x, cm, mi_row, mi_col, &best_ref_mv1, cpi->common.allow_high_precision_mv, + x->errorperbit, &cpi->fn_ptr[bsize], 0, mv_sf->subpel_iters_per_step, + cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, NULL, + 0, 0, pw, ph, 1, 1); + + return bestsme; +} + +static void mode_estimation(AV1_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, + struct scale_factors *sf, GF_PICTURE *gf_picture, + int frame_idx, int16_t *src_diff, tran_low_t *coeff, + tran_low_t *qcoeff, tran_low_t *dqcoeff, int mi_row, + int mi_col, BLOCK_SIZE bsize, TX_SIZE tx_size, + YV12_BUFFER_CONFIG *ref_frame[], uint8_t *predictor, + TplDepStats *tpl_stats) { + AV1_COMMON *cm = &cpi->common; + ThreadData *td = &cpi->td; + + const int bw = 4 << mi_size_wide_log2[bsize]; + const int bh = 4 << mi_size_high_log2[bsize]; + const int pix_num = bw * bh; + int best_rf_idx = -1; + int_mv best_mv; + int64_t best_inter_cost = INT64_MAX; + int64_t inter_cost; + int rf_idx; + const InterpFilters kernel = + av1_make_interp_filters(EIGHTTAP_REGULAR, EIGHTTAP_REGULAR); + + int64_t best_intra_cost = INT64_MAX; + int64_t intra_cost; + PREDICTION_MODE mode; + int mb_y_offset = mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE; + MB_MODE_INFO mi_above, mi_left; + + memset(tpl_stats, 0, sizeof(*tpl_stats)); + + xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8); + xd->mb_to_bottom_edge = ((cm->mi_rows - 1 - mi_row) * MI_SIZE) * 8; + xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8); + xd->mb_to_right_edge = ((cm->mi_cols - 1 - mi_col) * MI_SIZE) * 8; + xd->above_mbmi = (mi_row > 0) ? &mi_above : NULL; + xd->left_mbmi = (mi_col > 0) ? &mi_left : NULL; + + // Intra prediction search + for (mode = DC_PRED; mode <= PAETH_PRED; ++mode) { + uint8_t *src, *dst; + int src_stride, dst_stride; + + src = xd->cur_buf->y_buffer + mb_y_offset; + src_stride = xd->cur_buf->y_stride; + + dst = &predictor[0]; + dst_stride = bw; + + xd->mi[0]->sb_type = bsize; + xd->mi[0]->ref_frame[0] = INTRA_FRAME; + + av1_predict_intra_block( + cm, xd, block_size_wide[bsize], block_size_high[bsize], tx_size, mode, + 0, 0, FILTER_INTRA_MODES, src, src_stride, dst, dst_stride, 0, 0, 0); + + if (is_cur_buf_hbd(xd)) { + aom_highbd_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, + dst_stride, xd->bd); + } else { + aom_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, + dst_stride); + } + + wht_fwd_txfm(src_diff, bw, coeff, tx_size); + + intra_cost = aom_satd(coeff, pix_num); + + if (intra_cost < best_intra_cost) best_intra_cost = intra_cost; + } + + // Motion compensated prediction + best_mv.as_int = 0; + + (void)mb_y_offset; + // Motion estimation column boundary + x->mv_limits.col_min = -((mi_col * MI_SIZE) + (17 - 2 * AOM_INTERP_EXTEND)); + x->mv_limits.col_max = + ((cm->mi_cols - 1 - mi_col) * MI_SIZE) + (17 - 2 * AOM_INTERP_EXTEND); + + for (rf_idx = 0; rf_idx < 7; ++rf_idx) { + if (ref_frame[rf_idx] == NULL) continue; + + motion_compensated_prediction(cpi, td, xd->cur_buf->y_buffer + mb_y_offset, + ref_frame[rf_idx]->y_buffer + mb_y_offset, + xd->cur_buf->y_stride, bsize, mi_row, mi_col); + + // TODO(jingning): Not yet support high bit-depth in the next three + // steps. + ConvolveParams conv_params = get_conv_params(0, 0, xd->bd); + WarpTypesAllowed warp_types; + memset(&warp_types, 0, sizeof(WarpTypesAllowed)); + + av1_build_inter_predictor( + ref_frame[rf_idx]->y_buffer + mb_y_offset, ref_frame[rf_idx]->y_stride, + &predictor[0], bw, &x->best_mv.as_mv, sf, bw, bh, &conv_params, kernel, + &warp_types, mi_col * MI_SIZE, mi_row * MI_SIZE, 0, 0, MV_PRECISION_Q3, + mi_col * MI_SIZE, mi_row * MI_SIZE, xd, 0); + if (is_cur_buf_hbd(xd)) { + aom_highbd_subtract_block( + bh, bw, src_diff, bw, xd->cur_buf->y_buffer + mb_y_offset, + xd->cur_buf->y_stride, &predictor[0], bw, xd->bd); + } else { + aom_subtract_block(bh, bw, src_diff, bw, + xd->cur_buf->y_buffer + mb_y_offset, + xd->cur_buf->y_stride, &predictor[0], bw); + } + wht_fwd_txfm(src_diff, bw, coeff, tx_size); + + inter_cost = aom_satd(coeff, pix_num); + if (inter_cost < best_inter_cost) { + int64_t recon_error, sse; + + best_rf_idx = rf_idx; + best_inter_cost = inter_cost; + best_mv.as_int = x->best_mv.as_int; + get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, &recon_error, + &sse); + } + } + best_intra_cost = AOMMAX(best_intra_cost, 1); + best_inter_cost = AOMMIN(best_intra_cost, best_inter_cost); + tpl_stats->inter_cost = best_inter_cost << TPL_DEP_COST_SCALE_LOG2; + tpl_stats->intra_cost = best_intra_cost << TPL_DEP_COST_SCALE_LOG2; + tpl_stats->mc_dep_cost = tpl_stats->intra_cost + tpl_stats->mc_flow; + + tpl_stats->ref_frame_index = gf_picture[frame_idx].ref_frame[best_rf_idx]; + tpl_stats->mv.as_int = best_mv.as_int; +} + +static int round_floor(int ref_pos, int bsize_pix) { + int round; + if (ref_pos < 0) + round = -(1 + (-ref_pos - 1) / bsize_pix); + else + round = ref_pos / bsize_pix; + + return round; +} + +static int get_overlap_area(int grid_pos_row, int grid_pos_col, int ref_pos_row, + int ref_pos_col, int block, BLOCK_SIZE bsize) { + int width = 0, height = 0; + int bw = 4 << mi_size_wide_log2[bsize]; + int bh = 4 << mi_size_high_log2[bsize]; + + switch (block) { + case 0: + width = grid_pos_col + bw - ref_pos_col; + height = grid_pos_row + bh - ref_pos_row; + break; + case 1: + width = ref_pos_col + bw - grid_pos_col; + height = grid_pos_row + bh - ref_pos_row; + break; + case 2: + width = grid_pos_col + bw - ref_pos_col; + height = ref_pos_row + bh - grid_pos_row; + break; + case 3: + width = ref_pos_col + bw - grid_pos_col; + height = ref_pos_row + bh - grid_pos_row; + break; + default: assert(0); + } + + return width * height; +} + +static void tpl_model_update_b(TplDepFrame *tpl_frame, TplDepStats *tpl_stats, + int mi_row, int mi_col, const BLOCK_SIZE bsize) { + TplDepFrame *ref_tpl_frame = &tpl_frame[tpl_stats->ref_frame_index]; + TplDepStats *ref_stats = ref_tpl_frame->tpl_stats_ptr; + MV mv = tpl_stats->mv.as_mv; + int mv_row = mv.row >> 3; + int mv_col = mv.col >> 3; + + int ref_pos_row = mi_row * MI_SIZE + mv_row; + int ref_pos_col = mi_col * MI_SIZE + mv_col; + + const int bw = 4 << mi_size_wide_log2[bsize]; + const int bh = 4 << mi_size_high_log2[bsize]; + const int mi_height = mi_size_high[bsize]; + const int mi_width = mi_size_wide[bsize]; + const int pix_num = bw * bh; + + // top-left on grid block location in pixel + int grid_pos_row_base = round_floor(ref_pos_row, bh) * bh; + int grid_pos_col_base = round_floor(ref_pos_col, bw) * bw; + int block; + + for (block = 0; block < 4; ++block) { + int grid_pos_row = grid_pos_row_base + bh * (block >> 1); + int grid_pos_col = grid_pos_col_base + bw * (block & 0x01); + + if (grid_pos_row >= 0 && grid_pos_row < ref_tpl_frame->mi_rows * MI_SIZE && + grid_pos_col >= 0 && grid_pos_col < ref_tpl_frame->mi_cols * MI_SIZE) { + int overlap_area = get_overlap_area( + grid_pos_row, grid_pos_col, ref_pos_row, ref_pos_col, block, bsize); + int ref_mi_row = round_floor(grid_pos_row, bh) * mi_height; + int ref_mi_col = round_floor(grid_pos_col, bw) * mi_width; + + int64_t mc_flow = tpl_stats->mc_dep_cost - + (tpl_stats->mc_dep_cost * tpl_stats->inter_cost) / + tpl_stats->intra_cost; + + int idx, idy; + + for (idy = 0; idy < mi_height; ++idy) { + for (idx = 0; idx < mi_width; ++idx) { + TplDepStats *des_stats = + &ref_stats[(ref_mi_row + idy) * ref_tpl_frame->stride + + (ref_mi_col + idx)]; + + des_stats->mc_flow += (mc_flow * overlap_area) / pix_num; + assert(overlap_area >= 0); + } + } + } + } +} + +static void tpl_model_update(TplDepFrame *tpl_frame, TplDepStats *tpl_stats, + int mi_row, int mi_col, const BLOCK_SIZE bsize) { + int idx, idy; + const int mi_height = mi_size_high[bsize]; + const int mi_width = mi_size_wide[bsize]; + + for (idy = 0; idy < mi_height; ++idy) { + for (idx = 0; idx < mi_width; ++idx) { + TplDepStats *tpl_ptr = + &tpl_stats[(mi_row + idy) * tpl_frame->stride + (mi_col + idx)]; + tpl_model_update_b(tpl_frame, tpl_ptr, mi_row + idy, mi_col + idx, + BLOCK_4X4); + } + } +} + +static void tpl_model_store(TplDepStats *tpl_stats, int mi_row, int mi_col, + BLOCK_SIZE bsize, int stride, + const TplDepStats *src_stats) { + const int mi_height = mi_size_high[bsize]; + const int mi_width = mi_size_wide[bsize]; + int idx, idy; + + int64_t intra_cost = src_stats->intra_cost / (mi_height * mi_width); + int64_t inter_cost = src_stats->inter_cost / (mi_height * mi_width); + + TplDepStats *tpl_ptr; + + intra_cost = AOMMAX(1, intra_cost); + inter_cost = AOMMAX(1, inter_cost); + + for (idy = 0; idy < mi_height; ++idy) { + tpl_ptr = &tpl_stats[(mi_row + idy) * stride + mi_col]; + for (idx = 0; idx < mi_width; ++idx) { + tpl_ptr->intra_cost = intra_cost; + tpl_ptr->inter_cost = inter_cost; + tpl_ptr->mc_dep_cost = tpl_ptr->intra_cost + tpl_ptr->mc_flow; + tpl_ptr->ref_frame_index = src_stats->ref_frame_index; + tpl_ptr->mv.as_int = src_stats->mv.as_int; + ++tpl_ptr; + } + } +} + +static void mc_flow_dispenser(AV1_COMP *cpi, GF_PICTURE *gf_picture, + int frame_idx) { + TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; + YV12_BUFFER_CONFIG *this_frame = gf_picture[frame_idx].frame; + YV12_BUFFER_CONFIG *ref_frame[7] = { + NULL, NULL, NULL, NULL, NULL, NULL, NULL + }; + + AV1_COMMON *cm = &cpi->common; + struct scale_factors sf; + int rdmult, idx; + ThreadData *td = &cpi->td; + MACROBLOCK *x = &td->mb; + MACROBLOCKD *xd = &x->e_mbd; + int mi_row, mi_col; + + DECLARE_ALIGNED(32, uint16_t, predictor16[32 * 32 * 3]); + DECLARE_ALIGNED(32, uint8_t, predictor8[32 * 32 * 3]); + uint8_t *predictor; + DECLARE_ALIGNED(32, int16_t, src_diff[32 * 32]); + DECLARE_ALIGNED(32, tran_low_t, coeff[32 * 32]); + DECLARE_ALIGNED(32, tran_low_t, qcoeff[32 * 32]); + DECLARE_ALIGNED(32, tran_low_t, dqcoeff[32 * 32]); + + const BLOCK_SIZE bsize = BLOCK_32X32; + const TX_SIZE tx_size = max_txsize_lookup[bsize]; + const int mi_height = mi_size_high[bsize]; + const int mi_width = mi_size_wide[bsize]; + + // Setup scaling factor + av1_setup_scale_factors_for_frame( + &sf, this_frame->y_crop_width, this_frame->y_crop_height, + this_frame->y_crop_width, this_frame->y_crop_height); + + if (is_cur_buf_hbd(xd)) + predictor = CONVERT_TO_BYTEPTR(predictor16); + else + predictor = predictor8; + + // Prepare reference frame pointers. If any reference frame slot is + // unavailable, the pointer will be set to Null. + for (idx = 0; idx < 7; ++idx) { + int rf_idx = gf_picture[frame_idx].ref_frame[idx]; + if (rf_idx != -1) ref_frame[idx] = gf_picture[rf_idx].frame; + } + + xd->mi = cm->mi_grid_visible; + xd->mi[0] = cm->mi; + xd->cur_buf = this_frame; + + // Get rd multiplier set up. + rdmult = (int)av1_compute_rd_mult(cpi, tpl_frame->base_qindex); + if (rdmult < 1) rdmult = 1; + set_error_per_bit(x, rdmult); + av1_initialize_me_consts(cpi, x, tpl_frame->base_qindex); + + tpl_frame->is_valid = 1; + + cm->base_qindex = tpl_frame->base_qindex; + av1_frame_init_quantizer(cpi); + + for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) { + // Motion estimation row boundary + x->mv_limits.row_min = -((mi_row * MI_SIZE) + (17 - 2 * AOM_INTERP_EXTEND)); + x->mv_limits.row_max = + (cm->mi_rows - 1 - mi_row) * MI_SIZE + (17 - 2 * AOM_INTERP_EXTEND); + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) { + TplDepStats tpl_stats; + mode_estimation(cpi, x, xd, &sf, gf_picture, frame_idx, src_diff, coeff, + qcoeff, dqcoeff, mi_row, mi_col, bsize, tx_size, + ref_frame, predictor, &tpl_stats); + + // Motion flow dependency dispenser. + tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize, + tpl_frame->stride, &tpl_stats); + + tpl_model_update(cpi->tpl_stats, tpl_frame->tpl_stats_ptr, mi_row, mi_col, + bsize); + } + } +} + +static void init_gop_frames(AV1_COMP *cpi, GF_PICTURE *gf_picture, + const GF_GROUP *gf_group, int *tpl_group_frames, + const EncodeFrameInput *const frame_input) { + AV1_COMMON *cm = &cpi->common; + const SequenceHeader *const seq_params = &cm->seq_params; + int frame_idx = 0; + int i; + int gld_index = -1; + int alt_index = -1; + int lst_index = -1; + int extend_frame_count = 0; + int pframe_qindex = cpi->tpl_stats[2].base_qindex; + + RefCntBuffer *frame_bufs = cm->buffer_pool->frame_bufs; + int recon_frame_index[INTER_REFS_PER_FRAME + 1] = { -1, -1, -1, -1, + -1, -1, -1, -1 }; + + // TODO(jingning): To be used later for gf frame type parsing. + (void)gf_group; + + for (i = 0; i < FRAME_BUFFERS && frame_idx < INTER_REFS_PER_FRAME + 1; ++i) { + if (frame_bufs[i].ref_count == 0) { + alloc_frame_mvs(cm, &frame_bufs[i]); + if (aom_realloc_frame_buffer( + &frame_bufs[i].buf, cm->width, cm->height, + seq_params->subsampling_x, seq_params->subsampling_y, + seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, + cm->byte_alignment, NULL, NULL, NULL)) + aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate frame buffer"); + + recon_frame_index[frame_idx] = i; + ++frame_idx; + } + } + + for (i = 0; i < INTER_REFS_PER_FRAME + 1; ++i) { + assert(recon_frame_index[i] >= 0); + cpi->tpl_recon_frames[i] = &frame_bufs[recon_frame_index[i]].buf; + } + + *tpl_group_frames = 0; + + // Initialize Golden reference frame. + gf_picture[0].frame = NULL; + RefCntBuffer *ref_buf = get_ref_frame_buf(cm, GOLDEN_FRAME); + if (ref_buf) gf_picture[0].frame = &ref_buf->buf; + for (i = 0; i < 7; ++i) gf_picture[0].ref_frame[i] = -1; + gld_index = 0; + ++*tpl_group_frames; + + // Initialize ARF frame + gf_picture[1].frame = frame_input->source; + gf_picture[1].ref_frame[0] = gld_index; + gf_picture[1].ref_frame[1] = lst_index; + gf_picture[1].ref_frame[2] = alt_index; + // TODO(yuec) Need o figure out full AV1 reference model + for (i = 3; i < 7; ++i) gf_picture[1].ref_frame[i] = -1; + alt_index = 1; + ++*tpl_group_frames; + + // Initialize P frames + for (frame_idx = 2; frame_idx < MAX_LAG_BUFFERS; ++frame_idx) { + struct lookahead_entry *buf = + av1_lookahead_peek(cpi->lookahead, frame_idx - 2); + + if (buf == NULL) break; + + gf_picture[frame_idx].frame = &buf->img; + gf_picture[frame_idx].ref_frame[0] = gld_index; + gf_picture[frame_idx].ref_frame[1] = lst_index; + gf_picture[frame_idx].ref_frame[2] = alt_index; + for (i = 3; i < 7; ++i) gf_picture[frame_idx].ref_frame[i] = -1; + + ++*tpl_group_frames; + lst_index = frame_idx; + + if (frame_idx == cpi->rc.baseline_gf_interval + 1) break; + } + + gld_index = frame_idx; + lst_index = AOMMAX(0, frame_idx - 1); + alt_index = -1; + ++frame_idx; + + // Extend two frames outside the current gf group. + for (; frame_idx < MAX_LAG_BUFFERS && extend_frame_count < 2; ++frame_idx) { + struct lookahead_entry *buf = + av1_lookahead_peek(cpi->lookahead, frame_idx - 2); + + if (buf == NULL) break; + + cpi->tpl_stats[frame_idx].base_qindex = pframe_qindex; + + gf_picture[frame_idx].frame = &buf->img; + gf_picture[frame_idx].ref_frame[0] = gld_index; + gf_picture[frame_idx].ref_frame[1] = lst_index; + gf_picture[frame_idx].ref_frame[2] = alt_index; + for (i = 3; i < 7; ++i) gf_picture[frame_idx].ref_frame[i] = -1; + lst_index = frame_idx; + ++*tpl_group_frames; + ++extend_frame_count; + } +} + +static void init_tpl_stats(AV1_COMP *cpi) { + int frame_idx; + for (frame_idx = 0; frame_idx < MAX_LAG_BUFFERS; ++frame_idx) { + TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; + memset(tpl_frame->tpl_stats_ptr, 0, + tpl_frame->height * tpl_frame->width * + sizeof(*tpl_frame->tpl_stats_ptr)); + tpl_frame->is_valid = 0; + } +} + +void av1_tpl_setup_stats(AV1_COMP *cpi, + const EncodeFrameInput *const frame_input) { + GF_PICTURE gf_picture[MAX_LAG_BUFFERS]; + const GF_GROUP *gf_group = &cpi->twopass.gf_group; + int tpl_group_frames = 0; + int frame_idx; + + init_gop_frames(cpi, gf_picture, gf_group, &tpl_group_frames, frame_input); + + init_tpl_stats(cpi); + + // Backward propagation from tpl_group_frames to 1. + for (frame_idx = tpl_group_frames - 1; frame_idx > 0; --frame_idx) + mc_flow_dispenser(cpi, gf_picture, frame_idx); +} diff --git a/libaom/av1/encoder/tpl_model.h b/libaom/av1/encoder/tpl_model.h new file mode 100644 index 0000000..f6b33b0 --- /dev/null +++ b/libaom/av1/encoder/tpl_model.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_TPL_MODEL_H_ +#define AOM_AV1_ENCODER_TPL_MODEL_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +void av1_tpl_setup_stats(AV1_COMP *cpi, + const EncodeFrameInput *const frame_input); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_TPL_MODEL_H_ diff --git a/libaom/av1/encoder/var_based_part.c b/libaom/av1/encoder/var_based_part.c new file mode 100644 index 0000000..3cead91 --- /dev/null +++ b/libaom/av1/encoder/var_based_part.c @@ -0,0 +1,778 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <limits.h> +#include <math.h> +#include <stdbool.h> +#include <stdio.h> + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/binary_codes_writer.h" +#include "aom_ports/mem.h" +#include "aom_ports/aom_timer.h" +#include "aom_ports/system_state.h" + +#include "av1/common/reconinter.h" +#include "av1/common/blockd.h" + +#include "av1/encoder/encodeframe.h" +#include "av1/encoder/var_based_part.h" +#include "av1/encoder/reconinter_enc.h" + +extern const uint8_t AV1_VAR_OFFS[]; + +typedef struct { + // TODO(kyslov): consider changing to 64bit + + // This struct is used for computing variance in choose_partitioning(), where + // the max number of samples within a superblock is 32x32 (with 4x4 avg). + // With 8bit bitdepth, uint32_t is enough for sum_square_error (2^8 * 2^8 * 32 + // * 32 = 2^26). For high bitdepth we need to consider changing this to 64 bit + uint32_t sum_square_error; + int32_t sum_error; + int log2_count; + int variance; +} var; + +typedef struct { + var none; + var horz[2]; + var vert[2]; +} partition_variance; + +typedef struct { + partition_variance part_variances; + var split[4]; +} v4x4; + +typedef struct { + partition_variance part_variances; + v4x4 split[4]; +} v8x8; + +typedef struct { + partition_variance part_variances; + v8x8 split[4]; +} v16x16; + +typedef struct { + partition_variance part_variances; + v16x16 split[4]; +} v32x32; + +typedef struct { + partition_variance part_variances; + v32x32 split[4]; +} v64x64; + +typedef struct { + partition_variance part_variances; + v64x64 split[4]; +} v128x128; + +typedef struct { + partition_variance *part_variances; + var *split[4]; +} variance_node; + +static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) { + int i; + node->part_variances = NULL; + switch (bsize) { + case BLOCK_128X128: { + v128x128 *vt = (v128x128 *)data; + node->part_variances = &vt->part_variances; + for (i = 0; i < 4; i++) + node->split[i] = &vt->split[i].part_variances.none; + break; + } + case BLOCK_64X64: { + v64x64 *vt = (v64x64 *)data; + node->part_variances = &vt->part_variances; + for (i = 0; i < 4; i++) + node->split[i] = &vt->split[i].part_variances.none; + break; + } + case BLOCK_32X32: { + v32x32 *vt = (v32x32 *)data; + node->part_variances = &vt->part_variances; + for (i = 0; i < 4; i++) + node->split[i] = &vt->split[i].part_variances.none; + break; + } + case BLOCK_16X16: { + v16x16 *vt = (v16x16 *)data; + node->part_variances = &vt->part_variances; + for (i = 0; i < 4; i++) + node->split[i] = &vt->split[i].part_variances.none; + break; + } + case BLOCK_8X8: { + v8x8 *vt = (v8x8 *)data; + node->part_variances = &vt->part_variances; + for (i = 0; i < 4; i++) + node->split[i] = &vt->split[i].part_variances.none; + break; + } + default: { + v4x4 *vt = (v4x4 *)data; + assert(bsize == BLOCK_4X4); + node->part_variances = &vt->part_variances; + for (i = 0; i < 4; i++) node->split[i] = &vt->split[i]; + break; + } + } +} + +// Set variance values given sum square error, sum error, count. +static void fill_variance(uint32_t s2, int32_t s, int c, var *v) { + v->sum_square_error = s2; + v->sum_error = s; + v->log2_count = c; +} + +static void get_variance(var *v) { + v->variance = + (int)(256 * (v->sum_square_error - + (uint32_t)(((int64_t)v->sum_error * v->sum_error) >> + v->log2_count)) >> + v->log2_count); +} + +static void sum_2_variances(const var *a, const var *b, var *r) { + assert(a->log2_count == b->log2_count); + fill_variance(a->sum_square_error + b->sum_square_error, + a->sum_error + b->sum_error, a->log2_count + 1, r); +} + +static void fill_variance_tree(void *data, BLOCK_SIZE bsize) { + variance_node node; + memset(&node, 0, sizeof(node)); + tree_to_node(data, bsize, &node); + sum_2_variances(node.split[0], node.split[1], &node.part_variances->horz[0]); + sum_2_variances(node.split[2], node.split[3], &node.part_variances->horz[1]); + sum_2_variances(node.split[0], node.split[2], &node.part_variances->vert[0]); + sum_2_variances(node.split[1], node.split[3], &node.part_variances->vert[1]); + sum_2_variances(&node.part_variances->vert[0], &node.part_variances->vert[1], + &node.part_variances->none); +} + +static void set_block_size(AV1_COMP *const cpi, MACROBLOCK *const x, + MACROBLOCKD *const xd, int mi_row, int mi_col, + BLOCK_SIZE bsize) { + if (cpi->common.mi_cols > mi_col && cpi->common.mi_rows > mi_row) { + set_mode_info_offsets(cpi, x, xd, mi_row, mi_col); + xd->mi[0]->sb_type = bsize; + } +} + +static int set_vt_partitioning(AV1_COMP *cpi, MACROBLOCK *const x, + MACROBLOCKD *const xd, + const TileInfo *const tile, void *data, + BLOCK_SIZE bsize, int mi_row, int mi_col, + int64_t threshold, BLOCK_SIZE bsize_min, + int force_split) { + AV1_COMMON *const cm = &cpi->common; + variance_node vt; + const int block_width = mi_size_wide[bsize]; + const int block_height = mi_size_high[bsize]; + + assert(block_height == block_width); + tree_to_node(data, bsize, &vt); + + if (force_split == 1) return 0; + + if (mi_col + block_width > tile->mi_col_end || + mi_row + block_height > tile->mi_row_end) + return 0; + + // For bsize=bsize_min (16x16/8x8 for 8x8/4x4 downsampling), select if + // variance is below threshold, otherwise split will be selected. + // No check for vert/horiz split as too few samples for variance. + if (bsize == bsize_min) { + // Variance already computed to set the force_split. + if (frame_is_intra_only(cm)) get_variance(&vt.part_variances->none); + if (mi_col + block_width / 2 < cm->mi_cols && + mi_row + block_height / 2 < cm->mi_rows && + vt.part_variances->none.variance < threshold) { + set_block_size(cpi, x, xd, mi_row, mi_col, bsize); + return 1; + } + return 0; + } else if (bsize > bsize_min) { + // Variance already computed to set the force_split. + if (frame_is_intra_only(cm)) get_variance(&vt.part_variances->none); + // For key frame: take split for bsize above 32X32 or very high variance. + if (frame_is_intra_only(cm) && + (bsize > BLOCK_32X32 || + vt.part_variances->none.variance > (threshold << 4))) { + return 0; + } + // If variance is low, take the bsize (no split). + if (mi_col + block_width / 2 < cm->mi_cols && + mi_row + block_height / 2 < cm->mi_rows && + vt.part_variances->none.variance < threshold) { + set_block_size(cpi, x, xd, mi_row, mi_col, bsize); + return 1; + } + + // Check vertical split. + if (mi_row + block_height / 2 < cm->mi_rows) { + BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_VERT); + get_variance(&vt.part_variances->vert[0]); + get_variance(&vt.part_variances->vert[1]); + if (vt.part_variances->vert[0].variance < threshold && + vt.part_variances->vert[1].variance < threshold && + get_plane_block_size(subsize, xd->plane[1].subsampling_x, + xd->plane[1].subsampling_y) < BLOCK_INVALID) { + set_block_size(cpi, x, xd, mi_row, mi_col, subsize); + set_block_size(cpi, x, xd, mi_row, mi_col + block_width / 2, subsize); + return 1; + } + } + // Check horizontal split. + if (mi_col + block_width / 2 < cm->mi_cols) { + BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_HORZ); + get_variance(&vt.part_variances->horz[0]); + get_variance(&vt.part_variances->horz[1]); + if (vt.part_variances->horz[0].variance < threshold && + vt.part_variances->horz[1].variance < threshold && + get_plane_block_size(subsize, xd->plane[1].subsampling_x, + xd->plane[1].subsampling_y) < BLOCK_INVALID) { + set_block_size(cpi, x, xd, mi_row, mi_col, subsize); + set_block_size(cpi, x, xd, mi_row + block_height / 2, mi_col, subsize); + return 1; + } + } + + return 0; + } + return 0; +} + +static void fill_variance_8x8avg(const uint8_t *s, int sp, const uint8_t *d, + int dp, int x16_idx, int y16_idx, v16x16 *vst, + int pixels_wide, int pixels_high, + int is_key_frame) { + int k; + for (k = 0; k < 4; k++) { + int x8_idx = x16_idx + ((k & 1) << 3); + int y8_idx = y16_idx + ((k >> 1) << 3); + unsigned int sse = 0; + int sum = 0; + if (x8_idx < pixels_wide && y8_idx < pixels_high) { + int s_avg; + int d_avg = 128; + s_avg = aom_avg_8x8(s + y8_idx * sp + x8_idx, sp); + if (!is_key_frame) d_avg = aom_avg_8x8(d + y8_idx * dp + x8_idx, dp); + + sum = s_avg - d_avg; + sse = sum * sum; + } + fill_variance(sse, sum, 0, &vst->split[k].part_variances.none); + } +} + +static int compute_minmax_8x8(const uint8_t *s, int sp, const uint8_t *d, + int dp, int x16_idx, int y16_idx, int pixels_wide, + int pixels_high) { + int k; + int minmax_max = 0; + int minmax_min = 255; + // Loop over the 4 8x8 subblocks. + for (k = 0; k < 4; k++) { + int x8_idx = x16_idx + ((k & 1) << 3); + int y8_idx = y16_idx + ((k >> 1) << 3); + int min = 0; + int max = 0; + if (x8_idx < pixels_wide && y8_idx < pixels_high) { + aom_minmax_8x8(s + y8_idx * sp + x8_idx, sp, d + y8_idx * dp + x8_idx, dp, + &min, &max); + if ((max - min) > minmax_max) minmax_max = (max - min); + if ((max - min) < minmax_min) minmax_min = (max - min); + } + } + return (minmax_max - minmax_min); +} + +static void fill_variance_4x4avg(const uint8_t *s, int sp, const uint8_t *d, + int dp, int x8_idx, int y8_idx, v8x8 *vst, + int pixels_wide, int pixels_high, + int is_key_frame) { + int k; + for (k = 0; k < 4; k++) { + int x4_idx = x8_idx + ((k & 1) << 2); + int y4_idx = y8_idx + ((k >> 1) << 2); + unsigned int sse = 0; + int sum = 0; + if (x4_idx < pixels_wide && y4_idx < pixels_high) { + int s_avg; + int d_avg = 128; + s_avg = aom_avg_4x4(s + y4_idx * sp + x4_idx, sp); + if (!is_key_frame) d_avg = aom_avg_4x4(d + y4_idx * dp + x4_idx, dp); + sum = s_avg - d_avg; + sse = sum * sum; + } + fill_variance(sse, sum, 0, &vst->split[k].part_variances.none); + } +} + +static int64_t scale_part_thresh_sumdiff(int64_t threshold_base, int speed, + int width, int height, + int content_state) { + if (speed >= 8) { + if (width <= 640 && height <= 480) + return (5 * threshold_base) >> 2; + else if ((content_state == kLowSadLowSumdiff) || + (content_state == kHighSadLowSumdiff) || + (content_state == kLowVarHighSumdiff)) + return (5 * threshold_base) >> 2; + } else if (speed == 7) { + if ((content_state == kLowSadLowSumdiff) || + (content_state == kHighSadLowSumdiff) || + (content_state == kLowVarHighSumdiff)) { + return (5 * threshold_base) >> 2; + } + } + return threshold_base; +} + +// Set the variance split thresholds for following the block sizes: +// 0 - threshold_128x128, 1 - threshold_64x64, 2 - threshold_32x32, +// 3 - vbp_threshold_16x16. 4 - vbp_threshold_8x8 (to split to 4x4 partition) is +// currently only used on key frame. +static void set_vbp_thresholds(AV1_COMP *cpi, int64_t thresholds[], int q, + int content_state) { + AV1_COMMON *const cm = &cpi->common; + const int is_key_frame = frame_is_intra_only(cm); + const int threshold_multiplier = is_key_frame ? 40 : 1; + int64_t threshold_base = + (int64_t)(threshold_multiplier * cpi->dequants.y_dequant_QTX[q][1]); + + if (is_key_frame) { + thresholds[0] = threshold_base; + thresholds[1] = threshold_base; + thresholds[2] = threshold_base >> 2; + thresholds[3] = threshold_base >> 2; + thresholds[4] = threshold_base << 2; + } else { + // Increase base variance threshold based on content_state/sum_diff level. + threshold_base = scale_part_thresh_sumdiff( + threshold_base, cpi->oxcf.speed, cm->width, cm->height, content_state); + + thresholds[1] = threshold_base; + thresholds[3] = threshold_base << cpi->oxcf.speed; + if (cm->width >= 1280 && cm->height >= 720) + thresholds[3] = thresholds[3] << 1; + if (cm->width <= 352 && cm->height <= 288) { + thresholds[1] = threshold_base >> 3; + thresholds[2] = threshold_base >> 1; + thresholds[3] = threshold_base << 3; + } else if (cm->width < 1280 && cm->height < 720) { + thresholds[2] = (5 * threshold_base) >> 2; + } else if (cm->width < 1920 && cm->height < 1080) { + thresholds[2] = threshold_base << 1; + thresholds[3] <<= 2; + } else { + thresholds[2] = (5 * threshold_base) >> 1; + } + } +} + +void av1_set_variance_partition_thresholds(AV1_COMP *cpi, int q, + int content_state) { + AV1_COMMON *const cm = &cpi->common; + SPEED_FEATURES *const sf = &cpi->sf; + const int is_key_frame = frame_is_intra_only(cm); + if (sf->partition_search_type != VAR_BASED_PARTITION) { + return; + } else { + set_vbp_thresholds(cpi, cpi->vbp_thresholds, q, content_state); + // The thresholds below are not changed locally. + if (is_key_frame) { + cpi->vbp_threshold_sad = 0; + cpi->vbp_threshold_copy = 0; + cpi->vbp_bsize_min = BLOCK_8X8; + } else { + if (cm->width <= 352 && cm->height <= 288) + cpi->vbp_threshold_sad = 10; + else + cpi->vbp_threshold_sad = (cpi->dequants.y_dequant_QTX[q][1] << 1) > 1000 + ? (cpi->dequants.y_dequant_QTX[q][1] << 1) + : 1000; + cpi->vbp_bsize_min = BLOCK_16X16; + if (cm->width <= 352 && cm->height <= 288) + cpi->vbp_threshold_copy = 4000; + else if (cm->width <= 640 && cm->height <= 360) + cpi->vbp_threshold_copy = 8000; + else + cpi->vbp_threshold_copy = + (cpi->dequants.y_dequant_QTX[q][1] << 3) > 8000 + ? (cpi->dequants.y_dequant_QTX[q][1] << 3) + : 8000; + } + cpi->vbp_threshold_minmax = 15 + (q >> 3); + } +} + +// This function chooses partitioning based on the variance between source and +// reconstructed last, where variance is computed for down-sampled inputs. +// TODO(kyslov): lot of things. Bring back noise estimation, brush up partition +// selection and most of all - retune the thresholds +int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile, + MACROBLOCK *x, int mi_row, int mi_col) { + AV1_COMMON *const cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + + int i, j, k, m; + v128x128 *vt; + v16x16 *vt2 = NULL; + unsigned char force_split[85]; + int avg_32x32; + int max_var_32x32 = 0; + int min_var_32x32 = INT_MAX; + int var_32x32; + int var_64x64; + int min_var_64x64 = INT_MAX; + int max_var_64x64 = 0; + int avg_16x16[4]; + int maxvar_16x16[4]; + int minvar_16x16[4]; + int64_t threshold_4x4avg; + int content_state = 0; + uint8_t *s; + const uint8_t *d; + int sp; + int dp; + int compute_minmax_variance = 1; + int is_key_frame = frame_is_intra_only(cm); + int pixels_wide = 128, pixels_high = 128; + assert(cm->seq_params.sb_size == BLOCK_64X64 || + cm->seq_params.sb_size == BLOCK_128X128); + const int is_small_sb = (cm->seq_params.sb_size == BLOCK_64X64); + const int num_64x64_blocks = is_small_sb ? 1 : 4; + + CHECK_MEM_ERROR(cm, vt, aom_calloc(1, sizeof(*vt))); + + int64_t thresholds[5] = { cpi->vbp_thresholds[0], cpi->vbp_thresholds[1], + cpi->vbp_thresholds[2], cpi->vbp_thresholds[3], + cpi->vbp_thresholds[4] }; + + const int low_res = (cm->width <= 352 && cm->height <= 288); + int variance4x4downsample[64]; + int segment_id; + const int num_planes = av1_num_planes(cm); + + segment_id = xd->mi[0]->segment_id; + + set_vbp_thresholds(cpi, thresholds, cm->base_qindex, content_state); + + if (is_small_sb) { + pixels_wide = 64; + pixels_high = 64; + } + + // For non keyframes, disable 4x4 average for low resolution when speed = 8 + threshold_4x4avg = INT64_MAX; + + if (xd->mb_to_right_edge < 0) pixels_wide += (xd->mb_to_right_edge >> 3); + if (xd->mb_to_bottom_edge < 0) pixels_high += (xd->mb_to_bottom_edge >> 3); + + s = x->plane[0].src.buf; + sp = x->plane[0].src.stride; + + // Index for force_split: 0 for 64x64, 1-4 for 32x32 blocks, + // 5-20 for the 16x16 blocks. + force_split[0] = 0; + + if (!is_key_frame) { + // TODO(kyslov): we are assuming that the ref is LAST_FRAME! Check if it + // is!! + MB_MODE_INFO *mi = xd->mi[0]; + const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, LAST_FRAME); + + assert(yv12 != NULL); + + av1_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, + get_ref_scale_factors(cm, LAST_FRAME), num_planes); + mi->ref_frame[0] = LAST_FRAME; + mi->ref_frame[1] = NONE_FRAME; + mi->sb_type = cm->seq_params.sb_size; + mi->mv[0].as_int = 0; + mi->interp_filters = av1_make_interp_filters(BILINEAR, BILINEAR); + if (xd->mb_to_right_edge >= 0 && xd->mb_to_bottom_edge >= 0) { + const MV dummy_mv = { 0, 0 }; + av1_int_pro_motion_estimation(cpi, x, cm->seq_params.sb_size, mi_row, + mi_col, &dummy_mv); + } + +// TODO(kyslov): bring the small SAD functionality back +#if 0 + y_sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, x->plane[0].src.stride, + xd->plane[0].pre[0].buf, + xd->plane[0].pre[0].stride); +#endif + x->pred_mv[LAST_FRAME] = mi->mv[0].as_mv; + + set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]); + av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, + cm->seq_params.sb_size, AOM_PLANE_Y, + AOM_PLANE_Y); + + d = xd->plane[0].dst.buf; + dp = xd->plane[0].dst.stride; + + // If the y_sad is very small, take 64x64 as partition and exit. + // Don't check on boosted segment for now, as 64x64 is suppressed there. +#if 0 + if (segment_id == CR_SEGMENT_ID_BASE && y_sad < cpi->vbp_threshold_sad) + { const int block_width = num_8x8_blocks_wide_lookup[BLOCK_64X64]; const + int block_height = num_8x8_blocks_high_lookup[BLOCK_64X64]; if (mi_col + + block_width / 2 < cm->mi_cols && mi_row + block_height / 2 < cm->mi_rows) + { set_block_size(cpi, x, xd, mi_row, mi_col, BLOCK_128X128); + x->variance_low[0] = 1; + return 0; + } + } +#endif + } else { + d = AV1_VAR_OFFS; + dp = 0; + } + + if (low_res && threshold_4x4avg < INT64_MAX) + CHECK_MEM_ERROR(cm, vt2, aom_calloc(64, sizeof(*vt2))); + // Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances + // for splits. + for (m = 0; m < num_64x64_blocks; m++) { + const int x64_idx = ((m & 1) << 6); + const int y64_idx = ((m >> 1) << 6); + const int m2 = m << 2; + force_split[m + 1] = 0; + for (i = 0; i < 4; i++) { + const int x32_idx = x64_idx + ((i & 1) << 5); + const int y32_idx = y64_idx + ((i >> 1) << 5); + const int i2 = (m2 + i) << 2; + force_split[5 + m2 + i] = 0; + avg_16x16[i] = 0; + maxvar_16x16[i] = 0; + minvar_16x16[i] = INT_MAX; + for (j = 0; j < 4; j++) { + const int x16_idx = x32_idx + ((j & 1) << 4); + const int y16_idx = y32_idx + ((j >> 1) << 4); + const int split_index = 21 + i2 + j; + v16x16 *vst = &vt->split[m].split[i].split[j]; + force_split[split_index] = 0; + variance4x4downsample[i2 + j] = 0; + if (!is_key_frame) { + fill_variance_8x8avg(s, sp, d, dp, x16_idx, y16_idx, vst, pixels_wide, + pixels_high, is_key_frame); + fill_variance_tree(&vt->split[m].split[i].split[j], BLOCK_16X16); + get_variance(&vt->split[m].split[i].split[j].part_variances.none); + avg_16x16[i] += + vt->split[m].split[i].split[j].part_variances.none.variance; + if (vt->split[m].split[i].split[j].part_variances.none.variance < + minvar_16x16[i]) + minvar_16x16[i] = + vt->split[m].split[i].split[j].part_variances.none.variance; + if (vt->split[m].split[i].split[j].part_variances.none.variance > + maxvar_16x16[i]) + maxvar_16x16[i] = + vt->split[m].split[i].split[j].part_variances.none.variance; + if (vt->split[m].split[i].split[j].part_variances.none.variance > + thresholds[3]) { + // 16X16 variance is above threshold for split, so force split to + // 8x8 for this 16x16 block (this also forces splits for upper + // levels). + force_split[split_index] = 1; + force_split[5 + m2 + i] = 1; + force_split[m + 1] = 1; + force_split[0] = 1; + } else if (compute_minmax_variance && + vt->split[m] + .split[i] + .split[j] + .part_variances.none.variance > thresholds[2] && + !cyclic_refresh_segment_id_boosted(segment_id)) { + // We have some nominal amount of 16x16 variance (based on average), + // compute the minmax over the 8x8 sub-blocks, and if above + // threshold, force split to 8x8 block for this 16x16 block. + int minmax = compute_minmax_8x8(s, sp, d, dp, x16_idx, y16_idx, + pixels_wide, pixels_high); + int thresh_minmax = (int)cpi->vbp_threshold_minmax; + if (minmax > thresh_minmax) { + force_split[split_index] = 1; + force_split[5 + m2 + i] = 1; + force_split[m + 1] = 1; + force_split[0] = 1; + } + } + } + if (is_key_frame) { + force_split[split_index] = 0; + // Go down to 4x4 down-sampling for variance. + variance4x4downsample[i2 + j] = 1; + for (k = 0; k < 4; k++) { + int x8_idx = x16_idx + ((k & 1) << 3); + int y8_idx = y16_idx + ((k >> 1) << 3); + v8x8 *vst2 = is_key_frame ? &vst->split[k] : &vt2[i2 + j].split[k]; + fill_variance_4x4avg(s, sp, d, dp, x8_idx, y8_idx, vst2, + pixels_wide, pixels_high, is_key_frame); + } + } + } + } + } + + // Fill the rest of the variance tree by summing split partition values. + for (m = 0; m < num_64x64_blocks; ++m) { + avg_32x32 = 0; + const int m2 = m << 2; + for (i = 0; i < 4; i++) { + const int i2 = (m2 + i) << 2; + for (j = 0; j < 4; j++) { + const int split_index = 21 + i2 + j; + if (variance4x4downsample[i2 + j] == 1) { + v16x16 *vtemp = + (!is_key_frame) ? &vt2[i2 + j] : &vt->split[m].split[i].split[j]; + for (k = 0; k < 4; k++) + fill_variance_tree(&vtemp->split[k], BLOCK_8X8); + fill_variance_tree(vtemp, BLOCK_16X16); + // If variance of this 16x16 block is above the threshold, force block + // to split. This also forces a split on the upper levels. + get_variance(&vtemp->part_variances.none); + if (vtemp->part_variances.none.variance > thresholds[3]) { + force_split[split_index] = 1; + force_split[5 + m2 + i] = 1; + force_split[m + 1] = 1; + force_split[0] = 1; + } + } + } + fill_variance_tree(&vt->split[m].split[i], BLOCK_32X32); + // If variance of this 32x32 block is above the threshold, or if its above + // (some threshold of) the average variance over the sub-16x16 blocks, + // then force this block to split. This also forces a split on the upper + // (64x64) level. + if (!force_split[5 + m2 + i]) { + get_variance(&vt->split[m].split[i].part_variances.none); + var_32x32 = vt->split[m].split[i].part_variances.none.variance; + max_var_32x32 = AOMMAX(var_32x32, max_var_32x32); + min_var_32x32 = AOMMIN(var_32x32, min_var_32x32); + if (vt->split[m].split[i].part_variances.none.variance > + thresholds[2] || + (!is_key_frame && + vt->split[m].split[i].part_variances.none.variance > + (thresholds[2] >> 1) && + vt->split[m].split[i].part_variances.none.variance > + (avg_16x16[i] >> 1))) { + force_split[5 + m2 + i] = 1; + force_split[m + 1] = 1; + force_split[0] = 1; + } else if (!is_key_frame && cm->height <= 360 && + (maxvar_16x16[i] - minvar_16x16[i]) > (thresholds[2] >> 1) && + maxvar_16x16[i] > thresholds[2]) { + force_split[5 + m2 + i] = 1; + force_split[m + 1] = 1; + force_split[0] = 1; + } + avg_32x32 += var_32x32; + } + } + if (!force_split[1 + m]) { + fill_variance_tree(&vt->split[m], BLOCK_64X64); + get_variance(&vt->split[m].part_variances.none); + var_64x64 = vt->split[m].part_variances.none.variance; + max_var_64x64 = AOMMAX(var_64x64, max_var_64x64); + min_var_64x64 = AOMMIN(var_64x64, min_var_64x64); + // If variance of this 64x64 block is above (some threshold of) the + // average variance over the sub-32x32 blocks, then force this block to + // split. Only checking this for noise level >= medium for now. + + if (!is_key_frame && + (max_var_32x32 - min_var_32x32) > 3 * (thresholds[1] >> 3) && + max_var_32x32 > thresholds[1] >> 1) + force_split[1 + m] = 1; + } + if (is_small_sb) force_split[0] = 1; + } + + if (!force_split[0]) { + fill_variance_tree(vt, BLOCK_128X128); + get_variance(&vt->part_variances.none); + if (!is_key_frame && + (max_var_64x64 - min_var_64x64) > 3 * (thresholds[0] >> 3) && + max_var_64x64 > thresholds[0] >> 1) + force_split[0] = 1; + } + + if (!set_vt_partitioning(cpi, x, xd, tile, vt, BLOCK_128X128, mi_row, mi_col, + thresholds[0], BLOCK_16X16, force_split[0])) { + for (m = 0; m < num_64x64_blocks; ++m) { + const int x64_idx = ((m & 1) << 4); + const int y64_idx = ((m >> 1) << 4); + const int m2 = m << 2; + + // Now go through the entire structure, splitting every block size until + // we get to one that's got a variance lower than our threshold. + if (!set_vt_partitioning(cpi, x, xd, tile, &vt->split[m], BLOCK_64X64, + mi_row + y64_idx, mi_col + x64_idx, + thresholds[1], BLOCK_16X16, + force_split[1 + m])) { + for (i = 0; i < 4; ++i) { + const int x32_idx = ((i & 1) << 3); + const int y32_idx = ((i >> 1) << 3); + const int i2 = (m2 + i) << 2; + if (!set_vt_partitioning(cpi, x, xd, tile, &vt->split[m].split[i], + BLOCK_32X32, (mi_row + y64_idx + y32_idx), + (mi_col + x64_idx + x32_idx), thresholds[2], + BLOCK_16X16, force_split[5 + m2 + i])) { + for (j = 0; j < 4; ++j) { + const int x16_idx = ((j & 1) << 2); + const int y16_idx = ((j >> 1) << 2); + const int split_index = 21 + i2 + j; + // For inter frames: if variance4x4downsample[] == 1 for this + // 16x16 block, then the variance is based on 4x4 down-sampling, + // so use vt2 in set_vt_partioning(), otherwise use vt. + v16x16 *vtemp = + (!is_key_frame && variance4x4downsample[i2 + j] == 1) + ? &vt2[i2 + j] + : &vt->split[m].split[i].split[j]; + if (!set_vt_partitioning(cpi, x, xd, tile, vtemp, BLOCK_16X16, + mi_row + y64_idx + y32_idx + y16_idx, + mi_col + x64_idx + x32_idx + x16_idx, + thresholds[3], BLOCK_8X8, + force_split[split_index])) { + for (k = 0; k < 4; ++k) { + const int x8_idx = (k & 1) << 1; + const int y8_idx = (k >> 1) << 1; + set_block_size( + cpi, x, xd, + (mi_row + y64_idx + y32_idx + y16_idx + y8_idx), + (mi_col + x64_idx + x32_idx + x16_idx + x8_idx), + BLOCK_8X8); + } + } + } + } + } + } + } + } + + if (vt2) aom_free(vt2); + if (vt) aom_free(vt); + return 0; +} diff --git a/libaom/av1/encoder/var_based_part.h b/libaom/av1/encoder/var_based_part.h new file mode 100644 index 0000000..c355224 --- /dev/null +++ b/libaom/av1/encoder/var_based_part.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_VAR_BASED_PART_H_ +#define AOM_AV1_ENCODER_VAR_BASED_PART_H_ + +#include <stdio.h> + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "av1/encoder/encoder.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void av1_set_variance_partition_thresholds(AV1_COMP *cpi, int q, + int content_state); + +int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile, + MACROBLOCK *x, int mi_row, int mi_col); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // AOM_AV1_ENCODER_VAR_BASED_PART_H_ diff --git a/libaom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c b/libaom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c index 13982cc..9483063 100644 --- a/libaom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c +++ b/libaom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c @@ -1408,12 +1408,6 @@ static INLINE void fadst16x16_new_avx2(const __m256i *input, __m256i *output, output[15] = x1[0]; } -static INLINE __m256i scale_round_avx2(const __m256i a, const int scale) { - const __m256i scale__r = pair_set_w16_epi16(scale, 1 << (NewSqrt2Bits - 1)); - const __m256i b = _mm256_madd_epi16(a, scale__r); - return _mm256_srai_epi32(b, NewSqrt2Bits); -} - static INLINE void fidentity16x16_new_avx2(const __m256i *input, __m256i *output, int8_t cos_bit) { (void)cos_bit; @@ -1997,6 +1991,794 @@ static void lowbd_fwd_txfm2d_64x16_avx2(const int16_t *input, int32_t *output, } } +static INLINE void btf_16_avx2(__m256i *w0, __m256i *w1, __m256i *in0, + __m256i *in1, __m128i *out0, __m128i *out1, + __m128i *out2, __m128i *out3, + const __m256i *__rounding, int8_t *cos_bit) { + __m256i t0 = _mm256_unpacklo_epi16(*in0, *in1); + __m256i t1 = _mm256_unpackhi_epi16(*in0, *in1); + __m256i u0 = _mm256_madd_epi16(t0, *w0); + __m256i u1 = _mm256_madd_epi16(t1, *w0); + __m256i v0 = _mm256_madd_epi16(t0, *w1); + __m256i v1 = _mm256_madd_epi16(t1, *w1); + + __m256i a0 = _mm256_add_epi32(u0, *__rounding); + __m256i a1 = _mm256_add_epi32(u1, *__rounding); + __m256i b0 = _mm256_add_epi32(v0, *__rounding); + __m256i b1 = _mm256_add_epi32(v1, *__rounding); + + __m256i c0 = _mm256_srai_epi32(a0, *cos_bit); + __m256i c1 = _mm256_srai_epi32(a1, *cos_bit); + __m256i d0 = _mm256_srai_epi32(b0, *cos_bit); + __m256i d1 = _mm256_srai_epi32(b1, *cos_bit); + + __m256i temp0 = _mm256_packs_epi32(c0, c1); + __m256i temp1 = _mm256_packs_epi32(d0, d1); + + *out0 = _mm256_castsi256_si128(temp0); + *out1 = _mm256_castsi256_si128(temp1); + *out2 = _mm256_extracti128_si256(temp0, 0x01); + *out3 = _mm256_extracti128_si256(temp1, 0x01); +} + +static INLINE void fdct8x8_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1)); + + __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]); + __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); + __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]); + __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]); + __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]); + __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]); + __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]); + __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]); + + // stage 1 + __m256i x1[8]; + x1[0] = _mm256_adds_epi16(input[0], input[7]); + x1[7] = _mm256_subs_epi16(input[0], input[7]); + x1[1] = _mm256_adds_epi16(input[1], input[6]); + x1[6] = _mm256_subs_epi16(input[1], input[6]); + x1[2] = _mm256_adds_epi16(input[2], input[5]); + x1[5] = _mm256_subs_epi16(input[2], input[5]); + x1[3] = _mm256_adds_epi16(input[3], input[4]); + x1[4] = _mm256_subs_epi16(input[3], input[4]); + + // stage 2 + __m256i x2[8]; + x2[0] = _mm256_adds_epi16(x1[0], x1[3]); + x2[3] = _mm256_subs_epi16(x1[0], x1[3]); + x2[1] = _mm256_adds_epi16(x1[1], x1[2]); + x2[2] = _mm256_subs_epi16(x1[1], x1[2]); + x2[4] = x1[4]; + btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], __rounding, + cos_bit); + x2[5] = x1[5]; + x2[6] = x1[6]; + x2[7] = x1[7]; + + // stage 3 + __m256i x3[8]; + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x2[0], &x2[1], __rounding, + cos_bit); + x3[0] = x2[0]; + x3[1] = x2[1]; + btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x2[2], &x2[3], __rounding, + cos_bit); + x3[2] = x2[2]; + x3[3] = x2[3]; + x3[4] = _mm256_adds_epi16(x2[4], x2[5]); + x3[5] = _mm256_subs_epi16(x2[4], x2[5]); + x3[6] = _mm256_subs_epi16(x2[7], x2[6]); + x3[7] = _mm256_adds_epi16(x2[7], x2[6]); + + // stage 4 + __m256i x4[8]; + x4[0] = x3[0]; + x4[1] = x3[1]; + x4[2] = x3[2]; + x4[3] = x3[3]; + btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x3[4], &x3[7], __rounding, + cos_bit); + x4[4] = x3[4]; + x4[7] = x3[7]; + btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x3[5], &x3[6], __rounding, + cos_bit); + x4[5] = x3[5]; + x4[6] = x3[6]; + // stage 5 + output[0] = x4[0]; + output[1] = x4[4]; + output[2] = x4[2]; + output[3] = x4[6]; + output[4] = x4[1]; + output[5] = x4[5]; + output[6] = x4[3]; + output[7] = x4[7]; +} + +static INLINE void fadst8x8_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i __zero = _mm256_setzero_si256(); + const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1)); + + __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]); + __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]); + __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]); + __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]); + __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]); + __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]); + __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]); + __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]); + __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]); + __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]); + __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]); + __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]); + __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]); + + // stage 1 + __m256i x1[8]; + x1[0] = input[0]; + x1[1] = _mm256_subs_epi16(__zero, input[7]); + x1[2] = _mm256_subs_epi16(__zero, input[3]); + x1[3] = input[4]; + x1[4] = _mm256_subs_epi16(__zero, input[1]); + x1[5] = input[6]; + x1[6] = input[2]; + x1[7] = _mm256_subs_epi16(__zero, input[5]); + + // stage 2 + __m256i x2[8]; + x2[0] = x1[0]; + x2[1] = x1[1]; + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], __rounding, + cos_bit); + x2[2] = x1[2]; + x2[3] = x1[3]; + x2[4] = x1[4]; + x2[5] = x1[5]; + btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], __rounding, + cos_bit); + x2[6] = x1[6]; + x2[7] = x1[7]; + + // stage 3 + __m256i x3[8]; + x3[0] = _mm256_adds_epi16(x2[0], x2[2]); + x3[2] = _mm256_subs_epi16(x2[0], x2[2]); + x3[1] = _mm256_adds_epi16(x2[1], x2[3]); + x3[3] = _mm256_subs_epi16(x2[1], x2[3]); + x3[4] = _mm256_adds_epi16(x2[4], x2[6]); + x3[6] = _mm256_subs_epi16(x2[4], x2[6]); + x3[5] = _mm256_adds_epi16(x2[5], x2[7]); + x3[7] = _mm256_subs_epi16(x2[5], x2[7]); + + // stage 4 + __m256i x4[8]; + x4[0] = x3[0]; + x4[1] = x3[1]; + x4[2] = x3[2]; + x4[3] = x3[3]; + btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x3[4], &x3[5], __rounding, + cos_bit); + x4[4] = x3[4]; + x4[5] = x3[5]; + btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x3[6], &x3[7], __rounding, + cos_bit); + x4[6] = x3[6]; + x4[7] = x3[7]; + + // stage 5 + __m256i x5[8]; + x5[0] = _mm256_adds_epi16(x4[0], x4[4]); + x5[4] = _mm256_subs_epi16(x4[0], x4[4]); + x5[1] = _mm256_adds_epi16(x4[1], x4[5]); + x5[5] = _mm256_subs_epi16(x4[1], x4[5]); + x5[2] = _mm256_adds_epi16(x4[2], x4[6]); + x5[6] = _mm256_subs_epi16(x4[2], x4[6]); + x5[3] = _mm256_adds_epi16(x4[3], x4[7]); + x5[7] = _mm256_subs_epi16(x4[3], x4[7]); + + // stage 6 + __m256i x6[8]; + btf_16_w16_avx2(cospi_p04_p60, cospi_p60_m04, &x5[0], &x5[1], __rounding, + cos_bit); + x6[0] = x5[0]; + x6[1] = x5[1]; + btf_16_w16_avx2(cospi_p20_p44, cospi_p44_m20, &x5[2], &x5[3], __rounding, + cos_bit); + x6[2] = x5[2]; + x6[3] = x5[3]; + btf_16_w16_avx2(cospi_p36_p28, cospi_p28_m36, &x5[4], &x5[5], __rounding, + cos_bit); + x6[4] = x5[4]; + x6[5] = x5[5]; + btf_16_w16_avx2(cospi_p52_p12, cospi_p12_m52, &x5[6], &x5[7], __rounding, + cos_bit); + x6[6] = x5[6]; + x6[7] = x5[7]; + + // stage 7 + output[0] = x6[1]; + output[1] = x6[6]; + output[2] = x6[3]; + output[3] = x6[4]; + output[4] = x6[5]; + output[5] = x6[2]; + output[6] = x6[7]; + output[7] = x6[0]; +} + +static INLINE void fidentity8x8_new_avx2(const __m256i *input, __m256i *output, + int8_t cos_bit) { + (void)cos_bit; + + output[0] = _mm256_adds_epi16(input[0], input[0]); + output[1] = _mm256_adds_epi16(input[1], input[1]); + output[2] = _mm256_adds_epi16(input[2], input[2]); + output[3] = _mm256_adds_epi16(input[3], input[3]); + output[4] = _mm256_adds_epi16(input[4], input[4]); + output[5] = _mm256_adds_epi16(input[5], input[5]); + output[6] = _mm256_adds_epi16(input[6], input[6]); + output[7] = _mm256_adds_epi16(input[7], input[7]); +} + +static INLINE void fdct8x16_new_avx2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i __rounding_256 = _mm256_set1_epi32(1 << (cos_bit - 1)); + const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1)); + __m128i temp0, temp1, temp2, temp3; + __m256i in0, in1; + __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]); + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]); + __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]); + __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]); + __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]); + __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]); + __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]); + __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]); + __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]); + __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]); + __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]); + __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]); + __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]); + __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]); + __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]); + __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]); + + __m256i cospi_arr[12]; + + cospi_arr[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m32_p32), + cospi_m32_p32, 0x1); + cospi_arr[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32), + cospi_p32_p32, 0x1); + cospi_arr[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32), + cospi_p48_p16, 0x1); + cospi_arr[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32), + cospi_m16_p48, 0x1); + cospi_arr[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m16_p48), + cospi_m48_m16, 0x1); + cospi_arr[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_p16), + cospi_m16_p48, 0x1); + cospi_arr[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p56_p08), + cospi_p24_p40, 0x1); + cospi_arr[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m08_p56), + cospi_m40_p24, 0x1); + cospi_arr[8] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p60_p04), + cospi_p28_p36, 0x1); + cospi_arr[9] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m04_p60), + cospi_m36_p28, 0x1); + cospi_arr[10] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p44_p20), + cospi_p12_p52, 0x1); + cospi_arr[11] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m20_p44), + cospi_m52_p12, 0x1); + + __m256i x[8]; + x[0] = + _mm256_insertf128_si256(_mm256_castsi128_si256(input[0]), input[1], 0x1); + x[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[15]), input[14], + 0x1); + x[2] = + _mm256_insertf128_si256(_mm256_castsi128_si256(input[2]), input[3], 0x1); + x[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[13]), input[12], + 0x1); + x[4] = + _mm256_insertf128_si256(_mm256_castsi128_si256(input[5]), input[4], 0x1); + x[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[10]), input[11], + 0x1); + x[6] = + _mm256_insertf128_si256(_mm256_castsi128_si256(input[7]), input[6], 0x1); + x[7] = + _mm256_insertf128_si256(_mm256_castsi128_si256(input[8]), input[9], 0x1); + + // stage 1 + __m256i x1[8]; + x1[0] = _mm256_adds_epi16(x[0], x[1]); + x1[7] = _mm256_subs_epi16(x[0], x[1]); + x1[1] = _mm256_adds_epi16(x[2], x[3]); + x1[6] = _mm256_subs_epi16(x[2], x[3]); + x1[2] = _mm256_adds_epi16(x[4], x[5]); + x1[5] = _mm256_subs_epi16(x[4], x[5]); + x1[3] = _mm256_adds_epi16(x[6], x[7]); + x1[4] = _mm256_subs_epi16(x[6], x[7]); + + // stage 2 + __m256i x2[8]; + x2[0] = _mm256_adds_epi16(x1[0], x1[3]); + x2[7] = _mm256_subs_epi16(x1[0], x1[3]); + x2[1] = _mm256_adds_epi16(x1[1], x1[2]); + x2[6] = _mm256_subs_epi16(x1[1], x1[2]); + x2[2] = x1[4]; + x2[3] = x1[7]; + btf_16_avx2(&cospi_arr[0], &cospi_arr[1], &x1[5], &x1[6], &temp0, &temp1, + &temp2, &temp3, &__rounding_256, &cos_bit); + x2[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp0, 0x1); + x2[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp3), temp1, 0x1); + + // stage 3 + __m256i x3[8]; + x2[1] = _mm256_permute4x64_epi64(x2[1], 0x4e); + x3[0] = _mm256_adds_epi16(x2[0], x2[1]); + x3[1] = _mm256_subs_epi16(x2[0], x2[1]); + x3[2] = _mm256_blend_epi32(x2[7], x2[6], 0xf0); + btf_16_sse2(cospi_m32_p32, cospi_p32_p32, _mm256_castsi256_si128(x2[6]), + _mm256_extractf128_si256(x2[7], 0x01), temp0, temp1); + x3[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp1), temp0, 0x1); + x3[3] = _mm256_adds_epi16(x2[2], x2[4]); + x3[4] = _mm256_subs_epi16(x2[2], x2[4]); + x3[5] = _mm256_adds_epi16(x2[3], x2[5]); + x3[6] = _mm256_subs_epi16(x2[3], x2[5]); + + // stage 4 + __m256i x4[8]; + x4[0] = _mm256_blend_epi32(x3[0], x3[1], 0xf0); + x4[1] = _mm256_permute2f128_si256(x3[0], x3[1], 0x21); + btf_16_avx2(&cospi_arr[2], &cospi_arr[3], &x4[0], &x4[1], &output[0], + &output[8], &output[4], &output[12], &__rounding_256, &cos_bit); + x4[2] = _mm256_adds_epi16(x3[2], x3[7]); + x4[3] = _mm256_subs_epi16(x3[2], x3[7]); + x4[4] = _mm256_permute2f128_si256(x3[3], x3[4], 0x20); + x4[5] = _mm256_permute2f128_si256(x3[6], x3[5], 0x20); + in0 = _mm256_permute2f128_si256(x3[3], x3[4], 0x31); + in1 = _mm256_permute2f128_si256(x3[5], x3[6], 0x31); + btf_16_avx2(&cospi_arr[4], &cospi_arr[5], &in0, &in1, &temp0, &temp1, &temp2, + &temp3, &__rounding_256, &cos_bit); + + x4[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp2, 0x1); + x4[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp3), temp1, 0x1); + + // stage 5 + __m256i x5[4]; + in0 = _mm256_permute2f128_si256(x4[2], x4[3], 0x31); + in1 = _mm256_permute2f128_si256(x4[2], x4[3], 0x20); + btf_16_avx2(&cospi_arr[6], &cospi_arr[7], &in0, &in1, &output[2], &output[14], + &output[10], &output[6], &__rounding_256, &cos_bit); + x5[0] = _mm256_adds_epi16(x4[4], x4[6]); + x5[1] = _mm256_subs_epi16(x4[4], x4[6]); + x5[2] = _mm256_adds_epi16(x4[5], x4[7]); + x5[3] = _mm256_subs_epi16(x4[5], x4[7]); + + // stage 6 + in0 = _mm256_permute2f128_si256(x5[0], x5[1], 0x20); + in1 = _mm256_permute2f128_si256(x5[2], x5[3], 0x31); + btf_16_avx2(&cospi_arr[8], &cospi_arr[9], &in0, &in1, &output[1], &output[15], + &output[9], &output[7], &__rounding_256, &cos_bit); + in0 = _mm256_permute2f128_si256(x5[1], x5[0], 0x31); + in1 = _mm256_permute2f128_si256(x5[3], x5[2], 0x20); + btf_16_avx2(&cospi_arr[10], &cospi_arr[11], &in0, &in1, &output[5], + &output[11], &output[13], &output[3], &__rounding_256, &cos_bit); +} + +static INLINE void fadst8x16_new_avx2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i __zero = _mm256_setzero_si256(); + const __m256i __rounding_256 = _mm256_set1_epi32(1 << (cos_bit - 1)); + __m256i in0, in1; + __m128i temp0, temp1, temp2, temp3; + + __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]); + __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]); + __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]); + __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]); + __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]); + __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]); + __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]); + __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]); + __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]); + __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]); + __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]); + __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]); + __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]); + __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]); + __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]); + __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]); + __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]); + __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]); + __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]); + __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]); + __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]); + __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]); + __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]); + __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]); + __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]); + __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]); + __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]); + + __m256i cospi_arr[20]; + + cospi_arr[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32), + cospi_p32_p32, 0x1); + cospi_arr[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32), + cospi_p32_m32, 0x1); + cospi_arr[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32), + cospi_p32_p32, 0x1); + cospi_arr[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32), + cospi_p32_m32, 0x1); + cospi_arr[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p16_p48), + cospi_m48_p16, 0x1); + cospi_arr[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_m16), + cospi_p16_p48, 0x1); + cospi_arr[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p16_p48), + cospi_m48_p16, 0x1); + cospi_arr[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_m16), + cospi_p16_p48, 0x1); + cospi_arr[8] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p08_p56), + cospi_p40_p24, 0x1); + cospi_arr[9] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p56_m08), + cospi_p24_m40, 0x1); + cospi_arr[10] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m56_p08), + cospi_m24_p40, 0x1); + cospi_arr[11] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p08_p56), + cospi_p40_p24, 0x1); + cospi_arr[12] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p02_p62), + cospi_p10_p54, 0x1); + cospi_arr[13] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p62_m02), + cospi_p54_m10, 0x1); + cospi_arr[14] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p18_p46), + cospi_p26_p38, 0x1); + cospi_arr[15] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p46_m18), + cospi_p38_m26, 0x1); + cospi_arr[16] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p34_p30), + cospi_p42_p22, 0x1); + cospi_arr[17] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p30_m34), + cospi_p22_m42, 0x1); + cospi_arr[18] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p50_p14), + cospi_p58_p06, 0x1); + cospi_arr[19] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p14_m50), + cospi_p06_m58, 0x1); + + __m256i x[8]; + x[0] = + _mm256_insertf128_si256(_mm256_castsi128_si256(input[0]), input[4], 0x1); + x[1] = + _mm256_insertf128_si256(_mm256_castsi128_si256(input[2]), input[6], 0x1); + x[2] = + _mm256_insertf128_si256(_mm256_castsi128_si256(input[8]), input[12], 0x1); + x[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[10]), input[14], + 0x1); + x[4] = + _mm256_insertf128_si256(_mm256_castsi128_si256(input[1]), input[9], 0x1); + x[5] = + _mm256_insertf128_si256(_mm256_castsi128_si256(input[3]), input[11], 0x1); + x[6] = + _mm256_insertf128_si256(_mm256_castsi128_si256(input[5]), input[13], 0x1); + x[7] = + _mm256_insertf128_si256(_mm256_castsi128_si256(input[7]), input[15], 0x1); + + // stage 1 + __m256i x1[8]; + x1[0] = x[0]; + x1[1] = _mm256_subs_epi16(__zero, x[7]); + x1[2] = x[2]; + x1[3] = _mm256_subs_epi16(__zero, x[5]); + x1[4] = _mm256_subs_epi16(__zero, x[4]); + x1[5] = x[3]; + x1[6] = _mm256_subs_epi16(__zero, x[6]); + x1[7] = x[1]; + + // stage 2 + __m256i x2[8]; + x2[0] = _mm256_blend_epi32(x1[0], x1[1], 0xf0); + x2[3] = _mm256_blend_epi32(x1[3], x1[2], 0xf0); + x2[4] = _mm256_blend_epi32(x1[4], x1[5], 0xf0); + x2[7] = _mm256_blend_epi32(x1[7], x1[6], 0xf0); + in0 = _mm256_blend_epi32(x1[1], x1[0], 0xf0); + in1 = _mm256_blend_epi32(x1[2], x1[3], 0xf0); + btf_16_avx2(&cospi_arr[0], &cospi_arr[1], &in0, &in1, &temp0, &temp1, &temp2, + &temp3, &__rounding_256, &cos_bit); + x2[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1); + x2[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1); + in0 = _mm256_permute2f128_si256(x1[7], x1[6], 0x21); + in1 = _mm256_permute2f128_si256(x1[4], x1[5], 0x21); + btf_16_avx2(&cospi_arr[2], &cospi_arr[3], &in0, &in1, &temp0, &temp1, &temp2, + &temp3, &__rounding_256, &cos_bit); + x2[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1); + x2[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1); + + // stage 3 + __m256i x3[8]; + x3[0] = _mm256_adds_epi16(x2[0], x2[1]); + x3[1] = _mm256_subs_epi16(x2[0], x2[1]); + x3[2] = _mm256_adds_epi16(x2[3], x2[2]); + x3[3] = _mm256_subs_epi16(x2[3], x2[2]); + x3[4] = _mm256_adds_epi16(x2[4], x2[5]); + x3[5] = _mm256_subs_epi16(x2[4], x2[5]); + x3[6] = _mm256_adds_epi16(x2[7], x2[6]); + x3[7] = _mm256_subs_epi16(x2[7], x2[6]); + + // stage 4 + __m256i x4[8]; + x4[0] = x3[0]; + x4[1] = x3[1]; + x4[4] = x3[4]; + x4[5] = x3[5]; + in0 = _mm256_permute2f128_si256(x3[2], x3[3], 0x20); + in1 = _mm256_permute2f128_si256(x3[2], x3[3], 0x31); + btf_16_avx2(&cospi_arr[4], &cospi_arr[5], &in0, &in1, &temp0, &temp1, &temp2, + &temp3, &__rounding_256, &cos_bit); + x4[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1); + x4[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1); + in0 = _mm256_permute2f128_si256(x3[6], x3[7], 0x20); + in1 = _mm256_permute2f128_si256(x3[6], x3[7], 0x31); + btf_16_avx2(&cospi_arr[6], &cospi_arr[7], &in0, &in1, &temp0, &temp1, &temp2, + &temp3, &__rounding_256, &cos_bit); + x4[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1); + x4[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1); + + // stage 5 + __m256i x5[8]; + x5[0] = _mm256_adds_epi16(x4[0], x4[2]); + x5[1] = _mm256_subs_epi16(x4[0], x4[2]); + x5[2] = _mm256_adds_epi16(x4[1], x4[3]); + x5[3] = _mm256_subs_epi16(x4[1], x4[3]); + x5[4] = _mm256_adds_epi16(x4[4], x4[6]); + x5[5] = _mm256_subs_epi16(x4[4], x4[6]); + x5[6] = _mm256_adds_epi16(x4[5], x4[7]); + x5[7] = _mm256_subs_epi16(x4[5], x4[7]); + + // stage 6 + __m256i x6[8]; + x6[0] = x5[0]; + x6[1] = x5[2]; + x6[2] = x5[1]; + x6[3] = x5[3]; + in0 = _mm256_permute2f128_si256(x5[4], x5[6], 0x20); + in1 = _mm256_permute2f128_si256(x5[4], x5[6], 0x31); + btf_16_avx2(&cospi_arr[8], &cospi_arr[9], &in0, &in1, &temp0, &temp1, &temp2, + &temp3, &__rounding_256, &cos_bit); + x6[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1); + x6[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1); + in0 = _mm256_permute2f128_si256(x5[5], x5[7], 0x20); + in1 = _mm256_permute2f128_si256(x5[5], x5[7], 0x31); + btf_16_avx2(&cospi_arr[10], &cospi_arr[11], &in0, &in1, &temp0, &temp1, + &temp2, &temp3, &__rounding_256, &cos_bit); + x6[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1); + x6[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1); + + // stage 7 + __m256i x7[8]; + x7[0] = _mm256_adds_epi16(x6[0], x6[4]); + x7[1] = _mm256_subs_epi16(x6[0], x6[4]); + x7[2] = _mm256_adds_epi16(x6[1], x6[5]); + x7[3] = _mm256_subs_epi16(x6[1], x6[5]); + x7[4] = _mm256_adds_epi16(x6[2], x6[6]); + x7[5] = _mm256_subs_epi16(x6[2], x6[6]); + x7[6] = _mm256_adds_epi16(x6[3], x6[7]); + x7[7] = _mm256_subs_epi16(x6[3], x6[7]); + + // stage 8 + in0 = _mm256_permute2f128_si256(x7[0], x7[2], 0x20); + in1 = _mm256_permute2f128_si256(x7[0], x7[2], 0x31); + btf_16_avx2(&cospi_arr[12], &cospi_arr[13], &in0, &in1, &output[15], + &output[0], &output[13], &output[2], &__rounding_256, &cos_bit); + in0 = _mm256_permute2f128_si256(x7[4], x7[6], 0x20); + in1 = _mm256_permute2f128_si256(x7[4], x7[6], 0x31); + btf_16_avx2(&cospi_arr[14], &cospi_arr[15], &in0, &in1, &output[11], + &output[4], &output[9], &output[6], &__rounding_256, &cos_bit); + in0 = _mm256_permute2f128_si256(x7[1], x7[3], 0x20); + in1 = _mm256_permute2f128_si256(x7[1], x7[3], 0x31); + btf_16_avx2(&cospi_arr[16], &cospi_arr[17], &in0, &in1, &output[7], + &output[8], &output[5], &output[10], &__rounding_256, &cos_bit); + in0 = _mm256_permute2f128_si256(x7[5], x7[7], 0x20); + in1 = _mm256_permute2f128_si256(x7[5], x7[7], 0x31); + btf_16_avx2(&cospi_arr[18], &cospi_arr[19], &in0, &in1, &output[3], + &output[12], &output[1], &output[14], &__rounding_256, &cos_bit); +} + +static INLINE void fidentity8x16_new_avx2(const __m128i *input, __m128i *output, + int8_t cos_bit) { + (void)cos_bit; + const __m256i one = _mm256_set1_epi16(1); + __m256i temp; + for (int i = 0; i < 16; i += 2) { + temp = _mm256_insertf128_si256(_mm256_castsi128_si256(input[i]), + input[i + 1], 0x1); + const __m256i a_lo = _mm256_unpacklo_epi16(temp, one); + const __m256i a_hi = _mm256_unpackhi_epi16(temp, one); + const __m256i b_lo = scale_round_avx2(a_lo, 2 * NewSqrt2); + const __m256i b_hi = scale_round_avx2(a_hi, 2 * NewSqrt2); + temp = _mm256_packs_epi32(b_lo, b_hi); + output[i] = _mm256_castsi256_si128(temp); + output[i + 1] = _mm256_extractf128_si256(temp, 0x1); + } +} + +static const transform_1d_avx2 row_txfm8x16_arr[TX_TYPES] = { + fdct8x8_new_avx2, // DCT_DCT + fdct8x8_new_avx2, // ADST_DCT + fadst8x8_new_avx2, // DCT_ADST + fadst8x8_new_avx2, // ADST_ADST + fdct8x8_new_avx2, // FLIPADST_DCT + fadst8x8_new_avx2, // DCT_FLIPADST + fadst8x8_new_avx2, // FLIPADST_FLIPADST + fadst8x8_new_avx2, // ADST_FLIPADST + fadst8x8_new_avx2, // FLIPADST_ADST + fidentity8x8_new_avx2, // IDTX + fidentity8x8_new_avx2, // V_DCT + fdct8x8_new_avx2, // H_DCT + fidentity8x8_new_avx2, // V_ADST + fadst8x8_new_avx2, // H_ADST + fidentity8x8_new_avx2, // V_FLIPADST + fadst8x8_new_avx2 // H_FLIPADST +}; + +static const transform_1d_sse2 col_txfm8x16_arr[TX_TYPES] = { + fdct8x16_new_avx2, // DCT_DCT + fadst8x16_new_avx2, // ADST_DCT + fdct8x16_new_avx2, // DCT_ADST + fadst8x16_new_avx2, // ADST_ADST + fadst8x16_new_avx2, // FLIPADST_DCT + fdct8x16_new_avx2, // DCT_FLIPADST + fadst8x16_new_avx2, // FLIPADST_FLIPADST + fadst8x16_new_avx2, // ADST_FLIPADST + fadst8x16_new_avx2, // FLIPADST_ADST + fidentity8x16_new_avx2, // IDTX + fdct8x16_new_avx2, // V_DCT + fidentity8x16_new_avx2, // H_DCT + fadst8x16_new_avx2, // V_ADST + fidentity8x16_new_avx2, // H_ADST + fadst8x16_new_avx2, // V_FLIPADST + fidentity8x16_new_avx2 // H_FLIPADST +}; + +static const transform_1d_avx2 col_txfm16x8_arr[TX_TYPES] = { + fdct8x8_new_avx2, // DCT_DCT + fadst8x8_new_avx2, // ADST_DCT + fdct8x8_new_avx2, // DCT_ADST + fadst8x8_new_avx2, // ADST_ADST + fadst8x8_new_avx2, // FLIPADST_DCT + fdct8x8_new_avx2, // DCT_FLIPADST + fadst8x8_new_avx2, // FLIPADST_FLIPADST + fadst8x8_new_avx2, // ADST_FLIPADST + fadst8x8_new_avx2, // FLIPADST_ADST + fidentity8x8_new_avx2, // IDTX + fdct8x8_new_avx2, // V_DCT + fidentity8x8_new_avx2, // H_DCT + fadst8x8_new_avx2, // V_ADST + fidentity8x8_new_avx2, // H_ADST + fadst8x8_new_avx2, // V_FLIPADST + fidentity8x8_new_avx2, // H_FLIPADST +}; + +static const transform_1d_sse2 row_txfm16x8_arr[TX_TYPES] = { + fdct8x16_new_avx2, // DCT_DCT + fdct8x16_new_avx2, // ADST_DCT + fadst8x16_new_avx2, // DCT_ADST + fadst8x16_new_avx2, // ADST_ADST + fdct8x16_new_avx2, // FLIPADST_DCT + fadst8x16_new_avx2, // DCT_FLIPADST + fadst8x16_new_avx2, // FLIPADST_FLIPADST + fadst8x16_new_avx2, // ADST_FLIPADST + fadst8x16_new_avx2, // FLIPADST_ADST + fidentity8x16_new_avx2, // IDTX + fidentity8x16_new_avx2, // V_DCT + fdct8x16_new_avx2, // H_DCT + fidentity8x16_new_avx2, // V_ADST + fadst8x16_new_avx2, // H_ADST + fidentity8x16_new_avx2, // V_FLIPADST + fadst8x16_new_avx2 // H_FLIPADST +}; + +static void lowbd_fwd_txfm2d_8x16_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[16], buf1[16]; + __m256i buf2[8]; + const int8_t *shift = fwd_txfm_shift_ls[TX_8X16]; + const int txw_idx = get_txw_idx(TX_8X16); + const int txh_idx = get_txh_idx(TX_8X16); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 8; + const int height = 16; + const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type]; + const transform_1d_avx2 row_txfm = row_txfm8x16_arr[tx_type]; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input, stride, buf0, height); + } else { + load_buffer_16bit_to_16bit(input, stride, buf0, height); + } + round_shift_16bit(buf0, height, shift[0]); + col_txfm(buf0, buf0, cos_bit_col); + round_shift_16bit(buf0, height, shift[1]); + transpose_16bit_8x8(buf0, buf1); + transpose_16bit_8x8(buf0 + 8, buf1 + 8); + + __m128i *bufl, *bufu; + if (lr_flip) { + bufl = buf0; + bufu = buf0 + 8; + flip_buf_sse2(buf1 + width * 0, bufl, width); + flip_buf_sse2(buf1 + width * 1, bufu, width); + } else { + bufl = buf1 + width * 0; + bufu = buf1 + width * 1; + } + pack_reg(bufl, bufu, buf2); + row_txfm(buf2, buf2, cos_bit_row); + round_shift_16bit_w16_avx2(buf2, width, shift[2]); + transpose_16bit_16x8_avx2(buf2, buf2); + store_rect_buffer_16bit_to_32bit_w8_avx2(buf2, output, width, 8); +} + +static void lowbd_fwd_txfm2d_16x8_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m128i buf0[16], buf1[16]; + __m256i buf2[8]; + const int8_t *shift = fwd_txfm_shift_ls[TX_16X8]; + const int txw_idx = get_txw_idx(TX_16X8); + const int txh_idx = get_txh_idx(TX_16X8); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = 16; + const int height = 8; + const transform_1d_avx2 col_txfm = col_txfm16x8_arr[tx_type]; + const transform_1d_sse2 row_txfm = row_txfm16x8_arr[tx_type]; + __m128i *buf; + int ud_flip, lr_flip; + + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + if (ud_flip) { + load_buffer_16bit_to_16bit_flip(input + 8 * 0, stride, buf0, height); + load_buffer_16bit_to_16bit_flip(input + 8 * 1, stride, &buf0[8], height); + } else { + load_buffer_16bit_to_16bit(input + 8 * 0, stride, buf0, height); + load_buffer_16bit_to_16bit(input + 8 * 1, stride, &buf0[8], height); + } + pack_reg(buf0, &buf0[8], buf2); + round_shift_16bit_w16_avx2(buf2, height, shift[0]); + col_txfm(buf2, buf2, cos_bit_col); + round_shift_16bit_w16_avx2(buf2, height, shift[1]); + transpose_16bit_16x8_avx2(buf2, buf2); + extract_reg(buf2, buf1); + + if (lr_flip) { + buf = buf0; + flip_buf_sse2(buf1, buf, width); + } else { + buf = buf1; + } + row_txfm(buf, buf, cos_bit_row); + round_shift_16bit(buf, width, shift[2]); + transpose_16bit_8x8(buf, buf); + store_rect_buffer_16bit_to_32bit_w8(buf, output, width, height); + transpose_16bit_8x8(buf + 8, buf + 8); + store_rect_buffer_16bit_to_32bit_w8(buf + 8, output + 8, width, height); +} + static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = { av1_lowbd_fwd_txfm2d_4x4_sse2, // 4x4 transform av1_lowbd_fwd_txfm2d_8x8_sse2, // 8x8 transform @@ -2005,8 +2787,8 @@ static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = { lowbd_fwd_txfm2d_64x64_avx2, // 64x64 transform av1_lowbd_fwd_txfm2d_4x8_sse2, // 4x8 transform av1_lowbd_fwd_txfm2d_8x4_sse2, // 8x4 transform - av1_lowbd_fwd_txfm2d_8x16_sse2, // 8x16 transform - av1_lowbd_fwd_txfm2d_16x8_sse2, // 16x8 transform + lowbd_fwd_txfm2d_8x16_avx2, // 8x16 transform + lowbd_fwd_txfm2d_16x8_avx2, // 16x8 transform lowbd_fwd_txfm2d_16x32_avx2, // 16x32 transform lowbd_fwd_txfm2d_32x16_avx2, // 32x16 transform lowbd_fwd_txfm2d_32x64_avx2, // 32x64 transform diff --git a/libaom/av1/encoder/x86/corner_match_avx2.c b/libaom/av1/encoder/x86/corner_match_avx2.c new file mode 100644 index 0000000..7a3b999 --- /dev/null +++ b/libaom/av1/encoder/x86/corner_match_avx2.c @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <math.h> + +#include <immintrin.h> +#include "config/av1_rtcd.h" + +#include "aom_ports/mem.h" +#include "av1/encoder/corner_match.h" + +DECLARE_ALIGNED(16, static const uint8_t, byte_mask[16]) = { + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0 +}; +#if MATCH_SZ != 13 +#error "Need to change byte_mask in corner_match_sse4.c if MATCH_SZ != 13" +#endif + +/* Compute corr(im1, im2) * MATCH_SZ * stddev(im1), where the +correlation/standard deviation are taken over MATCH_SZ by MATCH_SZ windows +of each image, centered at (x1, y1) and (x2, y2) respectively. +*/ +double compute_cross_correlation_avx2(unsigned char *im1, int stride1, int x1, + int y1, unsigned char *im2, int stride2, + int x2, int y2) { + int i, stride1_i = 0, stride2_i = 0; + __m256i temp1, sum_vec, sumsq2_vec, cross_vec, v, v1_1, v2_1; + const __m128i mask = _mm_load_si128((__m128i *)byte_mask); + const __m256i zero = _mm256_setzero_si256(); + __m128i v1, v2; + + sum_vec = zero; + sumsq2_vec = zero; + cross_vec = zero; + + im1 += (y1 - MATCH_SZ_BY2) * stride1 + (x1 - MATCH_SZ_BY2); + im2 += (y2 - MATCH_SZ_BY2) * stride2 + (x2 - MATCH_SZ_BY2); + + for (i = 0; i < MATCH_SZ; ++i) { + v1 = _mm_and_si128(_mm_loadu_si128((__m128i *)&im1[stride1_i]), mask); + v1_1 = _mm256_cvtepu8_epi16(v1); + v2 = _mm_and_si128(_mm_loadu_si128((__m128i *)&im2[stride2_i]), mask); + v2_1 = _mm256_cvtepu8_epi16(v2); + + v = _mm256_insertf128_si256(_mm256_castsi128_si256(v1), v2, 1); + sumsq2_vec = _mm256_add_epi32(sumsq2_vec, _mm256_madd_epi16(v2_1, v2_1)); + + sum_vec = _mm256_add_epi16(sum_vec, _mm256_sad_epu8(v, zero)); + cross_vec = _mm256_add_epi32(cross_vec, _mm256_madd_epi16(v1_1, v2_1)); + stride1_i += stride1; + stride2_i += stride2; + } + __m256i sum_vec1 = _mm256_srli_si256(sum_vec, 8); + sum_vec = _mm256_add_epi32(sum_vec, sum_vec1); + int sum1_acc = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_vec)); + int sum2_acc = _mm256_extract_epi32(sum_vec, 4); + + __m256i unp_low = _mm256_unpacklo_epi64(sumsq2_vec, cross_vec); + __m256i unp_hig = _mm256_unpackhi_epi64(sumsq2_vec, cross_vec); + temp1 = _mm256_add_epi32(unp_low, unp_hig); + + __m128i low_sumsq = _mm256_castsi256_si128(temp1); + low_sumsq = _mm_add_epi32(low_sumsq, _mm256_extractf128_si256(temp1, 1)); + low_sumsq = _mm_add_epi32(low_sumsq, _mm_srli_epi64(low_sumsq, 32)); + int sumsq2_acc = _mm_cvtsi128_si32(low_sumsq); + int cross_acc = _mm_extract_epi32(low_sumsq, 2); + + int var2 = sumsq2_acc * MATCH_SZ_SQ - sum2_acc * sum2_acc; + int cov = cross_acc * MATCH_SZ_SQ - sum1_acc * sum2_acc; + return cov / sqrt((double)var2); +} diff --git a/libaom/av1/encoder/x86/encodetxb_avx2.c b/libaom/av1/encoder/x86/encodetxb_avx2.c index 7642f57..2621301 100644 --- a/libaom/av1/encoder/x86/encodetxb_avx2.c +++ b/libaom/av1/encoder/x86/encodetxb_avx2.c @@ -26,14 +26,6 @@ void av1_txb_init_levels_avx2(const tran_low_t *const coeff, const int width, const int stride = width + TX_PAD_HOR; const __m256i y_zeros = _mm256_setzero_si256(); - const int32_t pre_len = sizeof(*levels) * TX_PAD_TOP * stride; - uint8_t *pre_buf = levels - TX_PAD_TOP * stride; - uint8_t *pre_buf_end = pre_buf + pre_len; - do { - yy_storeu_256(pre_buf, y_zeros); - pre_buf += 32; - } while (pre_buf < pre_buf_end); - const int32_t bottom_len = sizeof(*levels) * (TX_PAD_BOTTOM * stride); uint8_t *bottom_buf_end = levels + (height + TX_PAD_BOTTOM) * stride; uint8_t *bottom_buf = bottom_buf_end - ((bottom_len + 31) & (~31)); diff --git a/libaom/av1/encoder/x86/encodetxb_sse4.c b/libaom/av1/encoder/x86/encodetxb_sse4.c index 5e0687c..34c9e4f 100644 --- a/libaom/av1/encoder/x86/encodetxb_sse4.c +++ b/libaom/av1/encoder/x86/encodetxb_sse4.c @@ -23,14 +23,6 @@ void av1_txb_init_levels_sse4_1(const tran_low_t *const coeff, const int width, const int stride = width + TX_PAD_HOR; const __m128i zeros = _mm_setzero_si128(); - const int32_t pre_len = sizeof(*levels) * TX_PAD_TOP * stride; - uint8_t *pre_buf = levels - TX_PAD_TOP * stride; - uint8_t *pre_buf_end = pre_buf + pre_len; - do { - _mm_storeu_si128((__m128i *)(pre_buf), zeros); - pre_buf += 16; - } while (pre_buf < pre_buf_end); - const int32_t bottom_len = sizeof(*levels) * (TX_PAD_BOTTOM * stride); uint8_t *bottom_buf = levels + stride * height; uint8_t *bottom_buf_end = bottom_buf + bottom_len; diff --git a/libaom/av1/encoder/x86/highbd_block_error_intrin_avx2.c b/libaom/av1/encoder/x86/highbd_block_error_intrin_avx2.c new file mode 100644 index 0000000..719734c --- /dev/null +++ b/libaom/av1/encoder/x86/highbd_block_error_intrin_avx2.c @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <immintrin.h> +#include <stdio.h> +#include "aom/aom_integer.h" +#include "av1/common/common.h" + +int64_t av1_highbd_block_error_avx2(tran_low_t *coeff, tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz, + int bps) { + int i; + int64_t temp1[8]; + int64_t error = 0, sqcoeff = 0; + const int shift = 2 * (bps - 8); + const int rounding = shift > 0 ? 1 << (shift - 1) : 0; + + for (i = 0; i < block_size; i += 16) { + __m256i mm256_coeff = _mm256_loadu_si256((__m256i *)(coeff + i)); + __m256i mm256_coeff2 = _mm256_loadu_si256((__m256i *)(coeff + i + 8)); + __m256i mm256_dqcoeff = _mm256_loadu_si256((__m256i *)(dqcoeff + i)); + __m256i mm256_dqcoeff2 = _mm256_loadu_si256((__m256i *)(dqcoeff + i + 8)); + + __m256i diff1 = _mm256_sub_epi32(mm256_coeff, mm256_dqcoeff); + __m256i diff2 = _mm256_sub_epi32(mm256_coeff2, mm256_dqcoeff2); + __m256i diff1h = _mm256_srli_epi64(diff1, 32); + __m256i diff2h = _mm256_srli_epi64(diff2, 32); + __m256i res = _mm256_mul_epi32(diff1, diff1); + __m256i res1 = _mm256_mul_epi32(diff1h, diff1h); + __m256i res2 = _mm256_mul_epi32(diff2, diff2); + __m256i res3 = _mm256_mul_epi32(diff2h, diff2h); + __m256i res_diff = _mm256_add_epi64(_mm256_add_epi64(res, res1), + _mm256_add_epi64(res2, res3)); + __m256i mm256_coeffh = _mm256_srli_epi64(mm256_coeff, 32); + __m256i mm256_coeffh2 = _mm256_srli_epi64(mm256_coeff2, 32); + res = _mm256_mul_epi32(mm256_coeff, mm256_coeff); + res1 = _mm256_mul_epi32(mm256_coeffh, mm256_coeffh); + res2 = _mm256_mul_epi32(mm256_coeff2, mm256_coeff2); + res3 = _mm256_mul_epi32(mm256_coeffh2, mm256_coeffh2); + __m256i res_sqcoeff = _mm256_add_epi64(_mm256_add_epi64(res, res1), + _mm256_add_epi64(res2, res3)); + _mm256_storeu_si256((__m256i *)temp1, res_diff); + _mm256_storeu_si256((__m256i *)temp1 + 1, res_sqcoeff); + + error += temp1[0] + temp1[1] + temp1[2] + temp1[3]; + sqcoeff += temp1[4] + temp1[5] + temp1[6] + temp1[7]; + } + assert(error >= 0 && sqcoeff >= 0); + error = (error + rounding) >> shift; + sqcoeff = (sqcoeff + rounding) >> shift; + + *ssz = sqcoeff; + return error; +} diff --git a/libaom/av1/encoder/x86/highbd_fwd_txfm_avx2.c b/libaom/av1/encoder/x86/highbd_fwd_txfm_avx2.c new file mode 100644 index 0000000..24c513f --- /dev/null +++ b/libaom/av1/encoder/x86/highbd_fwd_txfm_avx2.c @@ -0,0 +1,3170 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include <assert.h> +#include <immintrin.h> /*AVX2*/ + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" +#include "av1/common/av1_txfm.h" +#include "av1/encoder/av1_fwd_txfm1d_cfg.h" +#include "aom_dsp/txfm_common.h" +#include "aom_ports/mem.h" +#include "aom_dsp/x86/txfm_common_sse2.h" +#include "aom_dsp/x86/txfm_common_avx2.h" + +static INLINE void av1_load_buffer_8x8_avx2(const int16_t *input, __m256i *out, + int stride, int flipud, int fliplr, + int shift) { + __m128i out1[8]; + if (!flipud) { + out1[0] = _mm_load_si128((const __m128i *)(input + 0 * stride)); + out1[1] = _mm_load_si128((const __m128i *)(input + 1 * stride)); + out1[2] = _mm_load_si128((const __m128i *)(input + 2 * stride)); + out1[3] = _mm_load_si128((const __m128i *)(input + 3 * stride)); + out1[4] = _mm_load_si128((const __m128i *)(input + 4 * stride)); + out1[5] = _mm_load_si128((const __m128i *)(input + 5 * stride)); + out1[6] = _mm_load_si128((const __m128i *)(input + 6 * stride)); + out1[7] = _mm_load_si128((const __m128i *)(input + 7 * stride)); + + } else { + out1[7] = _mm_load_si128((const __m128i *)(input + 0 * stride)); + out1[6] = _mm_load_si128((const __m128i *)(input + 1 * stride)); + out1[5] = _mm_load_si128((const __m128i *)(input + 2 * stride)); + out1[4] = _mm_load_si128((const __m128i *)(input + 3 * stride)); + out1[3] = _mm_load_si128((const __m128i *)(input + 4 * stride)); + out1[2] = _mm_load_si128((const __m128i *)(input + 5 * stride)); + out1[1] = _mm_load_si128((const __m128i *)(input + 6 * stride)); + out1[0] = _mm_load_si128((const __m128i *)(input + 7 * stride)); + } + if (!fliplr) { + out[0] = _mm256_cvtepi16_epi32(out1[0]); + out[1] = _mm256_cvtepi16_epi32(out1[1]); + out[2] = _mm256_cvtepi16_epi32(out1[2]); + out[3] = _mm256_cvtepi16_epi32(out1[3]); + out[4] = _mm256_cvtepi16_epi32(out1[4]); + out[5] = _mm256_cvtepi16_epi32(out1[5]); + out[6] = _mm256_cvtepi16_epi32(out1[6]); + out[7] = _mm256_cvtepi16_epi32(out1[7]); + + } else { + out[0] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[0])); + out[1] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[1])); + out[2] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[2])); + out[3] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[3])); + out[4] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[4])); + out[5] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[5])); + out[6] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[6])); + out[7] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[7])); + } + out[0] = _mm256_slli_epi32(out[0], shift); + out[1] = _mm256_slli_epi32(out[1], shift); + out[2] = _mm256_slli_epi32(out[2], shift); + out[3] = _mm256_slli_epi32(out[3], shift); + out[4] = _mm256_slli_epi32(out[4], shift); + out[5] = _mm256_slli_epi32(out[5], shift); + out[6] = _mm256_slli_epi32(out[6], shift); + out[7] = _mm256_slli_epi32(out[7], shift); +} +static INLINE void col_txfm_8x8_rounding(__m256i *in, int shift) { + const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1)); + + in[0] = _mm256_add_epi32(in[0], rounding); + in[1] = _mm256_add_epi32(in[1], rounding); + in[2] = _mm256_add_epi32(in[2], rounding); + in[3] = _mm256_add_epi32(in[3], rounding); + in[4] = _mm256_add_epi32(in[4], rounding); + in[5] = _mm256_add_epi32(in[5], rounding); + in[6] = _mm256_add_epi32(in[6], rounding); + in[7] = _mm256_add_epi32(in[7], rounding); + + in[0] = _mm256_srai_epi32(in[0], shift); + in[1] = _mm256_srai_epi32(in[1], shift); + in[2] = _mm256_srai_epi32(in[2], shift); + in[3] = _mm256_srai_epi32(in[3], shift); + in[4] = _mm256_srai_epi32(in[4], shift); + in[5] = _mm256_srai_epi32(in[5], shift); + in[6] = _mm256_srai_epi32(in[6], shift); + in[7] = _mm256_srai_epi32(in[7], shift); +} +static INLINE void av1_load_buffer_8x16_avx2(const int16_t *input, __m256i *out, + int stride, int flipud, int fliplr, + int shift) { + const int16_t *topL = input; + const int16_t *botL = input + 8 * stride; + + const int16_t *tmp; + + if (flipud) { + tmp = topL; + topL = botL; + botL = tmp; + } + av1_load_buffer_8x8_avx2(topL, out, stride, flipud, fliplr, shift); + av1_load_buffer_8x8_avx2(botL, out + 8, stride, flipud, fliplr, shift); +} +static INLINE void av1_load_buffer_16xn_avx2(const int16_t *input, __m256i *out, + int stride, int height, + int outstride, int flipud, + int fliplr) { + __m256i out1[64]; + if (!flipud) { + for (int i = 0; i < height; i++) { + out1[i] = _mm256_loadu_si256((const __m256i *)(input + i * stride)); + } + } else { + for (int i = 0; i < height; i++) { + out1[(height - 1) - i] = + _mm256_loadu_si256((const __m256i *)(input + i * stride)); + } + } + if (!fliplr) { + for (int i = 0; i < height; i++) { + out[i * outstride] = + _mm256_cvtepi16_epi32(_mm256_castsi256_si128(out1[i])); + out[i * outstride + 1] = + _mm256_cvtepi16_epi32(_mm256_extractf128_si256(out1[i], 1)); + } + } else { + for (int i = 0; i < height; i++) { + out[i * outstride + 1] = _mm256_cvtepi16_epi32( + mm_reverse_epi16(_mm256_castsi256_si128(out1[i]))); + out[i * outstride + 0] = _mm256_cvtepi16_epi32( + mm_reverse_epi16(_mm256_extractf128_si256(out1[i], 1))); + } + } +} + +static void av1_fwd_txfm_transpose_8x8_avx2(const __m256i *in, __m256i *out, + const int instride, + const int outstride) { + __m256i u0, u1, u2, u3, u4, u5, u6, u7; + __m256i x0, x1; + + u0 = _mm256_unpacklo_epi32(in[0 * instride], in[1 * instride]); + u1 = _mm256_unpackhi_epi32(in[0 * instride], in[1 * instride]); + + u2 = _mm256_unpacklo_epi32(in[2 * instride], in[3 * instride]); + u3 = _mm256_unpackhi_epi32(in[2 * instride], in[3 * instride]); + + u4 = _mm256_unpacklo_epi32(in[4 * instride], in[5 * instride]); + u5 = _mm256_unpackhi_epi32(in[4 * instride], in[5 * instride]); + + u6 = _mm256_unpacklo_epi32(in[6 * instride], in[7 * instride]); + u7 = _mm256_unpackhi_epi32(in[6 * instride], in[7 * instride]); + + x0 = _mm256_unpacklo_epi64(u0, u2); + x1 = _mm256_unpacklo_epi64(u4, u6); + out[0 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[4 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31); + + x0 = _mm256_unpackhi_epi64(u0, u2); + x1 = _mm256_unpackhi_epi64(u4, u6); + out[1 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[5 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31); + + x0 = _mm256_unpacklo_epi64(u1, u3); + x1 = _mm256_unpacklo_epi64(u5, u7); + out[2 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[6 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31); + + x0 = _mm256_unpackhi_epi64(u1, u3); + x1 = _mm256_unpackhi_epi64(u5, u7); + out[3 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20); + out[7 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31); +} +static INLINE void av1_round_shift_32_8xn_avx2(__m256i *in, int size, int bit, + int stride) { + if (bit < 0) { + bit = -bit; + __m256i round = _mm256_set1_epi32(1 << (bit - 1)); + for (int i = 0; i < size; ++i) { + in[stride * i] = _mm256_add_epi32(in[stride * i], round); + in[stride * i] = _mm256_srai_epi32(in[stride * i], bit); + } + } else if (bit > 0) { + for (int i = 0; i < size; ++i) { + in[stride * i] = _mm256_slli_epi32(in[stride * i], bit); + } + } +} +static INLINE void av1_store_buffer_avx2(const __m256i *const in, int32_t *out, + const int stride, const int out_size) { + for (int i = 0; i < out_size; ++i) { + _mm256_store_si256((__m256i *)(out), in[i]); + out += stride; + } +} +static INLINE void av1_fwd_txfm_transpose_16x16_avx2(const __m256i *in, + __m256i *out) { + av1_fwd_txfm_transpose_8x8_avx2(&in[0], &out[0], 2, 2); + av1_fwd_txfm_transpose_8x8_avx2(&in[1], &out[16], 2, 2); + av1_fwd_txfm_transpose_8x8_avx2(&in[16], &out[1], 2, 2); + av1_fwd_txfm_transpose_8x8_avx2(&in[17], &out[17], 2, 2); +} + +static INLINE __m256i av1_half_btf_avx2(const __m256i *w0, const __m256i *n0, + const __m256i *w1, const __m256i *n1, + const __m256i *rounding, int bit) { + __m256i x, y; + + x = _mm256_mullo_epi32(*w0, *n0); + y = _mm256_mullo_epi32(*w1, *n1); + x = _mm256_add_epi32(x, y); + x = _mm256_add_epi32(x, *rounding); + x = _mm256_srai_epi32(x, bit); + return x; +} +#define btf_32_avx2_type0(w0, w1, in0, in1, out0, out1, bit) \ + do { \ + const __m256i ww0 = _mm256_set1_epi32(w0); \ + const __m256i ww1 = _mm256_set1_epi32(w1); \ + const __m256i in0_w0 = _mm256_mullo_epi32(in0, ww0); \ + const __m256i in1_w1 = _mm256_mullo_epi32(in1, ww1); \ + out0 = _mm256_add_epi32(in0_w0, in1_w1); \ + av1_round_shift_32_8xn_avx2(&out0, 1, -bit, 1); \ + const __m256i in0_w1 = _mm256_mullo_epi32(in0, ww1); \ + const __m256i in1_w0 = _mm256_mullo_epi32(in1, ww0); \ + out1 = _mm256_sub_epi32(in0_w1, in1_w0); \ + av1_round_shift_32_8xn_avx2(&out1, 1, -bit, 1); \ + } while (0) + +#define btf_32_type0_avx2_new(ww0, ww1, in0, in1, out0, out1, r, bit) \ + do { \ + const __m256i in0_w0 = _mm256_mullo_epi32(in0, ww0); \ + const __m256i in1_w1 = _mm256_mullo_epi32(in1, ww1); \ + out0 = _mm256_add_epi32(in0_w0, in1_w1); \ + out0 = _mm256_add_epi32(out0, r); \ + out0 = _mm256_srai_epi32(out0, bit); \ + const __m256i in0_w1 = _mm256_mullo_epi32(in0, ww1); \ + const __m256i in1_w0 = _mm256_mullo_epi32(in1, ww0); \ + out1 = _mm256_sub_epi32(in0_w1, in1_w0); \ + out1 = _mm256_add_epi32(out1, r); \ + out1 = _mm256_srai_epi32(out1, bit); \ + } while (0) + +typedef void (*transform_1d_avx2)(__m256i *in, __m256i *out, + const int8_t cos_bit, int instride, + int outstride); +static void av1_fdct8_avx2(__m256i *in, __m256i *out, const int8_t bit, + const int col_num, const int outstride) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + __m256i u[8], v[8]; + for (int col = 0; col < col_num; ++col) { + u[0] = _mm256_add_epi32(in[0 * col_num + col], in[7 * col_num + col]); + v[7] = _mm256_sub_epi32(in[0 * col_num + col], in[7 * col_num + col]); + u[1] = _mm256_add_epi32(in[1 * col_num + col], in[6 * col_num + col]); + u[6] = _mm256_sub_epi32(in[1 * col_num + col], in[6 * col_num + col]); + u[2] = _mm256_add_epi32(in[2 * col_num + col], in[5 * col_num + col]); + u[5] = _mm256_sub_epi32(in[2 * col_num + col], in[5 * col_num + col]); + u[3] = _mm256_add_epi32(in[3 * col_num + col], in[4 * col_num + col]); + v[4] = _mm256_sub_epi32(in[3 * col_num + col], in[4 * col_num + col]); + v[0] = _mm256_add_epi32(u[0], u[3]); + v[3] = _mm256_sub_epi32(u[0], u[3]); + v[1] = _mm256_add_epi32(u[1], u[2]); + v[2] = _mm256_sub_epi32(u[1], u[2]); + + v[5] = _mm256_mullo_epi32(u[5], cospim32); + v[6] = _mm256_mullo_epi32(u[6], cospi32); + v[5] = _mm256_add_epi32(v[5], v[6]); + v[5] = _mm256_add_epi32(v[5], rnding); + v[5] = _mm256_srai_epi32(v[5], bit); + + u[0] = _mm256_mullo_epi32(u[5], cospi32); + v[6] = _mm256_mullo_epi32(u[6], cospim32); + v[6] = _mm256_sub_epi32(u[0], v[6]); + v[6] = _mm256_add_epi32(v[6], rnding); + v[6] = _mm256_srai_epi32(v[6], bit); + + // stage 3 + // type 0 + v[0] = _mm256_mullo_epi32(v[0], cospi32); + v[1] = _mm256_mullo_epi32(v[1], cospi32); + u[0] = _mm256_add_epi32(v[0], v[1]); + u[0] = _mm256_add_epi32(u[0], rnding); + u[0] = _mm256_srai_epi32(u[0], bit); + + u[1] = _mm256_sub_epi32(v[0], v[1]); + u[1] = _mm256_add_epi32(u[1], rnding); + u[1] = _mm256_srai_epi32(u[1], bit); + + // type 1 + v[0] = _mm256_mullo_epi32(v[2], cospi48); + v[1] = _mm256_mullo_epi32(v[3], cospi16); + u[2] = _mm256_add_epi32(v[0], v[1]); + u[2] = _mm256_add_epi32(u[2], rnding); + u[2] = _mm256_srai_epi32(u[2], bit); + + v[0] = _mm256_mullo_epi32(v[2], cospi16); + v[1] = _mm256_mullo_epi32(v[3], cospi48); + u[3] = _mm256_sub_epi32(v[1], v[0]); + u[3] = _mm256_add_epi32(u[3], rnding); + u[3] = _mm256_srai_epi32(u[3], bit); + + u[4] = _mm256_add_epi32(v[4], v[5]); + u[5] = _mm256_sub_epi32(v[4], v[5]); + u[6] = _mm256_sub_epi32(v[7], v[6]); + u[7] = _mm256_add_epi32(v[7], v[6]); + + // stage 4 + // stage 5 + v[0] = _mm256_mullo_epi32(u[4], cospi56); + v[1] = _mm256_mullo_epi32(u[7], cospi8); + v[0] = _mm256_add_epi32(v[0], v[1]); + v[0] = _mm256_add_epi32(v[0], rnding); + out[1 * outstride + col] = _mm256_srai_epi32(v[0], bit); // buf0[4] + + v[0] = _mm256_mullo_epi32(u[4], cospi8); + v[1] = _mm256_mullo_epi32(u[7], cospi56); + v[0] = _mm256_sub_epi32(v[1], v[0]); + v[0] = _mm256_add_epi32(v[0], rnding); + out[7 * outstride + col] = _mm256_srai_epi32(v[0], bit); // buf0[7] + + v[0] = _mm256_mullo_epi32(u[5], cospi24); + v[1] = _mm256_mullo_epi32(u[6], cospi40); + v[0] = _mm256_add_epi32(v[0], v[1]); + v[0] = _mm256_add_epi32(v[0], rnding); + out[5 * outstride + col] = _mm256_srai_epi32(v[0], bit); // buf0[5] + + v[0] = _mm256_mullo_epi32(u[5], cospi40); + v[1] = _mm256_mullo_epi32(u[6], cospi24); + v[0] = _mm256_sub_epi32(v[1], v[0]); + v[0] = _mm256_add_epi32(v[0], rnding); + out[3 * outstride + col] = _mm256_srai_epi32(v[0], bit); // buf0[6] + + out[0 * outstride + col] = u[0]; // buf0[0] + out[4 * outstride + col] = u[1]; // buf0[1] + out[2 * outstride + col] = u[2]; // buf0[2] + out[6 * outstride + col] = u[3]; // buf0[3] + } +} +static void av1_fadst8_avx2(__m256i *in, __m256i *out, const int8_t bit, + const int col_num, const int outstirde) { + (void)col_num; + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); + const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]); + const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); + const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); + const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]); + const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); + const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); + const __m256i cospi36 = _mm256_set1_epi32(cospi[36]); + const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]); + const __m256i cospi52 = _mm256_set1_epi32(cospi[52]); + const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]); + const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const __m256i zero = _mm256_setzero_si256(); + __m256i u0, u1, u2, u3, u4, u5, u6, u7; + __m256i v0, v1, v2, v3, v4, v5, v6, v7; + __m256i x, y; + for (int col = 0; col < col_num; ++col) { + u0 = in[0 * col_num + col]; + u1 = _mm256_sub_epi32(zero, in[7 * col_num + col]); + u2 = _mm256_sub_epi32(zero, in[3 * col_num + col]); + u3 = in[4 * col_num + col]; + u4 = _mm256_sub_epi32(zero, in[1 * col_num + col]); + u5 = in[6 * col_num + col]; + u6 = in[2 * col_num + col]; + u7 = _mm256_sub_epi32(zero, in[5 * col_num + col]); + + // stage 2 + v0 = u0; + v1 = u1; + + x = _mm256_mullo_epi32(u2, cospi32); + y = _mm256_mullo_epi32(u3, cospi32); + v2 = _mm256_add_epi32(x, y); + v2 = _mm256_add_epi32(v2, rnding); + v2 = _mm256_srai_epi32(v2, bit); + + v3 = _mm256_sub_epi32(x, y); + v3 = _mm256_add_epi32(v3, rnding); + v3 = _mm256_srai_epi32(v3, bit); + + v4 = u4; + v5 = u5; + + x = _mm256_mullo_epi32(u6, cospi32); + y = _mm256_mullo_epi32(u7, cospi32); + v6 = _mm256_add_epi32(x, y); + v6 = _mm256_add_epi32(v6, rnding); + v6 = _mm256_srai_epi32(v6, bit); + + v7 = _mm256_sub_epi32(x, y); + v7 = _mm256_add_epi32(v7, rnding); + v7 = _mm256_srai_epi32(v7, bit); + + // stage 3 + u0 = _mm256_add_epi32(v0, v2); + u1 = _mm256_add_epi32(v1, v3); + u2 = _mm256_sub_epi32(v0, v2); + u3 = _mm256_sub_epi32(v1, v3); + u4 = _mm256_add_epi32(v4, v6); + u5 = _mm256_add_epi32(v5, v7); + u6 = _mm256_sub_epi32(v4, v6); + u7 = _mm256_sub_epi32(v5, v7); + + // stage 4 + v0 = u0; + v1 = u1; + v2 = u2; + v3 = u3; + + x = _mm256_mullo_epi32(u4, cospi16); + y = _mm256_mullo_epi32(u5, cospi48); + v4 = _mm256_add_epi32(x, y); + v4 = _mm256_add_epi32(v4, rnding); + v4 = _mm256_srai_epi32(v4, bit); + + x = _mm256_mullo_epi32(u4, cospi48); + y = _mm256_mullo_epi32(u5, cospim16); + v5 = _mm256_add_epi32(x, y); + v5 = _mm256_add_epi32(v5, rnding); + v5 = _mm256_srai_epi32(v5, bit); + + x = _mm256_mullo_epi32(u6, cospim48); + y = _mm256_mullo_epi32(u7, cospi16); + v6 = _mm256_add_epi32(x, y); + v6 = _mm256_add_epi32(v6, rnding); + v6 = _mm256_srai_epi32(v6, bit); + + x = _mm256_mullo_epi32(u6, cospi16); + y = _mm256_mullo_epi32(u7, cospi48); + v7 = _mm256_add_epi32(x, y); + v7 = _mm256_add_epi32(v7, rnding); + v7 = _mm256_srai_epi32(v7, bit); + + // stage 5 + u0 = _mm256_add_epi32(v0, v4); + u1 = _mm256_add_epi32(v1, v5); + u2 = _mm256_add_epi32(v2, v6); + u3 = _mm256_add_epi32(v3, v7); + u4 = _mm256_sub_epi32(v0, v4); + u5 = _mm256_sub_epi32(v1, v5); + u6 = _mm256_sub_epi32(v2, v6); + u7 = _mm256_sub_epi32(v3, v7); + + // stage 6 + x = _mm256_mullo_epi32(u0, cospi4); + y = _mm256_mullo_epi32(u1, cospi60); + v0 = _mm256_add_epi32(x, y); + v0 = _mm256_add_epi32(v0, rnding); + v0 = _mm256_srai_epi32(v0, bit); + + x = _mm256_mullo_epi32(u0, cospi60); + y = _mm256_mullo_epi32(u1, cospim4); + v1 = _mm256_add_epi32(x, y); + v1 = _mm256_add_epi32(v1, rnding); + v1 = _mm256_srai_epi32(v1, bit); + + x = _mm256_mullo_epi32(u2, cospi20); + y = _mm256_mullo_epi32(u3, cospi44); + v2 = _mm256_add_epi32(x, y); + v2 = _mm256_add_epi32(v2, rnding); + v2 = _mm256_srai_epi32(v2, bit); + + x = _mm256_mullo_epi32(u2, cospi44); + y = _mm256_mullo_epi32(u3, cospim20); + v3 = _mm256_add_epi32(x, y); + v3 = _mm256_add_epi32(v3, rnding); + v3 = _mm256_srai_epi32(v3, bit); + + x = _mm256_mullo_epi32(u4, cospi36); + y = _mm256_mullo_epi32(u5, cospi28); + v4 = _mm256_add_epi32(x, y); + v4 = _mm256_add_epi32(v4, rnding); + v4 = _mm256_srai_epi32(v4, bit); + + x = _mm256_mullo_epi32(u4, cospi28); + y = _mm256_mullo_epi32(u5, cospim36); + v5 = _mm256_add_epi32(x, y); + v5 = _mm256_add_epi32(v5, rnding); + v5 = _mm256_srai_epi32(v5, bit); + + x = _mm256_mullo_epi32(u6, cospi52); + y = _mm256_mullo_epi32(u7, cospi12); + v6 = _mm256_add_epi32(x, y); + v6 = _mm256_add_epi32(v6, rnding); + v6 = _mm256_srai_epi32(v6, bit); + + x = _mm256_mullo_epi32(u6, cospi12); + y = _mm256_mullo_epi32(u7, cospim52); + v7 = _mm256_add_epi32(x, y); + v7 = _mm256_add_epi32(v7, rnding); + v7 = _mm256_srai_epi32(v7, bit); + + // stage 7 + out[0 * outstirde + col] = v1; + out[1 * outstirde + col] = v6; + out[2 * outstirde + col] = v3; + out[3 * outstirde + col] = v4; + out[4 * outstirde + col] = v5; + out[5 * outstirde + col] = v2; + out[6 * outstirde + col] = v7; + out[7 * outstirde + col] = v0; + } +} +static void av1_idtx8_avx2(__m256i *in, __m256i *out, const int8_t bit, + int col_num, int outstride) { + (void)bit; + (void)outstride; + int num_iters = 8 * col_num; + for (int i = 0; i < num_iters; i += 8) { + out[i] = _mm256_add_epi32(in[i], in[i]); + out[i + 1] = _mm256_add_epi32(in[i + 1], in[i + 1]); + out[i + 2] = _mm256_add_epi32(in[i + 2], in[i + 2]); + out[i + 3] = _mm256_add_epi32(in[i + 3], in[i + 3]); + out[i + 4] = _mm256_add_epi32(in[i + 4], in[i + 4]); + out[i + 5] = _mm256_add_epi32(in[i + 5], in[i + 5]); + out[i + 6] = _mm256_add_epi32(in[i + 6], in[i + 6]); + out[i + 7] = _mm256_add_epi32(in[i + 7], in[i + 7]); + } +} +void av1_fwd_txfm2d_8x8_avx2(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + __m256i in[8], out[8]; + const TX_SIZE tx_size = TX_8X8; + const int8_t *shift = fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int width = tx_size_wide[tx_size]; + const int width_div8 = (width >> 3); + + switch (tx_type) { + case DCT_DCT: + av1_load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); + av1_fdct8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + av1_fdct8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + av1_store_buffer_avx2(in, coeff, 8, 8); + break; + case ADST_DCT: + av1_load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); + av1_fadst8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + av1_fdct8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + av1_store_buffer_avx2(in, coeff, 8, 8); + break; + case DCT_ADST: + av1_load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); + av1_fdct8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + av1_fadst8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + av1_store_buffer_avx2(in, coeff, 8, 8); + break; + case ADST_ADST: + av1_load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); + av1_fadst8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + av1_fadst8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + av1_store_buffer_avx2(in, coeff, 8, 8); + break; + case FLIPADST_DCT: + av1_load_buffer_8x8_avx2(input, in, stride, 1, 0, shift[0]); + av1_fadst8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + av1_fdct8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + av1_store_buffer_avx2(in, coeff, 8, 8); + break; + case DCT_FLIPADST: + av1_load_buffer_8x8_avx2(input, in, stride, 0, 1, shift[0]); + av1_fdct8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + av1_fadst8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + av1_store_buffer_avx2(in, coeff, 8, 8); + break; + case FLIPADST_FLIPADST: + av1_load_buffer_8x8_avx2(input, in, stride, 1, 1, shift[0]); + av1_fadst8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + av1_fadst8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + av1_store_buffer_avx2(in, coeff, 8, 8); + break; + case ADST_FLIPADST: + av1_load_buffer_8x8_avx2(input, in, stride, 0, 1, shift[0]); + av1_fadst8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + av1_fadst8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + av1_store_buffer_avx2(in, coeff, 8, 8); + break; + case FLIPADST_ADST: + av1_load_buffer_8x8_avx2(input, in, stride, 1, 0, shift[0]); + av1_fadst8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + av1_fadst8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + av1_store_buffer_avx2(in, coeff, 8, 8); + break; + case IDTX: + av1_load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); + av1_idtx8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + av1_idtx8_avx2(out, in, fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + av1_store_buffer_avx2(in, coeff, 8, 8); + break; + case V_DCT: + av1_load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); + av1_fdct8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + av1_idtx8_avx2(out, in, fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + av1_store_buffer_avx2(in, coeff, 8, 8); + break; + case H_DCT: + av1_load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); + av1_idtx8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + av1_fdct8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + av1_store_buffer_avx2(in, coeff, 8, 8); + break; + case V_ADST: + av1_load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); + av1_fadst8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + av1_idtx8_avx2(out, in, fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + av1_store_buffer_avx2(in, coeff, 8, 8); + break; + case H_ADST: + av1_load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]); + av1_idtx8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + av1_fadst8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + av1_store_buffer_avx2(in, coeff, 8, 8); + break; + case V_FLIPADST: + av1_load_buffer_8x8_avx2(input, in, stride, 1, 0, shift[0]); + av1_fadst8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + av1_idtx8_avx2(out, in, fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + av1_store_buffer_avx2(in, coeff, 8, 8); + break; + case H_FLIPADST: + av1_load_buffer_8x8_avx2(input, in, stride, 0, 1, shift[0]); + av1_idtx8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + col_txfm_8x8_rounding(out, -shift[1]); + av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + av1_fadst8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8); + av1_store_buffer_avx2(in, coeff, 8, 8); + break; + default: assert(0); + } + (void)bd; +} + +static void av1_fdct16_avx2(__m256i *in, __m256i *out, const int8_t bit, + const int col_num, const int outstride) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i cospi60 = _mm256_set1_epi32(cospi[60]); + const __m256i cospi4 = _mm256_set1_epi32(cospi[4]); + const __m256i cospi28 = _mm256_set1_epi32(cospi[28]); + const __m256i cospi36 = _mm256_set1_epi32(cospi[36]); + const __m256i cospi44 = _mm256_set1_epi32(cospi[44]); + const __m256i cospi20 = _mm256_set1_epi32(cospi[20]); + const __m256i cospi12 = _mm256_set1_epi32(cospi[12]); + const __m256i cospi52 = _mm256_set1_epi32(cospi[52]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + __m256i u[16], v[16], x; + int col; + + // Calculate the column 0, 1, 2, 3 + for (col = 0; col < col_num; ++col) { + // stage 0 + // stage 1 + u[0] = _mm256_add_epi32(in[0 * col_num + col], in[15 * col_num + col]); + u[15] = _mm256_sub_epi32(in[0 * col_num + col], in[15 * col_num + col]); + u[1] = _mm256_add_epi32(in[1 * col_num + col], in[14 * col_num + col]); + u[14] = _mm256_sub_epi32(in[1 * col_num + col], in[14 * col_num + col]); + u[2] = _mm256_add_epi32(in[2 * col_num + col], in[13 * col_num + col]); + u[13] = _mm256_sub_epi32(in[2 * col_num + col], in[13 * col_num + col]); + u[3] = _mm256_add_epi32(in[3 * col_num + col], in[12 * col_num + col]); + u[12] = _mm256_sub_epi32(in[3 * col_num + col], in[12 * col_num + col]); + u[4] = _mm256_add_epi32(in[4 * col_num + col], in[11 * col_num + col]); + u[11] = _mm256_sub_epi32(in[4 * col_num + col], in[11 * col_num + col]); + u[5] = _mm256_add_epi32(in[5 * col_num + col], in[10 * col_num + col]); + u[10] = _mm256_sub_epi32(in[5 * col_num + col], in[10 * col_num + col]); + u[6] = _mm256_add_epi32(in[6 * col_num + col], in[9 * col_num + col]); + u[9] = _mm256_sub_epi32(in[6 * col_num + col], in[9 * col_num + col]); + u[7] = _mm256_add_epi32(in[7 * col_num + col], in[8 * col_num + col]); + u[8] = _mm256_sub_epi32(in[7 * col_num + col], in[8 * col_num + col]); + + // stage 2 + v[0] = _mm256_add_epi32(u[0], u[7]); + v[7] = _mm256_sub_epi32(u[0], u[7]); + v[1] = _mm256_add_epi32(u[1], u[6]); + v[6] = _mm256_sub_epi32(u[1], u[6]); + v[2] = _mm256_add_epi32(u[2], u[5]); + v[5] = _mm256_sub_epi32(u[2], u[5]); + v[3] = _mm256_add_epi32(u[3], u[4]); + v[4] = _mm256_sub_epi32(u[3], u[4]); + v[8] = u[8]; + v[9] = u[9]; + + v[10] = _mm256_mullo_epi32(u[10], cospim32); + x = _mm256_mullo_epi32(u[13], cospi32); + v[10] = _mm256_add_epi32(v[10], x); + v[10] = _mm256_add_epi32(v[10], rnding); + v[10] = _mm256_srai_epi32(v[10], bit); + + v[13] = _mm256_mullo_epi32(u[10], cospi32); + x = _mm256_mullo_epi32(u[13], cospim32); + v[13] = _mm256_sub_epi32(v[13], x); + v[13] = _mm256_add_epi32(v[13], rnding); + v[13] = _mm256_srai_epi32(v[13], bit); + + v[11] = _mm256_mullo_epi32(u[11], cospim32); + x = _mm256_mullo_epi32(u[12], cospi32); + v[11] = _mm256_add_epi32(v[11], x); + v[11] = _mm256_add_epi32(v[11], rnding); + v[11] = _mm256_srai_epi32(v[11], bit); + + v[12] = _mm256_mullo_epi32(u[11], cospi32); + x = _mm256_mullo_epi32(u[12], cospim32); + v[12] = _mm256_sub_epi32(v[12], x); + v[12] = _mm256_add_epi32(v[12], rnding); + v[12] = _mm256_srai_epi32(v[12], bit); + v[14] = u[14]; + v[15] = u[15]; + + // stage 3 + u[0] = _mm256_add_epi32(v[0], v[3]); + u[3] = _mm256_sub_epi32(v[0], v[3]); + u[1] = _mm256_add_epi32(v[1], v[2]); + u[2] = _mm256_sub_epi32(v[1], v[2]); + u[4] = v[4]; + + u[5] = _mm256_mullo_epi32(v[5], cospim32); + x = _mm256_mullo_epi32(v[6], cospi32); + u[5] = _mm256_add_epi32(u[5], x); + u[5] = _mm256_add_epi32(u[5], rnding); + u[5] = _mm256_srai_epi32(u[5], bit); + + u[6] = _mm256_mullo_epi32(v[5], cospi32); + x = _mm256_mullo_epi32(v[6], cospim32); + u[6] = _mm256_sub_epi32(u[6], x); + u[6] = _mm256_add_epi32(u[6], rnding); + u[6] = _mm256_srai_epi32(u[6], bit); + + u[7] = v[7]; + u[8] = _mm256_add_epi32(v[8], v[11]); + u[11] = _mm256_sub_epi32(v[8], v[11]); + u[9] = _mm256_add_epi32(v[9], v[10]); + u[10] = _mm256_sub_epi32(v[9], v[10]); + u[12] = _mm256_sub_epi32(v[15], v[12]); + u[15] = _mm256_add_epi32(v[15], v[12]); + u[13] = _mm256_sub_epi32(v[14], v[13]); + u[14] = _mm256_add_epi32(v[14], v[13]); + + // stage 4 + u[0] = _mm256_mullo_epi32(u[0], cospi32); + u[1] = _mm256_mullo_epi32(u[1], cospi32); + v[0] = _mm256_add_epi32(u[0], u[1]); + v[0] = _mm256_add_epi32(v[0], rnding); + v[0] = _mm256_srai_epi32(v[0], bit); + + v[1] = _mm256_sub_epi32(u[0], u[1]); + v[1] = _mm256_add_epi32(v[1], rnding); + v[1] = _mm256_srai_epi32(v[1], bit); + + v[2] = _mm256_mullo_epi32(u[2], cospi48); + x = _mm256_mullo_epi32(u[3], cospi16); + v[2] = _mm256_add_epi32(v[2], x); + v[2] = _mm256_add_epi32(v[2], rnding); + v[2] = _mm256_srai_epi32(v[2], bit); + + v[3] = _mm256_mullo_epi32(u[2], cospi16); + x = _mm256_mullo_epi32(u[3], cospi48); + v[3] = _mm256_sub_epi32(x, v[3]); + v[3] = _mm256_add_epi32(v[3], rnding); + v[3] = _mm256_srai_epi32(v[3], bit); + + v[4] = _mm256_add_epi32(u[4], u[5]); + v[5] = _mm256_sub_epi32(u[4], u[5]); + v[6] = _mm256_sub_epi32(u[7], u[6]); + v[7] = _mm256_add_epi32(u[7], u[6]); + v[8] = u[8]; + + v[9] = _mm256_mullo_epi32(u[9], cospim16); + x = _mm256_mullo_epi32(u[14], cospi48); + v[9] = _mm256_add_epi32(v[9], x); + v[9] = _mm256_add_epi32(v[9], rnding); + v[9] = _mm256_srai_epi32(v[9], bit); + + v[14] = _mm256_mullo_epi32(u[9], cospi48); + x = _mm256_mullo_epi32(u[14], cospim16); + v[14] = _mm256_sub_epi32(v[14], x); + v[14] = _mm256_add_epi32(v[14], rnding); + v[14] = _mm256_srai_epi32(v[14], bit); + + v[10] = _mm256_mullo_epi32(u[10], cospim48); + x = _mm256_mullo_epi32(u[13], cospim16); + v[10] = _mm256_add_epi32(v[10], x); + v[10] = _mm256_add_epi32(v[10], rnding); + v[10] = _mm256_srai_epi32(v[10], bit); + + v[13] = _mm256_mullo_epi32(u[10], cospim16); + x = _mm256_mullo_epi32(u[13], cospim48); + v[13] = _mm256_sub_epi32(v[13], x); + v[13] = _mm256_add_epi32(v[13], rnding); + v[13] = _mm256_srai_epi32(v[13], bit); + + v[11] = u[11]; + v[12] = u[12]; + v[15] = u[15]; + + // stage 5 + u[0] = v[0]; + u[1] = v[1]; + u[2] = v[2]; + u[3] = v[3]; + + u[4] = _mm256_mullo_epi32(v[4], cospi56); + x = _mm256_mullo_epi32(v[7], cospi8); + u[4] = _mm256_add_epi32(u[4], x); + u[4] = _mm256_add_epi32(u[4], rnding); + u[4] = _mm256_srai_epi32(u[4], bit); + + u[7] = _mm256_mullo_epi32(v[4], cospi8); + x = _mm256_mullo_epi32(v[7], cospi56); + u[7] = _mm256_sub_epi32(x, u[7]); + u[7] = _mm256_add_epi32(u[7], rnding); + u[7] = _mm256_srai_epi32(u[7], bit); + + u[5] = _mm256_mullo_epi32(v[5], cospi24); + x = _mm256_mullo_epi32(v[6], cospi40); + u[5] = _mm256_add_epi32(u[5], x); + u[5] = _mm256_add_epi32(u[5], rnding); + u[5] = _mm256_srai_epi32(u[5], bit); + + u[6] = _mm256_mullo_epi32(v[5], cospi40); + x = _mm256_mullo_epi32(v[6], cospi24); + u[6] = _mm256_sub_epi32(x, u[6]); + u[6] = _mm256_add_epi32(u[6], rnding); + u[6] = _mm256_srai_epi32(u[6], bit); + + u[8] = _mm256_add_epi32(v[8], v[9]); + u[9] = _mm256_sub_epi32(v[8], v[9]); + u[10] = _mm256_sub_epi32(v[11], v[10]); + u[11] = _mm256_add_epi32(v[11], v[10]); + u[12] = _mm256_add_epi32(v[12], v[13]); + u[13] = _mm256_sub_epi32(v[12], v[13]); + u[14] = _mm256_sub_epi32(v[15], v[14]); + u[15] = _mm256_add_epi32(v[15], v[14]); + + // stage 6 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = u[4]; + v[5] = u[5]; + v[6] = u[6]; + v[7] = u[7]; + + v[8] = _mm256_mullo_epi32(u[8], cospi60); + x = _mm256_mullo_epi32(u[15], cospi4); + v[8] = _mm256_add_epi32(v[8], x); + v[8] = _mm256_add_epi32(v[8], rnding); + v[8] = _mm256_srai_epi32(v[8], bit); + + v[15] = _mm256_mullo_epi32(u[8], cospi4); + x = _mm256_mullo_epi32(u[15], cospi60); + v[15] = _mm256_sub_epi32(x, v[15]); + v[15] = _mm256_add_epi32(v[15], rnding); + v[15] = _mm256_srai_epi32(v[15], bit); + + v[9] = _mm256_mullo_epi32(u[9], cospi28); + x = _mm256_mullo_epi32(u[14], cospi36); + v[9] = _mm256_add_epi32(v[9], x); + v[9] = _mm256_add_epi32(v[9], rnding); + v[9] = _mm256_srai_epi32(v[9], bit); + + v[14] = _mm256_mullo_epi32(u[9], cospi36); + x = _mm256_mullo_epi32(u[14], cospi28); + v[14] = _mm256_sub_epi32(x, v[14]); + v[14] = _mm256_add_epi32(v[14], rnding); + v[14] = _mm256_srai_epi32(v[14], bit); + + v[10] = _mm256_mullo_epi32(u[10], cospi44); + x = _mm256_mullo_epi32(u[13], cospi20); + v[10] = _mm256_add_epi32(v[10], x); + v[10] = _mm256_add_epi32(v[10], rnding); + v[10] = _mm256_srai_epi32(v[10], bit); + + v[13] = _mm256_mullo_epi32(u[10], cospi20); + x = _mm256_mullo_epi32(u[13], cospi44); + v[13] = _mm256_sub_epi32(x, v[13]); + v[13] = _mm256_add_epi32(v[13], rnding); + v[13] = _mm256_srai_epi32(v[13], bit); + + v[11] = _mm256_mullo_epi32(u[11], cospi12); + x = _mm256_mullo_epi32(u[12], cospi52); + v[11] = _mm256_add_epi32(v[11], x); + v[11] = _mm256_add_epi32(v[11], rnding); + v[11] = _mm256_srai_epi32(v[11], bit); + + v[12] = _mm256_mullo_epi32(u[11], cospi52); + x = _mm256_mullo_epi32(u[12], cospi12); + v[12] = _mm256_sub_epi32(x, v[12]); + v[12] = _mm256_add_epi32(v[12], rnding); + v[12] = _mm256_srai_epi32(v[12], bit); + + out[0 * outstride + col] = v[0]; + out[1 * outstride + col] = v[8]; + out[2 * outstride + col] = v[4]; + out[3 * outstride + col] = v[12]; + out[4 * outstride + col] = v[2]; + out[5 * outstride + col] = v[10]; + out[6 * outstride + col] = v[6]; + out[7 * outstride + col] = v[14]; + out[8 * outstride + col] = v[1]; + out[9 * outstride + col] = v[9]; + out[10 * outstride + col] = v[5]; + out[11 * outstride + col] = v[13]; + out[12 * outstride + col] = v[3]; + out[13 * outstride + col] = v[11]; + out[14 * outstride + col] = v[7]; + out[15 * outstride + col] = v[15]; + } +} +static void av1_fadst16_avx2(__m256i *in, __m256i *out, const int8_t bit, + const int num_cols, const int outstride) { + const int32_t *cospi = cospi_arr(bit); + const __m256i cospi32 = _mm256_set1_epi32(cospi[32]); + const __m256i cospi48 = _mm256_set1_epi32(cospi[48]); + const __m256i cospi16 = _mm256_set1_epi32(cospi[16]); + const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]); + const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]); + const __m256i cospi8 = _mm256_set1_epi32(cospi[8]); + const __m256i cospi56 = _mm256_set1_epi32(cospi[56]); + const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]); + const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]); + const __m256i cospi24 = _mm256_set1_epi32(cospi[24]); + const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]); + const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]); + const __m256i cospi40 = _mm256_set1_epi32(cospi[40]); + const __m256i cospi2 = _mm256_set1_epi32(cospi[2]); + const __m256i cospi62 = _mm256_set1_epi32(cospi[62]); + const __m256i cospim2 = _mm256_set1_epi32(-cospi[2]); + const __m256i cospi10 = _mm256_set1_epi32(cospi[10]); + const __m256i cospi54 = _mm256_set1_epi32(cospi[54]); + const __m256i cospim10 = _mm256_set1_epi32(-cospi[10]); + const __m256i cospi18 = _mm256_set1_epi32(cospi[18]); + const __m256i cospi46 = _mm256_set1_epi32(cospi[46]); + const __m256i cospim18 = _mm256_set1_epi32(-cospi[18]); + const __m256i cospi26 = _mm256_set1_epi32(cospi[26]); + const __m256i cospi38 = _mm256_set1_epi32(cospi[38]); + const __m256i cospim26 = _mm256_set1_epi32(-cospi[26]); + const __m256i cospi34 = _mm256_set1_epi32(cospi[34]); + const __m256i cospi30 = _mm256_set1_epi32(cospi[30]); + const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]); + const __m256i cospi42 = _mm256_set1_epi32(cospi[42]); + const __m256i cospi22 = _mm256_set1_epi32(cospi[22]); + const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]); + const __m256i cospi50 = _mm256_set1_epi32(cospi[50]); + const __m256i cospi14 = _mm256_set1_epi32(cospi[14]); + const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]); + const __m256i cospi58 = _mm256_set1_epi32(cospi[58]); + const __m256i cospi6 = _mm256_set1_epi32(cospi[6]); + const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]); + const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1)); + const __m256i zero = _mm256_setzero_si256(); + + __m256i u[16], v[16], x, y; + int col; + + for (col = 0; col < num_cols; ++col) { + // stage 0 + // stage 1 + u[0] = in[0 * num_cols + col]; + u[1] = _mm256_sub_epi32(zero, in[15 * num_cols + col]); + u[2] = _mm256_sub_epi32(zero, in[7 * num_cols + col]); + u[3] = in[8 * num_cols + col]; + u[4] = _mm256_sub_epi32(zero, in[3 * num_cols + col]); + u[5] = in[12 * num_cols + col]; + u[6] = in[4 * num_cols + col]; + u[7] = _mm256_sub_epi32(zero, in[11 * num_cols + col]); + u[8] = _mm256_sub_epi32(zero, in[1 * num_cols + col]); + u[9] = in[14 * num_cols + col]; + u[10] = in[6 * num_cols + col]; + u[11] = _mm256_sub_epi32(zero, in[9 * num_cols + col]); + u[12] = in[2 * num_cols + col]; + u[13] = _mm256_sub_epi32(zero, in[13 * num_cols + col]); + u[14] = _mm256_sub_epi32(zero, in[5 * num_cols + col]); + u[15] = in[10 * num_cols + col]; + + // stage 2 + v[0] = u[0]; + v[1] = u[1]; + + x = _mm256_mullo_epi32(u[2], cospi32); + y = _mm256_mullo_epi32(u[3], cospi32); + v[2] = _mm256_add_epi32(x, y); + v[2] = _mm256_add_epi32(v[2], rnding); + v[2] = _mm256_srai_epi32(v[2], bit); + + v[3] = _mm256_sub_epi32(x, y); + v[3] = _mm256_add_epi32(v[3], rnding); + v[3] = _mm256_srai_epi32(v[3], bit); + + v[4] = u[4]; + v[5] = u[5]; + + x = _mm256_mullo_epi32(u[6], cospi32); + y = _mm256_mullo_epi32(u[7], cospi32); + v[6] = _mm256_add_epi32(x, y); + v[6] = _mm256_add_epi32(v[6], rnding); + v[6] = _mm256_srai_epi32(v[6], bit); + + v[7] = _mm256_sub_epi32(x, y); + v[7] = _mm256_add_epi32(v[7], rnding); + v[7] = _mm256_srai_epi32(v[7], bit); + + v[8] = u[8]; + v[9] = u[9]; + + x = _mm256_mullo_epi32(u[10], cospi32); + y = _mm256_mullo_epi32(u[11], cospi32); + v[10] = _mm256_add_epi32(x, y); + v[10] = _mm256_add_epi32(v[10], rnding); + v[10] = _mm256_srai_epi32(v[10], bit); + + v[11] = _mm256_sub_epi32(x, y); + v[11] = _mm256_add_epi32(v[11], rnding); + v[11] = _mm256_srai_epi32(v[11], bit); + + v[12] = u[12]; + v[13] = u[13]; + + x = _mm256_mullo_epi32(u[14], cospi32); + y = _mm256_mullo_epi32(u[15], cospi32); + v[14] = _mm256_add_epi32(x, y); + v[14] = _mm256_add_epi32(v[14], rnding); + v[14] = _mm256_srai_epi32(v[14], bit); + + v[15] = _mm256_sub_epi32(x, y); + v[15] = _mm256_add_epi32(v[15], rnding); + v[15] = _mm256_srai_epi32(v[15], bit); + + // stage 3 + u[0] = _mm256_add_epi32(v[0], v[2]); + u[1] = _mm256_add_epi32(v[1], v[3]); + u[2] = _mm256_sub_epi32(v[0], v[2]); + u[3] = _mm256_sub_epi32(v[1], v[3]); + u[4] = _mm256_add_epi32(v[4], v[6]); + u[5] = _mm256_add_epi32(v[5], v[7]); + u[6] = _mm256_sub_epi32(v[4], v[6]); + u[7] = _mm256_sub_epi32(v[5], v[7]); + u[8] = _mm256_add_epi32(v[8], v[10]); + u[9] = _mm256_add_epi32(v[9], v[11]); + u[10] = _mm256_sub_epi32(v[8], v[10]); + u[11] = _mm256_sub_epi32(v[9], v[11]); + u[12] = _mm256_add_epi32(v[12], v[14]); + u[13] = _mm256_add_epi32(v[13], v[15]); + u[14] = _mm256_sub_epi32(v[12], v[14]); + u[15] = _mm256_sub_epi32(v[13], v[15]); + + // stage 4 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = av1_half_btf_avx2(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit); + v[5] = av1_half_btf_avx2(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit); + v[6] = av1_half_btf_avx2(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit); + v[7] = av1_half_btf_avx2(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit); + v[8] = u[8]; + v[9] = u[9]; + v[10] = u[10]; + v[11] = u[11]; + v[12] = av1_half_btf_avx2(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit); + v[13] = + av1_half_btf_avx2(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit); + v[14] = + av1_half_btf_avx2(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit); + v[15] = av1_half_btf_avx2(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit); + + // stage 5 + u[0] = _mm256_add_epi32(v[0], v[4]); + u[1] = _mm256_add_epi32(v[1], v[5]); + u[2] = _mm256_add_epi32(v[2], v[6]); + u[3] = _mm256_add_epi32(v[3], v[7]); + u[4] = _mm256_sub_epi32(v[0], v[4]); + u[5] = _mm256_sub_epi32(v[1], v[5]); + u[6] = _mm256_sub_epi32(v[2], v[6]); + u[7] = _mm256_sub_epi32(v[3], v[7]); + u[8] = _mm256_add_epi32(v[8], v[12]); + u[9] = _mm256_add_epi32(v[9], v[13]); + u[10] = _mm256_add_epi32(v[10], v[14]); + u[11] = _mm256_add_epi32(v[11], v[15]); + u[12] = _mm256_sub_epi32(v[8], v[12]); + u[13] = _mm256_sub_epi32(v[9], v[13]); + u[14] = _mm256_sub_epi32(v[10], v[14]); + u[15] = _mm256_sub_epi32(v[11], v[15]); + + // stage 6 + v[0] = u[0]; + v[1] = u[1]; + v[2] = u[2]; + v[3] = u[3]; + v[4] = u[4]; + v[5] = u[5]; + v[6] = u[6]; + v[7] = u[7]; + v[8] = av1_half_btf_avx2(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit); + v[9] = av1_half_btf_avx2(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit); + v[10] = av1_half_btf_avx2(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit); + v[11] = + av1_half_btf_avx2(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit); + v[12] = av1_half_btf_avx2(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit); + v[13] = av1_half_btf_avx2(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit); + v[14] = + av1_half_btf_avx2(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit); + v[15] = av1_half_btf_avx2(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit); + + // stage 7 + u[0] = _mm256_add_epi32(v[0], v[8]); + u[1] = _mm256_add_epi32(v[1], v[9]); + u[2] = _mm256_add_epi32(v[2], v[10]); + u[3] = _mm256_add_epi32(v[3], v[11]); + u[4] = _mm256_add_epi32(v[4], v[12]); + u[5] = _mm256_add_epi32(v[5], v[13]); + u[6] = _mm256_add_epi32(v[6], v[14]); + u[7] = _mm256_add_epi32(v[7], v[15]); + u[8] = _mm256_sub_epi32(v[0], v[8]); + u[9] = _mm256_sub_epi32(v[1], v[9]); + u[10] = _mm256_sub_epi32(v[2], v[10]); + u[11] = _mm256_sub_epi32(v[3], v[11]); + u[12] = _mm256_sub_epi32(v[4], v[12]); + u[13] = _mm256_sub_epi32(v[5], v[13]); + u[14] = _mm256_sub_epi32(v[6], v[14]); + u[15] = _mm256_sub_epi32(v[7], v[15]); + + // stage 8 + v[0] = av1_half_btf_avx2(&cospi2, &u[0], &cospi62, &u[1], &rnding, bit); + v[1] = av1_half_btf_avx2(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit); + v[2] = av1_half_btf_avx2(&cospi10, &u[2], &cospi54, &u[3], &rnding, bit); + v[3] = av1_half_btf_avx2(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit); + v[4] = av1_half_btf_avx2(&cospi18, &u[4], &cospi46, &u[5], &rnding, bit); + v[5] = av1_half_btf_avx2(&cospi46, &u[4], &cospim18, &u[5], &rnding, bit); + v[6] = av1_half_btf_avx2(&cospi26, &u[6], &cospi38, &u[7], &rnding, bit); + v[7] = av1_half_btf_avx2(&cospi38, &u[6], &cospim26, &u[7], &rnding, bit); + v[8] = av1_half_btf_avx2(&cospi34, &u[8], &cospi30, &u[9], &rnding, bit); + v[9] = av1_half_btf_avx2(&cospi30, &u[8], &cospim34, &u[9], &rnding, bit); + v[10] = av1_half_btf_avx2(&cospi42, &u[10], &cospi22, &u[11], &rnding, bit); + v[11] = + av1_half_btf_avx2(&cospi22, &u[10], &cospim42, &u[11], &rnding, bit); + v[12] = av1_half_btf_avx2(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit); + v[13] = + av1_half_btf_avx2(&cospi14, &u[12], &cospim50, &u[13], &rnding, bit); + v[14] = av1_half_btf_avx2(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit); + v[15] = av1_half_btf_avx2(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit); + + // stage 9 + out[0 * outstride + col] = v[1]; + out[1 * outstride + col] = v[14]; + out[2 * outstride + col] = v[3]; + out[3 * outstride + col] = v[12]; + out[4 * outstride + col] = v[5]; + out[5 * outstride + col] = v[10]; + out[6 * outstride + col] = v[7]; + out[7 * outstride + col] = v[8]; + out[8 * outstride + col] = v[9]; + out[9 * outstride + col] = v[6]; + out[10 * outstride + col] = v[11]; + out[11 * outstride + col] = v[4]; + out[12 * outstride + col] = v[13]; + out[13 * outstride + col] = v[2]; + out[14 * outstride + col] = v[15]; + out[15 * outstride + col] = v[0]; + } +} +static void av1_idtx16_avx2(__m256i *in, __m256i *out, const int8_t bit, + int col_num, const int outstride) { + (void)bit; + (void)outstride; + __m256i fact = _mm256_set1_epi32(2 * NewSqrt2); + __m256i offset = _mm256_set1_epi32(1 << (NewSqrt2Bits - 1)); + __m256i a_low; + + int num_iters = 16 * col_num; + for (int i = 0; i < num_iters; i++) { + a_low = _mm256_mullo_epi32(in[i], fact); + a_low = _mm256_add_epi32(a_low, offset); + out[i] = _mm256_srai_epi32(a_low, NewSqrt2Bits); + } +} +static const transform_1d_avx2 col_highbd_txfm8x16_arr[TX_TYPES] = { + av1_fdct16_avx2, // DCT_DCT + av1_fadst16_avx2, // ADST_DCT + av1_fdct16_avx2, // DCT_ADST + av1_fadst16_avx2, // ADST_ADST + av1_fadst16_avx2, // FLIPADST_DCT + av1_fdct16_avx2, // DCT_FLIPADST + av1_fadst16_avx2, // FLIPADST_FLIPADST + av1_fadst16_avx2, // ADST_FLIPADST + av1_fadst16_avx2, // FLIPADST_ADST + av1_idtx16_avx2, // IDTX + av1_fdct16_avx2, // V_DCT + av1_idtx16_avx2, // H_DCT + av1_fadst16_avx2, // V_ADST + av1_idtx16_avx2, // H_ADST + av1_fadst16_avx2, // V_FLIPADST + av1_idtx16_avx2 // H_FLIPADST +}; +static const transform_1d_avx2 row_highbd_txfm8x8_arr[TX_TYPES] = { + av1_fdct8_avx2, // DCT_DCT + av1_fdct8_avx2, // ADST_DCT + av1_fadst8_avx2, // DCT_ADST + av1_fadst8_avx2, // ADST_ADST + av1_fdct8_avx2, // FLIPADST_DCT + av1_fadst8_avx2, // DCT_FLIPADST + av1_fadst8_avx2, // FLIPADST_FLIPADST + av1_fadst8_avx2, // ADST_FLIPADST + av1_fadst8_avx2, // FLIPADST_ADST + av1_idtx8_avx2, // IDTX + av1_idtx8_avx2, // V_DCT + av1_fdct8_avx2, // H_DCT + av1_idtx8_avx2, // V_ADST + av1_fadst8_avx2, // H_ADST + av1_idtx8_avx2, // V_FLIPADST + av1_fadst8_avx2 // H_FLIPADST +}; +void av1_fwd_txfm2d_8x16_avx2(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + __m256i in[16], out[16]; + const int8_t *shift = fwd_txfm_shift_ls[TX_8X16]; + const int txw_idx = get_txw_idx(TX_8X16); + const int txh_idx = get_txh_idx(TX_8X16); + const transform_1d_avx2 col_txfm = col_highbd_txfm8x16_arr[tx_type]; + const transform_1d_avx2 row_txfm = row_highbd_txfm8x8_arr[tx_type]; + const int8_t bit = fwd_cos_bit_col[txw_idx][txh_idx]; + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + av1_load_buffer_8x16_avx2(input, in, stride, ud_flip, lr_flip, shift[0]); + col_txfm(in, out, bit, 1, 1); + col_txfm_8x8_rounding(out, -shift[1]); + col_txfm_8x8_rounding(&out[8], -shift[1]); + av1_fwd_txfm_transpose_8x8_avx2(out, in, 1, 2); + av1_fwd_txfm_transpose_8x8_avx2(&out[8], &in[1], 1, 2); + row_txfm(in, out, bit, 2, 2); + av1_fwd_txfm_transpose_8x8_avx2(out, in, 2, 1); + av1_fwd_txfm_transpose_8x8_avx2(&out[1], &in[8], 2, 1); + av1_round_shift_rect_array_32_avx2(in, in, 16, -shift[2], NewSqrt2); + av1_store_buffer_avx2(in, coeff, 8, 16); + (void)bd; +} +static const transform_1d_avx2 col_highbd_txfm8x8_arr[TX_TYPES] = { + av1_fdct8_avx2, // DCT_DCT + av1_fadst8_avx2, // ADST_DCT + av1_fdct8_avx2, // DCT_ADST + av1_fadst8_avx2, // ADST_ADST + av1_fadst8_avx2, // FLIPADST_DCT + av1_fdct8_avx2, // DCT_FLIPADST + av1_fadst8_avx2, // FLIPADST_FLIPADST + av1_fadst8_avx2, // ADST_FLIPADST + av1_fadst8_avx2, // FLIPADST_ADST + av1_idtx8_avx2, // IDTX + av1_fdct8_avx2, // V_DCT + av1_idtx8_avx2, // H_DCT + av1_fadst8_avx2, // V_ADST + av1_idtx8_avx2, // H_ADST + av1_fadst8_avx2, // V_FLIPADST + av1_idtx8_avx2 // H_FLIPADST +}; +static const transform_1d_avx2 row_highbd_txfm8x16_arr[TX_TYPES] = { + av1_fdct16_avx2, // DCT_DCT + av1_fdct16_avx2, // ADST_DCT + av1_fadst16_avx2, // DCT_ADST + av1_fadst16_avx2, // ADST_ADST + av1_fdct16_avx2, // FLIPADST_DCT + av1_fadst16_avx2, // DCT_FLIPADST + av1_fadst16_avx2, // FLIPADST_FLIPADST + av1_fadst16_avx2, // ADST_FLIPADST + av1_fadst16_avx2, // FLIPADST_ADST + av1_idtx16_avx2, // IDTX + av1_idtx16_avx2, // V_DCT + av1_fdct16_avx2, // H_DCT + av1_idtx16_avx2, // V_ADST + av1_fadst16_avx2, // H_ADST + av1_idtx16_avx2, // V_FLIPADST + av1_fadst16_avx2 // H_FLIPADST +}; +void av1_fwd_txfm2d_16x8_avx2(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + __m256i in[16], out[16]; + const int8_t *shift = fwd_txfm_shift_ls[TX_16X8]; + const int txw_idx = get_txw_idx(TX_16X8); + const int txh_idx = get_txh_idx(TX_16X8); + const transform_1d_avx2 col_txfm = col_highbd_txfm8x8_arr[tx_type]; + const transform_1d_avx2 row_txfm = row_highbd_txfm8x16_arr[tx_type]; + const int8_t bit = fwd_cos_bit_col[txw_idx][txh_idx]; + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + + av1_load_buffer_16xn_avx2(input, in, stride, 8, 2, ud_flip, lr_flip); + av1_round_shift_32_8xn_avx2(in, 16, shift[0], 1); + col_txfm(in, out, bit, 2, 2); + av1_round_shift_32_8xn_avx2(out, 16, shift[1], 1); + av1_fwd_txfm_transpose_8x8_avx2(out, in, 2, 1); + av1_fwd_txfm_transpose_8x8_avx2(&out[1], &in[8], 2, 1); + row_txfm(in, out, bit, 1, 1); + av1_fwd_txfm_transpose_8x8_avx2(out, in, 1, 2); + av1_fwd_txfm_transpose_8x8_avx2(&out[8], &in[1], 1, 2); + av1_round_shift_rect_array_32_avx2(in, in, 16, -shift[2], NewSqrt2); + av1_store_buffer_avx2(in, coeff, 8, 16); + (void)bd; +} +void av1_fwd_txfm2d_16x16_avx2(const int16_t *input, int32_t *coeff, int stride, + TX_TYPE tx_type, int bd) { + __m256i in[32], out[32]; + const TX_SIZE tx_size = TX_16X16; + const int8_t *shift = fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const int width_div8 = (width >> 3); + const int width_div16 = (width >> 4); + const int size = (height << 1); + switch (tx_type) { + case DCT_DCT: + av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); + av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + av1_fdct16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + av1_fwd_txfm_transpose_16x16_avx2(out, in); + av1_fdct16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + av1_fwd_txfm_transpose_16x16_avx2(out, in); + av1_store_buffer_avx2(in, coeff, 8, 32); + break; + case ADST_DCT: + av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); + av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + av1_fadst16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + av1_fwd_txfm_transpose_16x16_avx2(out, in); + av1_fdct16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + av1_fwd_txfm_transpose_16x16_avx2(out, in); + av1_store_buffer_avx2(in, coeff, 8, 32); + break; + case DCT_ADST: + av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); + av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + av1_fdct16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + av1_fwd_txfm_transpose_16x16_avx2(out, in); + av1_fadst16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + av1_fwd_txfm_transpose_16x16_avx2(out, in); + av1_store_buffer_avx2(in, coeff, 8, 32); + break; + case ADST_ADST: + av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); + av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + av1_fadst16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + av1_fwd_txfm_transpose_16x16_avx2(out, in); + av1_fadst16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + av1_fwd_txfm_transpose_16x16_avx2(out, in); + av1_store_buffer_avx2(in, coeff, 8, 32); + break; + case FLIPADST_DCT: + av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 0); + av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + av1_fadst16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + av1_fwd_txfm_transpose_16x16_avx2(out, in); + av1_fdct16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + av1_fwd_txfm_transpose_16x16_avx2(out, in); + av1_store_buffer_avx2(in, coeff, 8, 32); + break; + case DCT_FLIPADST: + av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 1); + av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + av1_fdct16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + av1_fwd_txfm_transpose_16x16_avx2(out, in); + av1_fadst16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + av1_fwd_txfm_transpose_16x16_avx2(out, in); + av1_store_buffer_avx2(in, coeff, 8, 32); + break; + case FLIPADST_FLIPADST: + av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 1); + av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + av1_fadst16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + av1_fwd_txfm_transpose_16x16_avx2(out, in); + av1_fadst16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + av1_fwd_txfm_transpose_16x16_avx2(out, in); + av1_store_buffer_avx2(in, coeff, 8, 32); + break; + case ADST_FLIPADST: + av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 1); + av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + av1_fadst16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + av1_fwd_txfm_transpose_16x16_avx2(out, in); + av1_fadst16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + av1_fwd_txfm_transpose_16x16_avx2(out, in); + av1_store_buffer_avx2(in, coeff, 8, 32); + break; + case FLIPADST_ADST: + av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 0); + av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + av1_fadst16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + av1_fwd_txfm_transpose_16x16_avx2(out, in); + av1_fadst16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + av1_fwd_txfm_transpose_16x16_avx2(out, in); + av1_store_buffer_avx2(in, coeff, 8, 32); + break; + case IDTX: + av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); + av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + av1_idtx16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + av1_idtx16_avx2(out, in, fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + av1_store_buffer_avx2(in, coeff, 8, 32); + break; + case V_DCT: + av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); + av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + av1_fdct16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + av1_idtx16_avx2(out, in, fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + av1_store_buffer_avx2(in, coeff, 8, 32); + break; + case H_DCT: + av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); + av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + av1_idtx16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + av1_fwd_txfm_transpose_16x16_avx2(out, in); + av1_fdct16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + av1_fwd_txfm_transpose_16x16_avx2(out, in); + av1_store_buffer_avx2(in, coeff, 8, 32); + break; + case V_ADST: + av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); + av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + av1_fadst16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + av1_idtx16_avx2(out, in, fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + av1_store_buffer_avx2(in, coeff, 8, 32); + break; + case H_ADST: + av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0); + av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + av1_idtx16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + av1_fwd_txfm_transpose_16x16_avx2(out, in); + av1_fadst16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + av1_fwd_txfm_transpose_16x16_avx2(out, in); + av1_store_buffer_avx2(in, coeff, 8, 32); + break; + case V_FLIPADST: + av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 0); + av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + av1_fadst16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + av1_idtx16_avx2(out, in, fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + av1_store_buffer_avx2(in, coeff, 8, 32); + break; + case H_FLIPADST: + av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 1); + av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16); + av1_idtx16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8, + width_div8); + av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16); + av1_fwd_txfm_transpose_16x16_avx2(out, in); + av1_fadst16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8, + width_div8); + av1_fwd_txfm_transpose_16x16_avx2(out, in); + av1_store_buffer_avx2(in, coeff, 8, 32); + break; + default: assert(0); + } + (void)bd; +} +static INLINE void av1_fdct32_avx2(__m256i *input, __m256i *output, + const int8_t cos_bit, const int instride, + const int outstride) { + __m256i buf0[32]; + __m256i buf1[32]; + const int32_t *cospi; + int startidx = 0 * instride; + int endidx = 31 * instride; + // stage 0 + // stage 1 + buf1[0] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[31] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[1] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[30] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[2] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[29] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[3] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[28] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[4] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[27] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[5] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[26] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[6] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[25] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[7] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[24] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[8] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[23] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[9] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[22] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[10] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[21] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[11] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[20] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[12] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[19] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[13] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[18] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[14] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[17] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + buf1[15] = _mm256_add_epi32(input[startidx], input[endidx]); + buf1[16] = _mm256_sub_epi32(input[startidx], input[endidx]); + + // stage 2 + cospi = cospi_arr(cos_bit); + buf0[0] = _mm256_add_epi32(buf1[0], buf1[15]); + buf0[15] = _mm256_sub_epi32(buf1[0], buf1[15]); + buf0[1] = _mm256_add_epi32(buf1[1], buf1[14]); + buf0[14] = _mm256_sub_epi32(buf1[1], buf1[14]); + buf0[2] = _mm256_add_epi32(buf1[2], buf1[13]); + buf0[13] = _mm256_sub_epi32(buf1[2], buf1[13]); + buf0[3] = _mm256_add_epi32(buf1[3], buf1[12]); + buf0[12] = _mm256_sub_epi32(buf1[3], buf1[12]); + buf0[4] = _mm256_add_epi32(buf1[4], buf1[11]); + buf0[11] = _mm256_sub_epi32(buf1[4], buf1[11]); + buf0[5] = _mm256_add_epi32(buf1[5], buf1[10]); + buf0[10] = _mm256_sub_epi32(buf1[5], buf1[10]); + buf0[6] = _mm256_add_epi32(buf1[6], buf1[9]); + buf0[9] = _mm256_sub_epi32(buf1[6], buf1[9]); + buf0[7] = _mm256_add_epi32(buf1[7], buf1[8]); + buf0[8] = _mm256_sub_epi32(buf1[7], buf1[8]); + buf0[16] = buf1[16]; + buf0[17] = buf1[17]; + buf0[18] = buf1[18]; + buf0[19] = buf1[19]; + btf_32_avx2_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20], + buf0[27], cos_bit); + btf_32_avx2_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21], + buf0[26], cos_bit); + btf_32_avx2_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22], + buf0[25], cos_bit); + btf_32_avx2_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23], + buf0[24], cos_bit); + buf0[28] = buf1[28]; + buf0[29] = buf1[29]; + buf0[30] = buf1[30]; + buf0[31] = buf1[31]; + + // stage 3 + cospi = cospi_arr(cos_bit); + buf1[0] = _mm256_add_epi32(buf0[0], buf0[7]); + buf1[7] = _mm256_sub_epi32(buf0[0], buf0[7]); + buf1[1] = _mm256_add_epi32(buf0[1], buf0[6]); + buf1[6] = _mm256_sub_epi32(buf0[1], buf0[6]); + buf1[2] = _mm256_add_epi32(buf0[2], buf0[5]); + buf1[5] = _mm256_sub_epi32(buf0[2], buf0[5]); + buf1[3] = _mm256_add_epi32(buf0[3], buf0[4]); + buf1[4] = _mm256_sub_epi32(buf0[3], buf0[4]); + buf1[8] = buf0[8]; + buf1[9] = buf0[9]; + btf_32_avx2_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10], + buf1[13], cos_bit); + btf_32_avx2_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11], + buf1[12], cos_bit); + buf1[14] = buf0[14]; + buf1[15] = buf0[15]; + buf1[16] = _mm256_add_epi32(buf0[16], buf0[23]); + buf1[23] = _mm256_sub_epi32(buf0[16], buf0[23]); + buf1[17] = _mm256_add_epi32(buf0[17], buf0[22]); + buf1[22] = _mm256_sub_epi32(buf0[17], buf0[22]); + buf1[18] = _mm256_add_epi32(buf0[18], buf0[21]); + buf1[21] = _mm256_sub_epi32(buf0[18], buf0[21]); + buf1[19] = _mm256_add_epi32(buf0[19], buf0[20]); + buf1[20] = _mm256_sub_epi32(buf0[19], buf0[20]); + buf1[24] = _mm256_sub_epi32(buf0[31], buf0[24]); + buf1[31] = _mm256_add_epi32(buf0[31], buf0[24]); + buf1[25] = _mm256_sub_epi32(buf0[30], buf0[25]); + buf1[30] = _mm256_add_epi32(buf0[30], buf0[25]); + buf1[26] = _mm256_sub_epi32(buf0[29], buf0[26]); + buf1[29] = _mm256_add_epi32(buf0[29], buf0[26]); + buf1[27] = _mm256_sub_epi32(buf0[28], buf0[27]); + buf1[28] = _mm256_add_epi32(buf0[28], buf0[27]); + + // stage 4 + cospi = cospi_arr(cos_bit); + buf0[0] = _mm256_add_epi32(buf1[0], buf1[3]); + buf0[3] = _mm256_sub_epi32(buf1[0], buf1[3]); + buf0[1] = _mm256_add_epi32(buf1[1], buf1[2]); + buf0[2] = _mm256_sub_epi32(buf1[1], buf1[2]); + buf0[4] = buf1[4]; + btf_32_avx2_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], buf0[6], + cos_bit); + buf0[7] = buf1[7]; + buf0[8] = _mm256_add_epi32(buf1[8], buf1[11]); + buf0[11] = _mm256_sub_epi32(buf1[8], buf1[11]); + buf0[9] = _mm256_add_epi32(buf1[9], buf1[10]); + buf0[10] = _mm256_sub_epi32(buf1[9], buf1[10]); + buf0[12] = _mm256_sub_epi32(buf1[15], buf1[12]); + buf0[15] = _mm256_add_epi32(buf1[15], buf1[12]); + buf0[13] = _mm256_sub_epi32(buf1[14], buf1[13]); + buf0[14] = _mm256_add_epi32(buf1[14], buf1[13]); + buf0[16] = buf1[16]; + buf0[17] = buf1[17]; + btf_32_avx2_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18], + buf0[29], cos_bit); + btf_32_avx2_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19], + buf0[28], cos_bit); + btf_32_avx2_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20], + buf0[27], cos_bit); + btf_32_avx2_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21], + buf0[26], cos_bit); + buf0[22] = buf1[22]; + buf0[23] = buf1[23]; + buf0[24] = buf1[24]; + buf0[25] = buf1[25]; + buf0[30] = buf1[30]; + buf0[31] = buf1[31]; + + // stage 5 + cospi = cospi_arr(cos_bit); + btf_32_avx2_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], buf1[1], + cos_bit); + btf_32_avx2_type0(cospi[16], cospi[48], buf0[3], buf0[2], buf1[2], buf1[3], + cos_bit); + buf1[4] = _mm256_add_epi32(buf0[4], buf0[5]); + buf1[5] = _mm256_sub_epi32(buf0[4], buf0[5]); + buf1[6] = _mm256_sub_epi32(buf0[7], buf0[6]); + buf1[7] = _mm256_add_epi32(buf0[7], buf0[6]); + buf1[8] = buf0[8]; + btf_32_avx2_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9], buf1[14], + cos_bit); + btf_32_avx2_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10], + buf1[13], cos_bit); + buf1[11] = buf0[11]; + buf1[12] = buf0[12]; + buf1[15] = buf0[15]; + buf1[16] = _mm256_add_epi32(buf0[16], buf0[19]); + buf1[19] = _mm256_sub_epi32(buf0[16], buf0[19]); + buf1[17] = _mm256_add_epi32(buf0[17], buf0[18]); + buf1[18] = _mm256_sub_epi32(buf0[17], buf0[18]); + buf1[20] = _mm256_sub_epi32(buf0[23], buf0[20]); + buf1[23] = _mm256_add_epi32(buf0[23], buf0[20]); + buf1[21] = _mm256_sub_epi32(buf0[22], buf0[21]); + buf1[22] = _mm256_add_epi32(buf0[22], buf0[21]); + buf1[24] = _mm256_add_epi32(buf0[24], buf0[27]); + buf1[27] = _mm256_sub_epi32(buf0[24], buf0[27]); + buf1[25] = _mm256_add_epi32(buf0[25], buf0[26]); + buf1[26] = _mm256_sub_epi32(buf0[25], buf0[26]); + buf1[28] = _mm256_sub_epi32(buf0[31], buf0[28]); + buf1[31] = _mm256_add_epi32(buf0[31], buf0[28]); + buf1[29] = _mm256_sub_epi32(buf0[30], buf0[29]); + buf1[30] = _mm256_add_epi32(buf0[30], buf0[29]); + + // stage 6 + cospi = cospi_arr(cos_bit); + buf0[0] = buf1[0]; + buf0[1] = buf1[1]; + buf0[2] = buf1[2]; + buf0[3] = buf1[3]; + btf_32_avx2_type0(cospi[8], cospi[56], buf1[7], buf1[4], buf0[4], buf0[7], + cos_bit); + btf_32_avx2_type0(cospi[40], cospi[24], buf1[6], buf1[5], buf0[5], buf0[6], + cos_bit); + buf0[8] = _mm256_add_epi32(buf1[8], buf1[9]); + buf0[9] = _mm256_sub_epi32(buf1[8], buf1[9]); + buf0[10] = _mm256_sub_epi32(buf1[11], buf1[10]); + buf0[11] = _mm256_add_epi32(buf1[11], buf1[10]); + buf0[12] = _mm256_add_epi32(buf1[12], buf1[13]); + buf0[13] = _mm256_sub_epi32(buf1[12], buf1[13]); + buf0[14] = _mm256_sub_epi32(buf1[15], buf1[14]); + buf0[15] = _mm256_add_epi32(buf1[15], buf1[14]); + buf0[16] = buf1[16]; + btf_32_avx2_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17], + buf0[30], cos_bit); + btf_32_avx2_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18], + buf0[29], cos_bit); + buf0[19] = buf1[19]; + buf0[20] = buf1[20]; + btf_32_avx2_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21], + buf0[26], cos_bit); + btf_32_avx2_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22], + buf0[25], cos_bit); + buf0[23] = buf1[23]; + buf0[24] = buf1[24]; + buf0[27] = buf1[27]; + buf0[28] = buf1[28]; + buf0[31] = buf1[31]; + + // stage 7 + cospi = cospi_arr(cos_bit); + buf1[0] = buf0[0]; + buf1[1] = buf0[1]; + buf1[2] = buf0[2]; + buf1[3] = buf0[3]; + buf1[4] = buf0[4]; + buf1[5] = buf0[5]; + buf1[6] = buf0[6]; + buf1[7] = buf0[7]; + btf_32_avx2_type0(cospi[4], cospi[60], buf0[15], buf0[8], buf1[8], buf1[15], + cos_bit); + btf_32_avx2_type0(cospi[36], cospi[28], buf0[14], buf0[9], buf1[9], buf1[14], + cos_bit); + btf_32_avx2_type0(cospi[20], cospi[44], buf0[13], buf0[10], buf1[10], + buf1[13], cos_bit); + btf_32_avx2_type0(cospi[52], cospi[12], buf0[12], buf0[11], buf1[11], + buf1[12], cos_bit); + buf1[16] = _mm256_add_epi32(buf0[16], buf0[17]); + buf1[17] = _mm256_sub_epi32(buf0[16], buf0[17]); + buf1[18] = _mm256_sub_epi32(buf0[19], buf0[18]); + buf1[19] = _mm256_add_epi32(buf0[19], buf0[18]); + buf1[20] = _mm256_add_epi32(buf0[20], buf0[21]); + buf1[21] = _mm256_sub_epi32(buf0[20], buf0[21]); + buf1[22] = _mm256_sub_epi32(buf0[23], buf0[22]); + buf1[23] = _mm256_add_epi32(buf0[23], buf0[22]); + buf1[24] = _mm256_add_epi32(buf0[24], buf0[25]); + buf1[25] = _mm256_sub_epi32(buf0[24], buf0[25]); + buf1[26] = _mm256_sub_epi32(buf0[27], buf0[26]); + buf1[27] = _mm256_add_epi32(buf0[27], buf0[26]); + buf1[28] = _mm256_add_epi32(buf0[28], buf0[29]); + buf1[29] = _mm256_sub_epi32(buf0[28], buf0[29]); + buf1[30] = _mm256_sub_epi32(buf0[31], buf0[30]); + buf1[31] = _mm256_add_epi32(buf0[31], buf0[30]); + + // stage 8 + cospi = cospi_arr(cos_bit); + buf0[0] = buf1[0]; + buf0[1] = buf1[1]; + buf0[2] = buf1[2]; + buf0[3] = buf1[3]; + buf0[4] = buf1[4]; + buf0[5] = buf1[5]; + buf0[6] = buf1[6]; + buf0[7] = buf1[7]; + buf0[8] = buf1[8]; + buf0[9] = buf1[9]; + buf0[10] = buf1[10]; + buf0[11] = buf1[11]; + buf0[12] = buf1[12]; + buf0[13] = buf1[13]; + buf0[14] = buf1[14]; + buf0[15] = buf1[15]; + btf_32_avx2_type0(cospi[2], cospi[62], buf1[31], buf1[16], buf0[16], buf0[31], + cos_bit); + btf_32_avx2_type0(cospi[34], cospi[30], buf1[30], buf1[17], buf0[17], + buf0[30], cos_bit); + btf_32_avx2_type0(cospi[18], cospi[46], buf1[29], buf1[18], buf0[18], + buf0[29], cos_bit); + btf_32_avx2_type0(cospi[50], cospi[14], buf1[28], buf1[19], buf0[19], + buf0[28], cos_bit); + btf_32_avx2_type0(cospi[10], cospi[54], buf1[27], buf1[20], buf0[20], + buf0[27], cos_bit); + btf_32_avx2_type0(cospi[42], cospi[22], buf1[26], buf1[21], buf0[21], + buf0[26], cos_bit); + btf_32_avx2_type0(cospi[26], cospi[38], buf1[25], buf1[22], buf0[22], + buf0[25], cos_bit); + btf_32_avx2_type0(cospi[58], cospi[6], buf1[24], buf1[23], buf0[23], buf0[24], + cos_bit); + + startidx = 0 * outstride; + endidx = 31 * outstride; + // stage 9 + output[startidx] = buf0[0]; + output[endidx] = buf0[31]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[16]; + output[endidx] = buf0[15]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[8]; + output[endidx] = buf0[23]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[24]; + output[endidx] = buf0[7]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[4]; + output[endidx] = buf0[27]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[20]; + output[endidx] = buf0[11]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[12]; + output[endidx] = buf0[19]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[28]; + output[endidx] = buf0[3]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[2]; + output[endidx] = buf0[29]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[18]; + output[endidx] = buf0[13]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[10]; + output[endidx] = buf0[21]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[26]; + output[endidx] = buf0[5]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[6]; + output[endidx] = buf0[25]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[22]; + output[endidx] = buf0[9]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[14]; + output[endidx] = buf0[17]; + startidx += outstride; + endidx -= outstride; + output[startidx] = buf0[30]; + output[endidx] = buf0[1]; +} +static INLINE void idtx32x32_avx2(__m256i *input, __m256i *output, + const int8_t cos_bit, int instride, + int outstride) { + (void)cos_bit; + for (int i = 0; i < 32; i += 8) { + output[i * outstride] = _mm256_slli_epi32(input[i * instride], 2); + output[(i + 1) * outstride] = + _mm256_slli_epi32(input[(i + 1) * instride], 2); + output[(i + 2) * outstride] = + _mm256_slli_epi32(input[(i + 2) * instride], 2); + output[(i + 3) * outstride] = + _mm256_slli_epi32(input[(i + 3) * instride], 2); + output[(i + 4) * outstride] = + _mm256_slli_epi32(input[(i + 4) * instride], 2); + output[(i + 5) * outstride] = + _mm256_slli_epi32(input[(i + 5) * instride], 2); + output[(i + 6) * outstride] = + _mm256_slli_epi32(input[(i + 6) * instride], 2); + output[(i + 7) * outstride] = + _mm256_slli_epi32(input[(i + 7) * instride], 2); + } +} +static const transform_1d_avx2 col_txfm8x32_arr[TX_TYPES] = { + av1_fdct32_avx2, // DCT_DCT + NULL, // ADST_DCT + NULL, // DCT_ADST + NULL, // ADST_ADST + NULL, // FLIPADST_DCT + NULL, // DCT_FLIPADST + NULL, // FLIPADST_FLIPADST + NULL, // ADST_FLIPADST + NULL, // FLIPADST_ADST + idtx32x32_avx2, // IDTX + NULL, // V_DCT + NULL, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST +}; +static const transform_1d_avx2 row_txfm8x32_arr[TX_TYPES] = { + av1_fdct32_avx2, // DCT_DCT + NULL, // ADST_DCT + NULL, // DCT_ADST + NULL, // ADST_ADST + NULL, // FLIPADST_DCT + NULL, // DCT_FLIPADST + NULL, // FLIPADST_FLIPADST + NULL, // ADST_FLIPADST + NULL, // FLIPADST_ADST + idtx32x32_avx2, // IDTX + NULL, // V_DCT + NULL, // H_DCT + NULL, // V_ADST + NULL, // H_ADST + NULL, // V_FLIPADST + NULL // H_FLIPADST +}; +void av1_fwd_txfm2d_32x32_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + __m256i buf0[128], buf1[128]; + const int tx_size = TX_32X32; + const int8_t *shift = fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_avx2 col_txfm = col_txfm8x32_arr[tx_type]; + const transform_1d_avx2 row_txfm = row_txfm8x32_arr[tx_type]; + int r, c; + const int width_div16 = (width >> 4); + const int width_div8 = (width >> 3); + + for (int i = 0; i < width_div16; i++) { + av1_load_buffer_16xn_avx2(input + (i << 4), &buf0[(i << 1)], stride, height, + width_div8, 0, 0); + av1_round_shift_32_8xn_avx2(&buf0[(i << 1)], height, shift[0], width_div8); + av1_round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[0], + width_div8); + col_txfm(&buf0[(i << 1)], &buf0[(i << 1)], cos_bit_col, width_div8, + width_div8); + col_txfm(&buf0[(i << 1) + 1], &buf0[(i << 1) + 1], cos_bit_col, width_div8, + width_div8); + av1_round_shift_32_8xn_avx2(&buf0[(i << 1)], height, shift[1], width_div8); + av1_round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[1], + width_div8); + } + + for (r = 0; r < height; r += 8) { + for (c = 0; c < width_div8; c++) { + av1_fwd_txfm_transpose_8x8_avx2(&buf0[r * width_div8 + c], + &buf1[c * 8 * width_div8 + (r >> 3)], + width_div8, width_div8); + } + } + + for (int i = 0; i < width_div16; i++) { + row_txfm(&buf1[(i << 1)], &buf1[(i << 1)], cos_bit_row, width_div8, + width_div8); + row_txfm(&buf1[(i << 1) + 1], &buf1[(i << 1) + 1], cos_bit_row, width_div8, + width_div8); + av1_round_shift_32_8xn_avx2(&buf1[(i << 1)], height, shift[2], width_div8); + av1_round_shift_32_8xn_avx2(&buf1[(i << 1) + 1], height, shift[2], + width_div8); + } + + for (r = 0; r < height; r += 8) { + for (c = 0; c < width_div8; c++) { + av1_fwd_txfm_transpose_8x8_avx2(&buf1[r * width_div8 + c], + &buf0[c * 8 * width_div8 + (r >> 3)], + width_div8, width_div8); + } + } + + av1_store_buffer_avx2(buf0, output, 8, 128); +} +static INLINE void av1_fdct64_stage2_avx2(__m256i *x1, __m256i *x2, + __m256i *cospi_m32, + __m256i *cospi_p32, + const __m256i *__rounding, + int8_t cos_bit) { + x2[0] = _mm256_add_epi32(x1[0], x1[31]); + x2[31] = _mm256_sub_epi32(x1[0], x1[31]); + x2[1] = _mm256_add_epi32(x1[1], x1[30]); + x2[30] = _mm256_sub_epi32(x1[1], x1[30]); + x2[2] = _mm256_add_epi32(x1[2], x1[29]); + x2[29] = _mm256_sub_epi32(x1[2], x1[29]); + x2[3] = _mm256_add_epi32(x1[3], x1[28]); + x2[28] = _mm256_sub_epi32(x1[3], x1[28]); + x2[4] = _mm256_add_epi32(x1[4], x1[27]); + x2[27] = _mm256_sub_epi32(x1[4], x1[27]); + x2[5] = _mm256_add_epi32(x1[5], x1[26]); + x2[26] = _mm256_sub_epi32(x1[5], x1[26]); + x2[6] = _mm256_add_epi32(x1[6], x1[25]); + x2[25] = _mm256_sub_epi32(x1[6], x1[25]); + x2[7] = _mm256_add_epi32(x1[7], x1[24]); + x2[24] = _mm256_sub_epi32(x1[7], x1[24]); + x2[8] = _mm256_add_epi32(x1[8], x1[23]); + x2[23] = _mm256_sub_epi32(x1[8], x1[23]); + x2[9] = _mm256_add_epi32(x1[9], x1[22]); + x2[22] = _mm256_sub_epi32(x1[9], x1[22]); + x2[10] = _mm256_add_epi32(x1[10], x1[21]); + x2[21] = _mm256_sub_epi32(x1[10], x1[21]); + x2[11] = _mm256_add_epi32(x1[11], x1[20]); + x2[20] = _mm256_sub_epi32(x1[11], x1[20]); + x2[12] = _mm256_add_epi32(x1[12], x1[19]); + x2[19] = _mm256_sub_epi32(x1[12], x1[19]); + x2[13] = _mm256_add_epi32(x1[13], x1[18]); + x2[18] = _mm256_sub_epi32(x1[13], x1[18]); + x2[14] = _mm256_add_epi32(x1[14], x1[17]); + x2[17] = _mm256_sub_epi32(x1[14], x1[17]); + x2[15] = _mm256_add_epi32(x1[15], x1[16]); + x2[16] = _mm256_sub_epi32(x1[15], x1[16]); + x2[32] = x1[32]; + x2[33] = x1[33]; + x2[34] = x1[34]; + x2[35] = x1[35]; + x2[36] = x1[36]; + x2[37] = x1[37]; + x2[38] = x1[38]; + x2[39] = x1[39]; + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[40], x1[55], x2[40], x2[55], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[41], x1[54], x2[41], x2[54], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[42], x1[53], x2[42], x2[53], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[43], x1[52], x2[43], x2[52], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[44], x1[51], x2[44], x2[51], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[45], x1[50], x2[45], x2[50], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[46], x1[49], x2[46], x2[49], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[47], x1[48], x2[47], x2[48], + *__rounding, cos_bit); + x2[56] = x1[56]; + x2[57] = x1[57]; + x2[58] = x1[58]; + x2[59] = x1[59]; + x2[60] = x1[60]; + x2[61] = x1[61]; + x2[62] = x1[62]; + x2[63] = x1[63]; +} +static INLINE void av1_fdct64_stage3_avx2(__m256i *x2, __m256i *x3, + __m256i *cospi_m32, + __m256i *cospi_p32, + const __m256i *__rounding, + int8_t cos_bit) { + x3[0] = _mm256_add_epi32(x2[0], x2[15]); + x3[15] = _mm256_sub_epi32(x2[0], x2[15]); + x3[1] = _mm256_add_epi32(x2[1], x2[14]); + x3[14] = _mm256_sub_epi32(x2[1], x2[14]); + x3[2] = _mm256_add_epi32(x2[2], x2[13]); + x3[13] = _mm256_sub_epi32(x2[2], x2[13]); + x3[3] = _mm256_add_epi32(x2[3], x2[12]); + x3[12] = _mm256_sub_epi32(x2[3], x2[12]); + x3[4] = _mm256_add_epi32(x2[4], x2[11]); + x3[11] = _mm256_sub_epi32(x2[4], x2[11]); + x3[5] = _mm256_add_epi32(x2[5], x2[10]); + x3[10] = _mm256_sub_epi32(x2[5], x2[10]); + x3[6] = _mm256_add_epi32(x2[6], x2[9]); + x3[9] = _mm256_sub_epi32(x2[6], x2[9]); + x3[7] = _mm256_add_epi32(x2[7], x2[8]); + x3[8] = _mm256_sub_epi32(x2[7], x2[8]); + x3[16] = x2[16]; + x3[17] = x2[17]; + x3[18] = x2[18]; + x3[19] = x2[19]; + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[20], x2[27], x3[20], x3[27], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[21], x2[26], x3[21], x3[26], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[22], x2[25], x3[22], x3[25], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[23], x2[24], x3[23], x3[24], + *__rounding, cos_bit); + x3[28] = x2[28]; + x3[29] = x2[29]; + x3[30] = x2[30]; + x3[31] = x2[31]; + x3[32] = _mm256_add_epi32(x2[32], x2[47]); + x3[47] = _mm256_sub_epi32(x2[32], x2[47]); + x3[33] = _mm256_add_epi32(x2[33], x2[46]); + x3[46] = _mm256_sub_epi32(x2[33], x2[46]); + x3[34] = _mm256_add_epi32(x2[34], x2[45]); + x3[45] = _mm256_sub_epi32(x2[34], x2[45]); + x3[35] = _mm256_add_epi32(x2[35], x2[44]); + x3[44] = _mm256_sub_epi32(x2[35], x2[44]); + x3[36] = _mm256_add_epi32(x2[36], x2[43]); + x3[43] = _mm256_sub_epi32(x2[36], x2[43]); + x3[37] = _mm256_add_epi32(x2[37], x2[42]); + x3[42] = _mm256_sub_epi32(x2[37], x2[42]); + x3[38] = _mm256_add_epi32(x2[38], x2[41]); + x3[41] = _mm256_sub_epi32(x2[38], x2[41]); + x3[39] = _mm256_add_epi32(x2[39], x2[40]); + x3[40] = _mm256_sub_epi32(x2[39], x2[40]); + x3[48] = _mm256_sub_epi32(x2[63], x2[48]); + x3[63] = _mm256_add_epi32(x2[63], x2[48]); + x3[49] = _mm256_sub_epi32(x2[62], x2[49]); + x3[62] = _mm256_add_epi32(x2[62], x2[49]); + x3[50] = _mm256_sub_epi32(x2[61], x2[50]); + x3[61] = _mm256_add_epi32(x2[61], x2[50]); + x3[51] = _mm256_sub_epi32(x2[60], x2[51]); + x3[60] = _mm256_add_epi32(x2[60], x2[51]); + x3[52] = _mm256_sub_epi32(x2[59], x2[52]); + x3[59] = _mm256_add_epi32(x2[59], x2[52]); + x3[53] = _mm256_sub_epi32(x2[58], x2[53]); + x3[58] = _mm256_add_epi32(x2[58], x2[53]); + x3[54] = _mm256_sub_epi32(x2[57], x2[54]); + x3[57] = _mm256_add_epi32(x2[57], x2[54]); + x3[55] = _mm256_sub_epi32(x2[56], x2[55]); + x3[56] = _mm256_add_epi32(x2[56], x2[55]); +} +static INLINE void av1_fdct64_stage4_avx2( + __m256i *x3, __m256i *x4, __m256i *cospi_m32, __m256i *cospi_p32, + __m256i *cospi_m16, __m256i *cospi_p48, __m256i *cospi_m48, + const __m256i *__rounding, int8_t cos_bit) { + x4[0] = _mm256_add_epi32(x3[0], x3[7]); + x4[7] = _mm256_sub_epi32(x3[0], x3[7]); + x4[1] = _mm256_add_epi32(x3[1], x3[6]); + x4[6] = _mm256_sub_epi32(x3[1], x3[6]); + x4[2] = _mm256_add_epi32(x3[2], x3[5]); + x4[5] = _mm256_sub_epi32(x3[2], x3[5]); + x4[3] = _mm256_add_epi32(x3[3], x3[4]); + x4[4] = _mm256_sub_epi32(x3[3], x3[4]); + x4[8] = x3[8]; + x4[9] = x3[9]; + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x3[10], x3[13], x4[10], x4[13], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x3[11], x3[12], x4[11], x4[12], + *__rounding, cos_bit); + x4[14] = x3[14]; + x4[15] = x3[15]; + x4[16] = _mm256_add_epi32(x3[16], x3[23]); + x4[23] = _mm256_sub_epi32(x3[16], x3[23]); + x4[17] = _mm256_add_epi32(x3[17], x3[22]); + x4[22] = _mm256_sub_epi32(x3[17], x3[22]); + x4[18] = _mm256_add_epi32(x3[18], x3[21]); + x4[21] = _mm256_sub_epi32(x3[18], x3[21]); + x4[19] = _mm256_add_epi32(x3[19], x3[20]); + x4[20] = _mm256_sub_epi32(x3[19], x3[20]); + x4[24] = _mm256_sub_epi32(x3[31], x3[24]); + x4[31] = _mm256_add_epi32(x3[31], x3[24]); + x4[25] = _mm256_sub_epi32(x3[30], x3[25]); + x4[30] = _mm256_add_epi32(x3[30], x3[25]); + x4[26] = _mm256_sub_epi32(x3[29], x3[26]); + x4[29] = _mm256_add_epi32(x3[29], x3[26]); + x4[27] = _mm256_sub_epi32(x3[28], x3[27]); + x4[28] = _mm256_add_epi32(x3[28], x3[27]); + x4[32] = x3[32]; + x4[33] = x3[33]; + x4[34] = x3[34]; + x4[35] = x3[35]; + btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[36], x3[59], x4[36], x4[59], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[37], x3[58], x4[37], x4[58], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[38], x3[57], x4[38], x4[57], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[39], x3[56], x4[39], x4[56], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[40], x3[55], x4[40], x4[55], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[41], x3[54], x4[41], x4[54], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[42], x3[53], x4[42], x4[53], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[43], x3[52], x4[43], x4[52], + *__rounding, cos_bit); + x4[44] = x3[44]; + x4[45] = x3[45]; + x4[46] = x3[46]; + x4[47] = x3[47]; + x4[48] = x3[48]; + x4[49] = x3[49]; + x4[50] = x3[50]; + x4[51] = x3[51]; + x4[60] = x3[60]; + x4[61] = x3[61]; + x4[62] = x3[62]; + x4[63] = x3[63]; +} +static INLINE void av1_fdct64_stage5_avx2( + __m256i *x4, __m256i *x5, __m256i *cospi_m32, __m256i *cospi_p32, + __m256i *cospi_m16, __m256i *cospi_p48, __m256i *cospi_m48, + const __m256i *__rounding, int8_t cos_bit) { + x5[0] = _mm256_add_epi32(x4[0], x4[3]); + x5[3] = _mm256_sub_epi32(x4[0], x4[3]); + x5[1] = _mm256_add_epi32(x4[1], x4[2]); + x5[2] = _mm256_sub_epi32(x4[1], x4[2]); + x5[4] = x4[4]; + btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x4[5], x4[6], x5[5], x5[6], + *__rounding, cos_bit); + x5[7] = x4[7]; + x5[8] = _mm256_add_epi32(x4[8], x4[11]); + x5[11] = _mm256_sub_epi32(x4[8], x4[11]); + x5[9] = _mm256_add_epi32(x4[9], x4[10]); + x5[10] = _mm256_sub_epi32(x4[9], x4[10]); + x5[12] = _mm256_sub_epi32(x4[15], x4[12]); + x5[15] = _mm256_add_epi32(x4[15], x4[12]); + x5[13] = _mm256_sub_epi32(x4[14], x4[13]); + x5[14] = _mm256_add_epi32(x4[14], x4[13]); + x5[16] = x4[16]; + x5[17] = x4[17]; + btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x4[18], x4[29], x5[18], x5[29], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x4[19], x4[28], x5[19], x5[28], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x4[20], x4[27], x5[20], x5[27], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x4[21], x4[26], x5[21], x5[26], + *__rounding, cos_bit); + x5[22] = x4[22]; + x5[23] = x4[23]; + x5[24] = x4[24]; + x5[25] = x4[25]; + x5[30] = x4[30]; + x5[31] = x4[31]; + x5[32] = _mm256_add_epi32(x4[32], x4[39]); + x5[39] = _mm256_sub_epi32(x4[32], x4[39]); + x5[33] = _mm256_add_epi32(x4[33], x4[38]); + x5[38] = _mm256_sub_epi32(x4[33], x4[38]); + x5[34] = _mm256_add_epi32(x4[34], x4[37]); + x5[37] = _mm256_sub_epi32(x4[34], x4[37]); + x5[35] = _mm256_add_epi32(x4[35], x4[36]); + x5[36] = _mm256_sub_epi32(x4[35], x4[36]); + x5[40] = _mm256_sub_epi32(x4[47], x4[40]); + x5[47] = _mm256_add_epi32(x4[47], x4[40]); + x5[41] = _mm256_sub_epi32(x4[46], x4[41]); + x5[46] = _mm256_add_epi32(x4[46], x4[41]); + x5[42] = _mm256_sub_epi32(x4[45], x4[42]); + x5[45] = _mm256_add_epi32(x4[45], x4[42]); + x5[43] = _mm256_sub_epi32(x4[44], x4[43]); + x5[44] = _mm256_add_epi32(x4[44], x4[43]); + x5[48] = _mm256_add_epi32(x4[48], x4[55]); + x5[55] = _mm256_sub_epi32(x4[48], x4[55]); + x5[49] = _mm256_add_epi32(x4[49], x4[54]); + x5[54] = _mm256_sub_epi32(x4[49], x4[54]); + x5[50] = _mm256_add_epi32(x4[50], x4[53]); + x5[53] = _mm256_sub_epi32(x4[50], x4[53]); + x5[51] = _mm256_add_epi32(x4[51], x4[52]); + x5[52] = _mm256_sub_epi32(x4[51], x4[52]); + x5[56] = _mm256_sub_epi32(x4[63], x4[56]); + x5[63] = _mm256_add_epi32(x4[63], x4[56]); + x5[57] = _mm256_sub_epi32(x4[62], x4[57]); + x5[62] = _mm256_add_epi32(x4[62], x4[57]); + x5[58] = _mm256_sub_epi32(x4[61], x4[58]); + x5[61] = _mm256_add_epi32(x4[61], x4[58]); + x5[59] = _mm256_sub_epi32(x4[60], x4[59]); + x5[60] = _mm256_add_epi32(x4[60], x4[59]); +} +static INLINE void av1_fdct64_stage6_avx2( + __m256i *x5, __m256i *x6, __m256i *cospi_p16, __m256i *cospi_p32, + __m256i *cospi_m16, __m256i *cospi_p48, __m256i *cospi_m48, + __m256i *cospi_m08, __m256i *cospi_p56, __m256i *cospi_m56, + __m256i *cospi_m40, __m256i *cospi_p24, __m256i *cospi_m24, + const __m256i *__rounding, int8_t cos_bit) { + btf_32_type0_avx2_new(*cospi_p32, *cospi_p32, x5[0], x5[1], x6[0], x6[1], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_p16, *cospi_p48, x5[3], x5[2], x6[2], x6[3], + *__rounding, cos_bit); + x6[4] = _mm256_add_epi32(x5[4], x5[5]); + x6[5] = _mm256_sub_epi32(x5[4], x5[5]); + x6[6] = _mm256_sub_epi32(x5[7], x5[6]); + x6[7] = _mm256_add_epi32(x5[7], x5[6]); + x6[8] = x5[8]; + btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x5[9], x5[14], x6[9], x6[14], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x5[10], x5[13], x6[10], x6[13], + *__rounding, cos_bit); + x6[11] = x5[11]; + x6[12] = x5[12]; + x6[15] = x5[15]; + x6[16] = _mm256_add_epi32(x5[16], x5[19]); + x6[19] = _mm256_sub_epi32(x5[16], x5[19]); + x6[17] = _mm256_add_epi32(x5[17], x5[18]); + x6[18] = _mm256_sub_epi32(x5[17], x5[18]); + x6[20] = _mm256_sub_epi32(x5[23], x5[20]); + x6[23] = _mm256_add_epi32(x5[23], x5[20]); + x6[21] = _mm256_sub_epi32(x5[22], x5[21]); + x6[22] = _mm256_add_epi32(x5[22], x5[21]); + x6[24] = _mm256_add_epi32(x5[24], x5[27]); + x6[27] = _mm256_sub_epi32(x5[24], x5[27]); + x6[25] = _mm256_add_epi32(x5[25], x5[26]); + x6[26] = _mm256_sub_epi32(x5[25], x5[26]); + x6[28] = _mm256_sub_epi32(x5[31], x5[28]); + x6[31] = _mm256_add_epi32(x5[31], x5[28]); + x6[29] = _mm256_sub_epi32(x5[30], x5[29]); + x6[30] = _mm256_add_epi32(x5[30], x5[29]); + x6[32] = x5[32]; + x6[33] = x5[33]; + btf_32_type0_avx2_new(*cospi_m08, *cospi_p56, x5[34], x5[61], x6[34], x6[61], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m08, *cospi_p56, x5[35], x5[60], x6[35], x6[60], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m56, *cospi_m08, x5[36], x5[59], x6[36], x6[59], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m56, *cospi_m08, x5[37], x5[58], x6[37], x6[58], + *__rounding, cos_bit); + x6[38] = x5[38]; + x6[39] = x5[39]; + x6[40] = x5[40]; + x6[41] = x5[41]; + btf_32_type0_avx2_new(*cospi_m40, *cospi_p24, x5[42], x5[53], x6[42], x6[53], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m40, *cospi_p24, x5[43], x5[52], x6[43], x6[52], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m24, *cospi_m40, x5[44], x5[51], x6[44], x6[51], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m24, *cospi_m40, x5[45], x5[50], x6[45], x6[50], + *__rounding, cos_bit); + x6[46] = x5[46]; + x6[47] = x5[47]; + x6[48] = x5[48]; + x6[49] = x5[49]; + x6[54] = x5[54]; + x6[55] = x5[55]; + x6[56] = x5[56]; + x6[57] = x5[57]; + x6[62] = x5[62]; + x6[63] = x5[63]; +} +static INLINE void av1_fdct64_stage7_avx2( + __m256i *x6, __m256i *x7, __m256i *cospi_p08, __m256i *cospi_p56, + __m256i *cospi_p40, __m256i *cospi_p24, __m256i *cospi_m08, + __m256i *cospi_m56, __m256i *cospi_m40, __m256i *cospi_m24, + const __m256i *__rounding, int8_t cos_bit) { + x7[0] = x6[0]; + x7[1] = x6[1]; + x7[2] = x6[2]; + x7[3] = x6[3]; + btf_32_type0_avx2_new(*cospi_p08, *cospi_p56, x6[7], x6[4], x7[4], x7[7], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_p40, *cospi_p24, x6[6], x6[5], x7[5], x7[6], + *__rounding, cos_bit); + x7[8] = _mm256_add_epi32(x6[8], x6[9]); + x7[9] = _mm256_sub_epi32(x6[8], x6[9]); + x7[10] = _mm256_sub_epi32(x6[11], x6[10]); + x7[11] = _mm256_add_epi32(x6[11], x6[10]); + x7[12] = _mm256_add_epi32(x6[12], x6[13]); + x7[13] = _mm256_sub_epi32(x6[12], x6[13]); + x7[14] = _mm256_sub_epi32(x6[15], x6[14]); + x7[15] = _mm256_add_epi32(x6[15], x6[14]); + x7[16] = x6[16]; + btf_32_type0_avx2_new(*cospi_m08, *cospi_p56, x6[17], x6[30], x7[17], x7[30], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m56, *cospi_m08, x6[18], x6[29], x7[18], x7[29], + *__rounding, cos_bit); + x7[19] = x6[19]; + x7[20] = x6[20]; + btf_32_type0_avx2_new(*cospi_m40, *cospi_p24, x6[21], x6[26], x7[21], x7[26], + *__rounding, cos_bit); + btf_32_type0_avx2_new(*cospi_m24, *cospi_m40, x6[22], x6[25], x7[22], x7[25], + *__rounding, cos_bit); + x7[23] = x6[23]; + x7[24] = x6[24]; + x7[27] = x6[27]; + x7[28] = x6[28]; + x7[31] = x6[31]; + x7[32] = _mm256_add_epi32(x6[32], x6[35]); + x7[35] = _mm256_sub_epi32(x6[32], x6[35]); + x7[33] = _mm256_add_epi32(x6[33], x6[34]); + x7[34] = _mm256_sub_epi32(x6[33], x6[34]); + x7[36] = _mm256_sub_epi32(x6[39], x6[36]); + x7[39] = _mm256_add_epi32(x6[39], x6[36]); + x7[37] = _mm256_sub_epi32(x6[38], x6[37]); + x7[38] = _mm256_add_epi32(x6[38], x6[37]); + x7[40] = _mm256_add_epi32(x6[40], x6[43]); + x7[43] = _mm256_sub_epi32(x6[40], x6[43]); + x7[41] = _mm256_add_epi32(x6[41], x6[42]); + x7[42] = _mm256_sub_epi32(x6[41], x6[42]); + x7[44] = _mm256_sub_epi32(x6[47], x6[44]); + x7[47] = _mm256_add_epi32(x6[47], x6[44]); + x7[45] = _mm256_sub_epi32(x6[46], x6[45]); + x7[46] = _mm256_add_epi32(x6[46], x6[45]); + x7[48] = _mm256_add_epi32(x6[48], x6[51]); + x7[51] = _mm256_sub_epi32(x6[48], x6[51]); + x7[49] = _mm256_add_epi32(x6[49], x6[50]); + x7[50] = _mm256_sub_epi32(x6[49], x6[50]); + x7[52] = _mm256_sub_epi32(x6[55], x6[52]); + x7[55] = _mm256_add_epi32(x6[55], x6[52]); + x7[53] = _mm256_sub_epi32(x6[54], x6[53]); + x7[54] = _mm256_add_epi32(x6[54], x6[53]); + x7[56] = _mm256_add_epi32(x6[56], x6[59]); + x7[59] = _mm256_sub_epi32(x6[56], x6[59]); + x7[57] = _mm256_add_epi32(x6[57], x6[58]); + x7[58] = _mm256_sub_epi32(x6[57], x6[58]); + x7[60] = _mm256_sub_epi32(x6[63], x6[60]); + x7[63] = _mm256_add_epi32(x6[63], x6[60]); + x7[61] = _mm256_sub_epi32(x6[62], x6[61]); + x7[62] = _mm256_add_epi32(x6[62], x6[61]); +} +static INLINE void av1_fdct64_stage8_avx2(__m256i *x7, __m256i *x8, + const int32_t *cospi, + const __m256i *__rounding, + int8_t cos_bit) { + __m256i cospi_p60 = _mm256_set1_epi32(cospi[60]); + __m256i cospi_p04 = _mm256_set1_epi32(cospi[4]); + __m256i cospi_p28 = _mm256_set1_epi32(cospi[28]); + __m256i cospi_p36 = _mm256_set1_epi32(cospi[36]); + __m256i cospi_p44 = _mm256_set1_epi32(cospi[44]); + __m256i cospi_p20 = _mm256_set1_epi32(cospi[20]); + __m256i cospi_p12 = _mm256_set1_epi32(cospi[12]); + __m256i cospi_p52 = _mm256_set1_epi32(cospi[52]); + __m256i cospi_m04 = _mm256_set1_epi32(-cospi[4]); + __m256i cospi_m60 = _mm256_set1_epi32(-cospi[60]); + __m256i cospi_m36 = _mm256_set1_epi32(-cospi[36]); + __m256i cospi_m28 = _mm256_set1_epi32(-cospi[28]); + __m256i cospi_m20 = _mm256_set1_epi32(-cospi[20]); + __m256i cospi_m44 = _mm256_set1_epi32(-cospi[44]); + __m256i cospi_m52 = _mm256_set1_epi32(-cospi[52]); + __m256i cospi_m12 = _mm256_set1_epi32(-cospi[12]); + + x8[0] = x7[0]; + x8[1] = x7[1]; + x8[2] = x7[2]; + x8[3] = x7[3]; + x8[4] = x7[4]; + x8[5] = x7[5]; + x8[6] = x7[6]; + x8[7] = x7[7]; + + btf_32_type0_avx2_new(cospi_p04, cospi_p60, x7[15], x7[8], x8[8], x8[15], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p36, cospi_p28, x7[14], x7[9], x8[9], x8[14], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p20, cospi_p44, x7[13], x7[10], x8[10], x8[13], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p52, cospi_p12, x7[12], x7[11], x8[11], x8[12], + *__rounding, cos_bit); + x8[16] = _mm256_add_epi32(x7[16], x7[17]); + x8[17] = _mm256_sub_epi32(x7[16], x7[17]); + x8[18] = _mm256_sub_epi32(x7[19], x7[18]); + x8[19] = _mm256_add_epi32(x7[19], x7[18]); + x8[20] = _mm256_add_epi32(x7[20], x7[21]); + x8[21] = _mm256_sub_epi32(x7[20], x7[21]); + x8[22] = _mm256_sub_epi32(x7[23], x7[22]); + x8[23] = _mm256_add_epi32(x7[23], x7[22]); + x8[24] = _mm256_add_epi32(x7[24], x7[25]); + x8[25] = _mm256_sub_epi32(x7[24], x7[25]); + x8[26] = _mm256_sub_epi32(x7[27], x7[26]); + x8[27] = _mm256_add_epi32(x7[27], x7[26]); + x8[28] = _mm256_add_epi32(x7[28], x7[29]); + x8[29] = _mm256_sub_epi32(x7[28], x7[29]); + x8[30] = _mm256_sub_epi32(x7[31], x7[30]); + x8[31] = _mm256_add_epi32(x7[31], x7[30]); + x8[32] = x7[32]; + btf_32_type0_avx2_new(cospi_m04, cospi_p60, x7[33], x7[62], x8[33], x8[62], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_m60, cospi_m04, x7[34], x7[61], x8[34], x8[61], + *__rounding, cos_bit); + x8[35] = x7[35]; + x8[36] = x7[36]; + btf_32_type0_avx2_new(cospi_m36, cospi_p28, x7[37], x7[58], x8[37], x8[58], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_m28, cospi_m36, x7[38], x7[57], x8[38], x8[57], + *__rounding, cos_bit); + x8[39] = x7[39]; + x8[40] = x7[40]; + btf_32_type0_avx2_new(cospi_m20, cospi_p44, x7[41], x7[54], x8[41], x8[54], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_m44, cospi_m20, x7[42], x7[53], x8[42], x8[53], + *__rounding, cos_bit); + x8[43] = x7[43]; + x8[44] = x7[44]; + btf_32_type0_avx2_new(cospi_m52, cospi_p12, x7[45], x7[50], x8[45], x8[50], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_m12, cospi_m52, x7[46], x7[49], x8[46], x8[49], + *__rounding, cos_bit); + x8[47] = x7[47]; + x8[48] = x7[48]; + x8[51] = x7[51]; + x8[52] = x7[52]; + x8[55] = x7[55]; + x8[56] = x7[56]; + x8[59] = x7[59]; + x8[60] = x7[60]; + x8[63] = x7[63]; +} +static INLINE void av1_fdct64_stage9_avx2(__m256i *x8, __m256i *x9, + const int32_t *cospi, + const __m256i *__rounding, + int8_t cos_bit) { + __m256i cospi_p62 = _mm256_set1_epi32(cospi[62]); + __m256i cospi_p02 = _mm256_set1_epi32(cospi[2]); + __m256i cospi_p30 = _mm256_set1_epi32(cospi[30]); + __m256i cospi_p34 = _mm256_set1_epi32(cospi[34]); + __m256i cospi_p46 = _mm256_set1_epi32(cospi[46]); + __m256i cospi_p18 = _mm256_set1_epi32(cospi[18]); + __m256i cospi_p14 = _mm256_set1_epi32(cospi[14]); + __m256i cospi_p50 = _mm256_set1_epi32(cospi[50]); + __m256i cospi_p54 = _mm256_set1_epi32(cospi[54]); + __m256i cospi_p10 = _mm256_set1_epi32(cospi[10]); + __m256i cospi_p22 = _mm256_set1_epi32(cospi[22]); + __m256i cospi_p42 = _mm256_set1_epi32(cospi[42]); + __m256i cospi_p38 = _mm256_set1_epi32(cospi[38]); + __m256i cospi_p26 = _mm256_set1_epi32(cospi[26]); + __m256i cospi_p06 = _mm256_set1_epi32(cospi[6]); + __m256i cospi_p58 = _mm256_set1_epi32(cospi[58]); + + x9[0] = x8[0]; + x9[1] = x8[1]; + x9[2] = x8[2]; + x9[3] = x8[3]; + x9[4] = x8[4]; + x9[5] = x8[5]; + x9[6] = x8[6]; + x9[7] = x8[7]; + x9[8] = x8[8]; + x9[9] = x8[9]; + x9[10] = x8[10]; + x9[11] = x8[11]; + x9[12] = x8[12]; + x9[13] = x8[13]; + x9[14] = x8[14]; + x9[15] = x8[15]; + btf_32_type0_avx2_new(cospi_p02, cospi_p62, x8[31], x8[16], x9[16], x9[31], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p34, cospi_p30, x8[30], x8[17], x9[17], x9[30], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p18, cospi_p46, x8[29], x8[18], x9[18], x9[29], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p50, cospi_p14, x8[28], x8[19], x9[19], x9[28], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p10, cospi_p54, x8[27], x8[20], x9[20], x9[27], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p42, cospi_p22, x8[26], x8[21], x9[21], x9[26], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p26, cospi_p38, x8[25], x8[22], x9[22], x9[25], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p58, cospi_p06, x8[24], x8[23], x9[23], x9[24], + *__rounding, cos_bit); + x9[32] = _mm256_add_epi32(x8[32], x8[33]); + x9[33] = _mm256_sub_epi32(x8[32], x8[33]); + x9[34] = _mm256_sub_epi32(x8[35], x8[34]); + x9[35] = _mm256_add_epi32(x8[35], x8[34]); + x9[36] = _mm256_add_epi32(x8[36], x8[37]); + x9[37] = _mm256_sub_epi32(x8[36], x8[37]); + x9[38] = _mm256_sub_epi32(x8[39], x8[38]); + x9[39] = _mm256_add_epi32(x8[39], x8[38]); + x9[40] = _mm256_add_epi32(x8[40], x8[41]); + x9[41] = _mm256_sub_epi32(x8[40], x8[41]); + x9[42] = _mm256_sub_epi32(x8[43], x8[42]); + x9[43] = _mm256_add_epi32(x8[43], x8[42]); + x9[44] = _mm256_add_epi32(x8[44], x8[45]); + x9[45] = _mm256_sub_epi32(x8[44], x8[45]); + x9[46] = _mm256_sub_epi32(x8[47], x8[46]); + x9[47] = _mm256_add_epi32(x8[47], x8[46]); + x9[48] = _mm256_add_epi32(x8[48], x8[49]); + x9[49] = _mm256_sub_epi32(x8[48], x8[49]); + x9[50] = _mm256_sub_epi32(x8[51], x8[50]); + x9[51] = _mm256_add_epi32(x8[51], x8[50]); + x9[52] = _mm256_add_epi32(x8[52], x8[53]); + x9[53] = _mm256_sub_epi32(x8[52], x8[53]); + x9[54] = _mm256_sub_epi32(x8[55], x8[54]); + x9[55] = _mm256_add_epi32(x8[55], x8[54]); + x9[56] = _mm256_add_epi32(x8[56], x8[57]); + x9[57] = _mm256_sub_epi32(x8[56], x8[57]); + x9[58] = _mm256_sub_epi32(x8[59], x8[58]); + x9[59] = _mm256_add_epi32(x8[59], x8[58]); + x9[60] = _mm256_add_epi32(x8[60], x8[61]); + x9[61] = _mm256_sub_epi32(x8[60], x8[61]); + x9[62] = _mm256_sub_epi32(x8[63], x8[62]); + x9[63] = _mm256_add_epi32(x8[63], x8[62]); +} +static INLINE void av1_fdct64_stage10_avx2(__m256i *x9, __m256i *x10, + const int32_t *cospi, + const __m256i *__rounding, + int8_t cos_bit) { + __m256i cospi_p63 = _mm256_set1_epi32(cospi[63]); + __m256i cospi_p01 = _mm256_set1_epi32(cospi[1]); + __m256i cospi_p31 = _mm256_set1_epi32(cospi[31]); + __m256i cospi_p33 = _mm256_set1_epi32(cospi[33]); + __m256i cospi_p47 = _mm256_set1_epi32(cospi[47]); + __m256i cospi_p17 = _mm256_set1_epi32(cospi[17]); + __m256i cospi_p15 = _mm256_set1_epi32(cospi[15]); + __m256i cospi_p49 = _mm256_set1_epi32(cospi[49]); + __m256i cospi_p55 = _mm256_set1_epi32(cospi[55]); + __m256i cospi_p09 = _mm256_set1_epi32(cospi[9]); + __m256i cospi_p23 = _mm256_set1_epi32(cospi[23]); + __m256i cospi_p41 = _mm256_set1_epi32(cospi[41]); + __m256i cospi_p39 = _mm256_set1_epi32(cospi[39]); + __m256i cospi_p25 = _mm256_set1_epi32(cospi[25]); + __m256i cospi_p07 = _mm256_set1_epi32(cospi[7]); + __m256i cospi_p57 = _mm256_set1_epi32(cospi[57]); + __m256i cospi_p59 = _mm256_set1_epi32(cospi[59]); + __m256i cospi_p05 = _mm256_set1_epi32(cospi[5]); + __m256i cospi_p27 = _mm256_set1_epi32(cospi[27]); + __m256i cospi_p37 = _mm256_set1_epi32(cospi[37]); + __m256i cospi_p43 = _mm256_set1_epi32(cospi[43]); + __m256i cospi_p21 = _mm256_set1_epi32(cospi[21]); + __m256i cospi_p11 = _mm256_set1_epi32(cospi[11]); + __m256i cospi_p53 = _mm256_set1_epi32(cospi[53]); + __m256i cospi_p51 = _mm256_set1_epi32(cospi[51]); + __m256i cospi_p13 = _mm256_set1_epi32(cospi[13]); + __m256i cospi_p19 = _mm256_set1_epi32(cospi[19]); + __m256i cospi_p45 = _mm256_set1_epi32(cospi[45]); + __m256i cospi_p35 = _mm256_set1_epi32(cospi[35]); + __m256i cospi_p29 = _mm256_set1_epi32(cospi[29]); + __m256i cospi_p03 = _mm256_set1_epi32(cospi[3]); + __m256i cospi_p61 = _mm256_set1_epi32(cospi[61]); + + x10[0] = x9[0]; + x10[1] = x9[1]; + x10[2] = x9[2]; + x10[3] = x9[3]; + x10[4] = x9[4]; + x10[5] = x9[5]; + x10[6] = x9[6]; + x10[7] = x9[7]; + x10[8] = x9[8]; + x10[9] = x9[9]; + x10[10] = x9[10]; + x10[11] = x9[11]; + x10[12] = x9[12]; + x10[13] = x9[13]; + x10[14] = x9[14]; + x10[15] = x9[15]; + x10[16] = x9[16]; + x10[17] = x9[17]; + x10[18] = x9[18]; + x10[19] = x9[19]; + x10[20] = x9[20]; + x10[21] = x9[21]; + x10[22] = x9[22]; + x10[23] = x9[23]; + x10[24] = x9[24]; + x10[25] = x9[25]; + x10[26] = x9[26]; + x10[27] = x9[27]; + x10[28] = x9[28]; + x10[29] = x9[29]; + x10[30] = x9[30]; + x10[31] = x9[31]; + btf_32_type0_avx2_new(cospi_p01, cospi_p63, x9[63], x9[32], x10[32], x10[63], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p33, cospi_p31, x9[62], x9[33], x10[33], x10[62], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p17, cospi_p47, x9[61], x9[34], x10[34], x10[61], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p49, cospi_p15, x9[60], x9[35], x10[35], x10[60], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p09, cospi_p55, x9[59], x9[36], x10[36], x10[59], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p41, cospi_p23, x9[58], x9[37], x10[37], x10[58], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p25, cospi_p39, x9[57], x9[38], x10[38], x10[57], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p57, cospi_p07, x9[56], x9[39], x10[39], x10[56], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p05, cospi_p59, x9[55], x9[40], x10[40], x10[55], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p37, cospi_p27, x9[54], x9[41], x10[41], x10[54], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p21, cospi_p43, x9[53], x9[42], x10[42], x10[53], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p53, cospi_p11, x9[52], x9[43], x10[43], x10[52], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p13, cospi_p51, x9[51], x9[44], x10[44], x10[51], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p45, cospi_p19, x9[50], x9[45], x10[45], x10[50], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p29, cospi_p35, x9[49], x9[46], x10[46], x10[49], + *__rounding, cos_bit); + btf_32_type0_avx2_new(cospi_p61, cospi_p03, x9[48], x9[47], x10[47], x10[48], + *__rounding, cos_bit); +} +static void av1_fdct64_avx2(__m256i *input, __m256i *output, int8_t cos_bit, + const int instride, const int outstride) { + const int32_t *cospi = cospi_arr(cos_bit); + const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1)); + __m256i cospi_m32 = _mm256_set1_epi32(-cospi[32]); + __m256i cospi_p32 = _mm256_set1_epi32(cospi[32]); + __m256i cospi_m16 = _mm256_set1_epi32(-cospi[16]); + __m256i cospi_p48 = _mm256_set1_epi32(cospi[48]); + __m256i cospi_m48 = _mm256_set1_epi32(-cospi[48]); + __m256i cospi_p16 = _mm256_set1_epi32(cospi[16]); + __m256i cospi_m08 = _mm256_set1_epi32(-cospi[8]); + __m256i cospi_p56 = _mm256_set1_epi32(cospi[56]); + __m256i cospi_m56 = _mm256_set1_epi32(-cospi[56]); + __m256i cospi_m40 = _mm256_set1_epi32(-cospi[40]); + __m256i cospi_p24 = _mm256_set1_epi32(cospi[24]); + __m256i cospi_m24 = _mm256_set1_epi32(-cospi[24]); + __m256i cospi_p08 = _mm256_set1_epi32(cospi[8]); + __m256i cospi_p40 = _mm256_set1_epi32(cospi[40]); + + int startidx = 0 * instride; + int endidx = 63 * instride; + // stage 1 + __m256i x1[64]; + x1[0] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[63] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[1] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[62] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[2] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[61] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[3] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[60] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[4] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[59] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[5] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[58] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[6] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[57] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[7] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[56] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[8] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[55] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[9] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[54] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[10] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[53] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[11] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[52] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[12] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[51] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[13] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[50] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[14] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[49] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[15] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[48] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[16] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[47] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[17] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[46] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[18] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[45] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[19] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[44] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[20] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[43] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[21] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[42] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[22] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[41] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[23] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[40] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[24] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[39] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[25] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[38] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[26] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[37] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[27] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[36] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[28] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[35] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[29] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[34] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[30] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[33] = _mm256_sub_epi32(input[startidx], input[endidx]); + startidx += instride; + endidx -= instride; + x1[31] = _mm256_add_epi32(input[startidx], input[endidx]); + x1[32] = _mm256_sub_epi32(input[startidx], input[endidx]); + + // stage 2 + __m256i x2[64]; + av1_fdct64_stage2_avx2(x1, x2, &cospi_m32, &cospi_p32, &__rounding, cos_bit); + // stage 3 + av1_fdct64_stage3_avx2(x2, x1, &cospi_m32, &cospi_p32, &__rounding, cos_bit); + // stage 4 + av1_fdct64_stage4_avx2(x1, x2, &cospi_m32, &cospi_p32, &cospi_m16, &cospi_p48, + &cospi_m48, &__rounding, cos_bit); + // stage 5 + av1_fdct64_stage5_avx2(x2, x1, &cospi_m32, &cospi_p32, &cospi_m16, &cospi_p48, + &cospi_m48, &__rounding, cos_bit); + // stage 6 + av1_fdct64_stage6_avx2(x1, x2, &cospi_p16, &cospi_p32, &cospi_m16, &cospi_p48, + &cospi_m48, &cospi_m08, &cospi_p56, &cospi_m56, + &cospi_m40, &cospi_p24, &cospi_m24, &__rounding, + cos_bit); + // stage 7 + av1_fdct64_stage7_avx2(x2, x1, &cospi_p08, &cospi_p56, &cospi_p40, &cospi_p24, + &cospi_m08, &cospi_m56, &cospi_m40, &cospi_m24, + &__rounding, cos_bit); + // stage 8 + av1_fdct64_stage8_avx2(x1, x2, cospi, &__rounding, cos_bit); + // stage 9 + av1_fdct64_stage9_avx2(x2, x1, cospi, &__rounding, cos_bit); + // stage 10 + av1_fdct64_stage10_avx2(x1, x2, cospi, &__rounding, cos_bit); + + startidx = 0 * outstride; + endidx = 63 * outstride; + + // stage 11 + output[startidx] = x2[0]; + output[endidx] = x2[63]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[32]; + output[endidx] = x2[31]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[16]; + output[endidx] = x2[47]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[48]; + output[endidx] = x2[15]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[8]; + output[endidx] = x2[55]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[40]; + output[endidx] = x2[23]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[24]; + output[endidx] = x2[39]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[56]; + output[endidx] = x2[7]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[4]; + output[endidx] = x2[59]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[36]; + output[endidx] = x2[27]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[20]; + output[endidx] = x2[43]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[52]; + output[endidx] = x2[11]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[12]; + output[endidx] = x2[51]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[44]; + output[endidx] = x2[19]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[28]; + output[endidx] = x2[35]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[60]; + output[endidx] = x2[3]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[2]; + output[endidx] = x2[61]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[34]; + output[endidx] = x2[29]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[18]; + output[endidx] = x2[45]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[50]; + output[endidx] = x2[13]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[10]; + output[endidx] = x2[53]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[42]; + output[endidx] = x2[21]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[26]; + output[endidx] = x2[37]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[58]; + output[endidx] = x2[5]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[6]; + output[endidx] = x2[57]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[38]; + output[endidx] = x2[25]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[22]; + output[endidx] = x2[41]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[54]; + output[endidx] = x2[9]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[14]; + output[endidx] = x2[49]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[46]; + output[endidx] = x2[17]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[30]; + output[endidx] = x2[33]; + startidx += outstride; + endidx -= outstride; + output[startidx] = x2[62]; + output[endidx] = x2[1]; +} +void av1_fwd_txfm2d_64x64_avx2(const int16_t *input, int32_t *output, + int stride, TX_TYPE tx_type, int bd) { + (void)bd; + (void)tx_type; + assert(tx_type == DCT_DCT); + const TX_SIZE tx_size = TX_64X64; + __m256i buf0[512], buf1[512]; + const int8_t *shift = fwd_txfm_shift_ls[tx_size]; + const int txw_idx = get_txw_idx(tx_size); + const int txh_idx = get_txh_idx(tx_size); + const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx]; + const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + const transform_1d_avx2 col_txfm = av1_fdct64_avx2; + const transform_1d_avx2 row_txfm = av1_fdct64_avx2; + const int width_div16 = (width >> 4); + const int width_div8 = (width >> 3); + int r, c; + for (int i = 0; i < width_div16; i++) { + av1_load_buffer_16xn_avx2(input + (i << 4), &buf0[i << 1], stride, height, + width_div8, 0, 0); + av1_round_shift_32_8xn_avx2(&buf0[i << 1], height, shift[0], width_div8); + av1_round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[0], + width_div8); + col_txfm(&buf0[i << 1], &buf0[i << 1], cos_bit_col, width_div8, width_div8); + col_txfm(&buf0[(i << 1) + 1], &buf0[(i << 1) + 1], cos_bit_col, width_div8, + width_div8); + av1_round_shift_32_8xn_avx2(&buf0[i << 1], height, shift[1], width_div8); + av1_round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[1], + width_div8); + } + + for (r = 0; r < height; r += 8) { + for (c = 0; c < width_div8; c++) { + av1_fwd_txfm_transpose_8x8_avx2(&buf0[r * width_div8 + c], + &buf1[c * 8 * width_div8 + (r >> 3)], + width_div8, width_div8); + } + } + + for (int i = 0; i < 2; i++) { + row_txfm(&buf1[i << 1], &buf0[i << 1], cos_bit_row, width_div8, + width_div16); + row_txfm(&buf1[(i << 1) + 1], &buf0[(i << 1) + 1], cos_bit_row, width_div8, + width_div16); + av1_round_shift_32_8xn_avx2(&buf0[i << 1], (height >> 1), shift[2], + width_div16); + av1_round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], (height >> 1), shift[2], + width_div16); + } + + for (r = 0; r < (height >> 1); r += 8) { + for (c = 0; c < width_div16; c++) { + av1_fwd_txfm_transpose_8x8_avx2(&buf0[r * width_div16 + c], + &buf1[c * 8 * width_div16 + (r >> 3)], + width_div16, width_div16); + } + } + av1_store_buffer_avx2(buf1, output, 8, 128); +} diff --git a/libaom/av1/encoder/x86/highbd_temporal_filter_sse4.c b/libaom/av1/encoder/x86/highbd_temporal_filter_sse4.c new file mode 100644 index 0000000..f199b0f --- /dev/null +++ b/libaom/av1/encoder/x86/highbd_temporal_filter_sse4.c @@ -0,0 +1,954 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <smmintrin.h> + +#include "config/av1_rtcd.h" +#include "aom/aom_integer.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/temporal_filter.h" +#include "av1/encoder/x86/temporal_filter_constants.h" + +// Compute (a-b)**2 for 8 pixels with size 16-bit +static INLINE void highbd_store_dist_8(const uint16_t *a, const uint16_t *b, + uint32_t *dst) { + const __m128i zero = _mm_setzero_si128(); + const __m128i a_reg = _mm_loadu_si128((const __m128i *)a); + const __m128i b_reg = _mm_loadu_si128((const __m128i *)b); + + const __m128i a_first = _mm_cvtepu16_epi32(a_reg); + const __m128i a_second = _mm_unpackhi_epi16(a_reg, zero); + const __m128i b_first = _mm_cvtepu16_epi32(b_reg); + const __m128i b_second = _mm_unpackhi_epi16(b_reg, zero); + + __m128i dist_first, dist_second; + + dist_first = _mm_sub_epi32(a_first, b_first); + dist_second = _mm_sub_epi32(a_second, b_second); + dist_first = _mm_mullo_epi32(dist_first, dist_first); + dist_second = _mm_mullo_epi32(dist_second, dist_second); + + _mm_storeu_si128((__m128i *)dst, dist_first); + _mm_storeu_si128((__m128i *)(dst + 4), dist_second); +} + +// Sum up three neighboring distortions for the pixels +static INLINE void highbd_get_sum_4(const uint32_t *dist, __m128i *sum) { + __m128i dist_reg, dist_left, dist_right; + + dist_reg = _mm_loadu_si128((const __m128i *)dist); + dist_left = _mm_loadu_si128((const __m128i *)(dist - 1)); + dist_right = _mm_loadu_si128((const __m128i *)(dist + 1)); + + *sum = _mm_add_epi32(dist_reg, dist_left); + *sum = _mm_add_epi32(*sum, dist_right); +} + +static INLINE void highbd_get_sum_8(const uint32_t *dist, __m128i *sum_first, + __m128i *sum_second) { + highbd_get_sum_4(dist, sum_first); + highbd_get_sum_4(dist + 4, sum_second); +} + +// Average the value based on the number of values summed (9 for pixels away +// from the border, 4 for pixels in corners, and 6 for other edge values, plus +// however many values from y/uv plane are). +// +// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply +// by weight. +static INLINE void highbd_average_4(__m128i *output, const __m128i *sum, + const __m128i *mul_constants, + const int strength, const int rounding, + const int weight) { + // _mm_srl_epi16 uses the lower 64 bit value for the shift. + const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength); + const __m128i rounding_u32 = _mm_set1_epi32(rounding); + const __m128i weight_u32 = _mm_set1_epi32(weight); + const __m128i sixteen = _mm_set1_epi32(16); + const __m128i zero = _mm_setzero_si128(); + + // modifier * 3 / index; + const __m128i sum_lo = _mm_unpacklo_epi32(*sum, zero); + const __m128i sum_hi = _mm_unpackhi_epi32(*sum, zero); + const __m128i const_lo = _mm_unpacklo_epi32(*mul_constants, zero); + const __m128i const_hi = _mm_unpackhi_epi32(*mul_constants, zero); + + const __m128i mul_lo = _mm_mul_epu32(sum_lo, const_lo); + const __m128i mul_lo_div = _mm_srli_epi64(mul_lo, 32); + const __m128i mul_hi = _mm_mul_epu32(sum_hi, const_hi); + const __m128i mul_hi_div = _mm_srli_epi64(mul_hi, 32); + + // Now we have + // mul_lo: 00 a1 00 a0 + // mul_hi: 00 a3 00 a2 + // Unpack as 64 bit words to get even and odd elements + // unpack_lo: 00 a2 00 a0 + // unpack_hi: 00 a3 00 a1 + // Then we can shift and OR the results to get everything in 32-bits + const __m128i mul_even = _mm_unpacklo_epi64(mul_lo_div, mul_hi_div); + const __m128i mul_odd = _mm_unpackhi_epi64(mul_lo_div, mul_hi_div); + const __m128i mul_odd_shift = _mm_slli_si128(mul_odd, 4); + const __m128i mul = _mm_or_si128(mul_even, mul_odd_shift); + + // Round + *output = _mm_add_epi32(mul, rounding_u32); + *output = _mm_srl_epi32(*output, strength_u128); + + // Multiply with the weight + *output = _mm_min_epu32(*output, sixteen); + *output = _mm_sub_epi32(sixteen, *output); + *output = _mm_mullo_epi32(*output, weight_u32); +} + +static INLINE void highbd_average_8(__m128i *output_0, __m128i *output_1, + const __m128i *sum_0_u32, + const __m128i *sum_1_u32, + const __m128i *mul_constants_0, + const __m128i *mul_constants_1, + const int strength, const int rounding, + const int weight) { + highbd_average_4(output_0, sum_0_u32, mul_constants_0, strength, rounding, + weight); + highbd_average_4(output_1, sum_1_u32, mul_constants_1, strength, rounding, + weight); +} + +// Add 'sum_u32' to 'count'. Multiply by 'pred' and add to 'accumulator.' +static INLINE void highbd_accumulate_and_store_8(const __m128i sum_first_u32, + const __m128i sum_second_u32, + const uint16_t *pred, + uint16_t *count, + uint32_t *accumulator) { + // Cast down to 16-bit ints + const __m128i sum_u16 = _mm_packus_epi32(sum_first_u32, sum_second_u32); + const __m128i zero = _mm_setzero_si128(); + + __m128i pred_u16 = _mm_loadu_si128((const __m128i *)pred); + __m128i count_u16 = _mm_loadu_si128((const __m128i *)count); + + __m128i pred_0_u32, pred_1_u32; + __m128i accum_0_u32, accum_1_u32; + + count_u16 = _mm_adds_epu16(count_u16, sum_u16); + _mm_storeu_si128((__m128i *)count, count_u16); + + pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16); + + pred_0_u32 = _mm_cvtepu16_epi32(pred_u16); + pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero); + + accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator); + accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4)); + + accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32); + accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32); + + _mm_storeu_si128((__m128i *)accumulator, accum_0_u32); + _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32); +} + +static INLINE void highbd_read_dist_4(const uint32_t *dist, __m128i *dist_reg) { + *dist_reg = _mm_loadu_si128((const __m128i *)dist); +} + +static INLINE void highbd_read_dist_8(const uint32_t *dist, __m128i *reg_first, + __m128i *reg_second) { + highbd_read_dist_4(dist, reg_first); + highbd_read_dist_4(dist + 4, reg_second); +} + +static INLINE void highbd_read_chroma_dist_row_8( + int ss_x, const uint32_t *u_dist, const uint32_t *v_dist, __m128i *u_first, + __m128i *u_second, __m128i *v_first, __m128i *v_second) { + if (!ss_x) { + // If there is no chroma subsampling in the horizontal direction, then we + // need to load 8 entries from chroma. + highbd_read_dist_8(u_dist, u_first, u_second); + highbd_read_dist_8(v_dist, v_first, v_second); + } else { // ss_x == 1 + // Otherwise, we only need to load 8 entries + __m128i u_reg, v_reg; + + highbd_read_dist_4(u_dist, &u_reg); + + *u_first = _mm_unpacklo_epi32(u_reg, u_reg); + *u_second = _mm_unpackhi_epi32(u_reg, u_reg); + + highbd_read_dist_4(v_dist, &v_reg); + + *v_first = _mm_unpacklo_epi32(v_reg, v_reg); + *v_second = _mm_unpackhi_epi32(v_reg, v_reg); + } +} + +static void av1_highbd_apply_temporal_filter_luma_8( + const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre, + int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src, + int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, int use_whole_blk, uint32_t *y_accum, + uint16_t *y_count, const uint32_t *y_dist, const uint32_t *u_dist, + const uint32_t *v_dist, const uint32_t *const *neighbors_first, + const uint32_t *const *neighbors_second, int top_weight, + int bottom_weight) { + const int rounding = (1 << strength) >> 1; + int weight = top_weight; + + __m128i mul_first, mul_second; + + __m128i sum_row_1_first, sum_row_1_second; + __m128i sum_row_2_first, sum_row_2_second; + __m128i sum_row_3_first, sum_row_3_second; + + __m128i u_first, u_second; + __m128i v_first, v_second; + + __m128i sum_row_first; + __m128i sum_row_second; + + // Loop variables + unsigned int h; + + assert(strength >= 4 && strength <= 14 && + "invalid adjusted temporal filter strength"); + assert(block_width == 8); + + (void)block_width; + + // First row + mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[0]); + mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[0]); + + // Add luma values + highbd_get_sum_8(y_dist, &sum_row_2_first, &sum_row_2_second); + highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second); + + // We don't need to saturate here because the maximum value is UINT12_MAX ** 2 + // * 9 ~= 2**24 * 9 < 2 ** 28 < INT32_MAX + sum_row_first = _mm_add_epi32(sum_row_2_first, sum_row_3_first); + sum_row_second = _mm_add_epi32(sum_row_2_second, sum_row_3_second); + + // Add chroma values + highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second, + &v_first, &v_second); + + // Max value here is 2 ** 24 * (9 + 2), so no saturation is needed + sum_row_first = _mm_add_epi32(sum_row_first, u_first); + sum_row_second = _mm_add_epi32(sum_row_second, u_second); + + sum_row_first = _mm_add_epi32(sum_row_first, v_first); + sum_row_second = _mm_add_epi32(sum_row_second, v_second); + + // Get modifier and store result + highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first, + &sum_row_second, &mul_first, &mul_second, strength, rounding, + weight); + + highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); + + y_src += y_src_stride; + y_pre += y_pre_stride; + y_count += y_pre_stride; + y_accum += y_pre_stride; + y_dist += DIST_STRIDE; + + u_src += uv_src_stride; + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_src += uv_src_stride; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + + // Then all the rows except the last one + mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[1]); + mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[1]); + + for (h = 1; h < block_height - 1; ++h) { + // Move the weight to bottom half + if (!use_whole_blk && h == block_height / 2) { + weight = bottom_weight; + } + // Shift the rows up + sum_row_1_first = sum_row_2_first; + sum_row_1_second = sum_row_2_second; + sum_row_2_first = sum_row_3_first; + sum_row_2_second = sum_row_3_second; + + // Add luma values to the modifier + sum_row_first = _mm_add_epi32(sum_row_1_first, sum_row_2_first); + sum_row_second = _mm_add_epi32(sum_row_1_second, sum_row_2_second); + + highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second); + + sum_row_first = _mm_add_epi32(sum_row_first, sum_row_3_first); + sum_row_second = _mm_add_epi32(sum_row_second, sum_row_3_second); + + // Add chroma values to the modifier + if (ss_y == 0 || h % 2 == 0) { + // Only calculate the new chroma distortion if we are at a pixel that + // corresponds to a new chroma row + highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second, + &v_first, &v_second); + + u_src += uv_src_stride; + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_src += uv_src_stride; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + } + + sum_row_first = _mm_add_epi32(sum_row_first, u_first); + sum_row_second = _mm_add_epi32(sum_row_second, u_second); + sum_row_first = _mm_add_epi32(sum_row_first, v_first); + sum_row_second = _mm_add_epi32(sum_row_second, v_second); + + // Get modifier and store result + highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first, + &sum_row_second, &mul_first, &mul_second, strength, + rounding, weight); + highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); + + y_src += y_src_stride; + y_pre += y_pre_stride; + y_count += y_pre_stride; + y_accum += y_pre_stride; + y_dist += DIST_STRIDE; + } + + // The last row + mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[0]); + mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[0]); + + // Shift the rows up + sum_row_1_first = sum_row_2_first; + sum_row_1_second = sum_row_2_second; + sum_row_2_first = sum_row_3_first; + sum_row_2_second = sum_row_3_second; + + // Add luma values to the modifier + sum_row_first = _mm_add_epi32(sum_row_1_first, sum_row_2_first); + sum_row_second = _mm_add_epi32(sum_row_1_second, sum_row_2_second); + + // Add chroma values to the modifier + if (ss_y == 0) { + // Only calculate the new chroma distortion if we are at a pixel that + // corresponds to a new chroma row + highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second, + &v_first, &v_second); + } + + sum_row_first = _mm_add_epi32(sum_row_first, u_first); + sum_row_second = _mm_add_epi32(sum_row_second, u_second); + sum_row_first = _mm_add_epi32(sum_row_first, v_first); + sum_row_second = _mm_add_epi32(sum_row_second, v_second); + + // Get modifier and store result + highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first, + &sum_row_second, &mul_first, &mul_second, strength, rounding, + weight); + highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); +} + +// Perform temporal filter for the luma component. +static void av1_highbd_apply_temporal_filter_luma( + const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre, + int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src, + int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk, + uint32_t *y_accum, uint16_t *y_count, const uint32_t *y_dist, + const uint32_t *u_dist, const uint32_t *v_dist) { + unsigned int blk_col = 0, uv_blk_col = 0; + const unsigned int blk_col_step = 8, uv_blk_col_step = 8 >> ss_x; + const unsigned int mid_width = block_width >> 1, + last_width = block_width - blk_col_step; + int top_weight = blk_fw[0], + bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2]; + const uint32_t *const *neighbors_first; + const uint32_t *const *neighbors_second; + + // Left + neighbors_first = HIGHBD_LUMA_LEFT_COLUMN_NEIGHBORS; + neighbors_second = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS; + av1_highbd_apply_temporal_filter_luma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col, + v_pre + uv_blk_col, uv_pre_stride, blk_col_step, block_height, ss_x, ss_y, + strength, use_whole_blk, y_accum + blk_col, y_count + blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_first, neighbors_second, top_weight, bottom_weight); + + blk_col += blk_col_step; + uv_blk_col += uv_blk_col_step; + + // Middle First + neighbors_first = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS; + for (; blk_col < mid_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + av1_highbd_apply_temporal_filter_luma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, blk_col_step, + block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col, + y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col, + v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight, + bottom_weight); + } + + if (!use_whole_blk) { + top_weight = blk_fw[1]; + bottom_weight = blk_fw[3]; + } + + // Middle Second + for (; blk_col < last_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + av1_highbd_apply_temporal_filter_luma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, blk_col_step, + block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col, + y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col, + v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight, + bottom_weight); + } + + // Right + neighbors_second = HIGHBD_LUMA_RIGHT_COLUMN_NEIGHBORS; + av1_highbd_apply_temporal_filter_luma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col, + v_pre + uv_blk_col, uv_pre_stride, blk_col_step, block_height, ss_x, ss_y, + strength, use_whole_blk, y_accum + blk_col, y_count + blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_first, neighbors_second, top_weight, bottom_weight); +} + +// Add a row of luma distortion that corresponds to 8 chroma mods. If we are +// subsampling in x direction, then we have 16 lumas, else we have 8. +static INLINE void highbd_add_luma_dist_to_8_chroma_mod( + const uint32_t *y_dist, int ss_x, int ss_y, __m128i *u_mod_fst, + __m128i *u_mod_snd, __m128i *v_mod_fst, __m128i *v_mod_snd) { + __m128i y_reg_fst, y_reg_snd; + if (!ss_x) { + highbd_read_dist_8(y_dist, &y_reg_fst, &y_reg_snd); + if (ss_y == 1) { + __m128i y_tmp_fst, y_tmp_snd; + highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd); + y_reg_fst = _mm_add_epi32(y_reg_fst, y_tmp_fst); + y_reg_snd = _mm_add_epi32(y_reg_snd, y_tmp_snd); + } + } else { + // Temporary + __m128i y_fst, y_snd; + + // First 8 + highbd_read_dist_8(y_dist, &y_fst, &y_snd); + if (ss_y == 1) { + __m128i y_tmp_fst, y_tmp_snd; + highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd); + + y_fst = _mm_add_epi32(y_fst, y_tmp_fst); + y_snd = _mm_add_epi32(y_snd, y_tmp_snd); + } + + y_reg_fst = _mm_hadd_epi32(y_fst, y_snd); + + // Second 8 + highbd_read_dist_8(y_dist + 8, &y_fst, &y_snd); + if (ss_y == 1) { + __m128i y_tmp_fst, y_tmp_snd; + highbd_read_dist_8(y_dist + 8 + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd); + + y_fst = _mm_add_epi32(y_fst, y_tmp_fst); + y_snd = _mm_add_epi32(y_snd, y_tmp_snd); + } + + y_reg_snd = _mm_hadd_epi32(y_fst, y_snd); + } + + *u_mod_fst = _mm_add_epi32(*u_mod_fst, y_reg_fst); + *u_mod_snd = _mm_add_epi32(*u_mod_snd, y_reg_snd); + *v_mod_fst = _mm_add_epi32(*v_mod_fst, y_reg_fst); + *v_mod_snd = _mm_add_epi32(*v_mod_snd, y_reg_snd); +} + +// Apply temporal filter to the chroma components. This performs temporal +// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use +// blk_fw as an array of size 4 for the weights for each of the 4 subblocks, +// else use top_weight for top half, and bottom weight for bottom half. +static void av1_highbd_apply_temporal_filter_chroma_8( + const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre, + int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src, + int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre, + int uv_pre_stride, unsigned int uv_block_width, + unsigned int uv_block_height, int ss_x, int ss_y, int strength, + uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count, + const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist, + const uint32_t *const *neighbors_fst, const uint32_t *const *neighbors_snd, + int top_weight, int bottom_weight, const int *blk_fw) { + const int rounding = (1 << strength) >> 1; + int weight = top_weight; + + __m128i mul_fst, mul_snd; + + __m128i u_sum_row_1_fst, u_sum_row_2_fst, u_sum_row_3_fst; + __m128i v_sum_row_1_fst, v_sum_row_2_fst, v_sum_row_3_fst; + __m128i u_sum_row_1_snd, u_sum_row_2_snd, u_sum_row_3_snd; + __m128i v_sum_row_1_snd, v_sum_row_2_snd, v_sum_row_3_snd; + + __m128i u_sum_row_fst, v_sum_row_fst; + __m128i u_sum_row_snd, v_sum_row_snd; + + // Loop variable + unsigned int h; + + (void)uv_block_width; + + // First row + mul_fst = _mm_loadu_si128((const __m128i *)neighbors_fst[0]); + mul_snd = _mm_loadu_si128((const __m128i *)neighbors_snd[0]); + + // Add chroma values + highbd_get_sum_8(u_dist, &u_sum_row_2_fst, &u_sum_row_2_snd); + highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd); + + u_sum_row_fst = _mm_add_epi32(u_sum_row_2_fst, u_sum_row_3_fst); + u_sum_row_snd = _mm_add_epi32(u_sum_row_2_snd, u_sum_row_3_snd); + + highbd_get_sum_8(v_dist, &v_sum_row_2_fst, &v_sum_row_2_snd); + highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd); + + v_sum_row_fst = _mm_add_epi32(v_sum_row_2_fst, v_sum_row_3_fst); + v_sum_row_snd = _mm_add_epi32(v_sum_row_2_snd, v_sum_row_3_snd); + + // Add luma values + highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst, + &u_sum_row_snd, &v_sum_row_fst, + &v_sum_row_snd); + + // Get modifier and store result + if (blk_fw) { + highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + } else { + highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst, + &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst, + &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + } + highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count, + u_accum); + highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count, + v_accum); + + u_src += uv_src_stride; + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_src += uv_src_stride; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + u_count += uv_pre_stride; + u_accum += uv_pre_stride; + v_count += uv_pre_stride; + v_accum += uv_pre_stride; + + y_src += y_src_stride * (1 + ss_y); + y_pre += y_pre_stride * (1 + ss_y); + y_dist += DIST_STRIDE * (1 + ss_y); + + // Then all the rows except the last one + mul_fst = _mm_loadu_si128((const __m128i *)neighbors_fst[1]); + mul_snd = _mm_loadu_si128((const __m128i *)neighbors_snd[1]); + + for (h = 1; h < uv_block_height - 1; ++h) { + // Move the weight pointer to the bottom half of the blocks + if (h == uv_block_height / 2) { + if (blk_fw) { + blk_fw += 2; + } else { + weight = bottom_weight; + } + } + + // Shift the rows up + u_sum_row_1_fst = u_sum_row_2_fst; + u_sum_row_2_fst = u_sum_row_3_fst; + u_sum_row_1_snd = u_sum_row_2_snd; + u_sum_row_2_snd = u_sum_row_3_snd; + + v_sum_row_1_fst = v_sum_row_2_fst; + v_sum_row_2_fst = v_sum_row_3_fst; + v_sum_row_1_snd = v_sum_row_2_snd; + v_sum_row_2_snd = v_sum_row_3_snd; + + // Add chroma values + u_sum_row_fst = _mm_add_epi32(u_sum_row_1_fst, u_sum_row_2_fst); + u_sum_row_snd = _mm_add_epi32(u_sum_row_1_snd, u_sum_row_2_snd); + highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd); + u_sum_row_fst = _mm_add_epi32(u_sum_row_fst, u_sum_row_3_fst); + u_sum_row_snd = _mm_add_epi32(u_sum_row_snd, u_sum_row_3_snd); + + v_sum_row_fst = _mm_add_epi32(v_sum_row_1_fst, v_sum_row_2_fst); + v_sum_row_snd = _mm_add_epi32(v_sum_row_1_snd, v_sum_row_2_snd); + highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd); + v_sum_row_fst = _mm_add_epi32(v_sum_row_fst, v_sum_row_3_fst); + v_sum_row_snd = _mm_add_epi32(v_sum_row_snd, v_sum_row_3_snd); + + // Add luma values + highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst, + &u_sum_row_snd, &v_sum_row_fst, + &v_sum_row_snd); + + // Get modifier and store result + if (blk_fw) { + highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + } else { + highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst, + &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst, + &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + } + + highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count, + u_accum); + highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count, + v_accum); + + u_src += uv_src_stride; + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_src += uv_src_stride; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + u_count += uv_pre_stride; + u_accum += uv_pre_stride; + v_count += uv_pre_stride; + v_accum += uv_pre_stride; + + y_src += y_src_stride * (1 + ss_y); + y_pre += y_pre_stride * (1 + ss_y); + y_dist += DIST_STRIDE * (1 + ss_y); + } + + // The last row + mul_fst = _mm_loadu_si128((const __m128i *)neighbors_fst[0]); + mul_snd = _mm_loadu_si128((const __m128i *)neighbors_snd[0]); + + // Shift the rows up + u_sum_row_1_fst = u_sum_row_2_fst; + u_sum_row_2_fst = u_sum_row_3_fst; + u_sum_row_1_snd = u_sum_row_2_snd; + u_sum_row_2_snd = u_sum_row_3_snd; + + v_sum_row_1_fst = v_sum_row_2_fst; + v_sum_row_2_fst = v_sum_row_3_fst; + v_sum_row_1_snd = v_sum_row_2_snd; + v_sum_row_2_snd = v_sum_row_3_snd; + + // Add chroma values + u_sum_row_fst = _mm_add_epi32(u_sum_row_1_fst, u_sum_row_2_fst); + v_sum_row_fst = _mm_add_epi32(v_sum_row_1_fst, v_sum_row_2_fst); + u_sum_row_snd = _mm_add_epi32(u_sum_row_1_snd, u_sum_row_2_snd); + v_sum_row_snd = _mm_add_epi32(v_sum_row_1_snd, v_sum_row_2_snd); + + // Add luma values + highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst, + &u_sum_row_snd, &v_sum_row_fst, + &v_sum_row_snd); + + // Get modifier and store result + if (blk_fw) { + highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength, + rounding, blk_fw[0]); + highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength, + rounding, blk_fw[1]); + + } else { + highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst, + &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst, + &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding, + weight); + } + + highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count, + u_accum); + highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count, + v_accum); +} + +// Perform temporal filter for the chroma components. +static void av1_highbd_apply_temporal_filter_chroma( + const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre, + int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src, + int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk, + uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count, + const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) { + const unsigned int uv_width = block_width >> ss_x, + uv_height = block_height >> ss_y; + + unsigned int blk_col = 0, uv_blk_col = 0; + const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x; + const unsigned int uv_mid_width = uv_width >> 1, + uv_last_width = uv_width - uv_blk_col_step; + int top_weight = blk_fw[0], + bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2]; + const uint32_t *const *neighbors_fst; + const uint32_t *const *neighbors_snd; + + if (uv_width == 8) { + // Special Case: We are subsampling in x direction on a 16x16 block. Since + // we are operating on a row of 8 chroma pixels, we can't use the usual + // left-middle-right pattern. + assert(ss_x); + + if (ss_y) { + neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else { + neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS; + } + + if (use_whole_blk) { + av1_highbd_apply_temporal_filter_chroma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL); + } else { + av1_highbd_apply_temporal_filter_chroma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_fst, neighbors_snd, 0, 0, blk_fw); + } + + return; + } + + // Left + if (ss_x && ss_y) { + neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else { + neighbors_fst = HIGHBD_CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS; + neighbors_snd = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS; + } + + av1_highbd_apply_temporal_filter_chroma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col, + v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y, + strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, neighbors_snd, + top_weight, bottom_weight, NULL); + + blk_col += blk_col_step; + uv_blk_col += uv_blk_col_step; + + // Middle First + if (ss_x && ss_y) { + neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else { + neighbors_fst = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS; + } + + for (; uv_blk_col < uv_mid_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + av1_highbd_apply_temporal_filter_chroma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL); + } + + if (!use_whole_blk) { + top_weight = blk_fw[1]; + bottom_weight = blk_fw[3]; + } + + // Middle Second + for (; uv_blk_col < uv_last_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + av1_highbd_apply_temporal_filter_chroma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, + neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL); + } + + // Right + if (ss_x && ss_y) { + neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else { + neighbors_snd = HIGHBD_CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS; + } + + av1_highbd_apply_temporal_filter_chroma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col, + v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y, + strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, neighbors_snd, + top_weight, bottom_weight, NULL); +} + +void av1_highbd_apply_temporal_filter_sse4_1( + const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, + int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, + int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk, + uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, uint16_t *u_count, + uint32_t *v_accum, uint16_t *v_count) { + const unsigned int chroma_height = block_height >> ss_y, + chroma_width = block_width >> ss_x; + + DECLARE_ALIGNED(16, uint32_t, y_dist[BH * DIST_STRIDE]) = { 0 }; + DECLARE_ALIGNED(16, uint32_t, u_dist[BH * DIST_STRIDE]) = { 0 }; + DECLARE_ALIGNED(16, uint32_t, v_dist[BH * DIST_STRIDE]) = { 0 }; + + uint32_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1, + *v_dist_ptr = v_dist + 1; + const uint16_t *y_src_ptr = CONVERT_TO_SHORTPTR(y_src), + *u_src_ptr = CONVERT_TO_SHORTPTR(u_src), + *v_src_ptr = CONVERT_TO_SHORTPTR(v_src); + const uint16_t *y_pre_ptr = CONVERT_TO_SHORTPTR(y_pre), + *u_pre_ptr = CONVERT_TO_SHORTPTR(u_pre), + *v_pre_ptr = CONVERT_TO_SHORTPTR(v_pre); + + // Loop variables + unsigned int row, blk_col; + + assert(block_width <= BW && "block width too large"); + assert(block_height <= BH && "block height too large"); + assert(block_width % 16 == 0 && "block width must be multiple of 16"); + assert(block_height % 2 == 0 && "block height must be even"); + assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) && + "invalid chroma subsampling"); + assert(strength >= 4 && strength <= 14 && + "invalid adjusted temporal filter strength"); + assert(blk_fw[0] >= 0 && "filter weight must be positive"); + assert( + (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) && + "subblock filter weight must be positive"); + assert(blk_fw[0] <= 2 && "sublock filter weight must be less than 2"); + assert( + (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) && + "subblock filter weight must be less than 2"); + + // Precompute the difference squared + for (row = 0; row < block_height; row++) { + for (blk_col = 0; blk_col < block_width; blk_col += 8) { + highbd_store_dist_8(y_src_ptr + blk_col, y_pre_ptr + blk_col, + y_dist_ptr + blk_col); + } + y_src_ptr += y_src_stride; + y_pre_ptr += y_pre_stride; + y_dist_ptr += DIST_STRIDE; + } + + for (row = 0; row < chroma_height; row++) { + for (blk_col = 0; blk_col < chroma_width; blk_col += 8) { + highbd_store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col, + u_dist_ptr + blk_col); + highbd_store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col, + v_dist_ptr + blk_col); + } + + u_src_ptr += uv_src_stride; + u_pre_ptr += uv_pre_stride; + u_dist_ptr += DIST_STRIDE; + v_src_ptr += uv_src_stride; + v_pre_ptr += uv_pre_stride; + v_dist_ptr += DIST_STRIDE; + } + + y_src_ptr = CONVERT_TO_SHORTPTR(y_src), + u_src_ptr = CONVERT_TO_SHORTPTR(u_src), + v_src_ptr = CONVERT_TO_SHORTPTR(v_src); + y_pre_ptr = CONVERT_TO_SHORTPTR(y_pre), + u_pre_ptr = CONVERT_TO_SHORTPTR(u_pre), + v_pre_ptr = CONVERT_TO_SHORTPTR(v_pre); + + y_dist_ptr = y_dist + 1; + u_dist_ptr = u_dist + 1; + v_dist_ptr = v_dist + 1; + + av1_highbd_apply_temporal_filter_luma( + y_src_ptr, y_src_stride, y_pre_ptr, y_pre_stride, u_src_ptr, v_src_ptr, + uv_src_stride, u_pre_ptr, v_pre_ptr, uv_pre_stride, block_width, + block_height, ss_x, ss_y, strength, blk_fw, use_whole_blk, y_accum, + y_count, y_dist_ptr, u_dist_ptr, v_dist_ptr); + + av1_highbd_apply_temporal_filter_chroma( + y_src_ptr, y_src_stride, y_pre_ptr, y_pre_stride, u_src_ptr, v_src_ptr, + uv_src_stride, u_pre_ptr, v_pre_ptr, uv_pre_stride, block_width, + block_height, ss_x, ss_y, strength, blk_fw, use_whole_blk, u_accum, + u_count, v_accum, v_count, y_dist_ptr, u_dist_ptr, v_dist_ptr); +} diff --git a/libaom/av1/encoder/x86/pickrst_avx2.c b/libaom/av1/encoder/x86/pickrst_avx2.c index 7a63c60..d00fca0 100644 --- a/libaom/av1/encoder/x86/pickrst_avx2.c +++ b/libaom/av1/encoder/x86/pickrst_avx2.c @@ -536,7 +536,7 @@ int64_t av1_lowbd_pixel_proj_error_avx2( const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u); const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; - err += e * e; + err += ((int64_t)e * e); } dat += dat_stride; src += src_stride; @@ -581,7 +581,7 @@ int64_t av1_lowbd_pixel_proj_error_avx2( const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); int32_t v = xq_active * (flt[k] - u); const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; - err += e * e; + err += ((int64_t)e * e); } dat += dat_stride; src += src_stride; @@ -605,7 +605,7 @@ int64_t av1_lowbd_pixel_proj_error_avx2( } for (k = j; k < width; ++k) { const int32_t e = (int32_t)(dat[k]) - src[k]; - err += e * e; + err += ((int64_t)e * e); } dat += dat_stride; src += src_stride; @@ -711,7 +711,7 @@ int64_t av1_highbd_pixel_proj_error_avx2( const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u); const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; - err += e * e; + err += ((int64_t)e * e); } dat += dat_stride; src += src_stride; @@ -788,7 +788,7 @@ int64_t av1_highbd_pixel_proj_error_avx2( const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); int32_t v = xq_on * (flt[k] - u); const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; - err += e * e; + err += ((int64_t)e * e); } dat += dat_stride; src += src_stride; @@ -828,7 +828,7 @@ int64_t av1_highbd_pixel_proj_error_avx2( // Process remaining pixels (modulu 16) for (k = j; k < width; ++k) { const int32_t e = (int32_t)(dat[k]) - src[k]; - err += e * e; + err += ((int64_t)e * e); } dat += dat_stride; src += src_stride; diff --git a/libaom/av1/encoder/x86/pickrst_sse4.c b/libaom/av1/encoder/x86/pickrst_sse4.c index 2326736..a94e169 100644 --- a/libaom/av1/encoder/x86/pickrst_sse4.c +++ b/libaom/av1/encoder/x86/pickrst_sse4.c @@ -539,7 +539,7 @@ int64_t av1_lowbd_pixel_proj_error_sse4_1( const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u); const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; - err += e * e; + err += ((int64_t)e * e); } dat += dat_stride; src += src_stride; @@ -578,7 +578,7 @@ int64_t av1_lowbd_pixel_proj_error_sse4_1( const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); int32_t v = xq_active * (flt[k] - u); const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; - err += e * e; + err += ((int64_t)e * e); } dat += dat_stride; src += src_stride; @@ -607,7 +607,7 @@ int64_t av1_lowbd_pixel_proj_error_sse4_1( } for (k = j; k < width; ++k) { const int32_t e = (int32_t)(dat[k]) - src[k]; - err += e * e; + err += ((int64_t)e * e); } dat += dat_stride; src += src_stride; @@ -709,7 +709,7 @@ int64_t av1_highbd_pixel_proj_error_sse4_1( const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u); const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; - err += e * e; + err += ((int64_t)e * e); } dat += dat_stride; src += src_stride; @@ -777,7 +777,7 @@ int64_t av1_highbd_pixel_proj_error_sse4_1( const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS); int32_t v = xq_on * (flt[k] - u); const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; - err += e * e; + err += ((int64_t)e * e); } dat += dat_stride; src += src_stride; @@ -814,7 +814,7 @@ int64_t av1_highbd_pixel_proj_error_sse4_1( // Process remaining pixels (modulu 8) for (k = j; k < width; ++k) { const int32_t e = (int32_t)(dat[k]) - src[k]; - err += e * e; + err += ((int64_t)e * e); } dat += dat_stride; src += src_stride; diff --git a/libaom/av1/encoder/x86/temporal_filter_apply_sse2.asm b/libaom/av1/encoder/x86/temporal_filter_apply_sse2.asm deleted file mode 100644 index 30983d1..0000000 --- a/libaom/av1/encoder/x86/temporal_filter_apply_sse2.asm +++ /dev/null @@ -1,217 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - -%include "aom_ports/x86_abi_support.asm" - -SECTION .text - -; void av1_temporal_filter_apply_sse2 | arg -; (unsigned char *frame1, | 0 -; unsigned int stride, | 1 -; unsigned char *frame2, | 2 -; unsigned int block_width, | 3 -; unsigned int block_height, | 4 -; int strength, | 5 -; int filter_weight, | 6 -; unsigned int *accumulator, | 7 -; unsigned short *count) | 8 -global sym(av1_temporal_filter_apply_sse2) PRIVATE -sym(av1_temporal_filter_apply_sse2): - - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 9 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ALIGN_STACK 16, rax - %define block_width 0 - %define block_height 16 - %define strength 32 - %define filter_weight 48 - %define rounding_bit 64 - %define rbp_backup 80 - %define stack_size 96 - sub rsp, stack_size - mov [rsp + rbp_backup], rbp - ; end prolog - - mov edx, arg(3) - mov [rsp + block_width], rdx - mov edx, arg(4) - mov [rsp + block_height], rdx - movd xmm6, arg(5) - movdqa [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read - - ; calculate the rounding bit outside the loop - ; 0x8000 >> (16 - strength) - mov rdx, 16 - sub rdx, arg(5) ; 16 - strength - movq xmm4, rdx ; can't use rdx w/ shift - movdqa xmm5, [GLOBAL(_const_top_bit)] - psrlw xmm5, xmm4 - movdqa [rsp + rounding_bit], xmm5 - - mov rsi, arg(0) ; src/frame1 - mov rdx, arg(2) ; predictor frame - mov rdi, arg(7) ; accumulator - mov rax, arg(8) ; count - - ; dup the filter weight and store for later - movd xmm0, arg(6) ; filter_weight - pshuflw xmm0, xmm0, 0 - punpcklwd xmm0, xmm0 - movdqa [rsp + filter_weight], xmm0 - - mov rbp, arg(1) ; stride - pxor xmm7, xmm7 ; zero for extraction - - mov rcx, [rsp + block_width] - imul rcx, [rsp + block_height] - add rcx, rdx - cmp dword ptr [rsp + block_width], 8 - jne .temporal_filter_apply_load_16 - -.temporal_filter_apply_load_8: - movq xmm0, [rsi] ; first row - lea rsi, [rsi + rbp] ; += stride - punpcklbw xmm0, xmm7 ; src[ 0- 7] - movq xmm1, [rsi] ; second row - lea rsi, [rsi + rbp] ; += stride - punpcklbw xmm1, xmm7 ; src[ 8-15] - jmp .temporal_filter_apply_load_finished - -.temporal_filter_apply_load_16: - movdqa xmm0, [rsi] ; src (frame1) - lea rsi, [rsi + rbp] ; += stride - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm7 ; src[ 0- 7] - punpckhbw xmm1, xmm7 ; src[ 8-15] - -.temporal_filter_apply_load_finished: - movdqa xmm2, [rdx] ; predictor (frame2) - movdqa xmm3, xmm2 - punpcklbw xmm2, xmm7 ; pred[ 0- 7] - punpckhbw xmm3, xmm7 ; pred[ 8-15] - - ; modifier = src_byte - pixel_value - psubw xmm0, xmm2 ; src - pred[ 0- 7] - psubw xmm1, xmm3 ; src - pred[ 8-15] - - ; modifier *= modifier - pmullw xmm0, xmm0 ; modifer[ 0- 7]^2 - pmullw xmm1, xmm1 ; modifer[ 8-15]^2 - - ; modifier *= 3 - pmullw xmm0, [GLOBAL(_const_3w)] - pmullw xmm1, [GLOBAL(_const_3w)] - - ; modifer += 0x8000 >> (16 - strength) - paddw xmm0, [rsp + rounding_bit] - paddw xmm1, [rsp + rounding_bit] - - ; modifier >>= strength - psrlw xmm0, [rsp + strength] - psrlw xmm1, [rsp + strength] - - ; modifier = 16 - modifier - ; saturation takes care of modifier > 16 - movdqa xmm3, [GLOBAL(_const_16w)] - movdqa xmm2, [GLOBAL(_const_16w)] - psubusw xmm3, xmm1 - psubusw xmm2, xmm0 - - ; modifier *= filter_weight - pmullw xmm2, [rsp + filter_weight] - pmullw xmm3, [rsp + filter_weight] - - ; count - movdqa xmm4, [rax] - movdqa xmm5, [rax+16] - ; += modifier - paddw xmm4, xmm2 - paddw xmm5, xmm3 - ; write back - movdqa [rax], xmm4 - movdqa [rax+16], xmm5 - lea rax, [rax + 16*2] ; count += 16*(sizeof(short)) - - ; load and extract the predictor up to shorts - pxor xmm7, xmm7 - movdqa xmm0, [rdx] - lea rdx, [rdx + 16*1] ; pred += 16*(sizeof(char)) - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm7 ; pred[ 0- 7] - punpckhbw xmm1, xmm7 ; pred[ 8-15] - - ; modifier *= pixel_value - pmullw xmm0, xmm2 - pmullw xmm1, xmm3 - - ; expand to double words - movdqa xmm2, xmm0 - punpcklwd xmm0, xmm7 ; [ 0- 3] - punpckhwd xmm2, xmm7 ; [ 4- 7] - movdqa xmm3, xmm1 - punpcklwd xmm1, xmm7 ; [ 8-11] - punpckhwd xmm3, xmm7 ; [12-15] - - ; accumulator - movdqa xmm4, [rdi] - movdqa xmm5, [rdi+16] - movdqa xmm6, [rdi+32] - movdqa xmm7, [rdi+48] - ; += modifier - paddd xmm4, xmm0 - paddd xmm5, xmm2 - paddd xmm6, xmm1 - paddd xmm7, xmm3 - ; write back - movdqa [rdi], xmm4 - movdqa [rdi+16], xmm5 - movdqa [rdi+32], xmm6 - movdqa [rdi+48], xmm7 - lea rdi, [rdi + 16*4] ; accumulator += 16*(sizeof(int)) - - cmp rdx, rcx - je .temporal_filter_apply_epilog - pxor xmm7, xmm7 ; zero for extraction - cmp dword ptr [rsp + block_width], 16 - je .temporal_filter_apply_load_16 - jmp .temporal_filter_apply_load_8 - -.temporal_filter_apply_epilog: - ; begin epilog - mov rbp, [rsp + rbp_backup] - add rsp, stack_size - pop rsp - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -SECTION_RODATA -align 16 -_const_3w: - times 8 dw 3 -align 16 -_const_top_bit: - times 8 dw 1<<15 -align 16 -_const_16w: - times 8 dw 16 diff --git a/libaom/av1/encoder/x86/temporal_filter_constants.h b/libaom/av1/encoder/x86/temporal_filter_constants.h new file mode 100644 index 0000000..b3a10dd --- /dev/null +++ b/libaom/av1/encoder/x86/temporal_filter_constants.h @@ -0,0 +1,401 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AOM_AV1_ENCODER_X86_TEMPORAL_FILTER_CONSTANTS_H_ +#define AOM_AV1_ENCODER_X86_TEMPORAL_FILTER_CONSTANTS_H_ + +// Division using multiplication and shifting. The C implementation does: +// modifier *= 3; +// modifier /= index; +// where 'modifier' is a set of summed values and 'index' is the number of +// summed values. +// +// This equation works out to (m * 3) / i which reduces to: +// m * 3/4 +// m * 1/2 +// m * 1/3 +// +// By pairing the multiply with a down shift by 16 (_mm_mulhi_epu16): +// m * C / 65536 +// we can create a C to replicate the division. +// +// m * 49152 / 65536 = m * 3/4 +// m * 32758 / 65536 = m * 1/2 +// m * 21846 / 65536 = m * 0.3333 +// +// These are loaded using an instruction expecting int16_t values but are used +// with _mm_mulhi_epu16(), which treats them as unsigned. +#define NEIGHBOR_CONSTANT_4 (int16_t)49152 +#define NEIGHBOR_CONSTANT_5 (int16_t)39322 +#define NEIGHBOR_CONSTANT_6 (int16_t)32768 +#define NEIGHBOR_CONSTANT_7 (int16_t)28087 +#define NEIGHBOR_CONSTANT_8 (int16_t)24576 +#define NEIGHBOR_CONSTANT_9 (int16_t)21846 +#define NEIGHBOR_CONSTANT_10 (int16_t)19661 +#define NEIGHBOR_CONSTANT_11 (int16_t)17874 +#define NEIGHBOR_CONSTANT_13 (int16_t)15124 + +DECLARE_ALIGNED(16, static const int16_t, LEFT_CORNER_NEIGHBORS_PLUS_1[8]) = { + NEIGHBOR_CONSTANT_5, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7 +}; + +DECLARE_ALIGNED(16, static const int16_t, RIGHT_CORNER_NEIGHBORS_PLUS_1[8]) = { + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_5 +}; + +DECLARE_ALIGNED(16, static const int16_t, LEFT_EDGE_NEIGHBORS_PLUS_1[8]) = { + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const int16_t, RIGHT_EDGE_NEIGHBORS_PLUS_1[8]) = { + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_7 +}; + +DECLARE_ALIGNED(16, static const int16_t, MIDDLE_EDGE_NEIGHBORS_PLUS_1[8]) = { + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, + NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7 +}; + +DECLARE_ALIGNED(16, static const int16_t, MIDDLE_CENTER_NEIGHBORS_PLUS_1[8]) = { + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const int16_t, LEFT_CORNER_NEIGHBORS_PLUS_2[8]) = { + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const int16_t, RIGHT_CORNER_NEIGHBORS_PLUS_2[8]) = { + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_6 +}; + +DECLARE_ALIGNED(16, static const int16_t, LEFT_EDGE_NEIGHBORS_PLUS_2[8]) = { + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11 +}; + +DECLARE_ALIGNED(16, static const int16_t, RIGHT_EDGE_NEIGHBORS_PLUS_2[8]) = { + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const int16_t, MIDDLE_EDGE_NEIGHBORS_PLUS_2[8]) = { + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const int16_t, MIDDLE_CENTER_NEIGHBORS_PLUS_2[8]) = { + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11 +}; + +DECLARE_ALIGNED(16, static const int16_t, TWO_CORNER_NEIGHBORS_PLUS_2[8]) = { + NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_6 +}; + +DECLARE_ALIGNED(16, static const int16_t, TWO_EDGE_NEIGHBORS_PLUS_2[8]) = { + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, + NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const int16_t, LEFT_CORNER_NEIGHBORS_PLUS_4[8]) = { + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const int16_t, RIGHT_CORNER_NEIGHBORS_PLUS_4[8]) = { + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const int16_t, LEFT_EDGE_NEIGHBORS_PLUS_4[8]) = { + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13 +}; + +DECLARE_ALIGNED(16, static const int16_t, RIGHT_EDGE_NEIGHBORS_PLUS_4[8]) = { + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const int16_t, MIDDLE_EDGE_NEIGHBORS_PLUS_4[8]) = { + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const int16_t, MIDDLE_CENTER_NEIGHBORS_PLUS_4[8]) = { + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13 +}; + +DECLARE_ALIGNED(16, static const int16_t, TWO_CORNER_NEIGHBORS_PLUS_4[8]) = { + NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const int16_t, TWO_EDGE_NEIGHBORS_PLUS_4[8]) = { + NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, + NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_10 +}; + +static const int16_t *const LUMA_LEFT_COLUMN_NEIGHBORS[2] = { + LEFT_CORNER_NEIGHBORS_PLUS_2, LEFT_EDGE_NEIGHBORS_PLUS_2 +}; + +static const int16_t *const LUMA_MIDDLE_COLUMN_NEIGHBORS[2] = { + MIDDLE_EDGE_NEIGHBORS_PLUS_2, MIDDLE_CENTER_NEIGHBORS_PLUS_2 +}; + +static const int16_t *const LUMA_RIGHT_COLUMN_NEIGHBORS[2] = { + RIGHT_CORNER_NEIGHBORS_PLUS_2, RIGHT_EDGE_NEIGHBORS_PLUS_2 +}; + +static const int16_t *const CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS[2] = { + LEFT_CORNER_NEIGHBORS_PLUS_1, LEFT_EDGE_NEIGHBORS_PLUS_1 +}; + +static const int16_t *const CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS[2] = { + MIDDLE_EDGE_NEIGHBORS_PLUS_1, MIDDLE_CENTER_NEIGHBORS_PLUS_1 +}; + +static const int16_t *const CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS[2] = { + RIGHT_CORNER_NEIGHBORS_PLUS_1, RIGHT_EDGE_NEIGHBORS_PLUS_1 +}; + +static const int16_t *const CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS[2] = { + LEFT_CORNER_NEIGHBORS_PLUS_2, LEFT_EDGE_NEIGHBORS_PLUS_2 +}; + +static const int16_t *const CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = { + MIDDLE_EDGE_NEIGHBORS_PLUS_2, MIDDLE_CENTER_NEIGHBORS_PLUS_2 +}; + +static const int16_t *const CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = { + RIGHT_CORNER_NEIGHBORS_PLUS_2, RIGHT_EDGE_NEIGHBORS_PLUS_2 +}; + +static const int16_t *const CHROMA_SINGLE_SS_SINGLE_COLUMN_NEIGHBORS[2] = { + TWO_CORNER_NEIGHBORS_PLUS_2, TWO_EDGE_NEIGHBORS_PLUS_2 +}; + +static const int16_t *const CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS[2] = { + LEFT_CORNER_NEIGHBORS_PLUS_4, LEFT_EDGE_NEIGHBORS_PLUS_4 +}; + +static const int16_t *const CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = { + MIDDLE_EDGE_NEIGHBORS_PLUS_4, MIDDLE_CENTER_NEIGHBORS_PLUS_4 +}; + +static const int16_t *const CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = { + RIGHT_CORNER_NEIGHBORS_PLUS_4, RIGHT_EDGE_NEIGHBORS_PLUS_4 +}; + +static const int16_t *const CHROMA_DOUBLE_SS_SINGLE_COLUMN_NEIGHBORS[2] = { + TWO_CORNER_NEIGHBORS_PLUS_4, TWO_EDGE_NEIGHBORS_PLUS_4 +}; + +#define HIGHBD_NEIGHBOR_CONSTANT_4 (uint32_t)3221225472U +#define HIGHBD_NEIGHBOR_CONSTANT_5 (uint32_t)2576980378U +#define HIGHBD_NEIGHBOR_CONSTANT_6 (uint32_t)2147483648U +#define HIGHBD_NEIGHBOR_CONSTANT_7 (uint32_t)1840700270U +#define HIGHBD_NEIGHBOR_CONSTANT_8 (uint32_t)1610612736U +#define HIGHBD_NEIGHBOR_CONSTANT_9 (uint32_t)1431655766U +#define HIGHBD_NEIGHBOR_CONSTANT_10 (uint32_t)1288490189U +#define HIGHBD_NEIGHBOR_CONSTANT_11 (uint32_t)1171354718U +#define HIGHBD_NEIGHBOR_CONSTANT_13 (uint32_t)991146300U + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_1[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_5, HIGHBD_NEIGHBOR_CONSTANT_7, + HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_1[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7, + HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_5 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_1[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_10, + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_1[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10, + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_7 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_1[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7, + HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_1[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10, + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_2[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_6, HIGHBD_NEIGHBOR_CONSTANT_8, + HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_2[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8, + HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_6 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_2[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_11, + HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_2[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11, + HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_2[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8, + HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_2[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11, + HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_4[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_10, + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_4[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10, + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_8 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_4[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_13, + HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_4[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13, + HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_4[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10, + HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10 +}; + +DECLARE_ALIGNED(16, static const uint32_t, + HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_4[4]) = { + HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13, + HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13 +}; + +static const uint32_t *const HIGHBD_LUMA_LEFT_COLUMN_NEIGHBORS[2] = { + HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_2 +}; + +static const uint32_t *const HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS[2] = { + HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_2, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_2 +}; + +static const uint32_t *const HIGHBD_LUMA_RIGHT_COLUMN_NEIGHBORS[2] = { + HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_2 +}; + +static const uint32_t *const HIGHBD_CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS[2] = { + HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_1, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_1 +}; + +static const uint32_t *const HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS[2] = { + HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_1, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_1 +}; + +static const uint32_t *const HIGHBD_CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS[2] = { + HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_1, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_1 +}; + +static const uint32_t *const HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS[2] = + { HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_2 }; + +static const uint32_t + *const HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = { + HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_2, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_2 + }; + +static const uint32_t *const HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = + { HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_2, + HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_2 }; + +static const uint32_t *const HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS[2] = + { HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_4, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_4 }; + +static const uint32_t + *const HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = { + HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_4, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_4 + }; + +static const uint32_t *const HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = + { HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_4, + HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_4 }; + +#define DIST_STRIDE ((BW) + 2) +#endif // AOM_AV1_ENCODER_X86_TEMPORAL_FILTER_CONSTANTS_H_ diff --git a/libaom/av1/encoder/x86/temporal_filter_sse4.c b/libaom/av1/encoder/x86/temporal_filter_sse4.c new file mode 100644 index 0000000..556d00c --- /dev/null +++ b/libaom/av1/encoder/x86/temporal_filter_sse4.c @@ -0,0 +1,1006 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <smmintrin.h> + +#include "config/av1_rtcd.h" +#include "aom/aom_integer.h" +#include "av1/encoder/encoder.h" +#include "av1/encoder/temporal_filter.h" +#include "av1/encoder/x86/temporal_filter_constants.h" + +// Read in 8 pixels from a and b as 8-bit unsigned integers, compute the +// difference squared, and store as unsigned 16-bit integer to dst. +static INLINE void store_dist_8(const uint8_t *a, const uint8_t *b, + uint16_t *dst) { + const __m128i a_reg = _mm_loadl_epi64((const __m128i *)a); + const __m128i b_reg = _mm_loadl_epi64((const __m128i *)b); + + const __m128i a_first = _mm_cvtepu8_epi16(a_reg); + const __m128i b_first = _mm_cvtepu8_epi16(b_reg); + + __m128i dist_first; + + dist_first = _mm_sub_epi16(a_first, b_first); + dist_first = _mm_mullo_epi16(dist_first, dist_first); + + _mm_storeu_si128((__m128i *)dst, dist_first); +} + +static INLINE void store_dist_16(const uint8_t *a, const uint8_t *b, + uint16_t *dst) { + const __m128i zero = _mm_setzero_si128(); + const __m128i a_reg = _mm_loadu_si128((const __m128i *)a); + const __m128i b_reg = _mm_loadu_si128((const __m128i *)b); + + const __m128i a_first = _mm_cvtepu8_epi16(a_reg); + const __m128i a_second = _mm_unpackhi_epi8(a_reg, zero); + const __m128i b_first = _mm_cvtepu8_epi16(b_reg); + const __m128i b_second = _mm_unpackhi_epi8(b_reg, zero); + + __m128i dist_first, dist_second; + + dist_first = _mm_sub_epi16(a_first, b_first); + dist_second = _mm_sub_epi16(a_second, b_second); + dist_first = _mm_mullo_epi16(dist_first, dist_first); + dist_second = _mm_mullo_epi16(dist_second, dist_second); + + _mm_storeu_si128((__m128i *)dst, dist_first); + _mm_storeu_si128((__m128i *)(dst + 8), dist_second); +} + +static INLINE void read_dist_8(const uint16_t *dist, __m128i *dist_reg) { + *dist_reg = _mm_loadu_si128((const __m128i *)dist); +} + +static INLINE void read_dist_16(const uint16_t *dist, __m128i *reg_first, + __m128i *reg_second) { + read_dist_8(dist, reg_first); + read_dist_8(dist + 8, reg_second); +} + +// Average the value based on the number of values summed (9 for pixels away +// from the border, 4 for pixels in corners, and 6 for other edge values). +// +// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply +// by weight. +static __m128i average_8(__m128i sum, const __m128i *mul_constants, + const int strength, const int rounding, + const int weight) { + // _mm_srl_epi16 uses the lower 64 bit value for the shift. + const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength); + const __m128i rounding_u16 = _mm_set1_epi16(rounding); + const __m128i weight_u16 = _mm_set1_epi16(weight); + const __m128i sixteen = _mm_set1_epi16(16); + + // modifier * 3 / index; + sum = _mm_mulhi_epu16(sum, *mul_constants); + + sum = _mm_adds_epu16(sum, rounding_u16); + sum = _mm_srl_epi16(sum, strength_u128); + + // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4 + // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385 + // So this needs to use the epu16 version which did not come until SSE4. + sum = _mm_min_epu16(sum, sixteen); + + sum = _mm_sub_epi16(sixteen, sum); + + return _mm_mullo_epi16(sum, weight_u16); +} + +static __m128i average_4_4(__m128i sum, const __m128i *mul_constants, + const int strength, const int rounding, + const int weight_0, const int weight_1) { + // _mm_srl_epi16 uses the lower 64 bit value for the shift. + const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength); + const __m128i rounding_u16 = _mm_set1_epi16(rounding); + const __m128i weight_u16 = + _mm_setr_epi16(weight_0, weight_0, weight_0, weight_0, weight_1, weight_1, + weight_1, weight_1); + const __m128i sixteen = _mm_set1_epi16(16); + + // modifier * 3 / index; + sum = _mm_mulhi_epu16(sum, *mul_constants); + + sum = _mm_adds_epu16(sum, rounding_u16); + sum = _mm_srl_epi16(sum, strength_u128); + + // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4 + // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385 + // So this needs to use the epu16 version which did not come until SSE4. + sum = _mm_min_epu16(sum, sixteen); + + sum = _mm_sub_epi16(sixteen, sum); + + return _mm_mullo_epi16(sum, weight_u16); +} + +static INLINE void average_16(__m128i *sum_0_u16, __m128i *sum_1_u16, + const __m128i *mul_constants_0, + const __m128i *mul_constants_1, + const int strength, const int rounding, + const int weight) { + const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength); + const __m128i rounding_u16 = _mm_set1_epi16(rounding); + const __m128i weight_u16 = _mm_set1_epi16(weight); + const __m128i sixteen = _mm_set1_epi16(16); + __m128i input_0, input_1; + + input_0 = _mm_mulhi_epu16(*sum_0_u16, *mul_constants_0); + input_0 = _mm_adds_epu16(input_0, rounding_u16); + + input_1 = _mm_mulhi_epu16(*sum_1_u16, *mul_constants_1); + input_1 = _mm_adds_epu16(input_1, rounding_u16); + + input_0 = _mm_srl_epi16(input_0, strength_u128); + input_1 = _mm_srl_epi16(input_1, strength_u128); + + input_0 = _mm_min_epu16(input_0, sixteen); + input_1 = _mm_min_epu16(input_1, sixteen); + input_0 = _mm_sub_epi16(sixteen, input_0); + input_1 = _mm_sub_epi16(sixteen, input_1); + + *sum_0_u16 = _mm_mullo_epi16(input_0, weight_u16); + *sum_1_u16 = _mm_mullo_epi16(input_1, weight_u16); +} + +// Add 'sum_u16' to 'count'. Multiply by 'pred' and add to 'accumulator.' +static void accumulate_and_store_8(const __m128i sum_u16, const uint8_t *pred, + uint16_t *count, uint32_t *accumulator) { + const __m128i pred_u8 = _mm_loadl_epi64((const __m128i *)pred); + const __m128i zero = _mm_setzero_si128(); + __m128i count_u16 = _mm_loadu_si128((const __m128i *)count); + __m128i pred_u16 = _mm_cvtepu8_epi16(pred_u8); + __m128i pred_0_u32, pred_1_u32; + __m128i accum_0_u32, accum_1_u32; + + count_u16 = _mm_adds_epu16(count_u16, sum_u16); + _mm_storeu_si128((__m128i *)count, count_u16); + + pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16); + + pred_0_u32 = _mm_cvtepu16_epi32(pred_u16); + pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero); + + accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator); + accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4)); + + accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32); + accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32); + + _mm_storeu_si128((__m128i *)accumulator, accum_0_u32); + _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32); +} + +static INLINE void accumulate_and_store_16(const __m128i sum_0_u16, + const __m128i sum_1_u16, + const uint8_t *pred, uint16_t *count, + uint32_t *accumulator) { + const __m128i pred_u8 = _mm_loadu_si128((const __m128i *)pred); + const __m128i zero = _mm_setzero_si128(); + __m128i count_0_u16 = _mm_loadu_si128((const __m128i *)count), + count_1_u16 = _mm_loadu_si128((const __m128i *)(count + 8)); + __m128i pred_0_u16 = _mm_cvtepu8_epi16(pred_u8), + pred_1_u16 = _mm_unpackhi_epi8(pred_u8, zero); + __m128i pred_0_u32, pred_1_u32, pred_2_u32, pred_3_u32; + __m128i accum_0_u32, accum_1_u32, accum_2_u32, accum_3_u32; + + count_0_u16 = _mm_adds_epu16(count_0_u16, sum_0_u16); + _mm_storeu_si128((__m128i *)count, count_0_u16); + + count_1_u16 = _mm_adds_epu16(count_1_u16, sum_1_u16); + _mm_storeu_si128((__m128i *)(count + 8), count_1_u16); + + pred_0_u16 = _mm_mullo_epi16(sum_0_u16, pred_0_u16); + pred_1_u16 = _mm_mullo_epi16(sum_1_u16, pred_1_u16); + + pred_0_u32 = _mm_cvtepu16_epi32(pred_0_u16); + pred_1_u32 = _mm_unpackhi_epi16(pred_0_u16, zero); + pred_2_u32 = _mm_cvtepu16_epi32(pred_1_u16); + pred_3_u32 = _mm_unpackhi_epi16(pred_1_u16, zero); + + accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator); + accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4)); + accum_2_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 8)); + accum_3_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 12)); + + accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32); + accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32); + accum_2_u32 = _mm_add_epi32(pred_2_u32, accum_2_u32); + accum_3_u32 = _mm_add_epi32(pred_3_u32, accum_3_u32); + + _mm_storeu_si128((__m128i *)accumulator, accum_0_u32); + _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32); + _mm_storeu_si128((__m128i *)(accumulator + 8), accum_2_u32); + _mm_storeu_si128((__m128i *)(accumulator + 12), accum_3_u32); +} + +// Read in 8 pixels from y_dist. For each index i, compute y_dist[i-1] + +// y_dist[i] + y_dist[i+1] and store in sum as 16-bit unsigned int. +static INLINE void get_sum_8(const uint16_t *y_dist, __m128i *sum) { + __m128i dist_reg, dist_left, dist_right; + + dist_reg = _mm_loadu_si128((const __m128i *)y_dist); + dist_left = _mm_loadu_si128((const __m128i *)(y_dist - 1)); + dist_right = _mm_loadu_si128((const __m128i *)(y_dist + 1)); + + *sum = _mm_adds_epu16(dist_reg, dist_left); + *sum = _mm_adds_epu16(*sum, dist_right); +} + +// Read in 16 pixels from y_dist. For each index i, compute y_dist[i-1] + +// y_dist[i] + y_dist[i+1]. Store the result for first 8 pixels in sum_first and +// the rest in sum_second. +static INLINE void get_sum_16(const uint16_t *y_dist, __m128i *sum_first, + __m128i *sum_second) { + get_sum_8(y_dist, sum_first); + get_sum_8(y_dist + 8, sum_second); +} + +// Read in a row of chroma values corresponds to a row of 16 luma values. +static INLINE void read_chroma_dist_row_16(int ss_x, const uint16_t *u_dist, + const uint16_t *v_dist, + __m128i *u_first, __m128i *u_second, + __m128i *v_first, + __m128i *v_second) { + if (!ss_x) { + // If there is no chroma subsampling in the horizontal direction, then we + // need to load 16 entries from chroma. + read_dist_16(u_dist, u_first, u_second); + read_dist_16(v_dist, v_first, v_second); + } else { // ss_x == 1 + // Otherwise, we only need to load 8 entries + __m128i u_reg, v_reg; + + read_dist_8(u_dist, &u_reg); + + *u_first = _mm_unpacklo_epi16(u_reg, u_reg); + *u_second = _mm_unpackhi_epi16(u_reg, u_reg); + + read_dist_8(v_dist, &v_reg); + + *v_first = _mm_unpacklo_epi16(v_reg, v_reg); + *v_second = _mm_unpackhi_epi16(v_reg, v_reg); + } +} + +// Horizontal add unsigned 16-bit ints in src and store them as signed 32-bit +// int in dst. +static INLINE void hadd_epu16(__m128i *src, __m128i *dst) { + const __m128i zero = _mm_setzero_si128(); + const __m128i shift_right = _mm_srli_si128(*src, 2); + + const __m128i odd = _mm_blend_epi16(shift_right, zero, 170); + const __m128i even = _mm_blend_epi16(*src, zero, 170); + + *dst = _mm_add_epi32(even, odd); +} + +// Add a row of luma distortion to 8 corresponding chroma mods. +static INLINE void add_luma_dist_to_8_chroma_mod(const uint16_t *y_dist, + int ss_x, int ss_y, + __m128i *u_mod, + __m128i *v_mod) { + __m128i y_reg; + if (!ss_x) { + read_dist_8(y_dist, &y_reg); + if (ss_y == 1) { + __m128i y_tmp; + read_dist_8(y_dist + DIST_STRIDE, &y_tmp); + + y_reg = _mm_adds_epu16(y_reg, y_tmp); + } + } else { + __m128i y_first, y_second; + read_dist_16(y_dist, &y_first, &y_second); + if (ss_y == 1) { + __m128i y_tmp_0, y_tmp_1; + read_dist_16(y_dist + DIST_STRIDE, &y_tmp_0, &y_tmp_1); + + y_first = _mm_adds_epu16(y_first, y_tmp_0); + y_second = _mm_adds_epu16(y_second, y_tmp_1); + } + + hadd_epu16(&y_first, &y_first); + hadd_epu16(&y_second, &y_second); + + y_reg = _mm_packus_epi32(y_first, y_second); + } + + *u_mod = _mm_adds_epu16(*u_mod, y_reg); + *v_mod = _mm_adds_epu16(*v_mod, y_reg); +} + +// Apply temporal filter to the luma components. This performs temporal +// filtering on a luma block of 16 X block_height. Use blk_fw as an array of +// size 4 for the weights for each of the 4 subblocks if blk_fw is not NULL, +// else use top_weight for top half, and bottom weight for bottom half. +static void av1_apply_temporal_filter_luma_16( + const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, + int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, + int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, int use_whole_blk, uint32_t *y_accum, + uint16_t *y_count, const uint16_t *y_dist, const uint16_t *u_dist, + const uint16_t *v_dist, const int16_t *const *neighbors_first, + const int16_t *const *neighbors_second, int top_weight, int bottom_weight, + const int *blk_fw) { + const int rounding = (1 << strength) >> 1; + int weight = top_weight; + + __m128i mul_first, mul_second; + + __m128i sum_row_1_first, sum_row_1_second; + __m128i sum_row_2_first, sum_row_2_second; + __m128i sum_row_3_first, sum_row_3_second; + + __m128i u_first, u_second; + __m128i v_first, v_second; + + __m128i sum_row_first; + __m128i sum_row_second; + + // Loop variables + unsigned int h; + + assert(strength >= 0); + assert(strength <= 6); + + assert(block_width == 16); + + (void)block_width; + + // First row + mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[0]); + mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[0]); + + // Add luma values + get_sum_16(y_dist, &sum_row_2_first, &sum_row_2_second); + get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second); + + sum_row_first = _mm_adds_epu16(sum_row_2_first, sum_row_3_first); + sum_row_second = _mm_adds_epu16(sum_row_2_second, sum_row_3_second); + + // Add chroma values + read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first, + &v_second); + + sum_row_first = _mm_adds_epu16(sum_row_first, u_first); + sum_row_second = _mm_adds_epu16(sum_row_second, u_second); + + sum_row_first = _mm_adds_epu16(sum_row_first, v_first); + sum_row_second = _mm_adds_epu16(sum_row_second, v_second); + + // Get modifier and store result + if (blk_fw) { + sum_row_first = + average_8(sum_row_first, &mul_first, strength, rounding, blk_fw[0]); + sum_row_second = + average_8(sum_row_second, &mul_second, strength, rounding, blk_fw[1]); + } else { + average_16(&sum_row_first, &sum_row_second, &mul_first, &mul_second, + strength, rounding, weight); + } + accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); + + y_src += y_src_stride; + y_pre += y_pre_stride; + y_count += y_pre_stride; + y_accum += y_pre_stride; + y_dist += DIST_STRIDE; + + u_src += uv_src_stride; + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_src += uv_src_stride; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + + // Then all the rows except the last one + mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[1]); + mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[1]); + + for (h = 1; h < block_height - 1; ++h) { + // Move the weight to bottom half + if (!use_whole_blk && h == block_height / 2) { + if (blk_fw) { + blk_fw += 2; + } else { + weight = bottom_weight; + } + } + // Shift the rows up + sum_row_1_first = sum_row_2_first; + sum_row_1_second = sum_row_2_second; + sum_row_2_first = sum_row_3_first; + sum_row_2_second = sum_row_3_second; + + // Add luma values to the modifier + sum_row_first = _mm_adds_epu16(sum_row_1_first, sum_row_2_first); + sum_row_second = _mm_adds_epu16(sum_row_1_second, sum_row_2_second); + + get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second); + + sum_row_first = _mm_adds_epu16(sum_row_first, sum_row_3_first); + sum_row_second = _mm_adds_epu16(sum_row_second, sum_row_3_second); + + // Add chroma values to the modifier + if (ss_y == 0 || h % 2 == 0) { + // Only calculate the new chroma distortion if we are at a pixel that + // corresponds to a new chroma row + read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, + &v_first, &v_second); + + u_src += uv_src_stride; + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_src += uv_src_stride; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + } + + sum_row_first = _mm_adds_epu16(sum_row_first, u_first); + sum_row_second = _mm_adds_epu16(sum_row_second, u_second); + sum_row_first = _mm_adds_epu16(sum_row_first, v_first); + sum_row_second = _mm_adds_epu16(sum_row_second, v_second); + + // Get modifier and store result + if (blk_fw) { + sum_row_first = + average_8(sum_row_first, &mul_first, strength, rounding, blk_fw[0]); + sum_row_second = + average_8(sum_row_second, &mul_second, strength, rounding, blk_fw[1]); + } else { + average_16(&sum_row_first, &sum_row_second, &mul_first, &mul_second, + strength, rounding, weight); + } + accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); + + y_src += y_src_stride; + y_pre += y_pre_stride; + y_count += y_pre_stride; + y_accum += y_pre_stride; + y_dist += DIST_STRIDE; + } + + // The last row + mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[0]); + mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[0]); + + // Shift the rows up + sum_row_1_first = sum_row_2_first; + sum_row_1_second = sum_row_2_second; + sum_row_2_first = sum_row_3_first; + sum_row_2_second = sum_row_3_second; + + // Add luma values to the modifier + sum_row_first = _mm_adds_epu16(sum_row_1_first, sum_row_2_first); + sum_row_second = _mm_adds_epu16(sum_row_1_second, sum_row_2_second); + + // Add chroma values to the modifier + if (ss_y == 0) { + // Only calculate the new chroma distortion if we are at a pixel that + // corresponds to a new chroma row + read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first, + &v_second); + } + + sum_row_first = _mm_adds_epu16(sum_row_first, u_first); + sum_row_second = _mm_adds_epu16(sum_row_second, u_second); + sum_row_first = _mm_adds_epu16(sum_row_first, v_first); + sum_row_second = _mm_adds_epu16(sum_row_second, v_second); + + // Get modifier and store result + if (blk_fw) { + sum_row_first = + average_8(sum_row_first, &mul_first, strength, rounding, blk_fw[0]); + sum_row_second = + average_8(sum_row_second, &mul_second, strength, rounding, blk_fw[1]); + } else { + average_16(&sum_row_first, &sum_row_second, &mul_first, &mul_second, + strength, rounding, weight); + } + accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count, + y_accum); +} + +// Perform temporal filter for the luma component. +static void av1_apply_temporal_filter_luma( + const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, + int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, + int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk, + uint32_t *y_accum, uint16_t *y_count, const uint16_t *y_dist, + const uint16_t *u_dist, const uint16_t *v_dist) { + unsigned int blk_col = 0, uv_blk_col = 0; + const unsigned int blk_col_step = 16, uv_blk_col_step = 16 >> ss_x; + const unsigned int mid_width = block_width >> 1, + last_width = block_width - blk_col_step; + int top_weight = blk_fw[0], + bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2]; + const int16_t *const *neighbors_first; + const int16_t *const *neighbors_second; + + if (block_width == 16) { + // Special Case: The blockwidth is 16 and we are operating on a row of 16 + // chroma pixels. In this case, we can't use the usualy left-midle-right + // pattern. We also don't support splitting now. + neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS; + neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS; + if (use_whole_blk) { + av1_apply_temporal_filter_luma_16( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16, + block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col, + y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col, + v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight, + bottom_weight, NULL); + } else { + av1_apply_temporal_filter_luma_16( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16, + block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col, + y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col, + v_dist + uv_blk_col, neighbors_first, neighbors_second, 0, 0, blk_fw); + } + + return; + } + + // Left + neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS; + neighbors_second = LUMA_MIDDLE_COLUMN_NEIGHBORS; + av1_apply_temporal_filter_luma_16( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col, + v_pre + uv_blk_col, uv_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, top_weight, bottom_weight, NULL); + + blk_col += blk_col_step; + uv_blk_col += uv_blk_col_step; + + // Middle First + neighbors_first = LUMA_MIDDLE_COLUMN_NEIGHBORS; + for (; blk_col < mid_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + av1_apply_temporal_filter_luma_16( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16, block_height, + ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col, + y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col, + v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight, + bottom_weight, NULL); + } + + if (!use_whole_blk) { + top_weight = blk_fw[1]; + bottom_weight = blk_fw[3]; + } + + // Middle Second + for (; blk_col < last_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + av1_apply_temporal_filter_luma_16( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16, block_height, + ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col, + y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col, + v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight, + bottom_weight, NULL); + } + + // Right + neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS; + av1_apply_temporal_filter_luma_16( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col, + v_pre + uv_blk_col, uv_pre_stride, 16, block_height, ss_x, ss_y, strength, + use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first, + neighbors_second, top_weight, bottom_weight, NULL); +} + +// Apply temporal filter to the chroma components. This performs temporal +// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use +// blk_fw as an array of size 4 for the weights for each of the 4 subblocks, +// else use top_weight for top half, and bottom weight for bottom half. +static void av1_apply_temporal_filter_chroma_8( + const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, + int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, + int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, + int uv_pre_stride, unsigned int uv_block_width, + unsigned int uv_block_height, int ss_x, int ss_y, int strength, + uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count, + const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist, + const int16_t *const *neighbors, int top_weight, int bottom_weight, + const int *blk_fw) { + const int rounding = (1 << strength) >> 1; + int weight = top_weight; + + __m128i mul; + + __m128i u_sum_row_1, u_sum_row_2, u_sum_row_3; + __m128i v_sum_row_1, v_sum_row_2, v_sum_row_3; + + __m128i u_sum_row, v_sum_row; + + // Loop variable + unsigned int h; + + (void)uv_block_width; + + // First row + mul = _mm_loadu_si128((const __m128i *)neighbors[0]); + + // Add chroma values + get_sum_8(u_dist, &u_sum_row_2); + get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3); + + u_sum_row = _mm_adds_epu16(u_sum_row_2, u_sum_row_3); + + get_sum_8(v_dist, &v_sum_row_2); + get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3); + + v_sum_row = _mm_adds_epu16(v_sum_row_2, v_sum_row_3); + + // Add luma values + add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row); + + // Get modifier and store result + if (blk_fw) { + u_sum_row = + average_4_4(u_sum_row, &mul, strength, rounding, blk_fw[0], blk_fw[1]); + v_sum_row = + average_4_4(v_sum_row, &mul, strength, rounding, blk_fw[0], blk_fw[1]); + } else { + u_sum_row = average_8(u_sum_row, &mul, strength, rounding, weight); + v_sum_row = average_8(v_sum_row, &mul, strength, rounding, weight); + } + accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum); + accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum); + + u_src += uv_src_stride; + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_src += uv_src_stride; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + u_count += uv_pre_stride; + u_accum += uv_pre_stride; + v_count += uv_pre_stride; + v_accum += uv_pre_stride; + + y_src += y_src_stride * (1 + ss_y); + y_pre += y_pre_stride * (1 + ss_y); + y_dist += DIST_STRIDE * (1 + ss_y); + + // Then all the rows except the last one + mul = _mm_loadu_si128((const __m128i *)neighbors[1]); + + for (h = 1; h < uv_block_height - 1; ++h) { + // Move the weight pointer to the bottom half of the blocks + if (h == uv_block_height / 2) { + if (blk_fw) { + blk_fw += 2; + } else { + weight = bottom_weight; + } + } + + // Shift the rows up + u_sum_row_1 = u_sum_row_2; + u_sum_row_2 = u_sum_row_3; + + v_sum_row_1 = v_sum_row_2; + v_sum_row_2 = v_sum_row_3; + + // Add chroma values + u_sum_row = _mm_adds_epu16(u_sum_row_1, u_sum_row_2); + get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3); + u_sum_row = _mm_adds_epu16(u_sum_row, u_sum_row_3); + + v_sum_row = _mm_adds_epu16(v_sum_row_1, v_sum_row_2); + get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3); + v_sum_row = _mm_adds_epu16(v_sum_row, v_sum_row_3); + + // Add luma values + add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row); + + // Get modifier and store result + if (blk_fw) { + u_sum_row = average_4_4(u_sum_row, &mul, strength, rounding, blk_fw[0], + blk_fw[1]); + v_sum_row = average_4_4(v_sum_row, &mul, strength, rounding, blk_fw[0], + blk_fw[1]); + } else { + u_sum_row = average_8(u_sum_row, &mul, strength, rounding, weight); + v_sum_row = average_8(v_sum_row, &mul, strength, rounding, weight); + } + + accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum); + accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum); + + u_src += uv_src_stride; + u_pre += uv_pre_stride; + u_dist += DIST_STRIDE; + v_src += uv_src_stride; + v_pre += uv_pre_stride; + v_dist += DIST_STRIDE; + u_count += uv_pre_stride; + u_accum += uv_pre_stride; + v_count += uv_pre_stride; + v_accum += uv_pre_stride; + + y_src += y_src_stride * (1 + ss_y); + y_pre += y_pre_stride * (1 + ss_y); + y_dist += DIST_STRIDE * (1 + ss_y); + } + + // The last row + mul = _mm_loadu_si128((const __m128i *)neighbors[0]); + + // Shift the rows up + u_sum_row_1 = u_sum_row_2; + u_sum_row_2 = u_sum_row_3; + + v_sum_row_1 = v_sum_row_2; + v_sum_row_2 = v_sum_row_3; + + // Add chroma values + u_sum_row = _mm_adds_epu16(u_sum_row_1, u_sum_row_2); + v_sum_row = _mm_adds_epu16(v_sum_row_1, v_sum_row_2); + + // Add luma values + add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row); + + // Get modifier and store result + if (blk_fw) { + u_sum_row = + average_4_4(u_sum_row, &mul, strength, rounding, blk_fw[0], blk_fw[1]); + v_sum_row = + average_4_4(v_sum_row, &mul, strength, rounding, blk_fw[0], blk_fw[1]); + } else { + u_sum_row = average_8(u_sum_row, &mul, strength, rounding, weight); + v_sum_row = average_8(v_sum_row, &mul, strength, rounding, weight); + } + + accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum); + accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum); +} + +// Perform temporal filter for the chroma components. +static void av1_apply_temporal_filter_chroma( + const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, + int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, + int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk, + uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count, + const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) { + const unsigned int uv_width = block_width >> ss_x, + uv_height = block_height >> ss_y; + + unsigned int blk_col = 0, uv_blk_col = 0; + const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x; + const unsigned int uv_mid_width = uv_width >> 1, + uv_last_width = uv_width - uv_blk_col_step; + int top_weight = blk_fw[0], + bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2]; + const int16_t *const *neighbors; + + if (uv_width == 8) { + // Special Case: We are subsampling in x direction on a 16x16 block. Since + // we are operating on a row of 8 chroma pixels, we can't use the usual + // left-middle-right pattern. + assert(ss_x); + + if (ss_y) { + neighbors = CHROMA_DOUBLE_SS_SINGLE_COLUMN_NEIGHBORS; + } else { + neighbors = CHROMA_SINGLE_SS_SINGLE_COLUMN_NEIGHBORS; + } + + if (use_whole_blk) { + av1_apply_temporal_filter_chroma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, + top_weight, bottom_weight, NULL); + } else { + av1_apply_temporal_filter_chroma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, + 0, 0, blk_fw); + } + + return; + } + + // Left + if (ss_x && ss_y) { + neighbors = CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors = CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS; + } else { + neighbors = CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS; + } + + av1_apply_temporal_filter_chroma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col, + v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y, + strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight, + bottom_weight, NULL); + + blk_col += blk_col_step; + uv_blk_col += uv_blk_col_step; + + // Middle First + if (ss_x && ss_y) { + neighbors = CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors = CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS; + } else { + neighbors = CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS; + } + + for (; uv_blk_col < uv_mid_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + av1_apply_temporal_filter_chroma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, + top_weight, bottom_weight, NULL); + } + + if (!use_whole_blk) { + top_weight = blk_fw[1]; + bottom_weight = blk_fw[3]; + } + + // Middle Second + for (; uv_blk_col < uv_last_width; + blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) { + av1_apply_temporal_filter_chroma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, + u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width, + uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col, + u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col, + y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, + top_weight, bottom_weight, NULL); + } + + // Right + if (ss_x && ss_y) { + neighbors = CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else if (ss_x || ss_y) { + neighbors = CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS; + } else { + neighbors = CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS; + } + + av1_apply_temporal_filter_chroma_8( + y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride, + u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col, + v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y, + strength, u_accum + uv_blk_col, u_count + uv_blk_col, + v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col, + u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight, + bottom_weight, NULL); +} + +void av1_apply_temporal_filter_sse4_1( + const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, + int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, + int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk, + uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, uint16_t *u_count, + uint32_t *v_accum, uint16_t *v_count) { + const unsigned int chroma_height = block_height >> ss_y, + chroma_width = block_width >> ss_x; + + DECLARE_ALIGNED(16, uint16_t, y_dist[BH * DIST_STRIDE]) = { 0 }; + DECLARE_ALIGNED(16, uint16_t, u_dist[BH * DIST_STRIDE]) = { 0 }; + DECLARE_ALIGNED(16, uint16_t, v_dist[BH * DIST_STRIDE]) = { 0 }; + const int *blk_fw_ptr = blk_fw; + + uint16_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1, + *v_dist_ptr = v_dist + 1; + const uint8_t *y_src_ptr = y_src, *u_src_ptr = u_src, *v_src_ptr = v_src; + const uint8_t *y_pre_ptr = y_pre, *u_pre_ptr = u_pre, *v_pre_ptr = v_pre; + + // Loop variables + unsigned int row, blk_col; + + assert(block_width <= BW && "block width too large"); + assert(block_height <= BH && "block height too large"); + assert(block_width % 16 == 0 && "block width must be multiple of 16"); + assert(block_height % 2 == 0 && "block height must be even"); + assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) && + "invalid chroma subsampling"); + assert(strength >= 0 && strength <= 6 && "invalid temporal filter strength"); + assert(blk_fw[0] >= 0 && "filter weight must be positive"); + assert( + (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) && + "subblock filter weight must be positive"); + assert(blk_fw[0] <= 2 && "sublock filter weight must be less than 2"); + assert( + (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) && + "subblock filter weight must be less than 2"); + + // Precompute the difference sqaured + for (row = 0; row < block_height; row++) { + for (blk_col = 0; blk_col < block_width; blk_col += 16) { + store_dist_16(y_src_ptr + blk_col, y_pre_ptr + blk_col, + y_dist_ptr + blk_col); + } + y_src_ptr += y_src_stride; + y_pre_ptr += y_pre_stride; + y_dist_ptr += DIST_STRIDE; + } + + for (row = 0; row < chroma_height; row++) { + for (blk_col = 0; blk_col < chroma_width; blk_col += 8) { + store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col, + u_dist_ptr + blk_col); + store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col, + v_dist_ptr + blk_col); + } + + u_src_ptr += uv_src_stride; + u_pre_ptr += uv_pre_stride; + u_dist_ptr += DIST_STRIDE; + v_src_ptr += uv_src_stride; + v_pre_ptr += uv_pre_stride; + v_dist_ptr += DIST_STRIDE; + } + + y_dist_ptr = y_dist + 1; + u_dist_ptr = u_dist + 1; + v_dist_ptr = v_dist + 1; + + av1_apply_temporal_filter_luma( + y_src, y_src_stride, y_pre, y_pre_stride, u_src, v_src, uv_src_stride, + u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y, + strength, blk_fw_ptr, use_whole_blk, y_accum, y_count, y_dist_ptr, + u_dist_ptr, v_dist_ptr); + + av1_apply_temporal_filter_chroma( + y_src, y_src_stride, y_pre, y_pre_stride, u_src, v_src, uv_src_stride, + u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y, + strength, blk_fw_ptr, use_whole_blk, u_accum, u_count, v_accum, v_count, + y_dist_ptr, u_dist_ptr, v_dist_ptr); +} diff --git a/libaom/build/cmake/aom_config_defaults.cmake b/libaom/build/cmake/aom_config_defaults.cmake index feb9b5e..f498acd 100644 --- a/libaom/build/cmake/aom_config_defaults.cmake +++ b/libaom/build/cmake/aom_config_defaults.cmake @@ -101,8 +101,6 @@ set_aom_config_var(CONFIG_DENOISE 1 NUMBER "Denoise/noise modeling support in encoder.") set_aom_config_var(CONFIG_FILEOPTIONS 1 NUMBER "Enables encoder config file support.") -set_aom_config_var(CONFIG_FIX_GF_LENGTH 1 NUMBER - "Fix the GF length if possible") set_aom_config_var(CONFIG_INSPECTION 0 NUMBER "Enables bitstream inspection.") set_aom_config_var(CONFIG_INTERNAL_STATS 0 NUMBER "Enables internal encoder stats.") @@ -112,34 +110,29 @@ set_aom_config_var(CONFIG_MAX_DECODE_PROFILE 2 NUMBER "Max profile to support decoding.") set_aom_config_var(CONFIG_NORMAL_TILE_MODE 0 NUMBER "Only enables normal tile mode.") -set_aom_config_var( - CONFIG_REDUCED_ENCODER_BORDER 0 NUMBER - "Enable reduced border extention for encoder. \ - Disables superres and resize support." - ) set_aom_config_var(CONFIG_SIZE_LIMIT 0 NUMBER "Limit max decode width/height.") set_aom_config_var(CONFIG_SPATIAL_RESAMPLING 1 NUMBER "Spatial resampling.") set_aom_config_var(DECODE_HEIGHT_LIMIT 0 NUMBER "Set limit for decode height.") set_aom_config_var(DECODE_WIDTH_LIMIT 0 NUMBER "Set limit for decode width.") -set_aom_config_var(CONFIG_GLOBAL_MOTION_SEARCH 1 NUMBER - "Global motion search flag.") # AV1 experiment flags. -set_aom_config_var(CONFIG_COLLECT_INTER_MODE_RD_STATS 1 NUMBER - "AV1 experiment flag.") +set_aom_config_var(CONFIG_SPEED_STATS 0 NUMBER "AV1 experiment flag.") set_aom_config_var(CONFIG_COLLECT_RD_STATS 0 NUMBER "AV1 experiment flag.") set_aom_config_var(CONFIG_DIST_8X8 0 NUMBER "AV1 experiment flag.") set_aom_config_var(CONFIG_ENTROPY_STATS 0 NUMBER "AV1 experiment flag.") -set_aom_config_var(CONFIG_FP_MB_STATS 0 NUMBER "AV1 experiment flag.") set_aom_config_var(CONFIG_INTER_STATS_ONLY 0 NUMBER "AV1 experiment flag.") set_aom_config_var(CONFIG_RD_DEBUG 0 NUMBER "AV1 experiment flag.") -set_aom_config_var(CONFIG_2PASS_PARTITION_SEARCH_LVL 1 NUMBER +set_aom_config_var(CONFIG_2PASS_PARTITION_SEARCH_LVL_START 1 NUMBER + "AV1 experiment flag.") +set_aom_config_var(CONFIG_2PASS_PARTITION_SEARCH_LVL_END 3 NUMBER "AV1 experiment flag.") set_aom_config_var(CONFIG_SHARP_SETTINGS 0 NUMBER "AV1 experiment flag.") -set_aom_config_var(CONFIG_ONE_PASS_SVM 0 NUMBER "AV1 experiment flag.") set_aom_config_var(CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 1 NUMBER - "Disable full_pixel_motion_search_based_split on BLOCK_8X8") - + "Disable full_pixel_motion_search_based_split on BLOCK_8X8.") +set_aom_config_var(CONFIG_COLLECT_PARTITION_STATS 0 NUMBER + "Collect stats on partition decisions.") +set_aom_config_var(CONFIG_COLLECT_COMPONENT_TIMING 0 NUMBER + "Collect encoding component timing information.") # # Variables in this section control optional features of the build system. # diff --git a/libaom/build/cmake/aom_experiment_deps.cmake b/libaom/build/cmake/aom_experiment_deps.cmake index 0688704..2e36157 100644 --- a/libaom/build/cmake/aom_experiment_deps.cmake +++ b/libaom/build/cmake/aom_experiment_deps.cmake @@ -21,10 +21,6 @@ macro(fix_experiment_configs) change_config_and_warn(CONFIG_INSPECTION 1 CONFIG_ANALYZER) endif() - if(CONFIG_RD_DEBUG) - change_config_and_warn(CONFIG_RD_DEBUG 0 CONFIG_JNT_COMP) - endif() - if(CONFIG_DIST_8X8 AND CONFIG_MULTITHREAD) change_config_and_warn(CONFIG_DIST_8X8 0 CONFIG_MULTITHREAD) endif() diff --git a/libaom/build/cmake/toolchains/arm64-mingw-gcc.cmake b/libaom/build/cmake/toolchains/arm64-mingw-gcc.cmake index b5b2ff1..bfeac92 100644 --- a/libaom/build/cmake/toolchains/arm64-mingw-gcc.cmake +++ b/libaom/build/cmake/toolchains/arm64-mingw-gcc.cmake @@ -27,6 +27,3 @@ set(CMAKE_RANLIB ${CROSS}ranlib CACHE FILEPATH Indexer) # No runtime cpu detect for arm64-mingw-gcc. set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE NUMBER "") - -# Disable the use of the gtest's CMake support. -set(AOM_DISABLE_GTEST_CMAKE 1) diff --git a/libaom/build/cmake/toolchains/armv7-linux-gcc.cmake b/libaom/build/cmake/toolchains/armv7-linux-gcc.cmake index 7d3d630..6cbc2a8 100644 --- a/libaom/build/cmake/toolchains/armv7-linux-gcc.cmake +++ b/libaom/build/cmake/toolchains/armv7-linux-gcc.cmake @@ -28,16 +28,13 @@ endif() set(CMAKE_C_COMPILER ${CROSS}gcc) set(CMAKE_CXX_COMPILER ${CROSS}g++) set(AS_EXECUTABLE ${CROSS}as) -set(CMAKE_C_COMPILER_ARG1 - "-march=armv7-a -mfpu=neon ${AOM_EXTRA_TOOLCHAIN_FLAGS}") -set(CMAKE_CXX_COMPILER_ARG1 - "-march=armv7-a -mfpu=neon ${AOM_EXTRA_TOOLCHAIN_FLAGS}") +set(CMAKE_C_COMPILER_ARG1 "-march=armv7-a ${AOM_EXTRA_TOOLCHAIN_FLAGS}") +set(CMAKE_CXX_COMPILER_ARG1 "-march=armv7-a ${AOM_EXTRA_TOOLCHAIN_FLAGS}") set(AOM_AS_FLAGS --defsym ARCHITECTURE=7 -march=armv7-a -mfpu=neon ${AOM_EXTRA_TOOLCHAIN_FLAGS}) set(CMAKE_SYSTEM_PROCESSOR "armv7") -# No intrinsics flag required for armv7-linux-gcc. -set(AOM_NEON_INTRIN_FLAG "") +set(AOM_NEON_INTRIN_FLAG "-mfpu=neon") # No runtime cpu detect for armv7-linux-gcc. set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE NUMBER "") diff --git a/libaom/build/cmake/toolchains/armv7-mingw-gcc.cmake b/libaom/build/cmake/toolchains/armv7-mingw-gcc.cmake index cf06a11..eb488ec 100644 --- a/libaom/build/cmake/toolchains/armv7-mingw-gcc.cmake +++ b/libaom/build/cmake/toolchains/armv7-mingw-gcc.cmake @@ -27,6 +27,3 @@ set(CMAKE_RANLIB ${CROSS}ranlib CACHE FILEPATH Indexer) # No runtime cpu detect for armv7-mingw-gcc. set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE NUMBER "") - -# Disable the use of the gtest's CMake support. -set(AOM_DISABLE_GTEST_CMAKE 1) diff --git a/libaom/build/cmake/toolchains/x86-mingw-gcc.cmake b/libaom/build/cmake/toolchains/x86-mingw-gcc.cmake index c986c4e..4839c9d 100644 --- a/libaom/build/cmake/toolchains/x86-mingw-gcc.cmake +++ b/libaom/build/cmake/toolchains/x86-mingw-gcc.cmake @@ -26,6 +26,3 @@ set(CMAKE_C_COMPILER ${CROSS}gcc) set(CMAKE_CXX_COMPILER ${CROSS}g++) set(CMAKE_AR ${CROSS}ar CACHE FILEPATH Archiver) set(CMAKE_RANLIB ${CROSS}ranlib CACHE FILEPATH Indexer) - -# Disable the use of the gtest's CMake support. -set(AOM_DISABLE_GTEST_CMAKE 1) diff --git a/libaom/build/cmake/toolchains/x86_64-mingw-gcc.cmake b/libaom/build/cmake/toolchains/x86_64-mingw-gcc.cmake index 00d94d5..4b2d28d 100644 --- a/libaom/build/cmake/toolchains/x86_64-mingw-gcc.cmake +++ b/libaom/build/cmake/toolchains/x86_64-mingw-gcc.cmake @@ -24,6 +24,3 @@ set(CMAKE_C_COMPILER ${CROSS}gcc) set(CMAKE_CXX_COMPILER ${CROSS}g++) set(CMAKE_AR ${CROSS}ar CACHE FILEPATH Archiver) set(CMAKE_RANLIB ${CROSS}ranlib CACHE FILEPATH Indexer) - -# Disable the use of the gtest's CMake support. -set(AOM_DISABLE_GTEST_CMAKE 1) diff --git a/libaom/common/av1_config.c b/libaom/common/av1_config.c index e8decf7..90955fb 100644 --- a/libaom/common/av1_config.c +++ b/libaom/common/av1_config.c @@ -322,7 +322,7 @@ static int parse_sequence_header(const uint8_t *const buffer, size_t length, AV1C_READ_BITS_OR_RETURN_ERROR(max_frame_height_minus_1, frame_height_bits_minus_1 + 1); - int frame_id_numbers_present = 0; + uint8_t frame_id_numbers_present = 0; if (!reduced_still_picture_header) { AV1C_READ_BIT_OR_RETURN_ERROR(frame_id_numbers_present_flag); frame_id_numbers_present = frame_id_numbers_present_flag; @@ -345,7 +345,7 @@ static int parse_sequence_header(const uint8_t *const buffer, size_t length, AV1C_READ_BIT_OR_RETURN_ERROR(enable_order_hint); if (enable_order_hint) { - AV1C_READ_BIT_OR_RETURN_ERROR(enable_jnt_comp); + AV1C_READ_BIT_OR_RETURN_ERROR(enable_dist_wtd_comp); AV1C_READ_BIT_OR_RETURN_ERROR(enable_ref_frame_mvs); } diff --git a/libaom/common/rawenc.c b/libaom/common/rawenc.c index 5a2731d..b72132c 100644 --- a/libaom/common/rawenc.c +++ b/libaom/common/rawenc.c @@ -9,36 +9,88 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ +#include <stdbool.h> #include "common/rawenc.h" -void raw_write_image_file(const aom_image_t *img, const int *planes, - const int num_planes, FILE *file) { - const int bytes_per_sample = ((img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1); - for (int i = 0; i < num_planes; ++i) { - const int plane = planes[i]; - const unsigned char *buf = img->planes[plane]; - const int stride = img->stride[plane]; - const int w = aom_img_plane_width(img, plane); - const int h = aom_img_plane_height(img, plane); - for (int y = 0; y < h; ++y) { - fwrite(buf, bytes_per_sample, w, file); - buf += stride; +#define BATCH_SIZE 8 +// When writing greyscale color, batch 8 writes for low bit-depth, 4 writes +// for high bit-depth. +static const uint8_t batched[BATCH_SIZE] = { 128, 128, 128, 128, + 128, 128, 128, 128 }; +static const uint8_t batched_hbd[BATCH_SIZE] = { + 0, 128, 0, 128, 0, 128, 0, 128 +}; + +// Interface to writing to either a file or MD5Context. Takes a pointer to +// either the file or MD5Context, the buffer, the size of each element, and +// number of elements to write. Note that size and nmemb (last two args) must +// be unsigned int, as the interface to MD5Update requires that. +typedef void (*WRITER)(void *, const uint8_t *, unsigned int, unsigned int); + +static void write_file(void *fp, const uint8_t *buffer, unsigned int size, + unsigned int nmemb) { + fwrite(buffer, size, nmemb, (FILE *)fp); +} + +static void write_md5(void *md5, const uint8_t *buffer, unsigned int size, + unsigned int nmemb) { + MD5Update((MD5Context *)md5, buffer, size * nmemb); +} + +// Writes out n greyscale values. +static void write_greyscale(const bool high_bitdepth, int n, WRITER writer_func, + void *file_or_md5) { + const uint8_t *b = batched; + if (high_bitdepth) { + b = batched_hbd; + } + const int num_batched_writes = + high_bitdepth ? n / (BATCH_SIZE / 2) : n / BATCH_SIZE; + for (int i = 0; i < num_batched_writes; ++i) { + writer_func(file_or_md5, b, sizeof(uint8_t), BATCH_SIZE); + } + const int remaining = high_bitdepth ? n % (BATCH_SIZE / 2) : n % BATCH_SIZE; + for (int i = 0; i < remaining; ++i) { + if (high_bitdepth) { + writer_func(file_or_md5, batched_hbd, sizeof(uint8_t), 2); + } else { + writer_func(file_or_md5, batched, sizeof(uint8_t), 1); } } } -void raw_update_image_md5(const aom_image_t *img, const int *planes, - const int num_planes, MD5Context *md5) { +// Encapsulates the logic for writing raw data to either an image file or +// to an MD5 context. +static void raw_write_image_file_or_md5(const aom_image_t *img, + const int *planes, const int num_planes, + void *file_or_md5, WRITER writer_func) { + const bool high_bitdepth = img->fmt & AOM_IMG_FMT_HIGHBITDEPTH; + const int bytes_per_sample = high_bitdepth ? 2 : 1; for (int i = 0; i < num_planes; ++i) { const int plane = planes[i]; + const int w = aom_img_plane_width(img, plane); + const int h = aom_img_plane_height(img, plane); + // If we're on a color plane and the output is monochrome, write a greyscale + // value. Since there are only YUV planes, compare against Y. + if (img->monochrome && plane != AOM_PLANE_Y) { + write_greyscale(high_bitdepth, w * h, writer_func, file_or_md5); + continue; + } const unsigned char *buf = img->planes[plane]; const int stride = img->stride[plane]; - const int w = aom_img_plane_width(img, plane) * - ((img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1); - const int h = aom_img_plane_height(img, plane); for (int y = 0; y < h; ++y) { - MD5Update(md5, buf, w); + writer_func(file_or_md5, buf, bytes_per_sample, w); buf += stride; } } } + +void raw_write_image_file(const aom_image_t *img, const int *planes, + const int num_planes, FILE *file) { + raw_write_image_file_or_md5(img, planes, num_planes, file, write_file); +} + +void raw_update_image_md5(const aom_image_t *img, const int *planes, + const int num_planes, MD5Context *md5) { + raw_write_image_file_or_md5(img, planes, num_planes, md5, write_md5); +} diff --git a/libaom/common/tools_common.c b/libaom/common/tools_common.c index 2e32f61..51c1c52 100644 --- a/libaom/common/tools_common.c +++ b/libaom/common/tools_common.c @@ -149,6 +149,11 @@ const AvxInterface *get_aom_encoder_by_name(const char *name) { return NULL; } + +// large scale tile encoding +static const AvxInterface aom_lst_encoder = { "av1", LST_FOURCC, + &aom_codec_av1_cx }; +const AvxInterface *get_aom_lst_encoder(void) { return &aom_lst_encoder; } #endif // CONFIG_AV1_ENCODER #if CONFIG_AV1_DECODER diff --git a/libaom/common/tools_common.h b/libaom/common/tools_common.h index df3b62b..d9a68f0 100644 --- a/libaom/common/tools_common.h +++ b/libaom/common/tools_common.h @@ -18,6 +18,7 @@ #include "aom/aom_codec.h" #include "aom/aom_image.h" #include "aom/aom_integer.h" +#include "aom_ports/mem.h" #include "aom_ports/msvc.h" #if CONFIG_AV1_ENCODER @@ -78,11 +79,14 @@ enum VideoFileType { }; // Used in lightfield example. -typedef enum OUTPUT_FORMAT { +enum { YUV1D, // 1D tile output for conformance test. YUV, // Tile output in YUV format. NV12, // Tile output in NV12 format. -} OUTPUT_FORMAT; +} UENUM1BYTE(OUTPUT_FORMAT); + +// The fourcc for large_scale_tile encoding is "LSTC". +#define LST_FOURCC 0x4354534c struct FileTypeDetectionBuffer { char buf[4]; @@ -149,6 +153,7 @@ typedef struct AvxInterface { int get_aom_encoder_count(void); const AvxInterface *get_aom_encoder_by_index(int i); const AvxInterface *get_aom_encoder_by_name(const char *name); +const AvxInterface *get_aom_lst_encoder(void); int get_aom_decoder_count(void); const AvxInterface *get_aom_decoder_by_index(int i); diff --git a/libaom/common/video_reader.c b/libaom/common/video_reader.c index 47ad6e1..7b021bc 100644 --- a/libaom/common/video_reader.c +++ b/libaom/common/video_reader.c @@ -121,3 +121,7 @@ FILE *aom_video_reader_get_file(AvxVideoReader *reader) { const AvxVideoInfo *aom_video_reader_get_info(AvxVideoReader *reader) { return &reader->info; } + +void aom_video_reader_set_fourcc(AvxVideoReader *reader, uint32_t fourcc) { + reader->info.codec_fourcc = fourcc; +} diff --git a/libaom/common/video_reader.h b/libaom/common/video_reader.h index 903deae..9ab439e 100644 --- a/libaom/common/video_reader.h +++ b/libaom/common/video_reader.h @@ -50,6 +50,9 @@ FILE *aom_video_reader_get_file(AvxVideoReader *reader); // Fills AvxVideoInfo with information from opened video file. const AvxVideoInfo *aom_video_reader_get_info(AvxVideoReader *reader); +// Set fourcc. +void aom_video_reader_set_fourcc(AvxVideoReader *reader, uint32_t fourcc); + #ifdef __cplusplus } // extern "C" #endif diff --git a/libaom/common/video_writer.c b/libaom/common/video_writer.c index a7ec309..2b42e36 100644 --- a/libaom/common/video_writer.c +++ b/libaom/common/video_writer.c @@ -75,3 +75,7 @@ int aom_video_writer_write_frame(AvxVideoWriter *writer, const uint8_t *buffer, return 1; } + +void aom_video_writer_set_fourcc(AvxVideoWriter *writer, uint32_t fourcc) { + writer->info.codec_fourcc = fourcc; +} diff --git a/libaom/common/video_writer.h b/libaom/common/video_writer.h index 3e2b655..8712d47 100644 --- a/libaom/common/video_writer.h +++ b/libaom/common/video_writer.h @@ -14,7 +14,7 @@ #include "common/video_common.h" -typedef enum { kContainerIVF } AvxContainer; +enum { kContainerIVF } UENUM1BYTE(AvxContainer); struct AvxVideoWriterStruct; typedef struct AvxVideoWriterStruct AvxVideoWriter; @@ -37,6 +37,8 @@ void aom_video_writer_close(AvxVideoWriter *writer); // Writes frame bytes to the file. int aom_video_writer_write_frame(AvxVideoWriter *writer, const uint8_t *buffer, size_t size, int64_t pts); +// Set fourcc. +void aom_video_writer_set_fourcc(AvxVideoWriter *writer, uint32_t fourcc); #ifdef __cplusplus } // extern "C" diff --git a/libaom/common/webmenc.h b/libaom/common/webmenc.h index 4cdfd68..a4aa992 100644 --- a/libaom/common/webmenc.h +++ b/libaom/common/webmenc.h @@ -30,13 +30,13 @@ struct WebmOutputContext { }; /* Stereo 3D packed frame format */ -typedef enum stereo_format { +enum { STEREO_FORMAT_MONO = 0, STEREO_FORMAT_LEFT_RIGHT = 1, STEREO_FORMAT_BOTTOM_TOP = 2, STEREO_FORMAT_TOP_BOTTOM = 3, STEREO_FORMAT_RIGHT_LEFT = 11 -} stereo_format_t; +} UENUM1BYTE(stereo_format_t); // The following functions wrap libwebm's mkvmuxer. All functions return 0 upon // success, or -1 upon failure. diff --git a/libaom/examples/analyzer.cc b/libaom/examples/analyzer.cc index 6a42eca..261d085 100644 --- a/libaom/examples/analyzer.cc +++ b/libaom/examples/analyzer.cc @@ -162,7 +162,7 @@ bool AV1Decoder::setInspectionCallback() { void AV1Decoder::inspect(void *pbi, void *data) { AV1Decoder *decoder = (AV1Decoder *)data; - ifd_inspect(&decoder->frame_data, pbi); + ifd_inspect(&decoder->frame_data, pbi, 0); } #define MIN_ZOOM (1) diff --git a/libaom/examples/av1_dec_fuzzer.cc b/libaom/examples/av1_dec_fuzzer.cc new file mode 100644 index 0000000..96d16a8 --- /dev/null +++ b/libaom/examples/av1_dec_fuzzer.cc @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +/* + * See build_av1_dec_fuzzer.sh for building instructions. + */ + +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <memory> + +#include "config/aom_config.h" +#include "aom/aom_decoder.h" +#include "aom/aomdx.h" +#include "aom_ports/mem_ops.h" +#include "common/ivfdec.h" + +static void close_file(FILE *file) { fclose(file); } + +extern "C" void usage_exit(void) { exit(EXIT_FAILURE); } + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { + std::unique_ptr<FILE, decltype(&close_file)> file( + fmemopen((void *)data, size, "rb"), &close_file); + if (file == nullptr) { + return 0; + } + + char header[32]; + if (fread(header, 1, 32, file.get()) != 32) { + return 0; + } + const AvxInterface *decoder = get_aom_decoder_by_name("av1"); + if (decoder == nullptr) { + return 0; + } + + aom_codec_ctx_t codec; + // Set thread count in the range [1, 64]. + const unsigned int threads = (header[0] & 0x3f) + 1; + aom_codec_dec_cfg_t cfg = { threads, 0, 0, CONFIG_LOWBITDEPTH }; + if (aom_codec_dec_init(&codec, decoder->codec_interface(), &cfg, 0)) { + return 0; + } + + uint8_t *buffer = nullptr; + size_t buffer_size = 0; + size_t frame_size = 0; + while (!ivf_read_frame(file.get(), &buffer, &frame_size, &buffer_size, + nullptr)) { + const aom_codec_err_t err = + aom_codec_decode(&codec, buffer, frame_size, nullptr); + static_cast<void>(err); + aom_codec_iter_t iter = nullptr; + aom_image_t *img = nullptr; + while ((img = aom_codec_get_frame(&codec, &iter)) != nullptr) { + } + } + aom_codec_destroy(&codec); + free(buffer); + return 0; +} diff --git a/libaom/examples/build_av1_dec_fuzzer.sh b/libaom/examples/build_av1_dec_fuzzer.sh new file mode 100755 index 0000000..86992a0 --- /dev/null +++ b/libaom/examples/build_av1_dec_fuzzer.sh @@ -0,0 +1,78 @@ +#!/bin/bash +# +# Copyright (c) 2019, Alliance for Open Media. All rights reserved +# +# This source code is subject to the terms of the BSD 2 Clause License and +# the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +# was not distributed with this source code in the LICENSE file, you can +# obtain it at www.aomedia.org/license/software. If the Alliance for Open +# Media Patent License 1.0 was not distributed with this source code in the +# PATENTS file, you can obtain it at www.aomedia.org/license/patent. +# +############################################################################### +# Fuzzer for libaom decoder. +# ========================== +# Requirements +# --------------------- +# Clang6.0 or above (must support -fsanitize=fuzzer) +# +# References: +# --------------------- +# http://llvm.org/docs/LibFuzzer.html +# https://github.com/google/oss-fuzz +# +# Steps to build / run +# --------------------- + +set -eu + +# Have a copy of AOM and a build directory ready. +if [[ $# -ne 2 ]]; then + echo "Pass in the AOM source tree as first argument, and a build directory " + echo "as the second argument. The AOM source tree can be obtained via: " + echo " git clone https://aomedia.googlesource.com/aom" + exit 2 +fi +if [[ -z "$CC" ]]; then + echo "Set the CC environment variable to point to your C compiler." + exit 2 +fi +if [[ -z "$CXX" ]]; then + echo "Set the CXX environment variable to point to your C++ compiler." + exit 2 +fi + +AOM_DIR=$1 +BUILD_DIR=$2 +# Run CMake with address sanitizer enabled and build the codec. +# Enable DO_RANGE_CHECK_CLAMP to suppress the noise of integer overflows +# in the transform functions. Also set memory limits. +EXTRA_C_FLAGS='-DDO_RANGE_CHECK_CLAMP=1 -DAOM_MAX_ALLOCABLE_MEMORY=1073741824' +cd "${BUILD_DIR}" +cmake "${AOM_DIR}" -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCONFIG_PIC=1 \ + -DCONFIG_SCALABILITY=0 -DCONFIG_LOWBITDEPTH=1 -DCONFIG_AV1_ENCODER=0 \ + -DENABLE_EXAMPLES=0 -DENABLE_DOCS=0 -DENABLE_TESTS=0 -DCONFIG_SIZE_LIMIT=1 \ + -DDECODE_HEIGHT_LIMIT=12288 -DDECODE_WIDTH_LIMIT=12288 \ + -DAOM_EXTRA_C_FLAGS="${EXTRA_C_FLAGS}" \ + -DAOM_EXTRA_CXX_FLAGS="${EXTRA_C_FLAGS}" -DSANITIZE=address + +# Build the codec. +make -j$(nproc) + +# Build some libaom utils that are not part of the core lib. +$CC -std=c99 -c -I${AOM_DIR} -I${BUILD_DIR} \ + ${AOM_DIR}/common/ivfdec.c -o ${BUILD_DIR}/ivfdec.o + +$CC -std=c99 -c -I${AOM_DIR} -I${BUILD_DIR} \ + ${AOM_DIR}/common/tools_common.c -o ${BUILD_DIR}/tools_common.o + +# Build the av1 fuzzer +$CXX -std=c++11 -DDECODER=av1 -I${AOM_DIR} -I${BUILD_DIR} \ + -fsanitize=fuzzer -Wl,--start-group \ + ${AOM_DIR}/examples/av1_dec_fuzzer.cc -o ${BUILD_DIR}/av1_dec_fuzzer \ + ${BUILD_DIR}/libaom.a ${BUILD_DIR}/ivfdec.o ${BUILD_DIR}/tools_common.o \ + -Wl,--end-group + +echo "Fuzzer built at ${BUILD_DIR}/av1_dec_fuzzer." +echo "Create a corpus directory, copy IVF files in there, and run:" +echo " av1_dec_fuzzer CORPUS_DIR" diff --git a/libaom/examples/inspect.c b/libaom/examples/inspect.c index 7b7b3cd..9ca2a02 100644 --- a/libaom/examples/inspect.c +++ b/libaom/examples/inspect.c @@ -62,7 +62,10 @@ typedef enum { SEGMENT_ID_LAYER = 1 << 14, MOTION_MODE_LAYER = 1 << 15, COMPOUND_TYPE_LAYER = 1 << 16, - ALL_LAYERS = (1 << 17) - 1 + INTRABC_LAYER = 1 << 17, + PALETTE_LAYER = 1 << 18, + UV_PALETTE_LAYER = 1 << 19, + ALL_LAYERS = (1 << 20) - 1 } LayerType; static LayerType layers = 0; @@ -106,7 +109,20 @@ static const arg_def_t dump_delta_q_arg = ARG_DEF("dq", "delta_q", 0, "Dump QIndex"); static const arg_def_t dump_seg_id_arg = ARG_DEF("si", "seg_id", 0, "Dump Segment ID"); +static const arg_def_t dump_intrabc_arg = + ARG_DEF("ibc", "intrabc", 0, "Dump If IntraBC Is Used"); +static const arg_def_t dump_palette_arg = + ARG_DEF("plt", "palette", 0, "Dump Palette Size"); +static const arg_def_t dump_uv_palette_arg = + ARG_DEF("uvp", "uv_palette", 0, "Dump UV Palette Size"); static const arg_def_t usage_arg = ARG_DEF("h", "help", 0, "Help"); +static const arg_def_t skip_non_transform_arg = ARG_DEF( + "snt", "skip_non_transform", 1, "Skip is counted as a non transform."); +static const arg_def_t combined_arg = + ARG_DEF("comb", "combined", 1, "combinining parameters into one output."); + +int combined_parm_list[15]; +int combined_parm_count = 0; static const arg_def_t *main_args[] = { &limit_arg, &dump_all_arg, @@ -130,7 +146,12 @@ static const arg_def_t *main_args[] = { &limit_arg, &dump_motion_vectors_arg, &dump_delta_q_arg, &dump_seg_id_arg, + &dump_intrabc_arg, + &dump_palette_arg, + &dump_uv_palette_arg, &usage_arg, + &skip_non_transform_arg, + &combined_arg, NULL }; #define ENUM(name) \ { #name, name } @@ -158,6 +179,8 @@ const map_entry block_size_map[] = { ENUM(BLOCK_64X16), LAST_ENUM }; +#define TX_SKIP -1 + const map_entry tx_size_map[] = { ENUM(TX_4X4), ENUM(TX_8X8), ENUM(TX_16X16), ENUM(TX_32X32), ENUM(TX_64X64), ENUM(TX_4X8), ENUM(TX_8X4), ENUM(TX_8X16), @@ -225,10 +248,57 @@ const map_entry uv_prediction_mode_map[] = { const map_entry skip_map[] = { ENUM(SKIP), ENUM(NO_SKIP), LAST_ENUM }; +const map_entry intrabc_map[] = { + { "INTRABC", 1 }, { "NO_INTRABC", 0 }, LAST_ENUM +}; + +const map_entry palette_map[] = { + { "ZERO_COLORS", 0 }, { "TWO_COLORS", 2 }, { "THREE_COLORS", 3 }, + { "FOUR_COLORS", 4 }, { "FIVE_COLORS", 5 }, { "SIX_COLORS", 6 }, + { "SEVEN_COLORS", 7 }, { "EIGHT_COLORS", 8 }, LAST_ENUM +}; + const map_entry config_map[] = { ENUM(MI_SIZE), LAST_ENUM }; static const char *exec_name; +struct parm_offset { + char parm[60]; + char offset; +}; +struct parm_offset parm_offsets[] = { + { "blockSize", offsetof(insp_mi_data, sb_type) }, + { "transformSize", offsetof(insp_mi_data, tx_size) }, + { "transformType", offsetof(insp_mi_data, tx_type) }, + { "dualFilterType", offsetof(insp_mi_data, dual_filter_type) }, + { "mode", offsetof(insp_mi_data, mode) }, + { "uv_mode", offsetof(insp_mi_data, uv_mode) }, + { "motion_mode", offsetof(insp_mi_data, motion_mode) }, + { "compound_type", offsetof(insp_mi_data, compound_type) }, + { "referenceFrame", offsetof(insp_mi_data, ref_frame) }, + { "skip", offsetof(insp_mi_data, skip) }, +}; +int parm_count = sizeof(parm_offsets) / sizeof(parm_offsets[0]); + +int convert_to_indices(char *str, int *indices, int maxCount, int *count) { + *count = 0; + do { + char *comma = strchr(str, ','); + int length = (comma ? (int)(comma - str) : (int)strlen(str)); + int i; + for (i = 0; i < parm_count; ++i) { + if (!strncmp(str, parm_offsets[i].parm, length)) { + break; + } + } + if (i == parm_count) return 0; + indices[(*count)++] = i; + if (*count > maxCount) return 0; + str += length + 1; + } while (strlen(str) > 0); + return 1; +} + insp_frame_data frame_data; int frame_count = 0; int decoded_frame_count = 0; @@ -399,6 +469,38 @@ int put_motion_vectors(char *buffer) { return (int)(buf - buffer); } +int put_combined(char *buffer) { + const int mi_rows = frame_data.mi_rows; + const int mi_cols = frame_data.mi_cols; + char *buf = buffer; + int r, c, p; + buf += put_str(buf, " \""); + for (p = 0; p < combined_parm_count; ++p) { + if (p) buf += put_str(buf, "&"); + buf += put_str(buf, parm_offsets[combined_parm_list[p]].parm); + } + buf += put_str(buf, "\": ["); + for (r = 0; r < mi_rows; ++r) { + *(buf++) = '['; + for (c = 0; c < mi_cols; ++c) { + insp_mi_data *mi = &frame_data.mi_grid[r * mi_cols + c]; + *(buf++) = '['; + for (p = 0; p < combined_parm_count; ++p) { + if (p) *(buf++) = ','; + int16_t *v = (int16_t *)(((int8_t *)mi) + + parm_offsets[combined_parm_list[p]].offset); + buf += put_num(buf, 0, v[0], 0); + } + *(buf++) = ']'; + if (c < mi_cols - 1) *(buf++) = ','; + } + *(buf++) = ']'; + if (r < mi_rows - 1) *(buf++) = ','; + } + buf += put_str(buf, "],\n"); + return (int)(buf - buffer); +} + int put_block_info(char *buffer, const map_entry *map, const char *name, size_t offset, int len) { const int mi_rows = frame_data.mi_rows; @@ -507,9 +609,11 @@ int put_accounting(char *buffer) { } #endif +int skip_non_transform = 0; + void inspect(void *pbi, void *data) { /* Fetch frame data. */ - ifd_inspect(&frame_data, pbi); + ifd_inspect(&frame_data, pbi, skip_non_transform); // Show existing frames just show a reference buffer we've already decoded. // There's no information to show. @@ -584,6 +688,19 @@ void inspect(void *pbi, void *data) { if (layers & MOTION_VECTORS_LAYER) { buf += put_motion_vectors(buf); } + if (layers & INTRABC_LAYER) { + buf += put_block_info(buf, intrabc_map, "intrabc", + offsetof(insp_mi_data, intrabc), 0); + } + if (layers & PALETTE_LAYER) { + buf += put_block_info(buf, palette_map, "palette", + offsetof(insp_mi_data, palette), 0); + } + if (layers & UV_PALETTE_LAYER) { + buf += put_block_info(buf, palette_map, "uv_palette", + offsetof(insp_mi_data, uv_palette), 0); + } + if (combined_parm_count > 0) buf += put_combined(buf); if (layers & REFERENCE_FRAME_LAYER) { buf += put_block_info(buf, refs_map, "referenceFrame", offsetof(insp_mi_data, ref_frame), 2); @@ -775,6 +892,12 @@ static void parse_args(char **argv) { layers |= Q_INDEX_LAYER; else if (arg_match(&arg, &dump_seg_id_arg, argi)) layers |= SEGMENT_ID_LAYER; + else if (arg_match(&arg, &dump_intrabc_arg, argi)) + layers |= INTRABC_LAYER; + else if (arg_match(&arg, &dump_palette_arg, argi)) + layers |= PALETTE_LAYER; + else if (arg_match(&arg, &dump_uv_palette_arg, argi)) + layers |= UV_PALETTE_LAYER; else if (arg_match(&arg, &dump_all_arg, argi)) layers |= ALL_LAYERS; else if (arg_match(&arg, &compress_arg, argi)) @@ -783,6 +906,13 @@ static void parse_args(char **argv) { usage_exit(); else if (arg_match(&arg, &limit_arg, argi)) stop_after = arg_parse_uint(&arg); + else if (arg_match(&arg, &skip_non_transform_arg, argi)) + skip_non_transform = arg_parse_uint(&arg); + else if (arg_match(&arg, &combined_arg, argi)) + convert_to_indices( + (char *)arg.val, combined_parm_list, + sizeof(combined_parm_list) / sizeof(combined_parm_list[0]), + &combined_parm_count); else argj++; } diff --git a/libaom/examples/lightfield_bitstream_parsing.c b/libaom/examples/lightfield_bitstream_parsing.c index 9c90671..afacf44 100644 --- a/libaom/examples/lightfield_bitstream_parsing.c +++ b/libaom/examples/lightfield_bitstream_parsing.c @@ -211,6 +211,8 @@ int main(int argc, char **argv) { num_references = (int)strtol(argv[3], NULL, 0); info = aom_video_reader_get_info(reader); + aom_video_reader_set_fourcc(reader, AV1_FOURCC); + // The writer to write out ivf file in tile list OBU, which can be decoded by // AV1 decoder. writer = aom_video_writer_open(argv[2], kContainerIVF, info); diff --git a/libaom/examples/lightfield_decoder.c b/libaom/examples/lightfield_decoder.c index 23dac98..7a445f0 100644 --- a/libaom/examples/lightfield_decoder.c +++ b/libaom/examples/lightfield_decoder.c @@ -188,8 +188,10 @@ int main(int argc, char **argv) { info = aom_video_reader_get_info(reader); - decoder = get_aom_decoder_by_fourcc(info->codec_fourcc); - if (!decoder) die("Unknown input codec."); + if (info->codec_fourcc == LST_FOURCC) + decoder = get_aom_decoder_by_fourcc(AV1_FOURCC); + else + die("Unknown input codec."); printf("Using %s\n", aom_codec_iface_name(decoder->codec_interface())); if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0)) @@ -218,7 +220,7 @@ int main(int argc, char **argv) { // Allocate memory to store decoded references. Allocate memory with the // border so that it can be used as a reference. for (j = 0; j < num_references; j++) { - unsigned int border = AOM_BORDER_IN_PIXELS; + unsigned int border = AOM_DEC_BORDER_IN_PIXELS; if (!aom_img_alloc_with_border(&reference_images[j], ref_fmt, frame_res[0], frame_res[1], 32, 8, border)) { diff --git a/libaom/examples/lightfield_encoder.c b/libaom/examples/lightfield_encoder.c index e55cd5c..4dd71ca 100644 --- a/libaom/examples/lightfield_encoder.c +++ b/libaom/examples/lightfield_encoder.c @@ -275,9 +275,13 @@ static void pass1(aom_image_t *raw, FILE *infile, const char *outfile_name, aom_img_fmt_t ref_fmt = AOM_IMG_FMT_I420; if (!CONFIG_LOWBITDEPTH) ref_fmt |= AOM_IMG_FMT_HIGHBITDEPTH; // Allocate memory with the border so that it can be used as a reference. + int border_in_pixels = + (codec.config.enc->rc_resize_mode || codec.config.enc->rc_superres_mode) + ? AOM_BORDER_IN_PIXELS + : AOM_ENC_NO_SCALE_BORDER; for (i = 0; i < reference_image_num; i++) { if (!aom_img_alloc_with_border(&reference_images[i], ref_fmt, cfg->g_w, - cfg->g_h, 32, 8, AOM_BORDER_IN_PIXELS)) { + cfg->g_h, 32, 8, border_in_pixels)) { die("Failed to allocate image."); } } @@ -393,6 +397,10 @@ static void pass1(aom_image_t *raw, FILE *infile, const char *outfile_name, for (i = 0; i < reference_image_num; i++) aom_img_free(&reference_images[i]); if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec."); + + // Modify large_scale_file fourcc. + if (cfg->large_scale_tile == 1) + aom_video_writer_set_fourcc(writer, LST_FOURCC); aom_video_writer_close(writer); printf("\nSecond pass complete. Processed %d frames.\n", frame_count); diff --git a/libaom/examples/lightfield_tile_list_decoder.c b/libaom/examples/lightfield_tile_list_decoder.c index 4aabde1..87a8b43 100644 --- a/libaom/examples/lightfield_tile_list_decoder.c +++ b/libaom/examples/lightfield_tile_list_decoder.c @@ -160,7 +160,7 @@ int main(int argc, char **argv) { // Allocate memory to store decoded references. Allocate memory with the // border so that it can be used as a reference. for (j = 0; j < num_references; j++) { - unsigned int border = AOM_BORDER_IN_PIXELS; + unsigned int border = AOM_DEC_BORDER_IN_PIXELS; if (!aom_img_alloc_with_border(&reference_images[j], ref_fmt, frame_res[0], frame_res[1], 32, 8, border)) { diff --git a/libaom/test/av1_convolve_2d_test.cc b/libaom/test/av1_convolve_2d_test.cc index 825cef2..b0cef81 100644 --- a/libaom/test/av1_convolve_2d_test.cc +++ b/libaom/test/av1_convolve_2d_test.cc @@ -19,6 +19,7 @@ using libaom_test::AV1HighbdConvolve2D::AV1HighbdConvolve2DSrTest; using libaom_test::AV1HighbdConvolve2D::AV1HighbdJntConvolve2DTest; using ::testing::make_tuple; using ::testing::tuple; + namespace { TEST_P(AV1Convolve2DSrTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0)); } @@ -89,72 +90,72 @@ INSTANTIATE_TEST_CASE_P(NEON_COPY, AV1Convolve2DSrTest, TEST_P(AV1JntConvolve2DTest, CheckOutput) { RunCheckOutput(GET_PARAM(0)); } TEST_P(AV1JntConvolve2DTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0)); } -INSTANTIATE_TEST_CASE_P( - C_COPY, AV1JntConvolve2DTest, - libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_2d_copy_c, 0, 0)); +INSTANTIATE_TEST_CASE_P(C_COPY, AV1JntConvolve2DTest, + libaom_test::AV1Convolve2D::BuildParams( + av1_dist_wtd_convolve_2d_copy_c, 0, 0)); INSTANTIATE_TEST_CASE_P( C_X, AV1JntConvolve2DTest, - libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_x_c, 1, 0)); + libaom_test::AV1Convolve2D::BuildParams(av1_dist_wtd_convolve_x_c, 1, 0)); INSTANTIATE_TEST_CASE_P( C_Y, AV1JntConvolve2DTest, - libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_y_c, 0, 1)); + libaom_test::AV1Convolve2D::BuildParams(av1_dist_wtd_convolve_y_c, 0, 1)); #if HAVE_SSE2 INSTANTIATE_TEST_CASE_P(SSE2_COPY, AV1JntConvolve2DTest, libaom_test::AV1Convolve2D::BuildParams( - av1_jnt_convolve_2d_copy_sse2, 0, 0)); -INSTANTIATE_TEST_CASE_P( - SSE2, AV1JntConvolve2DTest, - libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_2d_sse2, 1, 1)); + av1_dist_wtd_convolve_2d_copy_sse2, 0, 0)); +INSTANTIATE_TEST_CASE_P(SSE2, AV1JntConvolve2DTest, + libaom_test::AV1Convolve2D::BuildParams( + av1_dist_wtd_convolve_2d_sse2, 1, 1)); -INSTANTIATE_TEST_CASE_P( - SSE2_X, AV1JntConvolve2DTest, - libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_x_sse2, 1, 0)); +INSTANTIATE_TEST_CASE_P(SSE2_X, AV1JntConvolve2DTest, + libaom_test::AV1Convolve2D::BuildParams( + av1_dist_wtd_convolve_x_sse2, 1, 0)); -INSTANTIATE_TEST_CASE_P( - SSE2_Y, AV1JntConvolve2DTest, - libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_y_sse2, 0, 1)); +INSTANTIATE_TEST_CASE_P(SSE2_Y, AV1JntConvolve2DTest, + libaom_test::AV1Convolve2D::BuildParams( + av1_dist_wtd_convolve_y_sse2, 0, 1)); #if HAVE_SSSE3 -INSTANTIATE_TEST_CASE_P( - SSSE3, AV1JntConvolve2DTest, - libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_2d_ssse3, 1, 1)); +INSTANTIATE_TEST_CASE_P(SSSE3, AV1JntConvolve2DTest, + libaom_test::AV1Convolve2D::BuildParams( + av1_dist_wtd_convolve_2d_ssse3, 1, 1)); #if HAVE_AVX2 INSTANTIATE_TEST_CASE_P(AVX2_COPY, AV1JntConvolve2DTest, libaom_test::AV1Convolve2D::BuildParams( - av1_jnt_convolve_2d_copy_avx2, 0, 0)); -INSTANTIATE_TEST_CASE_P( - AVX2_X, AV1JntConvolve2DTest, - libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_x_avx2, 1, 0)); + av1_dist_wtd_convolve_2d_copy_avx2, 0, 0)); +INSTANTIATE_TEST_CASE_P(AVX2_X, AV1JntConvolve2DTest, + libaom_test::AV1Convolve2D::BuildParams( + av1_dist_wtd_convolve_x_avx2, 1, 0)); -INSTANTIATE_TEST_CASE_P( - AVX2_Y, AV1JntConvolve2DTest, - libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_y_avx2, 0, 1)); +INSTANTIATE_TEST_CASE_P(AVX2_Y, AV1JntConvolve2DTest, + libaom_test::AV1Convolve2D::BuildParams( + av1_dist_wtd_convolve_y_avx2, 0, 1)); -INSTANTIATE_TEST_CASE_P( - AVX2, AV1JntConvolve2DTest, - libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_2d_avx2, 1, 1)); +INSTANTIATE_TEST_CASE_P(AVX2, AV1JntConvolve2DTest, + libaom_test::AV1Convolve2D::BuildParams( + av1_dist_wtd_convolve_2d_avx2, 1, 1)); #endif // HAVE_AVX2 #endif // HAVE_SSSE3 #endif // HAVE_SSE2 #if HAVE_NEON INSTANTIATE_TEST_CASE_P(NEON_COPY, AV1JntConvolve2DTest, libaom_test::AV1Convolve2D::BuildParams( - av1_jnt_convolve_2d_copy_neon, 0, 0)); + av1_dist_wtd_convolve_2d_copy_neon, 0, 0)); -INSTANTIATE_TEST_CASE_P( - NEON, AV1JntConvolve2DTest, - libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_2d_neon, 1, 1)); -INSTANTIATE_TEST_CASE_P( - NEON_X, AV1JntConvolve2DTest, - libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_x_neon, 1, 0)); +INSTANTIATE_TEST_CASE_P(NEON, AV1JntConvolve2DTest, + libaom_test::AV1Convolve2D::BuildParams( + av1_dist_wtd_convolve_2d_neon, 1, 1)); +INSTANTIATE_TEST_CASE_P(NEON_X, AV1JntConvolve2DTest, + libaom_test::AV1Convolve2D::BuildParams( + av1_dist_wtd_convolve_x_neon, 1, 0)); -INSTANTIATE_TEST_CASE_P( - NEON_Y, AV1JntConvolve2DTest, - libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_y_neon, 0, 1)); +INSTANTIATE_TEST_CASE_P(NEON_Y, AV1JntConvolve2DTest, + libaom_test::AV1Convolve2D::BuildParams( + av1_dist_wtd_convolve_y_neon, 0, 1)); #endif // HAVE_NEON TEST_P(AV1HighbdConvolve2DSrTest, CheckOutput) { RunCheckOutput(GET_PARAM(1)); } @@ -213,41 +214,41 @@ TEST_P(AV1HighbdJntConvolve2DTest, DISABLED_Speed) { INSTANTIATE_TEST_CASE_P(C_X, AV1HighbdJntConvolve2DTest, libaom_test::AV1HighbdConvolve2D::BuildParams( - av1_highbd_jnt_convolve_x_c, 1, 0)); + av1_highbd_dist_wtd_convolve_x_c, 1, 0)); INSTANTIATE_TEST_CASE_P(C_Y, AV1HighbdJntConvolve2DTest, libaom_test::AV1HighbdConvolve2D::BuildParams( - av1_highbd_jnt_convolve_y_c, 0, 1)); + av1_highbd_dist_wtd_convolve_y_c, 0, 1)); INSTANTIATE_TEST_CASE_P(C_COPY, AV1HighbdJntConvolve2DTest, libaom_test::AV1HighbdConvolve2D::BuildParams( - av1_highbd_jnt_convolve_2d_copy_c, 0, 0)); + av1_highbd_dist_wtd_convolve_2d_copy_c, 0, 0)); #if HAVE_SSE4_1 INSTANTIATE_TEST_CASE_P(SSE4_1_COPY, AV1HighbdJntConvolve2DTest, libaom_test::AV1HighbdConvolve2D::BuildParams( - av1_highbd_jnt_convolve_2d_copy_sse4_1, 0, 0)); + av1_highbd_dist_wtd_convolve_2d_copy_sse4_1, 0, 0)); INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdJntConvolve2DTest, libaom_test::AV1HighbdConvolve2D::BuildParams( - av1_highbd_jnt_convolve_2d_sse4_1, 1, 1)); + av1_highbd_dist_wtd_convolve_2d_sse4_1, 1, 1)); INSTANTIATE_TEST_CASE_P(SSE4_1_X, AV1HighbdJntConvolve2DTest, libaom_test::AV1HighbdConvolve2D::BuildParams( - av1_highbd_jnt_convolve_x_sse4_1, 1, 0)); + av1_highbd_dist_wtd_convolve_x_sse4_1, 1, 0)); INSTANTIATE_TEST_CASE_P(SSE4_1_Y, AV1HighbdJntConvolve2DTest, libaom_test::AV1HighbdConvolve2D::BuildParams( - av1_highbd_jnt_convolve_y_sse4_1, 0, 1)); + av1_highbd_dist_wtd_convolve_y_sse4_1, 0, 1)); #if HAVE_AVX2 INSTANTIATE_TEST_CASE_P(AVX2_COPY, AV1HighbdJntConvolve2DTest, libaom_test::AV1HighbdConvolve2D::BuildParams( - av1_highbd_jnt_convolve_2d_copy_avx2, 0, 0)); + av1_highbd_dist_wtd_convolve_2d_copy_avx2, 0, 0)); INSTANTIATE_TEST_CASE_P(AVX2, AV1HighbdJntConvolve2DTest, libaom_test::AV1HighbdConvolve2D::BuildParams( - av1_highbd_jnt_convolve_2d_avx2, 1, 1)); + av1_highbd_dist_wtd_convolve_2d_avx2, 1, 1)); INSTANTIATE_TEST_CASE_P(AVX2_X, AV1HighbdJntConvolve2DTest, libaom_test::AV1HighbdConvolve2D::BuildParams( - av1_highbd_jnt_convolve_x_avx2, 1, 0)); + av1_highbd_dist_wtd_convolve_x_avx2, 1, 0)); INSTANTIATE_TEST_CASE_P(AVX2_Y, AV1HighbdJntConvolve2DTest, libaom_test::AV1HighbdConvolve2D::BuildParams( - av1_highbd_jnt_convolve_y_avx2, 0, 1)); + av1_highbd_dist_wtd_convolve_y_avx2, 0, 1)); #endif // HAVE_AVX2 #endif // HAVE_SSE4_1 } // namespace diff --git a/libaom/test/av1_convolve_2d_test_util.cc b/libaom/test/av1_convolve_2d_test_util.cc index 409fd23..9cfe3e6 100644 --- a/libaom/test/av1_convolve_2d_test_util.cc +++ b/libaom/test/av1_convolve_2d_test_util.cc @@ -200,9 +200,9 @@ void AV1JntConvolve2DTest::RunCheckOutput(convolve_2d_func test_impl) { ConvolveParams conv_params2 = get_conv_params_no_round(do_average, 0, output2, MAX_SB_SIZE, 1, 8); - // Test special case where jnt_comp_avg is not used - conv_params1.use_jnt_comp_avg = 0; - conv_params2.use_jnt_comp_avg = 0; + // Test special case where dist_wtd_comp_avg is not used + conv_params1.use_dist_wtd_comp_avg = 0; + conv_params2.use_dist_wtd_comp_avg = 0; const int subx_range = has_subx ? 16 : 1; const int suby_range = has_suby ? 16 : 1; @@ -211,9 +211,10 @@ void AV1JntConvolve2DTest::RunCheckOutput(convolve_2d_func test_impl) { // Choose random locations within the source block const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7); const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7); - av1_jnt_convolve_2d_c(input + offset_r * w + offset_c, w, output8_1, - MAX_SB_SIZE, out_w, out_h, filter_params_x, - filter_params_y, subx, suby, &conv_params1); + av1_dist_wtd_convolve_2d_c(input + offset_r * w + offset_c, w, + output8_1, MAX_SB_SIZE, out_w, out_h, + filter_params_x, filter_params_y, subx, + suby, &conv_params1); test_impl(input + offset_r * w + offset_c, w, output8_2, MAX_SB_SIZE, out_w, out_h, filter_params_x, filter_params_y, subx, suby, &conv_params2); @@ -222,7 +223,7 @@ void AV1JntConvolve2DTest::RunCheckOutput(convolve_2d_func test_impl) { for (int j = 0; j < out_w; ++j) { int idx = i * MAX_SB_SIZE + j; ASSERT_EQ(output1[idx], output2[idx]) - << "Mismatch at unit tests for av1_jnt_convolve_2d\n" + << "Mismatch at unit tests for av1_dist_wtd_convolve_2d\n" << out_w << "x" << out_h << " Pixel mismatch at index " << idx << " = (" << i << ", " << j << "), sub pixel offset = (" << suby << ", " << subx << ")"; @@ -247,8 +248,8 @@ void AV1JntConvolve2DTest::RunCheckOutput(convolve_2d_func test_impl) { // Test different combination of fwd and bck offset weights for (int k = 0; k < 2; ++k) { for (int l = 0; l < 4; ++l) { - conv_params1.use_jnt_comp_avg = 1; - conv_params2.use_jnt_comp_avg = 1; + conv_params1.use_dist_wtd_comp_avg = 1; + conv_params2.use_dist_wtd_comp_avg = 1; conv_params1.fwd_offset = quant_dist_lookup_table[k][l][0]; conv_params1.bck_offset = quant_dist_lookup_table[k][l][1]; conv_params2.fwd_offset = quant_dist_lookup_table[k][l][0]; @@ -259,10 +260,10 @@ void AV1JntConvolve2DTest::RunCheckOutput(convolve_2d_func test_impl) { // Choose random locations within the source block const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7); const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7); - av1_jnt_convolve_2d_c(input + offset_r * w + offset_c, w, - output8_1, MAX_SB_SIZE, out_w, out_h, - filter_params_x, filter_params_y, subx, - suby, &conv_params1); + av1_dist_wtd_convolve_2d_c(input + offset_r * w + offset_c, w, + output8_1, MAX_SB_SIZE, out_w, out_h, + filter_params_x, filter_params_y, + subx, suby, &conv_params1); test_impl(input + offset_r * w + offset_c, w, output8_2, MAX_SB_SIZE, out_w, out_h, filter_params_x, filter_params_y, subx, suby, &conv_params2); @@ -272,7 +273,7 @@ void AV1JntConvolve2DTest::RunCheckOutput(convolve_2d_func test_impl) { int idx = i * MAX_SB_SIZE + j; ASSERT_EQ(output1[idx], output2[idx]) << "Mismatch at unit tests for " - "av1_jnt_convolve_2d\n" + "av1_dist_wtd_convolve_2d\n" << out_w << "x" << out_h << " Pixel mismatch at index " << idx << " = (" << i << ", " << j << "), sub pixel offset = (" << suby << ", " << subx @@ -333,7 +334,7 @@ void AV1JntConvolve2DTest::RunSpeedTest(convolve_2d_func test_impl) { ConvolveParams conv_params = get_conv_params_no_round(do_average, 0, output, MAX_SB_SIZE, 1, 8); - conv_params.use_jnt_comp_avg = 0; + conv_params.use_dist_wtd_comp_avg = 0; // Choose random locations within the source block const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7); @@ -540,8 +541,8 @@ void AV1HighbdJntConvolve2DTest::RunSpeedTest( ConvolveParams conv_params = get_conv_params_no_round(do_average, 0, output, MAX_SB_SIZE, 1, bd); - // Test special case where jnt_comp_avg is not used - conv_params.use_jnt_comp_avg = 0; + // Test special case where dist_wtd_comp_avg is not used + conv_params.use_dist_wtd_comp_avg = 0; subx = 0; suby = 0; @@ -601,9 +602,9 @@ void AV1HighbdJntConvolve2DTest::RunCheckOutput( ConvolveParams conv_params2 = get_conv_params_no_round( do_average, 0, output2, MAX_SB_SIZE, 1, bd); - // Test special case where jnt_comp_avg is not used - conv_params1.use_jnt_comp_avg = 0; - conv_params2.use_jnt_comp_avg = 0; + // Test special case where dist_wtd_comp_avg is not used + conv_params1.use_dist_wtd_comp_avg = 0; + conv_params2.use_dist_wtd_comp_avg = 0; const int subx_range = has_subx ? 16 : 1; const int suby_range = has_suby ? 16 : 1; @@ -612,10 +613,10 @@ void AV1HighbdJntConvolve2DTest::RunCheckOutput( // Choose random locations within the source block const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7); const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7); - av1_highbd_jnt_convolve_2d_c(input + offset_r * w + offset_c, w, - output16_1, MAX_SB_SIZE, out_w, out_h, - filter_params_x, filter_params_y, subx, - suby, &conv_params1, bd); + av1_highbd_dist_wtd_convolve_2d_c( + input + offset_r * w + offset_c, w, output16_1, MAX_SB_SIZE, + out_w, out_h, filter_params_x, filter_params_y, subx, suby, + &conv_params1, bd); test_impl(input + offset_r * w + offset_c, w, output16_2, MAX_SB_SIZE, out_w, out_h, filter_params_x, filter_params_y, subx, suby, &conv_params2, bd); @@ -648,8 +649,8 @@ void AV1HighbdJntConvolve2DTest::RunCheckOutput( // Test different combination of fwd and bck offset weights for (int k = 0; k < 2; ++k) { for (int l = 0; l < 4; ++l) { - conv_params1.use_jnt_comp_avg = 1; - conv_params2.use_jnt_comp_avg = 1; + conv_params1.use_dist_wtd_comp_avg = 1; + conv_params2.use_dist_wtd_comp_avg = 1; conv_params1.fwd_offset = quant_dist_lookup_table[k][l][0]; conv_params1.bck_offset = quant_dist_lookup_table[k][l][1]; conv_params2.fwd_offset = quant_dist_lookup_table[k][l][0]; @@ -662,7 +663,7 @@ void AV1HighbdJntConvolve2DTest::RunCheckOutput( // Choose random locations within the source block const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7); const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7); - av1_highbd_jnt_convolve_2d_c( + av1_highbd_dist_wtd_convolve_2d_c( input + offset_r * w + offset_c, w, output16_1, MAX_SB_SIZE, out_w, out_h, filter_params_x, filter_params_y, subx, suby, &conv_params1, bd); diff --git a/libaom/test/av1_convolve_scale_test.cc b/libaom/test/av1_convolve_scale_test.cc index 1929c49..a933fc9 100644 --- a/libaom/test/av1_convolve_scale_test.cc +++ b/libaom/test/av1_convolve_scale_test.cc @@ -286,13 +286,13 @@ class ConvolveScaleTestBase : public ::testing::Test { } void SetConvParamOffset(int i, int j, int is_compound, int do_average, - int use_jnt_comp_avg) { + int use_dist_wtd_comp_avg) { if (i == -1 && j == -1) { - convolve_params_.use_jnt_comp_avg = use_jnt_comp_avg; + convolve_params_.use_dist_wtd_comp_avg = use_dist_wtd_comp_avg; convolve_params_.is_compound = is_compound; convolve_params_.do_average = do_average; } else { - convolve_params_.use_jnt_comp_avg = use_jnt_comp_avg; + convolve_params_.use_dist_wtd_comp_avg = use_dist_wtd_comp_avg; convolve_params_.fwd_offset = quant_dist_lookup_table[i][j][0]; convolve_params_.bck_offset = quant_dist_lookup_table[i][j][1]; convolve_params_.is_compound = is_compound; @@ -312,12 +312,12 @@ class ConvolveScaleTestBase : public ::testing::Test { is_compound = 1; for (int do_average = 0; do_average < 2; do_average++) { - for (int use_jnt_comp_avg = 0; use_jnt_comp_avg < 2; - use_jnt_comp_avg++) { + for (int use_dist_wtd_comp_avg = 0; use_dist_wtd_comp_avg < 2; + use_dist_wtd_comp_avg++) { for (int j = 0; j < 2; ++j) { for (int k = 0; k < 4; ++k) { SetConvParamOffset(j, k, is_compound, do_average, - use_jnt_comp_avg); + use_dist_wtd_comp_avg); Prep(&rnd); RunOne(true); RunOne(false); diff --git a/libaom/test/av1_fwd_txfm2d_test.cc b/libaom/test/av1_fwd_txfm2d_test.cc index c1b97f7..eb09cb1 100644 --- a/libaom/test/av1_fwd_txfm2d_test.cc +++ b/libaom/test/av1_fwd_txfm2d_test.cc @@ -288,6 +288,68 @@ void AV1FwdTxfm2dMatchTest(TX_SIZE tx_size, lowbd_fwd_txfm_func target_func) { } } +void AV1FwdTxfm2dSpeedTest(TX_SIZE tx_size, lowbd_fwd_txfm_func target_func) { + TxfmParam param; + memset(¶m, 0, sizeof(param)); + const int rows = tx_size_high[tx_size]; + const int cols = tx_size_wide[tx_size]; + const int num_loops = 1000000 / (rows * cols); + + for (int i = 0; i < 2; ++i) { + const int bd = 8; + for (int tx_type = 0; tx_type < TX_TYPES; ++tx_type) { + if (libaom_test::IsTxSizeTypeValid( + tx_size, static_cast<TX_TYPE>(tx_type)) == false) { + continue; + } + + FwdTxfm2dFunc ref_func = libaom_test::fwd_txfm_func_ls[tx_size]; + if (ref_func != NULL) { + DECLARE_ALIGNED(32, int16_t, input[64 * 64]) = { 0 }; + DECLARE_ALIGNED(32, int32_t, output[64 * 64]); + DECLARE_ALIGNED(32, int32_t, ref_output[64 * 64]); + int input_stride = 64; + ACMRandom rnd(ACMRandom::DeterministicSeed()); + + for (int r = 0; r < rows; ++r) { + for (int c = 0; c < cols; ++c) { + input[r * input_stride + c] = rnd.Rand16() % (1 << bd); + } + } + + param.tx_type = (TX_TYPE)tx_type; + param.tx_size = (TX_SIZE)tx_size; + param.tx_set_type = EXT_TX_SET_ALL16; + param.bd = bd; + + aom_usec_timer ref_timer, test_timer; + + aom_usec_timer_start(&ref_timer); + for (int i = 0; i < num_loops; ++i) { + ref_func(input, ref_output, input_stride, (TX_TYPE)tx_type, bd); + } + aom_usec_timer_mark(&ref_timer); + const int elapsed_time_c = + static_cast<int>(aom_usec_timer_elapsed(&ref_timer)); + + aom_usec_timer_start(&test_timer); + for (int i = 0; i < num_loops; ++i) { + target_func(input, output, input_stride, ¶m); + } + aom_usec_timer_mark(&test_timer); + const int elapsed_time_simd = + static_cast<int>(aom_usec_timer_elapsed(&test_timer)); + + printf( + "txfm_size[%d] \t txfm_type[%d] \t c_time=%d \t simd_time=%d \t " + "gain=%d \n", + tx_size, tx_type, elapsed_time_c, elapsed_time_simd, + (elapsed_time_c / elapsed_time_simd)); + } + } + } +} + typedef ::testing::tuple<TX_SIZE, lowbd_fwd_txfm_func> LbdFwdTxfm2dParam; class AV1FwdTxfm2dTest : public ::testing::TestWithParam<LbdFwdTxfm2dParam> {}; @@ -295,7 +357,9 @@ class AV1FwdTxfm2dTest : public ::testing::TestWithParam<LbdFwdTxfm2dParam> {}; TEST_P(AV1FwdTxfm2dTest, match) { AV1FwdTxfm2dMatchTest(GET_PARAM(0), GET_PARAM(1)); } - +TEST_P(AV1FwdTxfm2dTest, DISABLED_Speed) { + AV1FwdTxfm2dSpeedTest(GET_PARAM(0), GET_PARAM(1)); +} using ::testing::Combine; using ::testing::Values; using ::testing::ValuesIn; @@ -507,5 +571,12 @@ INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdFwdTxfm2dTest, Combine(ValuesIn(Highbd_fwd_txfm_for_sse4_1), Values(av1_highbd_fwd_txfm))); #endif // HAVE_SSE4_1 +#if HAVE_AVX2 +static TX_SIZE Highbd_fwd_txfm_for_avx2[] = { TX_8X8, TX_16X16, TX_32X32, + TX_64X64, TX_8X16, TX_16X8 }; +INSTANTIATE_TEST_CASE_P(AVX2, AV1HighbdFwdTxfm2dTest, + Combine(ValuesIn(Highbd_fwd_txfm_for_avx2), + Values(av1_highbd_fwd_txfm))); +#endif // HAVE_AVX2 } // namespace diff --git a/libaom/test/av1_highbd_iht_test.cc b/libaom/test/av1_highbd_iht_test.cc index 7f077b6..6d77cbf 100644 --- a/libaom/test/av1_highbd_iht_test.cc +++ b/libaom/test/av1_highbd_iht_test.cc @@ -308,7 +308,8 @@ INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdInvTxfm2d, ::testing::Values(av1_highbd_inv_txfm_add_sse4_1)); #endif -#if HAVE_AVX2 +// TODO(http://crbug.com/aomedia/2350): these cause test vector mismatches. +#if 0 // HAVE_AVX2 INSTANTIATE_TEST_CASE_P(AVX2, AV1HighbdInvTxfm2d, ::testing::Values(av1_highbd_inv_txfm_add_avx2)); #endif diff --git a/libaom/test/av1_round_shift_array_test.cc b/libaom/test/av1_round_shift_array_test.cc index 181a394..61dbed5 100644 --- a/libaom/test/av1_round_shift_array_test.cc +++ b/libaom/test/av1_round_shift_array_test.cc @@ -13,7 +13,7 @@ #include <stdio.h> #include <stdlib.h> -#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" #include "aom_mem/aom_mem.h" #include "aom_ports/aom_timer.h" diff --git a/libaom/test/av1_txfm_test.h b/libaom/test/av1_txfm_test.h index a181647..5a56d28 100644 --- a/libaom/test/av1_txfm_test.h +++ b/libaom/test/av1_txfm_test.h @@ -29,14 +29,14 @@ #include "av1/common/enums.h" namespace libaom_test { -typedef enum { +enum { TYPE_DCT = 0, TYPE_ADST, TYPE_IDTX, TYPE_IDCT, TYPE_IADST, TYPE_LAST -} TYPE_TXFM; +} UENUM1BYTE(TYPE_TXFM); int get_txfm1d_size(TX_SIZE tx_size); diff --git a/libaom/test/comp_avg_pred_test.cc b/libaom/test/comp_avg_pred_test.cc index 9c6ed90..3e5632e 100644 --- a/libaom/test/comp_avg_pred_test.cc +++ b/libaom/test/comp_avg_pred_test.cc @@ -12,61 +12,65 @@ #include "test/comp_avg_pred_test.h" using libaom_test::ACMRandom; -using libaom_test::AV1JNTCOMPAVG::AV1HighBDJNTCOMPAVGTest; -using libaom_test::AV1JNTCOMPAVG::AV1HighBDJNTCOMPAVGUPSAMPLEDTest; -using libaom_test::AV1JNTCOMPAVG::AV1JNTCOMPAVGTest; -using libaom_test::AV1JNTCOMPAVG::AV1JNTCOMPAVGUPSAMPLEDTest; +using libaom_test::AV1DISTWTDCOMPAVG::AV1DISTWTDCOMPAVGTest; +using libaom_test::AV1DISTWTDCOMPAVG::AV1DISTWTDCOMPAVGUPSAMPLEDTest; +using libaom_test::AV1DISTWTDCOMPAVG::AV1HighBDDISTWTDCOMPAVGTest; +using libaom_test::AV1DISTWTDCOMPAVG::AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest; using ::testing::make_tuple; using ::testing::tuple; namespace { -TEST_P(AV1JNTCOMPAVGTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0)); } +TEST_P(AV1DISTWTDCOMPAVGTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0)); } -TEST_P(AV1JNTCOMPAVGTest, CheckOutput) { RunCheckOutput(GET_PARAM(0)); } +TEST_P(AV1DISTWTDCOMPAVGTest, CheckOutput) { RunCheckOutput(GET_PARAM(0)); } #if HAVE_SSSE3 -INSTANTIATE_TEST_CASE_P( - SSSE3, AV1JNTCOMPAVGTest, - libaom_test::AV1JNTCOMPAVG::BuildParams(aom_jnt_comp_avg_pred_ssse3)); +INSTANTIATE_TEST_CASE_P(SSSE3, AV1DISTWTDCOMPAVGTest, + libaom_test::AV1DISTWTDCOMPAVG::BuildParams( + aom_dist_wtd_comp_avg_pred_ssse3)); #endif -TEST_P(AV1JNTCOMPAVGUPSAMPLEDTest, DISABLED_Speed) { +TEST_P(AV1DISTWTDCOMPAVGUPSAMPLEDTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0)); } -TEST_P(AV1JNTCOMPAVGUPSAMPLEDTest, CheckOutput) { +TEST_P(AV1DISTWTDCOMPAVGUPSAMPLEDTest, CheckOutput) { RunCheckOutput(GET_PARAM(0)); } #if HAVE_SSSE3 -INSTANTIATE_TEST_CASE_P(SSSE3, AV1JNTCOMPAVGUPSAMPLEDTest, - libaom_test::AV1JNTCOMPAVG::BuildParams( - aom_jnt_comp_avg_upsampled_pred_ssse3)); +INSTANTIATE_TEST_CASE_P(SSSE3, AV1DISTWTDCOMPAVGUPSAMPLEDTest, + libaom_test::AV1DISTWTDCOMPAVG::BuildParams( + aom_dist_wtd_comp_avg_upsampled_pred_ssse3)); #endif -TEST_P(AV1HighBDJNTCOMPAVGTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(1)); } +TEST_P(AV1HighBDDISTWTDCOMPAVGTest, DISABLED_Speed) { + RunSpeedTest(GET_PARAM(1)); +} -TEST_P(AV1HighBDJNTCOMPAVGTest, CheckOutput) { RunCheckOutput(GET_PARAM(1)); } +TEST_P(AV1HighBDDISTWTDCOMPAVGTest, CheckOutput) { + RunCheckOutput(GET_PARAM(1)); +} #if HAVE_SSE2 -INSTANTIATE_TEST_CASE_P(SSE2, AV1HighBDJNTCOMPAVGTest, - libaom_test::AV1JNTCOMPAVG::BuildParams( - aom_highbd_jnt_comp_avg_pred_sse2, 1)); +INSTANTIATE_TEST_CASE_P(SSE2, AV1HighBDDISTWTDCOMPAVGTest, + libaom_test::AV1DISTWTDCOMPAVG::BuildParams( + aom_highbd_dist_wtd_comp_avg_pred_sse2, 1)); #endif -TEST_P(AV1HighBDJNTCOMPAVGUPSAMPLEDTest, DISABLED_Speed) { +TEST_P(AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(1)); } -TEST_P(AV1HighBDJNTCOMPAVGUPSAMPLEDTest, CheckOutput) { +TEST_P(AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest, CheckOutput) { RunCheckOutput(GET_PARAM(1)); } #if HAVE_SSE2 -INSTANTIATE_TEST_CASE_P(SSE2, AV1HighBDJNTCOMPAVGUPSAMPLEDTest, - libaom_test::AV1JNTCOMPAVG::BuildParams( - aom_highbd_jnt_comp_avg_upsampled_pred_sse2)); +INSTANTIATE_TEST_CASE_P(SSE2, AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest, + libaom_test::AV1DISTWTDCOMPAVG::BuildParams( + aom_highbd_dist_wtd_comp_avg_upsampled_pred_sse2)); #endif } // namespace diff --git a/libaom/test/comp_avg_pred_test.h b/libaom/test/comp_avg_pred_test.h index 65a0153..01ea35d 100644 --- a/libaom/test/comp_avg_pred_test.h +++ b/libaom/test/comp_avg_pred_test.h @@ -25,72 +25,73 @@ namespace libaom_test { const int kMaxSize = 128 + 32; // padding -namespace AV1JNTCOMPAVG { +namespace AV1DISTWTDCOMPAVG { -typedef void (*jntcompavg_func)(uint8_t *comp_pred, const uint8_t *pred, - int width, int height, const uint8_t *ref, - int ref_stride, - const JNT_COMP_PARAMS *jcp_param); +typedef void (*distwtdcompavg_func)(uint8_t *comp_pred, const uint8_t *pred, + int width, int height, const uint8_t *ref, + int ref_stride, + const DIST_WTD_COMP_PARAMS *jcp_param); -typedef void (*jntcompavgupsampled_func)( +typedef void (*distwtdcompavgupsampled_func)( MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref, - int ref_stride, const JNT_COMP_PARAMS *jcp_param, int subpel_search); + int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search); -typedef void (*highbdjntcompavgupsampled_func)( +typedef void (*highbddistwtdcompavgupsampled_func)( MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col, const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, - int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param, + int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search); -typedef ::testing::tuple<jntcompavg_func, BLOCK_SIZE> JNTCOMPAVGParam; +typedef ::testing::tuple<distwtdcompavg_func, BLOCK_SIZE> DISTWTDCOMPAVGParam; -typedef ::testing::tuple<jntcompavgupsampled_func, BLOCK_SIZE> - JNTCOMPAVGUPSAMPLEDParam; +typedef ::testing::tuple<distwtdcompavgupsampled_func, BLOCK_SIZE> + DISTWTDCOMPAVGUPSAMPLEDParam; -typedef ::testing::tuple<int, jntcompavg_func, BLOCK_SIZE> - HighbdJNTCOMPAVGParam; +typedef ::testing::tuple<int, distwtdcompavg_func, BLOCK_SIZE> + HighbdDISTWTDCOMPAVGParam; -typedef ::testing::tuple<int, highbdjntcompavgupsampled_func, BLOCK_SIZE> - HighbdJNTCOMPAVGUPSAMPLEDParam; +typedef ::testing::tuple<int, highbddistwtdcompavgupsampled_func, BLOCK_SIZE> + HighbdDISTWTDCOMPAVGUPSAMPLEDParam; -::testing::internal::ParamGenerator<JNTCOMPAVGParam> BuildParams( - jntcompavg_func filter) { +::testing::internal::ParamGenerator<DISTWTDCOMPAVGParam> BuildParams( + distwtdcompavg_func filter) { return ::testing::Combine(::testing::Values(filter), ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL)); } -::testing::internal::ParamGenerator<JNTCOMPAVGUPSAMPLEDParam> BuildParams( - jntcompavgupsampled_func filter) { +::testing::internal::ParamGenerator<DISTWTDCOMPAVGUPSAMPLEDParam> BuildParams( + distwtdcompavgupsampled_func filter) { return ::testing::Combine(::testing::Values(filter), ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL)); } -::testing::internal::ParamGenerator<HighbdJNTCOMPAVGParam> BuildParams( - jntcompavg_func filter, int is_hbd) { +::testing::internal::ParamGenerator<HighbdDISTWTDCOMPAVGParam> BuildParams( + distwtdcompavg_func filter, int is_hbd) { (void)is_hbd; return ::testing::Combine(::testing::Range(8, 13, 2), ::testing::Values(filter), ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL)); } -::testing::internal::ParamGenerator<HighbdJNTCOMPAVGUPSAMPLEDParam> BuildParams( - highbdjntcompavgupsampled_func filter) { +::testing::internal::ParamGenerator<HighbdDISTWTDCOMPAVGUPSAMPLEDParam> +BuildParams(highbddistwtdcompavgupsampled_func filter) { return ::testing::Combine(::testing::Range(8, 13, 2), ::testing::Values(filter), ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL)); } -class AV1JNTCOMPAVGTest : public ::testing::TestWithParam<JNTCOMPAVGParam> { +class AV1DISTWTDCOMPAVGTest + : public ::testing::TestWithParam<DISTWTDCOMPAVGParam> { public: - ~AV1JNTCOMPAVGTest() {} + ~AV1DISTWTDCOMPAVGTest() {} void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); } void TearDown() { libaom_test::ClearSystemState(); } protected: - void RunCheckOutput(jntcompavg_func test_impl) { + void RunCheckOutput(distwtdcompavg_func test_impl) { const int w = kMaxSize, h = kMaxSize; const int block_idx = GET_PARAM(1); @@ -107,27 +108,27 @@ class AV1JNTCOMPAVGTest : public ::testing::TestWithParam<JNTCOMPAVGParam> { const int in_w = block_size_wide[block_idx]; const int in_h = block_size_high[block_idx]; - JNT_COMP_PARAMS jnt_comp_params; - jnt_comp_params.use_jnt_comp_avg = 1; + DIST_WTD_COMP_PARAMS dist_wtd_comp_params; + dist_wtd_comp_params.use_dist_wtd_comp_avg = 1; for (int ii = 0; ii < 2; ii++) { for (int jj = 0; jj < 4; jj++) { - jnt_comp_params.fwd_offset = quant_dist_lookup_table[ii][jj][0]; - jnt_comp_params.bck_offset = quant_dist_lookup_table[ii][jj][1]; + dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[ii][jj][0]; + dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[ii][jj][1]; const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7); const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7); - aom_jnt_comp_avg_pred_c(output, pred8 + offset_r * w + offset_c, in_w, - in_h, ref8 + offset_r * w + offset_c, in_w, - &jnt_comp_params); + aom_dist_wtd_comp_avg_pred_c(output, pred8 + offset_r * w + offset_c, + in_w, in_h, ref8 + offset_r * w + offset_c, + in_w, &dist_wtd_comp_params); test_impl(output2, pred8 + offset_r * w + offset_c, in_w, in_h, - ref8 + offset_r * w + offset_c, in_w, &jnt_comp_params); + ref8 + offset_r * w + offset_c, in_w, &dist_wtd_comp_params); for (int i = 0; i < in_h; ++i) { for (int j = 0; j < in_w; ++j) { int idx = i * in_w + j; ASSERT_EQ(output[idx], output2[idx]) - << "Mismatch at unit tests for AV1JNTCOMPAVGTest\n" + << "Mismatch at unit tests for AV1DISTWTDCOMPAVGTest\n" << in_w << "x" << in_h << " Pixel mismatch at index " << idx << " = (" << i << ", " << j << ")"; } @@ -135,7 +136,7 @@ class AV1JNTCOMPAVGTest : public ::testing::TestWithParam<JNTCOMPAVGParam> { } } } - void RunSpeedTest(jntcompavg_func test_impl) { + void RunSpeedTest(distwtdcompavg_func test_impl) { const int w = kMaxSize, h = kMaxSize; const int block_idx = GET_PARAM(1); @@ -152,49 +153,49 @@ class AV1JNTCOMPAVGTest : public ::testing::TestWithParam<JNTCOMPAVGParam> { const int in_w = block_size_wide[block_idx]; const int in_h = block_size_high[block_idx]; - JNT_COMP_PARAMS jnt_comp_params; - jnt_comp_params.use_jnt_comp_avg = 1; + DIST_WTD_COMP_PARAMS dist_wtd_comp_params; + dist_wtd_comp_params.use_dist_wtd_comp_avg = 1; - jnt_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0]; - jnt_comp_params.bck_offset = quant_dist_lookup_table[0][0][1]; + dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0]; + dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][0][1]; const int num_loops = 1000000000 / (in_w + in_h); aom_usec_timer timer; aom_usec_timer_start(&timer); for (int i = 0; i < num_loops; ++i) - aom_jnt_comp_avg_pred_c(output, pred8, in_w, in_h, ref8, in_w, - &jnt_comp_params); + aom_dist_wtd_comp_avg_pred_c(output, pred8, in_w, in_h, ref8, in_w, + &dist_wtd_comp_params); aom_usec_timer_mark(&timer); const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer)); - printf("jntcompavg c_code %3dx%-3d: %7.2f us\n", in_w, in_h, + printf("distwtdcompavg c_code %3dx%-3d: %7.2f us\n", in_w, in_h, 1000.0 * elapsed_time / num_loops); aom_usec_timer timer1; aom_usec_timer_start(&timer1); for (int i = 0; i < num_loops; ++i) - test_impl(output2, pred8, in_w, in_h, ref8, in_w, &jnt_comp_params); + test_impl(output2, pred8, in_w, in_h, ref8, in_w, &dist_wtd_comp_params); aom_usec_timer_mark(&timer1); const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1)); - printf("jntcompavg test_code %3dx%-3d: %7.2f us\n", in_w, in_h, + printf("distwtdcompavg test_code %3dx%-3d: %7.2f us\n", in_w, in_h, 1000.0 * elapsed_time1 / num_loops); } libaom_test::ACMRandom rnd_; -}; // class AV1JNTCOMPAVGTest +}; // class AV1DISTWTDCOMPAVGTest -class AV1JNTCOMPAVGUPSAMPLEDTest - : public ::testing::TestWithParam<JNTCOMPAVGUPSAMPLEDParam> { +class AV1DISTWTDCOMPAVGUPSAMPLEDTest + : public ::testing::TestWithParam<DISTWTDCOMPAVGUPSAMPLEDParam> { public: - ~AV1JNTCOMPAVGUPSAMPLEDTest() {} + ~AV1DISTWTDCOMPAVGUPSAMPLEDTest() {} void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); } void TearDown() { libaom_test::ClearSystemState(); } protected: - void RunCheckOutput(jntcompavgupsampled_func test_impl) { + void RunCheckOutput(distwtdcompavgupsampled_func test_impl) { const int w = kMaxSize, h = kMaxSize; const int block_idx = GET_PARAM(1); @@ -211,8 +212,8 @@ class AV1JNTCOMPAVGUPSAMPLEDTest const int in_w = block_size_wide[block_idx]; const int in_h = block_size_high[block_idx]; - JNT_COMP_PARAMS jnt_comp_params; - jnt_comp_params.use_jnt_comp_avg = 1; + DIST_WTD_COMP_PARAMS dist_wtd_comp_params; + dist_wtd_comp_params.use_dist_wtd_comp_avg = 1; int sub_x_q3, sub_y_q3; int subpel_search; for (subpel_search = USE_4_TAPS; subpel_search <= USE_8_TAPS; @@ -221,28 +222,30 @@ class AV1JNTCOMPAVGUPSAMPLEDTest for (sub_y_q3 = 0; sub_y_q3 < 8; ++sub_y_q3) { for (int ii = 0; ii < 2; ii++) { for (int jj = 0; jj < 4; jj++) { - jnt_comp_params.fwd_offset = quant_dist_lookup_table[ii][jj][0]; - jnt_comp_params.bck_offset = quant_dist_lookup_table[ii][jj][1]; + dist_wtd_comp_params.fwd_offset = + quant_dist_lookup_table[ii][jj][0]; + dist_wtd_comp_params.bck_offset = + quant_dist_lookup_table[ii][jj][1]; const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7); const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7); - aom_jnt_comp_avg_upsampled_pred_c( + aom_dist_wtd_comp_avg_upsampled_pred_c( NULL, NULL, 0, 0, NULL, output, pred8 + offset_r * w + offset_c, in_w, in_h, sub_x_q3, sub_y_q3, ref8 + offset_r * w + offset_c, in_w, - &jnt_comp_params, subpel_search); + &dist_wtd_comp_params, subpel_search); test_impl(NULL, NULL, 0, 0, NULL, output2, pred8 + offset_r * w + offset_c, in_w, in_h, sub_x_q3, sub_y_q3, ref8 + offset_r * w + offset_c, in_w, - &jnt_comp_params, subpel_search); + &dist_wtd_comp_params, subpel_search); for (int i = 0; i < in_h; ++i) { for (int j = 0; j < in_w; ++j) { int idx = i * in_w + j; ASSERT_EQ(output[idx], output2[idx]) << "Mismatch at unit tests for " - "AV1JNTCOMPAVGUPSAMPLEDTest\n" + "AV1DISTWTDCOMPAVGUPSAMPLEDTest\n" << in_w << "x" << in_h << " Pixel mismatch at index " << idx << " = (" << i << ", " << j << "), sub pixel offset = (" << sub_y_q3 << ", " @@ -255,7 +258,7 @@ class AV1JNTCOMPAVGUPSAMPLEDTest } } } - void RunSpeedTest(jntcompavgupsampled_func test_impl) { + void RunSpeedTest(distwtdcompavgupsampled_func test_impl) { const int w = kMaxSize, h = kMaxSize; const int block_idx = GET_PARAM(1); @@ -272,11 +275,11 @@ class AV1JNTCOMPAVGUPSAMPLEDTest const int in_w = block_size_wide[block_idx]; const int in_h = block_size_high[block_idx]; - JNT_COMP_PARAMS jnt_comp_params; - jnt_comp_params.use_jnt_comp_avg = 1; + DIST_WTD_COMP_PARAMS dist_wtd_comp_params; + dist_wtd_comp_params.use_dist_wtd_comp_avg = 1; - jnt_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0]; - jnt_comp_params.bck_offset = quant_dist_lookup_table[0][0][1]; + dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0]; + dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][0][1]; int sub_x_q3 = 0; int sub_y_q3 = 0; @@ -287,13 +290,13 @@ class AV1JNTCOMPAVGUPSAMPLEDTest int subpel_search = USE_8_TAPS; // set to USE_4_TAPS to test 4-tap filter. for (int i = 0; i < num_loops; ++i) - aom_jnt_comp_avg_upsampled_pred_c(NULL, NULL, 0, 0, NULL, output, pred8, - in_w, in_h, sub_x_q3, sub_y_q3, ref8, - in_w, &jnt_comp_params, subpel_search); + aom_dist_wtd_comp_avg_upsampled_pred_c( + NULL, NULL, 0, 0, NULL, output, pred8, in_w, in_h, sub_x_q3, sub_y_q3, + ref8, in_w, &dist_wtd_comp_params, subpel_search); aom_usec_timer_mark(&timer); const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer)); - printf("jntcompavgupsampled c_code %3dx%-3d: %7.2f us\n", in_w, in_h, + printf("distwtdcompavgupsampled c_code %3dx%-3d: %7.2f us\n", in_w, in_h, 1000.0 * elapsed_time / num_loops); aom_usec_timer timer1; @@ -301,27 +304,27 @@ class AV1JNTCOMPAVGUPSAMPLEDTest for (int i = 0; i < num_loops; ++i) test_impl(NULL, NULL, 0, 0, NULL, output2, pred8, in_w, in_h, sub_x_q3, - sub_y_q3, ref8, in_w, &jnt_comp_params, subpel_search); + sub_y_q3, ref8, in_w, &dist_wtd_comp_params, subpel_search); aom_usec_timer_mark(&timer1); const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1)); - printf("jntcompavgupsampled test_code %3dx%-3d: %7.2f us\n", in_w, in_h, + printf("distwtdcompavgupsampled test_code %3dx%-3d: %7.2f us\n", in_w, in_h, 1000.0 * elapsed_time1 / num_loops); } libaom_test::ACMRandom rnd_; -}; // class AV1JNTCOMPAVGUPSAMPLEDTest +}; // class AV1DISTWTDCOMPAVGUPSAMPLEDTest -class AV1HighBDJNTCOMPAVGTest - : public ::testing::TestWithParam<HighbdJNTCOMPAVGParam> { +class AV1HighBDDISTWTDCOMPAVGTest + : public ::testing::TestWithParam<HighbdDISTWTDCOMPAVGParam> { public: - ~AV1HighBDJNTCOMPAVGTest() {} + ~AV1HighBDDISTWTDCOMPAVGTest() {} void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); } void TearDown() { libaom_test::ClearSystemState(); } protected: - void RunCheckOutput(jntcompavg_func test_impl) { + void RunCheckOutput(distwtdcompavg_func test_impl) { const int w = kMaxSize, h = kMaxSize; const int block_idx = GET_PARAM(2); const int bd = GET_PARAM(0); @@ -338,31 +341,31 @@ class AV1HighBDJNTCOMPAVGTest const int in_w = block_size_wide[block_idx]; const int in_h = block_size_high[block_idx]; - JNT_COMP_PARAMS jnt_comp_params; - jnt_comp_params.use_jnt_comp_avg = 1; + DIST_WTD_COMP_PARAMS dist_wtd_comp_params; + dist_wtd_comp_params.use_dist_wtd_comp_avg = 1; for (int ii = 0; ii < 2; ii++) { for (int jj = 0; jj < 4; jj++) { - jnt_comp_params.fwd_offset = quant_dist_lookup_table[ii][jj][0]; - jnt_comp_params.bck_offset = quant_dist_lookup_table[ii][jj][1]; + dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[ii][jj][0]; + dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[ii][jj][1]; const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7); const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7); - aom_highbd_jnt_comp_avg_pred_c( + aom_highbd_dist_wtd_comp_avg_pred_c( CONVERT_TO_BYTEPTR(output), CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c, in_w, in_h, CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c, in_w, - &jnt_comp_params); + &dist_wtd_comp_params); test_impl(CONVERT_TO_BYTEPTR(output2), CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c, in_w, in_h, CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c, - in_w, &jnt_comp_params); + in_w, &dist_wtd_comp_params); for (int i = 0; i < in_h; ++i) { for (int j = 0; j < in_w; ++j) { int idx = i * in_w + j; ASSERT_EQ(output[idx], output2[idx]) - << "Mismatch at unit tests for AV1HighBDJNTCOMPAVGTest\n" + << "Mismatch at unit tests for AV1HighBDDISTWTDCOMPAVGTest\n" << in_w << "x" << in_h << " Pixel mismatch at index " << idx << " = (" << i << ", " << j << ")"; } @@ -370,7 +373,7 @@ class AV1HighBDJNTCOMPAVGTest } } } - void RunSpeedTest(jntcompavg_func test_impl) { + void RunSpeedTest(distwtdcompavg_func test_impl) { const int w = kMaxSize, h = kMaxSize; const int block_idx = GET_PARAM(2); const int bd = GET_PARAM(0); @@ -387,24 +390,24 @@ class AV1HighBDJNTCOMPAVGTest const int in_w = block_size_wide[block_idx]; const int in_h = block_size_high[block_idx]; - JNT_COMP_PARAMS jnt_comp_params; - jnt_comp_params.use_jnt_comp_avg = 1; + DIST_WTD_COMP_PARAMS dist_wtd_comp_params; + dist_wtd_comp_params.use_dist_wtd_comp_avg = 1; - jnt_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0]; - jnt_comp_params.bck_offset = quant_dist_lookup_table[0][0][1]; + dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0]; + dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][0][1]; const int num_loops = 1000000000 / (in_w + in_h); aom_usec_timer timer; aom_usec_timer_start(&timer); for (int i = 0; i < num_loops; ++i) - aom_highbd_jnt_comp_avg_pred_c( + aom_highbd_dist_wtd_comp_avg_pred_c( CONVERT_TO_BYTEPTR(output), CONVERT_TO_BYTEPTR(pred8), in_w, in_h, - CONVERT_TO_BYTEPTR(ref8), in_w, &jnt_comp_params); + CONVERT_TO_BYTEPTR(ref8), in_w, &dist_wtd_comp_params); aom_usec_timer_mark(&timer); const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer)); - printf("highbdjntcompavg c_code %3dx%-3d: %7.2f us\n", in_w, in_h, + printf("highbddistwtdcompavg c_code %3dx%-3d: %7.2f us\n", in_w, in_h, 1000.0 * elapsed_time / num_loops); aom_usec_timer timer1; @@ -412,26 +415,26 @@ class AV1HighBDJNTCOMPAVGTest for (int i = 0; i < num_loops; ++i) test_impl(CONVERT_TO_BYTEPTR(output2), CONVERT_TO_BYTEPTR(pred8), in_w, - in_h, CONVERT_TO_BYTEPTR(ref8), in_w, &jnt_comp_params); + in_h, CONVERT_TO_BYTEPTR(ref8), in_w, &dist_wtd_comp_params); aom_usec_timer_mark(&timer1); const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1)); - printf("highbdjntcompavg test_code %3dx%-3d: %7.2f us\n", in_w, in_h, + printf("highbddistwtdcompavg test_code %3dx%-3d: %7.2f us\n", in_w, in_h, 1000.0 * elapsed_time1 / num_loops); } libaom_test::ACMRandom rnd_; -}; // class AV1HighBDJNTCOMPAVGTest +}; // class AV1HighBDDISTWTDCOMPAVGTest -class AV1HighBDJNTCOMPAVGUPSAMPLEDTest - : public ::testing::TestWithParam<HighbdJNTCOMPAVGUPSAMPLEDParam> { +class AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest + : public ::testing::TestWithParam<HighbdDISTWTDCOMPAVGUPSAMPLEDParam> { public: - ~AV1HighBDJNTCOMPAVGUPSAMPLEDTest() {} + ~AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest() {} void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); } void TearDown() { libaom_test::ClearSystemState(); } protected: - void RunCheckOutput(highbdjntcompavgupsampled_func test_impl) { + void RunCheckOutput(highbddistwtdcompavgupsampled_func test_impl) { const int w = kMaxSize, h = kMaxSize; const int block_idx = GET_PARAM(2); const int bd = GET_PARAM(0); @@ -448,8 +451,8 @@ class AV1HighBDJNTCOMPAVGUPSAMPLEDTest const int in_w = block_size_wide[block_idx]; const int in_h = block_size_high[block_idx]; - JNT_COMP_PARAMS jnt_comp_params; - jnt_comp_params.use_jnt_comp_avg = 1; + DIST_WTD_COMP_PARAMS dist_wtd_comp_params; + dist_wtd_comp_params.use_dist_wtd_comp_avg = 1; int sub_x_q3, sub_y_q3; int subpel_search; for (subpel_search = USE_4_TAPS; subpel_search <= USE_8_TAPS; @@ -458,30 +461,32 @@ class AV1HighBDJNTCOMPAVGUPSAMPLEDTest for (sub_y_q3 = 0; sub_y_q3 < 8; ++sub_y_q3) { for (int ii = 0; ii < 2; ii++) { for (int jj = 0; jj < 4; jj++) { - jnt_comp_params.fwd_offset = quant_dist_lookup_table[ii][jj][0]; - jnt_comp_params.bck_offset = quant_dist_lookup_table[ii][jj][1]; + dist_wtd_comp_params.fwd_offset = + quant_dist_lookup_table[ii][jj][0]; + dist_wtd_comp_params.bck_offset = + quant_dist_lookup_table[ii][jj][1]; const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7); const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7); - aom_highbd_jnt_comp_avg_upsampled_pred_c( + aom_highbd_dist_wtd_comp_avg_upsampled_pred_c( NULL, NULL, 0, 0, NULL, CONVERT_TO_BYTEPTR(output), CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c, in_w, in_h, sub_x_q3, sub_y_q3, CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c, in_w, bd, - &jnt_comp_params, subpel_search); + &dist_wtd_comp_params, subpel_search); test_impl(NULL, NULL, 0, 0, NULL, CONVERT_TO_BYTEPTR(output2), CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c, in_w, in_h, sub_x_q3, sub_y_q3, CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c, - in_w, bd, &jnt_comp_params, subpel_search); + in_w, bd, &dist_wtd_comp_params, subpel_search); for (int i = 0; i < in_h; ++i) { for (int j = 0; j < in_w; ++j) { int idx = i * in_w + j; ASSERT_EQ(output[idx], output2[idx]) << "Mismatch at unit tests for " - "AV1HighBDJNTCOMPAVGUPSAMPLEDTest\n" + "AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest\n" << in_w << "x" << in_h << " Pixel mismatch at index " << idx << " = (" << i << ", " << j << "), sub pixel offset = (" << sub_y_q3 << ", " @@ -494,7 +499,7 @@ class AV1HighBDJNTCOMPAVGUPSAMPLEDTest } } } - void RunSpeedTest(highbdjntcompavgupsampled_func test_impl) { + void RunSpeedTest(highbddistwtdcompavgupsampled_func test_impl) { const int w = kMaxSize, h = kMaxSize; const int block_idx = GET_PARAM(2); const int bd = GET_PARAM(0); @@ -511,11 +516,11 @@ class AV1HighBDJNTCOMPAVGUPSAMPLEDTest const int in_w = block_size_wide[block_idx]; const int in_h = block_size_high[block_idx]; - JNT_COMP_PARAMS jnt_comp_params; - jnt_comp_params.use_jnt_comp_avg = 1; + DIST_WTD_COMP_PARAMS dist_wtd_comp_params; + dist_wtd_comp_params.use_dist_wtd_comp_avg = 1; - jnt_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0]; - jnt_comp_params.bck_offset = quant_dist_lookup_table[0][0][1]; + dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0]; + dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][0][1]; int sub_x_q3 = 0; int sub_y_q3 = 0; const int num_loops = 1000000000 / (in_w + in_h); @@ -523,15 +528,16 @@ class AV1HighBDJNTCOMPAVGUPSAMPLEDTest aom_usec_timer_start(&timer); int subpel_search = USE_8_TAPS; // set to USE_4_TAPS to test 4-tap filter. for (int i = 0; i < num_loops; ++i) - aom_highbd_jnt_comp_avg_upsampled_pred_c( + aom_highbd_dist_wtd_comp_avg_upsampled_pred_c( NULL, NULL, 0, 0, NULL, CONVERT_TO_BYTEPTR(output), CONVERT_TO_BYTEPTR(pred8), in_w, in_h, sub_x_q3, sub_y_q3, - CONVERT_TO_BYTEPTR(ref8), in_w, bd, &jnt_comp_params, subpel_search); + CONVERT_TO_BYTEPTR(ref8), in_w, bd, &dist_wtd_comp_params, + subpel_search); aom_usec_timer_mark(&timer); const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer)); - printf("highbdjntcompavgupsampled c_code %3dx%-3d: %7.2f us\n", in_w, in_h, - 1000.0 * elapsed_time / num_loops); + printf("highbddistwtdcompavgupsampled c_code %3dx%-3d: %7.2f us\n", in_w, + in_h, 1000.0 * elapsed_time / num_loops); aom_usec_timer timer1; aom_usec_timer_start(&timer1); @@ -539,19 +545,19 @@ class AV1HighBDJNTCOMPAVGUPSAMPLEDTest for (int i = 0; i < num_loops; ++i) test_impl(NULL, NULL, 0, 0, NULL, CONVERT_TO_BYTEPTR(output2), CONVERT_TO_BYTEPTR(pred8), in_w, in_h, sub_x_q3, sub_y_q3, - CONVERT_TO_BYTEPTR(ref8), in_w, bd, &jnt_comp_params, + CONVERT_TO_BYTEPTR(ref8), in_w, bd, &dist_wtd_comp_params, subpel_search); aom_usec_timer_mark(&timer1); const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1)); - printf("highbdjntcompavgupsampled test_code %3dx%-3d: %7.2f us\n", in_w, + printf("highbddistwtdcompavgupsampled test_code %3dx%-3d: %7.2f us\n", in_w, in_h, 1000.0 * elapsed_time1 / num_loops); } libaom_test::ACMRandom rnd_; -}; // class AV1HighBDJNTCOMPAVGUPSAMPLEDTest +}; // class AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest -} // namespace AV1JNTCOMPAVG +} // namespace AV1DISTWTDCOMPAVG } // namespace libaom_test #endif // AOM_TEST_COMP_AVG_PRED_TEST_H_ diff --git a/libaom/test/corner_match_test.cc b/libaom/test/corner_match_test.cc index 58e3139..af2baa7 100644 --- a/libaom/test/corner_match_test.cc +++ b/libaom/test/corner_match_test.cc @@ -24,9 +24,13 @@ namespace AV1CornerMatch { using libaom_test::ACMRandom; +typedef double (*ComputeCrossCorrFunc)(unsigned char *im1, int stride1, int x1, + int y1, unsigned char *im2, int stride2, + int x2, int y2); + using ::testing::make_tuple; using ::testing::tuple; -typedef tuple<int> CornerMatchParam; +typedef tuple<int, ComputeCrossCorrFunc> CornerMatchParam; class AV1CornerMatchTest : public ::testing::TestWithParam<CornerMatchParam> { public: @@ -36,19 +40,24 @@ class AV1CornerMatchTest : public ::testing::TestWithParam<CornerMatchParam> { virtual void TearDown(); protected: - void RunCheckOutput(); + void RunCheckOutput(int run_times); + ComputeCrossCorrFunc target_func; libaom_test::ACMRandom rnd_; }; AV1CornerMatchTest::~AV1CornerMatchTest() {} -void AV1CornerMatchTest::SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); } +void AV1CornerMatchTest::SetUp() { + rnd_.Reset(ACMRandom::DeterministicSeed()); + target_func = GET_PARAM(1); +} void AV1CornerMatchTest::TearDown() { libaom_test::ClearSystemState(); } -void AV1CornerMatchTest::RunCheckOutput() { +void AV1CornerMatchTest::RunCheckOutput(int run_times) { const int w = 128, h = 128; const int num_iters = 10000; int i, j; + aom_usec_timer ref_timer, test_timer; uint8_t *input1 = new uint8_t[w * h]; uint8_t *input2 = new uint8_t[w * h]; @@ -80,21 +89,54 @@ void AV1CornerMatchTest::RunCheckOutput() { double res_c = compute_cross_correlation_c(input1, w, x1, y1, input2, w, x2, y2); - double res_sse4 = - compute_cross_correlation_sse4_1(input1, w, x1, y1, input2, w, x2, y2); + double res_simd = target_func(input1, w, x1, y1, input2, w, x2, y2); - ASSERT_EQ(res_sse4, res_c); - } + if (run_times > 1) { + aom_usec_timer_start(&ref_timer); + for (j = 0; j < run_times; j++) { + compute_cross_correlation_c(input1, w, x1, y1, input2, w, x2, y2); + } + aom_usec_timer_mark(&ref_timer); + const int elapsed_time_c = + static_cast<int>(aom_usec_timer_elapsed(&ref_timer)); + aom_usec_timer_start(&test_timer); + for (j = 0; j < run_times; j++) { + target_func(input1, w, x1, y1, input2, w, x2, y2); + } + aom_usec_timer_mark(&test_timer); + const int elapsed_time_simd = + static_cast<int>(aom_usec_timer_elapsed(&test_timer)); + + printf( + "c_time=%d \t simd_time=%d \t " + "gain=%d\n", + elapsed_time_c, elapsed_time_simd, + (elapsed_time_c / elapsed_time_simd)); + } else { + ASSERT_EQ(res_simd, res_c); + } + } delete[] input1; delete[] input2; } -TEST_P(AV1CornerMatchTest, CheckOutput) { RunCheckOutput(); } - -INSTANTIATE_TEST_CASE_P(SSE4_1, AV1CornerMatchTest, - ::testing::Values(make_tuple(0), make_tuple(1))); - +TEST_P(AV1CornerMatchTest, CheckOutput) { RunCheckOutput(1); } +TEST_P(AV1CornerMatchTest, DISABLED_Speed) { RunCheckOutput(100000); } + +#if HAVE_SSE4_1 +INSTANTIATE_TEST_CASE_P( + SSE4_1, AV1CornerMatchTest, + ::testing::Values(make_tuple(0, compute_cross_correlation_sse4_1), + make_tuple(1, compute_cross_correlation_sse4_1))); +#endif + +#if HAVE_AVX2 +INSTANTIATE_TEST_CASE_P( + AVX2, AV1CornerMatchTest, + ::testing::Values(make_tuple(0, compute_cross_correlation_avx2), + make_tuple(1, compute_cross_correlation_avx2))); +#endif } // namespace AV1CornerMatch } // namespace test_libaom diff --git a/libaom/test/dr_prediction_test.cc b/libaom/test/dr_prediction_test.cc index a64d39b..4be8489 100644 --- a/libaom/test/dr_prediction_test.cc +++ b/libaom/test/dr_prediction_test.cc @@ -59,7 +59,9 @@ typedef void (*Z1_Lbd)(uint8_t *dst, ptrdiff_t stride, int bw, int bh, template <Z1_Lbd fn> void z1_wrapper(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, - int /*upsample_left*/, int dx, int dy, int /*bd*/) { + int upsample_left, int dx, int dy, int bd) { + (void)bd; + (void)upsample_left; fn(dst, stride, bw, bh, above, left, upsample_above, dx, dy); } @@ -69,7 +71,9 @@ typedef void (*Z2_Lbd)(uint8_t *dst, ptrdiff_t stride, int bw, int bh, template <Z2_Lbd fn> void z2_wrapper(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, - int upsample_left, int dx, int dy, int /*bd*/) { + int upsample_left, int dx, int dy, int bd) { + (void)bd; + (void)upsample_left; fn(dst, stride, bw, bh, above, left, upsample_above, upsample_left, dx, dy); } @@ -78,9 +82,10 @@ typedef void (*Z3_Lbd)(uint8_t *dst, ptrdiff_t stride, int bw, int bh, int upsample_left, int dx, int dy); template <Z3_Lbd fn> void z3_wrapper(uint8_t *dst, ptrdiff_t stride, int bw, int bh, - const uint8_t *above, const uint8_t *left, - int /*upsample_above*/, int upsample_left, int dx, int dy, - int /*bd*/) { + const uint8_t *above, const uint8_t *left, int upsample_above, + int upsample_left, int dx, int dy, int bd) { + (void)bd; + (void)upsample_above; fn(dst, stride, bw, bh, above, left, upsample_left, dx, dy); } @@ -90,8 +95,10 @@ typedef void (*Z1_Hbd)(uint16_t *dst, ptrdiff_t stride, int bw, int bh, template <Z1_Hbd fn> void z1_wrapper_hbd(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, - int upsample_above, int /*upsample_left*/, int dx, int dy, + int upsample_above, int upsample_left, int dx, int dy, int bd) { + (void)bd; + (void)upsample_left; fn(dst, stride, bw, bh, above, left, upsample_above, dx, dy, bd); } @@ -104,6 +111,7 @@ void z2_wrapper_hbd(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy, int bd) { + (void)bd; fn(dst, stride, bw, bh, above, left, upsample_above, upsample_left, dx, dy, bd); } @@ -114,8 +122,10 @@ typedef void (*Z3_Hbd)(uint16_t *dst, ptrdiff_t stride, int bw, int bh, template <Z3_Hbd fn> void z3_wrapper_hbd(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, - int /*upsample_above*/, int upsample_left, int dx, int dy, + int upsample_above, int upsample_left, int dx, int dy, int bd) { + (void)bd; + (void)upsample_above; fn(dst, stride, bw, bh, above, left, upsample_left, dx, dy, bd); } @@ -135,7 +145,7 @@ struct DrPredFunc { template <typename Pixel, typename FuncType> class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > { protected: - static const int kMaxNumTests = 100000; + static const int kMaxNumTests = 10000; static const int kIterations = 10; static const int kDstStride = 64; static const int kDstSize = kDstStride * kDstStride; @@ -171,6 +181,9 @@ class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > { void Predict(bool speedtest, int tx) { const int kNumTests = speedtest ? kMaxNumTests : 1; aom_usec_timer timer; + int tst_time = 0; + + bd_ = params_.bit_depth; aom_usec_timer_start(&timer); for (int k = 0; k < kNumTests; ++k) { @@ -180,25 +193,27 @@ class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > { aom_usec_timer_mark(&timer); const int ref_time = static_cast<int>(aom_usec_timer_elapsed(&timer)); - aom_usec_timer_start(&timer); if (params_.tst_fn) { + aom_usec_timer_start(&timer); for (int k = 0; k < kNumTests; ++k) { ASM_REGISTER_STATE_CHECK(params_.tst_fn(dst_tst_, dst_stride_, bw_, bh_, above_, left_, upsample_above_, upsample_left_, dx_, dy_, bd_)); } + aom_usec_timer_mark(&timer); + tst_time = static_cast<int>(aom_usec_timer_elapsed(&timer)); } else { for (int i = 0; i < kDstSize; ++i) { dst_ref_[i] = dst_tst_[i]; } } - aom_usec_timer_mark(&timer); - const int tst_time = static_cast<int>(aom_usec_timer_elapsed(&timer)); OutputTimes(kNumTests, ref_time, tst_time, tx); } void RunTest(bool speedtest, bool needsaturation, int p_angle) { + bd_ = params_.bit_depth; + if (needsaturation) { for (int i = 0; i < kBufSize; ++i) { above_data_[i] = left_data_[i] = (1 << bd_) - 1; @@ -290,8 +305,7 @@ class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > { class LowbdDrPredTest : public DrPredTest<uint8_t, DrPred> {}; TEST_P(LowbdDrPredTest, SaturatedValues) { - for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) { - enable_upsample_ = iter & 1; + for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) { for (int angle = start_angle_; angle < stop_angle_; ++angle) { dx_ = av1_get_dx(angle); dy_ = av1_get_dy(angle); @@ -300,20 +314,6 @@ TEST_P(LowbdDrPredTest, SaturatedValues) { } } -TEST_P(LowbdDrPredTest, DISABLED_Speed) { - const int angles[] = { 3, 45, 87 }; - for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) { - for (int i = 0; i < 3; ++i) { - const int angle = angles[i] + start_angle_; - dx_ = av1_get_dx(angle); - dy_ = av1_get_dy(angle); - printf("enable_upsample: %d angle: %d ~~~~~~~~~~~~~~~\n", - enable_upsample_, angle); - if (dx_ && dy_) RunTest(true, false, angle); - } - } -} - using ::testing::make_tuple; INSTANTIATE_TEST_CASE_P( @@ -328,8 +328,7 @@ INSTANTIATE_TEST_CASE_P( class HighbdDrPredTest : public DrPredTest<uint16_t, DrPred_Hbd> {}; TEST_P(HighbdDrPredTest, SaturatedValues) { - for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) { - enable_upsample_ = iter & 1; + for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) { for (int angle = start_angle_; angle < stop_angle_; ++angle) { dx_ = av1_get_dx(angle); dy_ = av1_get_dy(angle); @@ -362,6 +361,46 @@ INSTANTIATE_TEST_CASE_P( #if HAVE_AVX2 INSTANTIATE_TEST_CASE_P( + AVX2, LowbdDrPredTest, + ::testing::Values(DrPredFunc<DrPred>(&z1_wrapper<av1_dr_prediction_z1_c>, + &z1_wrapper<av1_dr_prediction_z1_avx2>, + AOM_BITS_8, kZ1Start), + /* TODO(niva213@gmail.com): Re-enable this test after + fixing valgrind issue: https://crbug.com/aomedia/2316 + DrPredFunc<DrPred>(&z2_wrapper<av1_dr_prediction_z2_c>, + &z2_wrapper<av1_dr_prediction_z2_avx2>, + AOM_BITS_8, kZ2Start), */ + DrPredFunc<DrPred>(&z3_wrapper<av1_dr_prediction_z3_c>, + &z3_wrapper<av1_dr_prediction_z3_avx2>, + AOM_BITS_8, kZ3Start))); + +TEST_P(LowbdDrPredTest, DISABLED_Speed) { + const int angles[] = { 3, 45, 87 }; + for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) { + for (int i = 0; i < 3; ++i) { + const int angle = angles[i] + start_angle_; + dx_ = av1_get_dx(angle); + dy_ = av1_get_dy(angle); + printf("enable_upsample: %d angle: %d ~~~~~~~~~~~~~~~\n", + enable_upsample_, angle); + if (dx_ && dy_) RunTest(true, false, angle); + } + } +} + +TEST_P(LowbdDrPredTest, OperationCheck) { + if (params_.tst_fn == NULL) return; + // const int angles[] = { 3, 45, 81, 87, 93, 100, 145, 187, 199, 260 }; + for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) { + for (int angle = start_angle_; angle < stop_angle_; ++angle) { + dx_ = av1_get_dx(angle); + dy_ = av1_get_dy(angle); + if (dx_ && dy_) RunTest(false, false, angle); + } + } +} + +INSTANTIATE_TEST_CASE_P( AVX2, HighbdDrPredTest, ::testing::Values(DrPredFunc<DrPred_Hbd>( &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_c>, @@ -375,7 +414,9 @@ INSTANTIATE_TEST_CASE_P( &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_c>, &z1_wrapper_hbd<av1_highbd_dr_prediction_z1_avx2>, AOM_BITS_12, kZ1Start), - /*DrPredFunc<DrPred_Hbd>( + /* TODO(niva213@gmail.com): Re-enable these tests after + fixing valgrind issue: https://crbug.com/aomedia/2316 + DrPredFunc<DrPred_Hbd>( &z2_wrapper_hbd<av1_highbd_dr_prediction_z2_c>, &z2_wrapper_hbd<av1_highbd_dr_prediction_z2_avx2>, AOM_BITS_8, kZ2Start), diff --git a/libaom/test/edge_detect_test.cc b/libaom/test/edge_detect_test.cc index 47466cb..77a731f 100644 --- a/libaom/test/edge_detect_test.cc +++ b/libaom/test/edge_detect_test.cc @@ -185,8 +185,9 @@ TEST_P(EdgeDetectBrightnessTest, DetectUniformBrightness) { const bool high_bd = GET_PARAM(3); const int bd = GET_PARAM(4); - ASSERT_EQ(0, av1_edge_exists(input_, stride_8tap(width), width, height, - high_bd, bd)); + ASSERT_EQ( + 0, av1_edge_exists(input_, stride_8tap(width), width, height, high_bd, bd) + .magnitude); } INSTANTIATE_TEST_CASE_P(ImageBrightnessTests, EdgeDetectBrightnessTest, @@ -245,9 +246,11 @@ TEST_P(EdgeDetectImageTest, BlackWhite) { free(orig); // Value should be between 556 and 560. ASSERT_LE(556, av1_edge_exists(padded, stride_8tap(width), width, height, - high_bd, bd)); + high_bd, bd) + .magnitude); ASSERT_GE(560, av1_edge_exists(padded, stride_8tap(width), width, height, - high_bd, bd)); + high_bd, bd) + .magnitude); free_pad_8tap(padded, width, high_bd); } diff --git a/libaom/test/encode_api_test.cc b/libaom/test/encode_api_test.cc index c26f572..235480a 100644 --- a/libaom/test/encode_api_test.cc +++ b/libaom/test/encode_api_test.cc @@ -50,7 +50,7 @@ TEST(EncodeAPI, InvalidParams) { EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_init(&enc, kCodecs[i], NULL, 0)); EXPECT_EQ(AOM_CODEC_INVALID_PARAM, - aom_codec_enc_config_default(kCodecs[i], &cfg, 1)); + aom_codec_enc_config_default(kCodecs[i], &cfg, 2)); EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(kCodecs[i], &cfg, 0)); EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, kCodecs[i], &cfg, 0)); diff --git a/libaom/test/end_to_end_test.cc b/libaom/test/end_to_end_test.cc index 9aa44c6..6ea09a6 100644 --- a/libaom/test/end_to_end_test.cc +++ b/libaom/test/end_to_end_test.cc @@ -53,6 +53,13 @@ typedef struct { unsigned int profile; } TestVideoParam; +std::ostream &operator<<(std::ostream &os, const TestVideoParam &test_arg) { + return os << "TestVideoParam { filename:" << test_arg.filename + << " input_bit_depth:" << test_arg.input_bit_depth + << " fmt:" << test_arg.fmt << " bit_depth:" << test_arg.bit_depth + << " profile:" << test_arg.profile << "}"; +} + const TestVideoParam kTestVectors[] = { { "park_joy_90p_8_420.y4m", 8, AOM_IMG_FMT_I420, AOM_BITS_8, 0 }, { "park_joy_90p_8_422.y4m", 8, AOM_IMG_FMT_I422, AOM_BITS_8, 2 }, diff --git a/libaom/test/error_block_test.cc b/libaom/test/error_block_test.cc index 353947c..3664ccf 100644 --- a/libaom/test/error_block_test.cc +++ b/libaom/test/error_block_test.cc @@ -156,6 +156,70 @@ TEST_P(ErrorBlockTest, ExtremeValues) { << "First failed at test case " << first_failure; } +TEST_P(ErrorBlockTest, DISABLED_Speed) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + DECLARE_ALIGNED(16, tran_low_t, coeff[4096]); + DECLARE_ALIGNED(16, tran_low_t, dqcoeff[4096]); + intptr_t block_size; + int64_t ssz; + int num_iters = 100000; + int64_t ref_ssz; + int k; + const int msb = bit_depth_ + 8 - 1; + for (int i = 0; i < 9; ++i) { + block_size = 16 << (i % 9); // All block sizes from 4x4, 8x4 ..64x64 + for (k = 0; k < 9; k++) { + for (int j = 0; j < block_size; j++) { + if (k < 5) { + if (rnd(2)) { + // Positive number + coeff[j] = rnd(1 << msb); + dqcoeff[j] = rnd(1 << msb); + } else { + // Negative number + coeff[j] = -rnd(1 << msb); + dqcoeff[j] = -rnd(1 << msb); + } + } else { + if (rnd(2)) { + // Positive number + coeff[j] = rnd(1 << 14); + dqcoeff[j] = rnd(1 << 14); + } else { + // Negative number + coeff[j] = -rnd(1 << 14); + dqcoeff[j] = -rnd(1 << 14); + } + } + } + aom_usec_timer ref_timer, test_timer; + + aom_usec_timer_start(&ref_timer); + for (int i = 0; i < num_iters; ++i) { + ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz, bit_depth_); + } + aom_usec_timer_mark(&ref_timer); + const int elapsed_time_c = + static_cast<int>(aom_usec_timer_elapsed(&ref_timer)); + + aom_usec_timer_start(&test_timer); + for (int i = 0; i < num_iters; ++i) { + error_block_op_(coeff, dqcoeff, block_size, &ssz, bit_depth_); + } + aom_usec_timer_mark(&test_timer); + + const int elapsed_time_simd = + static_cast<int>(aom_usec_timer_elapsed(&test_timer)); + + printf( + " c_time=%d \t simd_time=%d \t " + "gain=%d \n", + elapsed_time_c, elapsed_time_simd, + (elapsed_time_c / elapsed_time_simd)); + } + } +} + #if (HAVE_SSE2 || HAVE_AVX) using ::testing::make_tuple; @@ -168,4 +232,17 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&av1_highbd_block_error_sse2, &av1_highbd_block_error_c, AOM_BITS_8))); #endif // HAVE_SSE2 + +#if (HAVE_AVX2) +using ::testing::make_tuple; + +INSTANTIATE_TEST_CASE_P( + AVX2, ErrorBlockTest, + ::testing::Values(make_tuple(&av1_highbd_block_error_avx2, + &av1_highbd_block_error_c, AOM_BITS_10), + make_tuple(&av1_highbd_block_error_avx2, + &av1_highbd_block_error_c, AOM_BITS_12), + make_tuple(&av1_highbd_block_error_avx2, + &av1_highbd_block_error_c, AOM_BITS_8))); +#endif // HAVE_AVX2 } // namespace diff --git a/libaom/test/external_frame_buffer_test.cc b/libaom/test/external_frame_buffer_test.cc index 6fcd9e7..4938a64 100644 --- a/libaom/test/external_frame_buffer_test.cc +++ b/libaom/test/external_frame_buffer_test.cc @@ -58,7 +58,7 @@ class ExternalFrameBufferList { // Searches the frame buffer list for a free frame buffer. Makes sure // that the frame buffer is at least |min_size| in bytes. Marks that the - // frame buffer is in use by libvpx. Finally sets |fb| to point to the + // frame buffer is in use by libaom. Finally sets |fb| to point to the // external frame buffer. Returns < 0 on an error. int GetFreeFrameBuffer(size_t min_size, aom_codec_frame_buffer_t *fb) { EXPECT_TRUE(fb != NULL); @@ -114,9 +114,9 @@ class ExternalFrameBufferList { return 0; } - // Checks that the ximage data is contained within the external frame buffer - // private data passed back in the ximage. - void CheckXImageFrameBuffer(const aom_image_t *img) { + // Checks that the aom_image_t data is contained within the external frame + // buffer private data passed back in the aom_image_t. + void CheckImageFrameBuffer(const aom_image_t *img) { if (img->fb_priv != NULL) { const struct ExternalFrameBuffer *const ext_fb = reinterpret_cast<ExternalFrameBuffer *>(img->fb_priv); @@ -158,7 +158,7 @@ class ExternalFrameBufferList { #if CONFIG_WEBM_IO -// Callback used by libvpx to request the application to return a frame +// Callback used by libaom to request the application to return a frame // buffer of at least |min_size| in bytes. int get_aom_frame_buffer(void *user_priv, size_t min_size, aom_codec_frame_buffer_t *fb) { @@ -167,7 +167,7 @@ int get_aom_frame_buffer(void *user_priv, size_t min_size, return fb_list->GetFreeFrameBuffer(min_size, fb); } -// Callback used by libvpx to tell the application that |fb| is not needed +// Callback used by libaom to tell the application that |fb| is not needed // anymore. int release_aom_frame_buffer(void *user_priv, aom_codec_frame_buffer_t *fb) { ExternalFrameBufferList *const fb_list = @@ -218,7 +218,7 @@ class ExternalFrameBufferMD5Test const libaom_test::CompressedVideoSource &video, libaom_test::Decoder *decoder) { if (num_buffers_ > 0 && video.frame_number() == 0) { - // Have libvpx use frame buffers we create. + // Have libaom use frame buffers we create. ASSERT_TRUE(fb_list_.CreateBufferList(num_buffers_)); ASSERT_EQ(AOM_CODEC_OK, decoder->SetFrameBufferFunctions(GetAV1FrameBuffer, @@ -299,7 +299,7 @@ class ExternalFrameBufferMD5Test const char kAV1TestFile[] = "av1-1-b8-03-sizeup.mkv"; const char kAV1NonRefTestFile[] = "av1-1-b8-01-size-226x226.ivf"; -// Class for testing passing in external frame buffers to libvpx. +// Class for testing passing in external frame buffers to libaom. class ExternalFrameBufferTest : public ::testing::Test { protected: ExternalFrameBufferTest() : video_(NULL), decoder_(NULL), num_buffers_(0) {} @@ -322,7 +322,7 @@ class ExternalFrameBufferTest : public ::testing::Test { video_ = NULL; } - // Passes the external frame buffer information to libvpx. + // Passes the external frame buffer information to libaom. aom_codec_err_t SetFrameBufferFunctions( int num_buffers, aom_get_frame_buffer_cb_fn_t cb_get, aom_release_frame_buffer_cb_fn_t cb_release) { @@ -359,7 +359,7 @@ class ExternalFrameBufferTest : public ::testing::Test { // Get decompressed data while ((img = dec_iter.Next()) != NULL) { - fb_list_.CheckXImageFrameBuffer(img); + fb_list_.CheckImageFrameBuffer(img); } } @@ -390,7 +390,7 @@ class ExternalFrameBufferNonRefTest : public ExternalFrameBufferTest { #endif // CONFIG_WEBM_IO // This test runs through the set of test vectors, and decodes them. -// Libvpx will call into the application to allocate a frame buffer when +// Libaom will call into the application to allocate a frame buffer when // needed. The md5 checksums are computed for each frame in the video file. // If md5 checksums match the correct md5 data, then the test is passed. // Otherwise, the test failed. diff --git a/libaom/test/fwd_kf_test.cc b/libaom/test/fwd_kf_test.cc new file mode 100644 index 0000000..6c428d9 --- /dev/null +++ b/libaom/test/fwd_kf_test.cc @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" + +namespace { + +typedef struct { + const int max_kf_dist; + const double psnr_thresh; +} FwdKfTestParam; + +const FwdKfTestParam kTestParams[] = { + { 4, 37.3 }, { 6, 36.5 }, { 8, 35.8 }, + { 12, 34.3 }, { 16, 34.3 }, { 18, 33.7 } +}; + +// Params: encoding mode and index into the kMaxKfDists array to control +// kf-max-dist +class ForwardKeyTest + : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>, + public ::libaom_test::EncoderTest { + protected: + ForwardKeyTest() + : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), + kf_max_dist_ind_(GET_PARAM(2)) {} + virtual ~ForwardKeyTest() {} + + virtual void SetUp() { + InitializeConfig(); + SetMode(encoding_mode_); + const aom_rational timebase = { 1, 30 }; + cfg_.g_timebase = timebase; + cpu_used_ = 2; + kf_max_dist_ = kTestParams[kf_max_dist_ind_].max_kf_dist; + psnr_threshold_ = kTestParams[kf_max_dist_ind_].psnr_thresh; + cfg_.rc_end_usage = AOM_VBR; + cfg_.rc_target_bitrate = 200; + cfg_.g_lag_in_frames = 10; + cfg_.fwd_kf_enabled = 1; + cfg_.kf_max_dist = kf_max_dist_; + cfg_.g_threads = 0; + init_flags_ = AOM_CODEC_USE_PSNR; + } + + virtual void BeginPassHook(unsigned int) { + psnr_ = 0.0; + nframes_ = 0; + } + + virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) { + psnr_ += pkt->data.psnr.psnr[0]; + nframes_++; + } + + virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video, + ::libaom_test::Encoder *encoder) { + if (video->frame() == 0) { + encoder->Control(AOME_SET_CPUUSED, cpu_used_); + if (encoding_mode_ != ::libaom_test::kRealTime) { + encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1); + encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7); + encoder->Control(AOME_SET_ARNR_STRENGTH, 5); + } + } + } + + double GetAveragePsnr() const { + if (nframes_) return psnr_ / nframes_; + return 0.0; + } + + double GetPsnrThreshold() { return psnr_threshold_; } + + ::libaom_test::TestMode encoding_mode_; + const int kf_max_dist_ind_; + double psnr_threshold_; + int kf_max_dist_; + int cpu_used_; + int nframes_; + double psnr_; +}; + +TEST_P(ForwardKeyTest, ForwardKeyEncodeTest) { + libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + cfg_.g_timebase.den, cfg_.g_timebase.num, + 0, 20); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + // TODO(sarahparker) Add functionality to assert the minimum number of + // keyframes were placed. + EXPECT_GT(GetAveragePsnr(), GetPsnrThreshold()) + << "kf max dist = " << kf_max_dist_; +} + +AV1_INSTANTIATE_TEST_CASE( + ForwardKeyTest, ::testing::Values(::libaom_test::kTwoPassGood), + ::testing::Range(0, static_cast<int>(GTEST_ARRAY_SIZE_(kTestParams)))); +} // namespace diff --git a/libaom/test/gf_max_pyr_height_test.cc b/libaom/test/gf_max_pyr_height_test.cc new file mode 100644 index 0000000..2d78493 --- /dev/null +++ b/libaom/test/gf_max_pyr_height_test.cc @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/i420_video_source.h" +#include "test/util.h" + +namespace { + +static const struct GFMaxPyrHeightTestParam { + int gf_max_pyr_height; + double psnr_thresh; +} kTestParams[] = { + { 0, 34.75 }, { 1, 34.75 }, { 2, 35.25 }, { 3, 35.50 }, { 4, 35.50 }, +}; + +// Compiler may decide to add some padding to the struct above for alignment, +// which the gtest may try to print (on error for example). This would cause +// valgrind to complain that the padding is uninitialized. To avoid that, we +// provide our own function to print the struct. +// This also makes '--gtest_list_tests' output more understandable. +std::ostream &operator<<(std::ostream &os, const GFMaxPyrHeightTestParam &p) { + os << "GFMaxPyrHeightTestParam { " + << "gf_max_pyr_height = " << p.gf_max_pyr_height << ", " + << "psnr_thresh = " << p.psnr_thresh << " }"; + return os; +} + +// Params: encoding mode and GFMaxPyrHeightTestParam object. +class GFMaxPyrHeightTest + : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, + GFMaxPyrHeightTestParam>, + public ::libaom_test::EncoderTest { + protected: + GFMaxPyrHeightTest() + : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)) { + gf_max_pyr_height_ = GET_PARAM(2).gf_max_pyr_height; + psnr_threshold_ = GET_PARAM(2).psnr_thresh; + } + virtual ~GFMaxPyrHeightTest() {} + + virtual void SetUp() { + InitializeConfig(); + SetMode(encoding_mode_); + const aom_rational timebase = { 1, 30 }; + cfg_.g_timebase = timebase; + cpu_used_ = 4; + cfg_.rc_end_usage = AOM_VBR; + cfg_.rc_target_bitrate = 200; + cfg_.g_lag_in_frames = 19; + cfg_.g_threads = 0; + init_flags_ = AOM_CODEC_USE_PSNR; + } + + virtual void BeginPassHook(unsigned int) { + psnr_ = 0.0; + nframes_ = 0; + } + + virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) { + psnr_ += pkt->data.psnr.psnr[0]; + nframes_++; + } + + virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video, + ::libaom_test::Encoder *encoder) { + if (video->frame() == 0) { + encoder->Control(AOME_SET_CPUUSED, cpu_used_); + if (encoding_mode_ != ::libaom_test::kRealTime) { + encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1); + encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7); + encoder->Control(AOME_SET_ARNR_STRENGTH, 5); + } + encoder->Control(AV1E_SET_GF_MAX_PYRAMID_HEIGHT, gf_max_pyr_height_); + } + } + + double GetAveragePsnr() const { + if (nframes_) return psnr_ / nframes_; + return 0.0; + } + + double GetPsnrThreshold() { return psnr_threshold_; } + + ::libaom_test::TestMode encoding_mode_; + double psnr_threshold_; + int gf_max_pyr_height_; + int cpu_used_; + int nframes_; + double psnr_; +}; + +TEST_P(GFMaxPyrHeightTest, EncodeAndVerifyPSNR) { + libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + cfg_.g_timebase.den, cfg_.g_timebase.num, + 0, 32); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + EXPECT_GT(GetAveragePsnr(), GetPsnrThreshold()) + << "GF Max Pyramid Height = " << gf_max_pyr_height_; +} + +AV1_INSTANTIATE_TEST_CASE(GFMaxPyrHeightTest, + ::testing::Values(::libaom_test::kTwoPassGood), + ::testing::ValuesIn(kTestParams)); +} // namespace diff --git a/libaom/test/hiprec_convolve_test_util.cc b/libaom/test/hiprec_convolve_test_util.cc index f5bf56e..2672bce 100644 --- a/libaom/test/hiprec_convolve_test_util.cc +++ b/libaom/test/hiprec_convolve_test_util.cc @@ -31,7 +31,7 @@ static void generate_kernels(ACMRandom *rnd, InterpKernel hkernel, hkernel[2] = hkernel[4] = WIENER_FILT_TAP2_MINV + rnd->PseudoUniform(WIENER_FILT_TAP2_MAXV + 1 - WIENER_FILT_TAP2_MINV); - hkernel[3] = -2 * (hkernel[0] + hkernel[1] + hkernel[2]); + hkernel[3] = -(hkernel[0] + hkernel[1] + hkernel[2]); hkernel[7] = 0; vkernel[0] = vkernel[6] = @@ -43,7 +43,7 @@ static void generate_kernels(ACMRandom *rnd, InterpKernel hkernel, vkernel[2] = vkernel[4] = WIENER_FILT_TAP2_MINV + rnd->PseudoUniform(WIENER_FILT_TAP2_MAXV + 1 - WIENER_FILT_TAP2_MINV); - vkernel[3] = -2 * (vkernel[0] + vkernel[1] + vkernel[2]); + vkernel[3] = -(vkernel[0] + vkernel[1] + vkernel[2]); vkernel[7] = 0; } diff --git a/libaom/test/horz_superres_test.cc b/libaom/test/horz_superres_test.cc index 1627684..f2c2115 100644 --- a/libaom/test/horz_superres_test.cc +++ b/libaom/test/horz_superres_test.cc @@ -28,13 +28,8 @@ using ::testing::tuple; /* TESTING PARAMETERS */ -#define NUM_TEST_VIDEOS 3 - const int kBitrate = 40; -// PSNR thresholds found by experiment -const double kPSNRThresholds[] = { 26.0, 28.0, 20.0 }; - typedef struct { const char *filename; aom_img_fmt fmt; @@ -42,18 +37,20 @@ typedef struct { unsigned int profile; unsigned int limit; unsigned int screen_content; + double psnr_threshold; } TestVideoParam; const TestVideoParam kTestVideoVectors[] = { - { "park_joy_90p_8_420.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 5, 0 }, - { "park_joy_90p_10_444.y4m", AOM_IMG_FMT_I44416, AOM_BITS_10, 1, 5, 0 }, - { "screendata.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 4, 1 }, + { "park_joy_90p_8_420.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 5, 0, 26.0 }, + { "park_joy_90p_10_444.y4m", AOM_IMG_FMT_I44416, AOM_BITS_10, 1, 5, 0, 28.0 }, + { "screendata.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 4, 1, 20.0 }, + // Image coding (single frame). + { "niklas_1280_720_30.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 1, 0, 32.0 }, }; -// Superres modes tested -// SUPERRES_QTHRESH is not included, as it has its own test -const SUPERRES_MODE kSuperresModesNotQThresh[] = { SUPERRES_FIXED, - SUPERRES_RANDOM }; +// Modes with extra params have their own tests. +const SUPERRES_MODE kSuperresModesWithoutParams[] = { SUPERRES_RANDOM, + SUPERRES_AUTO }; // Superres denominators and superres kf denominators to be tested typedef tuple<int, int> SuperresDenominatorPair; @@ -74,10 +71,8 @@ const SuperresQThresholdPair kSuperresQThresholds[] = { /* END (TESTING PARAMETERS) */ // Test parameter list: -// <[needed for EncoderTest], test_video_idx_, superres_mode_, -// tuple(superres_denom_, superres_kf_denom_)> -typedef tuple<const libaom_test::CodecFactory *, int, SUPERRES_MODE, - SuperresDenominatorPair> +// <[needed for EncoderTest], test_video_param_, superres_mode_> +typedef tuple<const libaom_test::CodecFactory *, TestVideoParam, SUPERRES_MODE> HorzSuperresTestParam; class HorzSuperresEndToEndTest @@ -85,16 +80,113 @@ class HorzSuperresEndToEndTest public ::libaom_test::EncoderTest { protected: HorzSuperresEndToEndTest() - : EncoderTest(GET_PARAM(0)), test_video_idx_(GET_PARAM(1)), - superres_mode_(GET_PARAM(2)), psnr_(0.0), frame_count_(0) { - test_video_param_ = kTestVideoVectors[test_video_idx_]; + : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)), + superres_mode_(GET_PARAM(2)), psnr_(0.0), frame_count_(0) {} + + virtual ~HorzSuperresEndToEndTest() {} + + virtual void SetUp() { + InitializeConfig(); + SetMode(::libaom_test::kTwoPassGood); + cfg_.g_lag_in_frames = 5; + cfg_.rc_end_usage = AOM_Q; + cfg_.rc_target_bitrate = kBitrate; + cfg_.g_error_resilient = 0; + cfg_.g_profile = test_video_param_.profile; + cfg_.g_input_bit_depth = (unsigned int)test_video_param_.bit_depth; + cfg_.g_bit_depth = test_video_param_.bit_depth; + init_flags_ = AOM_CODEC_USE_PSNR; + if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH; + + // Set superres parameters + cfg_.rc_superres_mode = superres_mode_; + } + + virtual void BeginPassHook(unsigned int) { + psnr_ = 0.0; + frame_count_ = 0; + } + + virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) { + psnr_ += pkt->data.psnr.psnr[0]; + frame_count_++; + } + + virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video, + ::libaom_test::Encoder *encoder) { + if (video->frame() == 0) { + encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1); + encoder->Control(AV1E_SET_TILE_COLUMNS, 4); + + // Set cpu-used = 8 for speed + encoder->Control(AOME_SET_CPUUSED, 8); + + // Test screen coding tools + if (test_video_param_.screen_content) + encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_SCREEN); + else + encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_DEFAULT); + + encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1); + encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7); + encoder->Control(AOME_SET_ARNR_STRENGTH, 5); + } + } - SuperresDenominatorPair denoms = GET_PARAM(3); + double GetAveragePsnr() const { + if (frame_count_) return psnr_ / frame_count_; + return 0.0; + } + + void DoTest() { + std::unique_ptr<libaom_test::VideoSource> video; + video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0, + test_video_param_.limit)); + ASSERT_TRUE(video.get() != NULL); + + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); + const double psnr = GetAveragePsnr(); + EXPECT_GT(psnr, test_video_param_.psnr_threshold) + << "superres_mode_ = " << superres_mode_; + + EXPECT_EQ(test_video_param_.limit, frame_count_) + << "superres_mode_ = " << superres_mode_; + } + + TestVideoParam test_video_param_; + SUPERRES_MODE superres_mode_; + + private: + double psnr_; + unsigned int frame_count_; +}; + +TEST_P(HorzSuperresEndToEndTest, HorzSuperresEndToEndPSNRTest) { DoTest(); } + +AV1_INSTANTIATE_TEST_CASE(HorzSuperresEndToEndTest, + ::testing::ValuesIn(kTestVideoVectors), + ::testing::ValuesIn(kSuperresModesWithoutParams)); + +// Test parameter list: +// <[needed for EncoderTest], test_video_param_, tuple(superres_denom_, +// superres_kf_denom_)> +typedef tuple<const libaom_test::CodecFactory *, TestVideoParam, + SuperresDenominatorPair> + HorzSuperresFixedTestParam; + +class HorzSuperresFixedEndToEndTest + : public ::testing::TestWithParam<HorzSuperresFixedTestParam>, + public ::libaom_test::EncoderTest { + protected: + HorzSuperresFixedEndToEndTest() + : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)), + superres_mode_(SUPERRES_FIXED), psnr_(0.0), frame_count_(0) { + SuperresDenominatorPair denoms = GET_PARAM(2); superres_denom_ = ::testing::get<0>(denoms); superres_kf_denom_ = ::testing::get<1>(denoms); } - virtual ~HorzSuperresEndToEndTest() {} + virtual ~HorzSuperresFixedEndToEndTest() {} virtual void SetUp() { InitializeConfig(); @@ -151,8 +243,6 @@ class HorzSuperresEndToEndTest return 0.0; } - double GetPsnrThreshold() { return kPSNRThresholds[test_video_idx_]; } - void DoTest() { std::unique_ptr<libaom_test::VideoSource> video; video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0, @@ -161,7 +251,7 @@ class HorzSuperresEndToEndTest ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); const double psnr = GetAveragePsnr(); - EXPECT_GT(psnr, GetPsnrThreshold()) + EXPECT_GT(psnr, test_video_param_.psnr_threshold) << "superres_mode_ = " << superres_mode_ << ", superres_denom_ = " << superres_denom_ << ", superres_kf_denom_ = " << superres_kf_denom_; @@ -172,7 +262,6 @@ class HorzSuperresEndToEndTest << ", superres_kf_denom_ = " << superres_kf_denom_; } - int test_video_idx_; TestVideoParam test_video_param_; SUPERRES_MODE superres_mode_; int superres_denom_; @@ -183,17 +272,16 @@ class HorzSuperresEndToEndTest unsigned int frame_count_; }; -TEST_P(HorzSuperresEndToEndTest, HorzSuperresEndToEndPSNRTest) { DoTest(); } +TEST_P(HorzSuperresFixedEndToEndTest, HorzSuperresFixedTestParam) { DoTest(); } -AV1_INSTANTIATE_TEST_CASE(HorzSuperresEndToEndTest, - ::testing::Range(0, NUM_TEST_VIDEOS), - ::testing::ValuesIn(kSuperresModesNotQThresh), +AV1_INSTANTIATE_TEST_CASE(HorzSuperresFixedEndToEndTest, + ::testing::ValuesIn(kTestVideoVectors), ::testing::ValuesIn(kSuperresDenominators)); // Test parameter list: -// <[needed for EncoderTest], test_video_idx_, tuple(superres_denom_, -// superres_kf_denom_), tuple(superres_qthresh_,superres_kf_qthresh_)> -typedef tuple<const libaom_test::CodecFactory *, int, SuperresDenominatorPair, +// <[needed for EncoderTest], test_video_param_, +// tuple(superres_qthresh_,superres_kf_qthresh_)> +typedef tuple<const libaom_test::CodecFactory *, TestVideoParam, SuperresQThresholdPair> HorzSuperresQThreshTestParam; @@ -202,15 +290,9 @@ class HorzSuperresQThreshEndToEndTest public ::libaom_test::EncoderTest { protected: HorzSuperresQThreshEndToEndTest() - : EncoderTest(GET_PARAM(0)), test_video_idx_(GET_PARAM(1)), + : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)), superres_mode_(SUPERRES_QTHRESH), psnr_(0.0), frame_count_(0) { - test_video_param_ = kTestVideoVectors[test_video_idx_]; - - SuperresDenominatorPair denoms = GET_PARAM(2); - superres_denom_ = ::testing::get<0>(denoms); - superres_kf_denom_ = ::testing::get<1>(denoms); - - SuperresQThresholdPair qthresholds = GET_PARAM(3); + SuperresQThresholdPair qthresholds = GET_PARAM(2); superres_qthresh_ = ::testing::get<0>(qthresholds); superres_kf_qthresh_ = ::testing::get<1>(qthresholds); } @@ -232,8 +314,6 @@ class HorzSuperresQThreshEndToEndTest // Set superres parameters cfg_.rc_superres_mode = superres_mode_; - cfg_.rc_superres_denominator = superres_denom_; - cfg_.rc_superres_kf_denominator = superres_kf_denom_; cfg_.rc_superres_qthresh = superres_qthresh_; cfg_.rc_superres_kf_qthresh = superres_kf_qthresh_; } @@ -274,8 +354,6 @@ class HorzSuperresQThreshEndToEndTest return 0.0; } - double GetPsnrThreshold() { return kPSNRThresholds[test_video_idx_]; } - void DoTest() { std::unique_ptr<libaom_test::VideoSource> video; video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0, @@ -284,26 +362,19 @@ class HorzSuperresQThreshEndToEndTest ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); const double psnr = GetAveragePsnr(); - EXPECT_GT(psnr, GetPsnrThreshold()) + EXPECT_GT(psnr, test_video_param_.psnr_threshold) << "superres_mode_ = " << superres_mode_ - << ", superres_denom_ = " << superres_denom_ - << ", superres_kf_denom_ = " << superres_kf_denom_ << ", superres_qthresh_ = " << superres_qthresh_ << ", superres_kf_qthresh_ = " << superres_kf_qthresh_; EXPECT_EQ(test_video_param_.limit, frame_count_) << "superres_mode_ = " << superres_mode_ - << ", superres_denom_ = " << superres_denom_ - << ", superres_kf_denom_ = " << superres_kf_denom_ << ", superres_qthresh_ = " << superres_qthresh_ << ", superres_kf_qthresh_ = " << superres_kf_qthresh_; } - int test_video_idx_; TestVideoParam test_video_param_; SUPERRES_MODE superres_mode_; - int superres_denom_; - int superres_kf_denom_; int superres_qthresh_; int superres_kf_qthresh_; @@ -317,8 +388,7 @@ TEST_P(HorzSuperresQThreshEndToEndTest, HorzSuperresQThreshEndToEndPSNRTest) { } AV1_INSTANTIATE_TEST_CASE(HorzSuperresQThreshEndToEndTest, - ::testing::Range(0, NUM_TEST_VIDEOS), - ::testing::ValuesIn(kSuperresDenominators), + ::testing::ValuesIn(kTestVideoVectors), ::testing::ValuesIn(kSuperresQThresholds)); } // namespace diff --git a/libaom/test/level_test.cc b/libaom/test/level_test.cc new file mode 100644 index 0000000..e3b0ef1 --- /dev/null +++ b/libaom/test/level_test.cc @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#include <memory> + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/util.h" +#include "test/y4m_video_source.h" +#include "test/yuv_video_source.h" + +namespace { +// Speed settings tested +static const int kCpuUsedVectors[] = { + 1, + 2, + 3, + 4, +}; + +class LevelTest + : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>, + public ::libaom_test::EncoderTest { + protected: + LevelTest() + : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), + cpu_used_(GET_PARAM(2)), target_level_(31) {} + + virtual ~LevelTest() {} + + virtual void SetUp() { + InitializeConfig(); + SetMode(encoding_mode_); + if (encoding_mode_ != ::libaom_test::kRealTime) { + cfg_.g_lag_in_frames = 5; + cfg_.rc_end_usage = AOM_VBR; + } else { + cfg_.g_lag_in_frames = 0; + cfg_.rc_end_usage = AOM_CBR; + cfg_.rc_buf_sz = 1000; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 600; + } + } + + virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video, + ::libaom_test::Encoder *encoder) { + if (video->frame() == 0) { + encoder->Control(AOME_SET_CPUUSED, cpu_used_); + encoder->Control(AV1E_SET_TARGET_SEQ_LEVEL_IDX, target_level_); + if (encoding_mode_ != ::libaom_test::kRealTime) { + encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1); + encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7); + encoder->Control(AOME_SET_ARNR_STRENGTH, 5); + } + } + } + + libaom_test::TestMode encoding_mode_; + int cpu_used_; + int target_level_; +}; + +TEST_P(LevelTest, TestTargetLevelApi) { + static const aom_codec_iface_t *codec = &aom_codec_av1_cx_algo; + aom_codec_ctx_t enc; + aom_codec_enc_cfg_t cfg; + EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(codec, &cfg, 0)); + EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, codec, &cfg, 0)); + for (int operating_point = 0; operating_point <= 32; ++operating_point) { + for (int level = 0; level <= 32; ++level) { + const int target_level = operating_point * 100 + level; + if ((level >= 0 && level <= 23) || level == 31 || operating_point > 31) { + EXPECT_EQ(AOM_CODEC_OK, + aom_codec_control(&enc, AV1E_SET_TARGET_SEQ_LEVEL_IDX, + target_level)); + } else { + EXPECT_EQ(AOM_CODEC_INVALID_PARAM, + aom_codec_control(&enc, AV1E_SET_TARGET_SEQ_LEVEL_IDX, + target_level)); + } + } + } + EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc)); +} + +TEST_P(LevelTest, TestTargetLevel19) { + std::unique_ptr<libaom_test::VideoSource> video; + video.reset(new libaom_test::Y4mVideoSource("park_joy_90p_8_420.y4m", 0, 10)); + ASSERT_TRUE(video.get() != NULL); + // Level index 19 corresponding to level 6.3. + target_level_ = 19; + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); +} + +AV1_INSTANTIATE_TEST_CASE(LevelTest, + ::testing::Values(::libaom_test::kTwoPassGood), + ::testing::ValuesIn(kCpuUsedVectors)); +} // namespace diff --git a/libaom/test/quantize_func_test.cc b/libaom/test/quantize_func_test.cc index 8dee864..067a981 100644 --- a/libaom/test/quantize_func_test.cc +++ b/libaom/test/quantize_func_test.cc @@ -63,7 +63,7 @@ void highbd_quan64x64_wrapper(QUAN_PARAM_LIST) { HBD_QUAN_FUNC; } -typedef enum { TYPE_B, TYPE_DC, TYPE_FP } QuantType; +enum { TYPE_B, TYPE_DC, TYPE_FP } UENUM1BYTE(QuantType); using ::testing::tuple; typedef tuple<QuantizeFunc, QuantizeFunc, TX_SIZE, QuantType, aom_bit_depth_t> @@ -191,6 +191,13 @@ class QuantizeTest : public ::testing::TestWithParam<QuantizeParam> { } } + void FillCoeffRandomRows(int num) { + FillCoeffZero(); + for (int i = 0; i < num; ++i) { + coeff_[i] = GetRandomCoeff(); + } + } + void FillCoeffZero() { FillCoeff(0); } void FillCoeffConstant() { @@ -287,28 +294,31 @@ TEST_P(QuantizeTest, DISABLED_Speed) { const int16_t *dequant = qtab_->dequant.y_dequant_QTX[q]; const int kNumTests = 5000000; aom_usec_timer timer, simd_timer; + int rows = tx_size_high[tx_size_]; + int cols = tx_size_wide[tx_size_]; + for (int cnt = 0; cnt <= rows; cnt++) { + FillCoeffRandomRows(cnt * cols); + + aom_usec_timer_start(&timer); + for (int n = 0; n < kNumTests; ++n) { + quant_ref_(coeff_ptr, n_coeffs, zbin, round_fp, quant_fp, quant_shift, + qcoeff, dqcoeff, dequant, eob, sc->scan, sc->iscan); + } + aom_usec_timer_mark(&timer); - FillCoeffRandom(); - - aom_usec_timer_start(&timer); - for (int n = 0; n < kNumTests; ++n) { - quant_ref_(coeff_ptr, n_coeffs, zbin, round_fp, quant_fp, quant_shift, - qcoeff, dqcoeff, dequant, eob, sc->scan, sc->iscan); - } - aom_usec_timer_mark(&timer); + aom_usec_timer_start(&simd_timer); + for (int n = 0; n < kNumTests; ++n) { + quant_(coeff_ptr, n_coeffs, zbin, round_fp, quant_fp, quant_shift, qcoeff, + dqcoeff, dequant, eob, sc->scan, sc->iscan); + } + aom_usec_timer_mark(&simd_timer); - aom_usec_timer_start(&simd_timer); - for (int n = 0; n < kNumTests; ++n) { - quant_(coeff_ptr, n_coeffs, zbin, round_fp, quant_fp, quant_shift, qcoeff, - dqcoeff, dequant, eob, sc->scan, sc->iscan); + const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer)); + const int simd_elapsed_time = + static_cast<int>(aom_usec_timer_elapsed(&simd_timer)); + printf("c_time = %d \t simd_time = %d \t Gain = %d \n", elapsed_time, + simd_elapsed_time, (elapsed_time / simd_elapsed_time)); } - aom_usec_timer_mark(&simd_timer); - - const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer)); - const int simd_elapsed_time = - static_cast<int>(aom_usec_timer_elapsed(&simd_timer)); - printf("c_time = %d \t simd_time = %d \t Gain = %d \n", elapsed_time, - simd_elapsed_time, (elapsed_time / simd_elapsed_time)); } using ::testing::make_tuple; @@ -398,6 +408,24 @@ const QuantizeParam kQParamArraySSE2[] = { TX_32X32, TYPE_B, AOM_BITS_10), make_tuple(&aom_highbd_quantize_b_32x32_c, &aom_highbd_quantize_b_32x32_sse2, TX_32X32, TYPE_B, AOM_BITS_12), + make_tuple(&aom_highbd_quantize_b_64x64_c, &aom_highbd_quantize_b_64x64_sse2, + TX_64X64, TYPE_B, AOM_BITS_8), + make_tuple(&aom_highbd_quantize_b_64x64_c, &aom_highbd_quantize_b_64x64_sse2, + TX_64X64, TYPE_B, AOM_BITS_10), + make_tuple(&aom_highbd_quantize_b_64x64_c, &aom_highbd_quantize_b_64x64_sse2, + TX_64X64, TYPE_B, AOM_BITS_12), + make_tuple(&aom_quantize_b_adaptive_c, &aom_quantize_b_adaptive_sse2, + TX_16X16, TYPE_B, AOM_BITS_8), + make_tuple(&aom_quantize_b_adaptive_c, &aom_quantize_b_adaptive_sse2, TX_8X8, + TYPE_B, AOM_BITS_8), + make_tuple(&aom_quantize_b_adaptive_c, &aom_quantize_b_adaptive_sse2, TX_4X4, + TYPE_B, AOM_BITS_8), + make_tuple(&aom_quantize_b_32x32_adaptive_c, + &aom_quantize_b_32x32_adaptive_sse2, TX_32X16, TYPE_B, AOM_BITS_8), + make_tuple(&aom_quantize_b_32x32_adaptive_c, + &aom_quantize_b_32x32_adaptive_sse2, TX_16X32, TYPE_B, AOM_BITS_8), + make_tuple(&aom_quantize_b_32x32_adaptive_c, + &aom_quantize_b_32x32_adaptive_sse2, TX_32X32, TYPE_B, AOM_BITS_8) }; INSTANTIATE_TEST_CASE_P(SSE2, QuantizeTest, @@ -411,6 +439,9 @@ INSTANTIATE_TEST_CASE_P( TX_16X16, TYPE_B, AOM_BITS_8), make_tuple(&aom_quantize_b_32x32_c, &aom_quantize_b_32x32_ssse3, TX_32X32, TYPE_B, + AOM_BITS_8), + make_tuple(&aom_quantize_b_64x64_c, + &aom_quantize_b_64x64_ssse3, TX_64X64, TYPE_B, AOM_BITS_8))); #endif // HAVE_SSSE3 && ARCH_X86_64 diff --git a/libaom/test/resize_test.cc b/libaom/test/resize_test.cc index b270b83..39e7d1b 100644 --- a/libaom/test/resize_test.cc +++ b/libaom/test/resize_test.cc @@ -297,7 +297,7 @@ class ResizeInternalTestLarge : public ResizeTest { virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) { if (frame0_psnr_ == 0.) frame0_psnr_ = pkt->data.psnr.psnr[0]; - EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 2.5); + EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 3.0); } #if WRITE_COMPRESSED_STREAM @@ -374,6 +374,7 @@ class ResizeRealtimeTest if (video->frame() == 0) { encoder->Control(AV1E_SET_AQ_MODE, 3); encoder->Control(AOME_SET_CPUUSED, set_cpu_used_); + encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1); } if (change_bitrate_ && video->frame() == 120) { diff --git a/libaom/test/rt_end_to_end_test.cc b/libaom/test/rt_end_to_end_test.cc new file mode 100644 index 0000000..9c3e96b --- /dev/null +++ b/libaom/test/rt_end_to_end_test.cc @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <memory> + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "test/codec_factory.h" +#include "test/encode_test_driver.h" +#include "test/util.h" +#include "test/y4m_video_source.h" +#include "test/yuv_video_source.h" + +namespace { + +const unsigned int kFrames = 10; +const int kBitrate = 500; + +// List of psnr thresholds for speed settings 0-8 +const double kPsnrThreshold[9] = { 36.9, 36.9, 36.85, 36.8, 36.6, + 36.4, 36.0, 35.5, 35.0 }; + +typedef struct { + const char *filename; + unsigned int input_bit_depth; + aom_img_fmt fmt; + aom_bit_depth_t bit_depth; + unsigned int profile; +} TestVideoParam; + +std::ostream &operator<<(std::ostream &os, const TestVideoParam &test_arg) { + return os << "TestVideoParam { filename:" << test_arg.filename + << " input_bit_depth:" << test_arg.input_bit_depth + << " fmt:" << test_arg.fmt << " bit_depth:" << test_arg.bit_depth + << " profile:" << test_arg.profile << "}"; +} + +// TODO(kyslov): Add more test vectors +const TestVideoParam kTestVectors[] = { + { "park_joy_90p_8_420.y4m", 8, AOM_IMG_FMT_I420, AOM_BITS_8, 0 }, +}; + +// Speed settings tested +const int kCpuUsedVectors[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8 }; + +class RTEndToEndTest + : public ::libaom_test::CodecTestWith2Params<TestVideoParam, int>, + public ::libaom_test::EncoderTest { + protected: + RTEndToEndTest() + : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)), + cpu_used_(GET_PARAM(2)), psnr_(0.0), nframes_(0) {} + + virtual ~RTEndToEndTest() {} + + virtual void SetUp() { + InitializeConfig(); + SetMode(::libaom_test::kRealTime); + + cfg_.g_usage = 1; // TODO(kyslov): Move it to encode_test_driver.cc + cfg_.rc_end_usage = AOM_CBR; + cfg_.rc_buf_sz = 1000; + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 600; + } + + virtual void BeginPassHook(unsigned int) { + psnr_ = 0.0; + nframes_ = 0; + } + + virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) { + psnr_ += pkt->data.psnr.psnr[0]; + nframes_++; + } + + virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video, + ::libaom_test::Encoder *encoder) { + if (video->frame() == 0) { + encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1); + encoder->Control(AV1E_SET_TILE_COLUMNS, 1); + encoder->Control(AOME_SET_CPUUSED, cpu_used_); + encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_DEFAULT); + } + } + + double GetAveragePsnr() const { + if (nframes_) return psnr_ / nframes_; + return 0.0; + } + + double GetPsnrThreshold() { return kPsnrThreshold[cpu_used_]; } + + void DoTest() { + cfg_.rc_target_bitrate = kBitrate; + cfg_.g_error_resilient = 0; + cfg_.g_profile = test_video_param_.profile; + cfg_.g_input_bit_depth = test_video_param_.input_bit_depth; + cfg_.g_bit_depth = test_video_param_.bit_depth; + init_flags_ = AOM_CODEC_USE_PSNR; + if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH; + + std::unique_ptr<libaom_test::VideoSource> video; + video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0, + kFrames)); + ASSERT_TRUE(video.get() != NULL); + + ASSERT_NO_FATAL_FAILURE(RunLoop(video.get())); + const double psnr = GetAveragePsnr(); + EXPECT_GT(psnr, GetPsnrThreshold()) << "cpu used = " << cpu_used_; + } + + TestVideoParam test_video_param_; + int cpu_used_; + + private: + double psnr_; + unsigned int nframes_; +}; + +class RTEndToEndTestLarge : public RTEndToEndTest {}; + +TEST_P(RTEndToEndTestLarge, EndtoEndPSNRTest) { DoTest(); } + +TEST_P(RTEndToEndTest, EndtoEndPSNRTest) { DoTest(); } + +AV1_INSTANTIATE_TEST_CASE(RTEndToEndTestLarge, + ::testing::ValuesIn(kTestVectors), + ::testing::ValuesIn(kCpuUsedVectors)); + +AV1_INSTANTIATE_TEST_CASE(RTEndToEndTest, ::testing::Values(kTestVectors[0]), + ::testing::Values(kCpuUsedVectors[8])); +} // namespace diff --git a/libaom/test/sad_test.cc b/libaom/test/sad_test.cc index 845fe79..87dbb33 100644 --- a/libaom/test/sad_test.cc +++ b/libaom/test/sad_test.cc @@ -35,22 +35,25 @@ typedef uint32_t (*SadMxNAvgFunc)(const uint8_t *src_ptr, int src_stride, const uint8_t *second_pred); typedef ::testing::tuple<int, int, SadMxNAvgFunc, int> SadMxNAvgParam; -typedef void (*JntCompAvgFunc)(uint8_t *comp_pred, const uint8_t *pred, - int width, int height, const uint8_t *ref, - int ref_stride, - const JNT_COMP_PARAMS *jcp_param); -typedef ::testing::tuple<int, int, JntCompAvgFunc, int> JntCompAvgParam; - -typedef unsigned int (*JntSadMxhFunc)(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - int width, int height); -typedef ::testing::tuple<int, int, JntSadMxhFunc, int> JntSadMxhParam; - -typedef uint32_t (*JntSadMxNAvgFunc)(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_ptr, int ref_stride, - const uint8_t *second_pred, - const JNT_COMP_PARAMS *jcp_param); -typedef ::testing::tuple<int, int, JntSadMxNAvgFunc, int> JntSadMxNAvgParam; +typedef void (*DistWtdCompAvgFunc)(uint8_t *comp_pred, const uint8_t *pred, + int width, int height, const uint8_t *ref, + int ref_stride, + const DIST_WTD_COMP_PARAMS *jcp_param); +typedef ::testing::tuple<int, int, DistWtdCompAvgFunc, int> DistWtdCompAvgParam; + +typedef unsigned int (*DistWtdSadMxhFunc)(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int width, + int height); +typedef ::testing::tuple<int, int, DistWtdSadMxhFunc, int> DistWtdSadMxhParam; + +typedef uint32_t (*DistWtdSadMxNAvgFunc)(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + const uint8_t *second_pred, + const DIST_WTD_COMP_PARAMS *jcp_param); +typedef ::testing::tuple<int, int, DistWtdSadMxNAvgFunc, int> + DistWtdSadMxNAvgParam; typedef void (*SadMxNx4Func)(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_ptr[], int ref_stride, @@ -203,7 +206,7 @@ class SADTestBase : public ::testing::Test { return sad; } - void ReferenceJntCompAvg(int block_idx) { + void ReferenceDistWtdCompAvg(int block_idx) { const uint8_t *const reference8 = GetReference(block_idx); const uint8_t *const second_pred8 = second_pred_; uint8_t *const comp_pred8 = comp_pred_; @@ -228,7 +231,7 @@ class SADTestBase : public ::testing::Test { } } - unsigned int ReferenceJntSADavg(int block_idx) { + unsigned int ReferenceDistWtdSADavg(int block_idx) { unsigned int sad = 0; const uint8_t *const reference8 = GetReference(block_idx); const uint8_t *const source8 = source_data_; @@ -305,7 +308,7 @@ class SADTestBase : public ::testing::Test { static uint8_t *comp_pred_test_; static uint8_t *comp_pred8_test_; static uint16_t *comp_pred16_test_; - JNT_COMP_PARAMS jcp_param_; + DIST_WTD_COMP_PARAMS jcp_param_; ACMRandom rnd_; }; @@ -391,13 +394,15 @@ class SADavgTest : public ::testing::WithParamInterface<SadMxNAvgParam>, } }; -class JntCompAvgTest : public ::testing::WithParamInterface<JntCompAvgParam>, - public SADTestBase { +class DistWtdCompAvgTest + : public ::testing::WithParamInterface<DistWtdCompAvgParam>, + public SADTestBase { public: - JntCompAvgTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {} + DistWtdCompAvgTest() + : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {} protected: - void jnt_comp_avg(int block_idx) { + void dist_wtd_comp_avg(int block_idx) { const uint8_t *const reference = GetReference(block_idx); ASM_REGISTER_STATE_CHECK(GET_PARAM(2)(comp_pred_test_, second_pred_, width_, @@ -411,8 +416,8 @@ class JntCompAvgTest : public ::testing::WithParamInterface<JntCompAvgParam>, jcp_param_.fwd_offset = quant_dist_lookup_table[j][i][0]; jcp_param_.bck_offset = quant_dist_lookup_table[j][i][1]; - ReferenceJntCompAvg(0); - jnt_comp_avg(0); + ReferenceDistWtdCompAvg(0); + dist_wtd_comp_avg(0); for (int y = 0; y < height_; ++y) for (int x = 0; x < width_; ++x) @@ -423,10 +428,10 @@ class JntCompAvgTest : public ::testing::WithParamInterface<JntCompAvgParam>, } }; -class JntSADTest : public ::testing::WithParamInterface<JntSadMxhParam>, - public SADTestBase { +class DistWtdSADTest : public ::testing::WithParamInterface<DistWtdSadMxhParam>, + public SADTestBase { public: - JntSADTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {} + DistWtdSADTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {} protected: unsigned int SAD(int block_idx) { @@ -455,13 +460,14 @@ class JntSADTest : public ::testing::WithParamInterface<JntSadMxhParam>, } }; -class JntSADavgTest : public ::testing::WithParamInterface<JntSadMxNAvgParam>, - public SADTestBase { +class DistWtdSADavgTest + : public ::testing::WithParamInterface<DistWtdSadMxNAvgParam>, + public SADTestBase { public: - JntSADavgTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {} + DistWtdSADavgTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {} protected: - unsigned int jnt_SAD_avg(int block_idx) { + unsigned int dist_wtd_SAD_avg(int block_idx) { unsigned int ret; const uint8_t *const reference = GetReference(block_idx); @@ -477,8 +483,8 @@ class JntSADavgTest : public ::testing::WithParamInterface<JntSadMxNAvgParam>, jcp_param_.fwd_offset = quant_dist_lookup_table[j][i][0]; jcp_param_.bck_offset = quant_dist_lookup_table[j][i][1]; - const unsigned int reference_sad = ReferenceJntSADavg(0); - const unsigned int exp_sad = jnt_SAD_avg(0); + const unsigned int reference_sad = ReferenceDistWtdSADavg(0); + const unsigned int exp_sad = dist_wtd_SAD_avg(0); ASSERT_EQ(reference_sad, exp_sad); } @@ -608,19 +614,19 @@ TEST_P(SADavgTest, ShortSrc) { source_stride_ = tmp_stride; } -TEST_P(JntCompAvgTest, MaxRef) { +TEST_P(DistWtdCompAvgTest, MaxRef) { FillConstant(reference_data_, reference_stride_, mask_); FillConstant(second_pred_, width_, 0); CheckCompAvg(); } -TEST_P(JntCompAvgTest, MaxSecondPred) { +TEST_P(DistWtdCompAvgTest, MaxSecondPred) { FillConstant(reference_data_, reference_stride_, 0); FillConstant(second_pred_, width_, mask_); CheckCompAvg(); } -TEST_P(JntCompAvgTest, ShortRef) { +TEST_P(DistWtdCompAvgTest, ShortRef) { const int tmp_stride = reference_stride_; reference_stride_ >>= 1; FillRandom(reference_data_, reference_stride_); @@ -629,7 +635,7 @@ TEST_P(JntCompAvgTest, ShortRef) { reference_stride_ = tmp_stride; } -TEST_P(JntCompAvgTest, UnalignedRef) { +TEST_P(DistWtdCompAvgTest, UnalignedRef) { // The reference frame, but not the source frame, may be unaligned for // certain types of searches. const int tmp_stride = reference_stride_; @@ -640,19 +646,19 @@ TEST_P(JntCompAvgTest, UnalignedRef) { reference_stride_ = tmp_stride; } -TEST_P(JntSADTest, MaxRef) { +TEST_P(DistWtdSADTest, MaxRef) { FillConstant(source_data_, source_stride_, 0); FillConstant(reference_data_, reference_stride_, mask_); CheckSAD(); } -TEST_P(JntSADTest, MaxSrc) { +TEST_P(DistWtdSADTest, MaxSrc) { FillConstant(source_data_, source_stride_, mask_); FillConstant(reference_data_, reference_stride_, 0); CheckSAD(); } -TEST_P(JntSADTest, ShortRef) { +TEST_P(DistWtdSADTest, ShortRef) { const int tmp_stride = reference_stride_; reference_stride_ >>= 1; FillRandom(source_data_, source_stride_); @@ -661,7 +667,7 @@ TEST_P(JntSADTest, ShortRef) { reference_stride_ = tmp_stride; } -TEST_P(JntSADTest, UnalignedRef) { +TEST_P(DistWtdSADTest, UnalignedRef) { // The reference frame, but not the source frame, may be unaligned for // certain types of searches. const int tmp_stride = reference_stride_; @@ -672,7 +678,7 @@ TEST_P(JntSADTest, UnalignedRef) { reference_stride_ = tmp_stride; } -TEST_P(JntSADTest, ShortSrc) { +TEST_P(DistWtdSADTest, ShortSrc) { const int tmp_stride = source_stride_; source_stride_ >>= 1; int test_count = 2000; @@ -685,20 +691,20 @@ TEST_P(JntSADTest, ShortSrc) { source_stride_ = tmp_stride; } -TEST_P(JntSADavgTest, MaxRef) { +TEST_P(DistWtdSADavgTest, MaxRef) { FillConstant(source_data_, source_stride_, 0); FillConstant(reference_data_, reference_stride_, mask_); FillConstant(second_pred_, width_, 0); CheckSAD(); } -TEST_P(JntSADavgTest, MaxSrc) { +TEST_P(DistWtdSADavgTest, MaxSrc) { FillConstant(source_data_, source_stride_, mask_); FillConstant(reference_data_, reference_stride_, 0); FillConstant(second_pred_, width_, 0); CheckSAD(); } -TEST_P(JntSADavgTest, ShortRef) { +TEST_P(DistWtdSADavgTest, ShortRef) { const int tmp_stride = reference_stride_; reference_stride_ >>= 1; FillRandom(source_data_, source_stride_); @@ -708,7 +714,7 @@ TEST_P(JntSADavgTest, ShortRef) { reference_stride_ = tmp_stride; } -TEST_P(JntSADavgTest, UnalignedRef) { +TEST_P(DistWtdSADavgTest, UnalignedRef) { // The reference frame, but not the source frame, may be unaligned for // certain types of searches. const int tmp_stride = reference_stride_; @@ -720,7 +726,7 @@ TEST_P(JntSADavgTest, UnalignedRef) { reference_stride_ = tmp_stride; } -TEST_P(JntSADavgTest, ShortSrc) { +TEST_P(DistWtdSADavgTest, ShortSrc) { const int tmp_stride = source_stride_; source_stride_ >>= 1; int test_count = 2000; @@ -947,47 +953,48 @@ const SadMxNAvgParam avg_c_tests[] = { INSTANTIATE_TEST_CASE_P(C, SADavgTest, ::testing::ValuesIn(avg_c_tests)); // TODO(chengchen): add highbd tests -const JntCompAvgParam jnt_comp_avg_c_tests[] = { - make_tuple(128, 128, &aom_jnt_comp_avg_pred_c, -1), - make_tuple(128, 64, &aom_jnt_comp_avg_pred_c, -1), - make_tuple(64, 128, &aom_jnt_comp_avg_pred_c, -1), - make_tuple(64, 64, &aom_jnt_comp_avg_pred_c, -1), - make_tuple(64, 32, &aom_jnt_comp_avg_pred_c, -1), - make_tuple(32, 64, &aom_jnt_comp_avg_pred_c, -1), - make_tuple(32, 32, &aom_jnt_comp_avg_pred_c, -1), - make_tuple(32, 16, &aom_jnt_comp_avg_pred_c, -1), - make_tuple(16, 32, &aom_jnt_comp_avg_pred_c, -1), - make_tuple(16, 16, &aom_jnt_comp_avg_pred_c, -1), - make_tuple(16, 8, &aom_jnt_comp_avg_pred_c, -1), - make_tuple(8, 16, &aom_jnt_comp_avg_pred_c, -1), - make_tuple(8, 8, &aom_jnt_comp_avg_pred_c, -1), - make_tuple(8, 4, &aom_jnt_comp_avg_pred_c, -1), - make_tuple(4, 8, &aom_jnt_comp_avg_pred_c, -1), - make_tuple(4, 4, &aom_jnt_comp_avg_pred_c, -1), +const DistWtdCompAvgParam dist_wtd_comp_avg_c_tests[] = { + make_tuple(128, 128, &aom_dist_wtd_comp_avg_pred_c, -1), + make_tuple(128, 64, &aom_dist_wtd_comp_avg_pred_c, -1), + make_tuple(64, 128, &aom_dist_wtd_comp_avg_pred_c, -1), + make_tuple(64, 64, &aom_dist_wtd_comp_avg_pred_c, -1), + make_tuple(64, 32, &aom_dist_wtd_comp_avg_pred_c, -1), + make_tuple(32, 64, &aom_dist_wtd_comp_avg_pred_c, -1), + make_tuple(32, 32, &aom_dist_wtd_comp_avg_pred_c, -1), + make_tuple(32, 16, &aom_dist_wtd_comp_avg_pred_c, -1), + make_tuple(16, 32, &aom_dist_wtd_comp_avg_pred_c, -1), + make_tuple(16, 16, &aom_dist_wtd_comp_avg_pred_c, -1), + make_tuple(16, 8, &aom_dist_wtd_comp_avg_pred_c, -1), + make_tuple(8, 16, &aom_dist_wtd_comp_avg_pred_c, -1), + make_tuple(8, 8, &aom_dist_wtd_comp_avg_pred_c, -1), + make_tuple(8, 4, &aom_dist_wtd_comp_avg_pred_c, -1), + make_tuple(4, 8, &aom_dist_wtd_comp_avg_pred_c, -1), + make_tuple(4, 4, &aom_dist_wtd_comp_avg_pred_c, -1), }; -INSTANTIATE_TEST_CASE_P(C, JntCompAvgTest, - ::testing::ValuesIn(jnt_comp_avg_c_tests)); - -const JntSadMxNAvgParam jnt_avg_c_tests[] = { - make_tuple(128, 128, &aom_jnt_sad128x128_avg_c, -1), - make_tuple(128, 64, &aom_jnt_sad128x64_avg_c, -1), - make_tuple(64, 128, &aom_jnt_sad64x128_avg_c, -1), - make_tuple(64, 64, &aom_jnt_sad64x64_avg_c, -1), - make_tuple(64, 32, &aom_jnt_sad64x32_avg_c, -1), - make_tuple(32, 64, &aom_jnt_sad32x64_avg_c, -1), - make_tuple(32, 32, &aom_jnt_sad32x32_avg_c, -1), - make_tuple(32, 16, &aom_jnt_sad32x16_avg_c, -1), - make_tuple(16, 32, &aom_jnt_sad16x32_avg_c, -1), - make_tuple(16, 16, &aom_jnt_sad16x16_avg_c, -1), - make_tuple(16, 8, &aom_jnt_sad16x8_avg_c, -1), - make_tuple(8, 16, &aom_jnt_sad8x16_avg_c, -1), - make_tuple(8, 8, &aom_jnt_sad8x8_avg_c, -1), - make_tuple(8, 4, &aom_jnt_sad8x4_avg_c, -1), - make_tuple(4, 8, &aom_jnt_sad4x8_avg_c, -1), - make_tuple(4, 4, &aom_jnt_sad4x4_avg_c, -1), +INSTANTIATE_TEST_CASE_P(C, DistWtdCompAvgTest, + ::testing::ValuesIn(dist_wtd_comp_avg_c_tests)); + +const DistWtdSadMxNAvgParam dist_wtd_avg_c_tests[] = { + make_tuple(128, 128, &aom_dist_wtd_sad128x128_avg_c, -1), + make_tuple(128, 64, &aom_dist_wtd_sad128x64_avg_c, -1), + make_tuple(64, 128, &aom_dist_wtd_sad64x128_avg_c, -1), + make_tuple(64, 64, &aom_dist_wtd_sad64x64_avg_c, -1), + make_tuple(64, 32, &aom_dist_wtd_sad64x32_avg_c, -1), + make_tuple(32, 64, &aom_dist_wtd_sad32x64_avg_c, -1), + make_tuple(32, 32, &aom_dist_wtd_sad32x32_avg_c, -1), + make_tuple(32, 16, &aom_dist_wtd_sad32x16_avg_c, -1), + make_tuple(16, 32, &aom_dist_wtd_sad16x32_avg_c, -1), + make_tuple(16, 16, &aom_dist_wtd_sad16x16_avg_c, -1), + make_tuple(16, 8, &aom_dist_wtd_sad16x8_avg_c, -1), + make_tuple(8, 16, &aom_dist_wtd_sad8x16_avg_c, -1), + make_tuple(8, 8, &aom_dist_wtd_sad8x8_avg_c, -1), + make_tuple(8, 4, &aom_dist_wtd_sad8x4_avg_c, -1), + make_tuple(4, 8, &aom_dist_wtd_sad4x8_avg_c, -1), + make_tuple(4, 4, &aom_dist_wtd_sad4x4_avg_c, -1), }; -INSTANTIATE_TEST_CASE_P(C, JntSADavgTest, ::testing::ValuesIn(jnt_avg_c_tests)); +INSTANTIATE_TEST_CASE_P(C, DistWtdSADavgTest, + ::testing::ValuesIn(dist_wtd_avg_c_tests)); const SadMxNx4Param x4d_c_tests[] = { make_tuple(128, 128, &aom_sad128x128x4d_c, -1), @@ -1251,7 +1258,7 @@ INSTANTIATE_TEST_CASE_P(SSE2, SADx4Test, ::testing::ValuesIn(x4d_sse2_tests)); #if HAVE_SSSE3 // Note: These are named sse2, but part of ssse3 file and only built and linked // when ssse3 is enabled. -const JntSadMxhParam jnt_sad_sse2_tests[] = { +const DistWtdSadMxhParam dist_wtd_sad_sse2_tests[] = { make_tuple(4, 4, &aom_sad4xh_sse2, -1), make_tuple(4, 8, &aom_sad4xh_sse2, -1), make_tuple(8, 4, &aom_sad8xh_sse2, -1), @@ -1275,8 +1282,8 @@ const JntSadMxhParam jnt_sad_sse2_tests[] = { make_tuple(16, 64, &aom_sad16xh_sse2, -1), make_tuple(64, 16, &aom_sad64xh_sse2, -1), }; -INSTANTIATE_TEST_CASE_P(SSE2, JntSADTest, - ::testing::ValuesIn(jnt_sad_sse2_tests)); +INSTANTIATE_TEST_CASE_P(SSE2, DistWtdSADTest, + ::testing::ValuesIn(dist_wtd_sad_sse2_tests)); #endif // HAVE_SSSE3 @@ -1285,49 +1292,49 @@ INSTANTIATE_TEST_CASE_P(SSE2, JntSADTest, #endif // HAVE_SSE3 #if HAVE_SSSE3 -const JntCompAvgParam jnt_comp_avg_ssse3_tests[] = { - make_tuple(128, 128, &aom_jnt_comp_avg_pred_ssse3, -1), - make_tuple(128, 64, &aom_jnt_comp_avg_pred_ssse3, -1), - make_tuple(64, 128, &aom_jnt_comp_avg_pred_ssse3, -1), - make_tuple(64, 64, &aom_jnt_comp_avg_pred_ssse3, -1), - make_tuple(64, 32, &aom_jnt_comp_avg_pred_ssse3, -1), - make_tuple(32, 64, &aom_jnt_comp_avg_pred_ssse3, -1), - make_tuple(32, 32, &aom_jnt_comp_avg_pred_ssse3, -1), - make_tuple(32, 16, &aom_jnt_comp_avg_pred_ssse3, -1), - make_tuple(16, 32, &aom_jnt_comp_avg_pred_ssse3, -1), - make_tuple(16, 16, &aom_jnt_comp_avg_pred_ssse3, -1), - make_tuple(16, 8, &aom_jnt_comp_avg_pred_ssse3, -1), - make_tuple(8, 16, &aom_jnt_comp_avg_pred_ssse3, -1), - make_tuple(8, 8, &aom_jnt_comp_avg_pred_ssse3, -1), - make_tuple(8, 4, &aom_jnt_comp_avg_pred_ssse3, -1), - make_tuple(4, 8, &aom_jnt_comp_avg_pred_ssse3, -1), - make_tuple(4, 4, &aom_jnt_comp_avg_pred_ssse3, -1), - make_tuple(16, 16, &aom_jnt_comp_avg_pred_ssse3, -1), +const DistWtdCompAvgParam dist_wtd_comp_avg_ssse3_tests[] = { + make_tuple(128, 128, &aom_dist_wtd_comp_avg_pred_ssse3, -1), + make_tuple(128, 64, &aom_dist_wtd_comp_avg_pred_ssse3, -1), + make_tuple(64, 128, &aom_dist_wtd_comp_avg_pred_ssse3, -1), + make_tuple(64, 64, &aom_dist_wtd_comp_avg_pred_ssse3, -1), + make_tuple(64, 32, &aom_dist_wtd_comp_avg_pred_ssse3, -1), + make_tuple(32, 64, &aom_dist_wtd_comp_avg_pred_ssse3, -1), + make_tuple(32, 32, &aom_dist_wtd_comp_avg_pred_ssse3, -1), + make_tuple(32, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1), + make_tuple(16, 32, &aom_dist_wtd_comp_avg_pred_ssse3, -1), + make_tuple(16, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1), + make_tuple(16, 8, &aom_dist_wtd_comp_avg_pred_ssse3, -1), + make_tuple(8, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1), + make_tuple(8, 8, &aom_dist_wtd_comp_avg_pred_ssse3, -1), + make_tuple(8, 4, &aom_dist_wtd_comp_avg_pred_ssse3, -1), + make_tuple(4, 8, &aom_dist_wtd_comp_avg_pred_ssse3, -1), + make_tuple(4, 4, &aom_dist_wtd_comp_avg_pred_ssse3, -1), + make_tuple(16, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1), }; -INSTANTIATE_TEST_CASE_P(SSSE3, JntCompAvgTest, - ::testing::ValuesIn(jnt_comp_avg_ssse3_tests)); - -const JntSadMxNAvgParam jnt_avg_ssse3_tests[] = { - make_tuple(128, 128, &aom_jnt_sad128x128_avg_ssse3, -1), - make_tuple(128, 64, &aom_jnt_sad128x64_avg_ssse3, -1), - make_tuple(64, 128, &aom_jnt_sad64x128_avg_ssse3, -1), - make_tuple(64, 64, &aom_jnt_sad64x64_avg_ssse3, -1), - make_tuple(64, 32, &aom_jnt_sad64x32_avg_ssse3, -1), - make_tuple(32, 64, &aom_jnt_sad32x64_avg_ssse3, -1), - make_tuple(32, 32, &aom_jnt_sad32x32_avg_ssse3, -1), - make_tuple(32, 16, &aom_jnt_sad32x16_avg_ssse3, -1), - make_tuple(16, 32, &aom_jnt_sad16x32_avg_ssse3, -1), - make_tuple(16, 16, &aom_jnt_sad16x16_avg_ssse3, -1), - make_tuple(16, 8, &aom_jnt_sad16x8_avg_ssse3, -1), - make_tuple(8, 16, &aom_jnt_sad8x16_avg_ssse3, -1), - make_tuple(8, 8, &aom_jnt_sad8x8_avg_ssse3, -1), - make_tuple(8, 4, &aom_jnt_sad8x4_avg_ssse3, -1), - make_tuple(4, 8, &aom_jnt_sad4x8_avg_ssse3, -1), - make_tuple(4, 4, &aom_jnt_sad4x4_avg_ssse3, -1), +INSTANTIATE_TEST_CASE_P(SSSE3, DistWtdCompAvgTest, + ::testing::ValuesIn(dist_wtd_comp_avg_ssse3_tests)); + +const DistWtdSadMxNAvgParam dist_wtd_avg_ssse3_tests[] = { + make_tuple(128, 128, &aom_dist_wtd_sad128x128_avg_ssse3, -1), + make_tuple(128, 64, &aom_dist_wtd_sad128x64_avg_ssse3, -1), + make_tuple(64, 128, &aom_dist_wtd_sad64x128_avg_ssse3, -1), + make_tuple(64, 64, &aom_dist_wtd_sad64x64_avg_ssse3, -1), + make_tuple(64, 32, &aom_dist_wtd_sad64x32_avg_ssse3, -1), + make_tuple(32, 64, &aom_dist_wtd_sad32x64_avg_ssse3, -1), + make_tuple(32, 32, &aom_dist_wtd_sad32x32_avg_ssse3, -1), + make_tuple(32, 16, &aom_dist_wtd_sad32x16_avg_ssse3, -1), + make_tuple(16, 32, &aom_dist_wtd_sad16x32_avg_ssse3, -1), + make_tuple(16, 16, &aom_dist_wtd_sad16x16_avg_ssse3, -1), + make_tuple(16, 8, &aom_dist_wtd_sad16x8_avg_ssse3, -1), + make_tuple(8, 16, &aom_dist_wtd_sad8x16_avg_ssse3, -1), + make_tuple(8, 8, &aom_dist_wtd_sad8x8_avg_ssse3, -1), + make_tuple(8, 4, &aom_dist_wtd_sad8x4_avg_ssse3, -1), + make_tuple(4, 8, &aom_dist_wtd_sad4x8_avg_ssse3, -1), + make_tuple(4, 4, &aom_dist_wtd_sad4x4_avg_ssse3, -1), }; -INSTANTIATE_TEST_CASE_P(SSSE3, JntSADavgTest, - ::testing::ValuesIn(jnt_avg_ssse3_tests)); +INSTANTIATE_TEST_CASE_P(SSSE3, DistWtdSADavgTest, + ::testing::ValuesIn(dist_wtd_avg_ssse3_tests)); #endif // HAVE_SSSE3 #if HAVE_SSE4_1 diff --git a/libaom/test/sum_squares_test.cc b/libaom/test/sum_squares_test.cc index cb518c8..f26a646 100644 --- a/libaom/test/sum_squares_test.cc +++ b/libaom/test/sum_squares_test.cc @@ -255,7 +255,7 @@ class SSETest : public ::testing::TestWithParam<SSETestParam> { aom_free(src_); aom_free(ref_); } - void RunTest(int isRandom, int width, int height); + void RunTest(int isRandom, int width, int height, int run_times); void GenRandomData(int width, int height, int stride) { uint16_t *pSrc = (uint16_t *)src_; @@ -298,8 +298,9 @@ class SSETest : public ::testing::TestWithParam<SSETestParam> { ACMRandom rnd_; }; -void SSETest::RunTest(int isRandom, int width, int height) { +void SSETest::RunTest(int isRandom, int width, int height, int run_times) { int failed = 0; + aom_usec_timer ref_timer, test_timer; for (int k = 0; k < 3; k++) { int stride = 4 << rnd_(7); // Up to 256 stride while (stride < width) { // Make sure it's valid @@ -326,31 +327,58 @@ void SSETest::RunTest(int isRandom, int width, int height) { pRef = CONVERT_TO_BYTEPTR(ref_); } res_ref = params_.ref_func(pSrc, stride, pRef, stride, width, height); + res_tst = params_.tst_func(pSrc, stride, pRef, stride, width, height); + if (run_times > 1) { + aom_usec_timer_start(&ref_timer); + for (int j = 0; j < run_times; j++) { + params_.ref_func(pSrc, stride, pRef, stride, width, height); + } + aom_usec_timer_mark(&ref_timer); + const int elapsed_time_c = + static_cast<int>(aom_usec_timer_elapsed(&ref_timer)); - ASM_REGISTER_STATE_CHECK( - res_tst = params_.tst_func(pSrc, stride, pRef, stride, width, height)); - - if (!failed) { - failed = res_ref != res_tst; - EXPECT_EQ(res_ref, res_tst) - << "Error:" << (isHbd_ ? "hbd " : " ") << k << " SSE Test [" << width - << "x" << height << "] C output does not match optimized output."; + aom_usec_timer_start(&test_timer); + for (int j = 0; j < run_times; j++) { + params_.tst_func(pSrc, stride, pRef, stride, width, height); + } + aom_usec_timer_mark(&test_timer); + const int elapsed_time_simd = + static_cast<int>(aom_usec_timer_elapsed(&test_timer)); + + printf( + "c_time=%d \t simd_time=%d \t " + "gain=%d\n", + elapsed_time_c, elapsed_time_simd, + (elapsed_time_c / elapsed_time_simd)); + } else { + if (!failed) { + failed = res_ref != res_tst; + EXPECT_EQ(res_ref, res_tst) + << "Error:" << (isHbd_ ? "hbd " : " ") << k << " SSE Test [" + << width << "x" << height + << "] C output does not match optimized output."; + } } } } TEST_P(SSETest, OperationCheck) { for (int height = 4; height <= 128; height += 4) { - RunTest(1, width_, height); // GenRandomData + RunTest(1, width_, height, 1); // GenRandomData } } TEST_P(SSETest, ExtremeValues) { for (int height = 4; height <= 128; height += 4) { - RunTest(0, width_, height); + RunTest(0, width_, height, 1); } } +TEST_P(SSETest, DISABLED_Speed) { + for (int height = 4; height <= 128; height += 4) { + RunTest(1, width_, height, 100); + } +} #if HAVE_SSE4_1 TestSSEFuncs sse_sse4[] = { TestSSEFuncs(&aom_sse_c, &aom_sse_sse4_1), TestSSEFuncs(&aom_highbd_sse_c, diff --git a/libaom/test/test-data.sha1 b/libaom/test/test-data.sha1 index 95342a8..bd63206 100644 --- a/libaom/test/test-data.sha1 +++ b/libaom/test/test-data.sha1 @@ -532,3 +532,9 @@ e94687eb0e90179b3800b6d5e11eb7e9bfb34eec *av1-1-b8-22-svc-L1T2.ivf 2bc12b16385ea14323bc79607fb8dfbd7edaf8ef *av1-1-b8-22-svc-L1T2.ivf.md5 32ef2f14ee9cb11a24a22934f4c065e926e5d236 *av1-1-b8-22-svc-L2T2.ivf f476a10ff06d750129f8229755d51e17ff141b2a *av1-1-b8-22-svc-L2T2.ivf.md5 +afca5502a489692b0a3c120370b0f43b8fc572a1 *av1-1-b8-04-cdfupdate.ivf +13b9423155a08d5e3a2fd9ae4a973bb046718cdf *av1-1-b8-04-cdfupdate.ivf.md5 +f064290d7fcd3b3de19020e8aec6c43c88d3a505 *av1-1-b8-05-mv.ivf +bff316e63ded5559116bdc2fa4aa97ad7b1a1761 *av1-1-b8-05-mv.ivf.md5 +b48a717c7c003b8dd23c3c2caed1ac673380fdb3 *av1-1-b8-06-mfmv.ivf +1424e3cb53e00eb56b94f4c725826274212c42b6 *av1-1-b8-06-mfmv.ivf.md5 diff --git a/libaom/test/test.cmake b/libaom/test/test.cmake index 12f2319..a44737a 100644 --- a/libaom/test/test.cmake +++ b/libaom/test/test.cmake @@ -64,10 +64,14 @@ list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES "${AOM_ROOT}/test/encode_test_driver.cc" "${AOM_ROOT}/test/encode_test_driver.h" "${AOM_ROOT}/test/end_to_end_test.cc" + "${AOM_ROOT}/test/fwd_kf_test.cc" + "${AOM_ROOT}/test/gf_max_pyr_height_test.cc" + "${AOM_ROOT}/test/rt_end_to_end_test.cc" "${AOM_ROOT}/test/error_resilience_test.cc" "${AOM_ROOT}/test/frame_size_tests.cc" "${AOM_ROOT}/test/horz_superres_test.cc" "${AOM_ROOT}/test/i420_video_source.h" + "${AOM_ROOT}/test/level_test.cc" "${AOM_ROOT}/test/lossless_test.cc" "${AOM_ROOT}/test/monochrome_test.cc" "${AOM_ROOT}/test/qm_test.cc" @@ -120,7 +124,8 @@ if(NOT BUILD_SHARED_LIBS) "${AOM_ROOT}/test/film_grain_table_test.cc" "${AOM_ROOT}/test/segment_binarization_sync.cc" "${AOM_ROOT}/test/superframe_test.cc" - "${AOM_ROOT}/test/tile_independence_test.cc") + "${AOM_ROOT}/test/tile_independence_test.cc" + "${AOM_ROOT}/test/yuv_temporal_filter_test.cc") endif() list(APPEND AOM_UNIT_TEST_COMMON_INTRIN_NEON @@ -233,13 +238,6 @@ if(ENABLE_TESTS) "make sure it's in your PATH.") endif() - if(MSVC) # Force static run time to avoid collisions with googletest. - include("${AOM_ROOT}/build/cmake/msvc_runtime.cmake") - if(BUILD_SHARED_LIBS) - set(AOM_DISABLE_GTEST_CMAKE 1) - endif() - endif() - if(BUILD_SHARED_LIBS AND APPLE) # Silence an RPATH warning. set(CMAKE_MACOSX_RPATH 1) endif() @@ -247,15 +245,16 @@ if(ENABLE_TESTS) include_directories( "${AOM_ROOT}/third_party/googletest/src/googletest/include") - if(AOM_DISABLE_GTEST_CMAKE) - include_directories("${AOM_ROOT}/third_party/googletest/src/googletest") - add_library( - gtest - STATIC - "${AOM_ROOT}/third_party/googletest/src/googletest/src/gtest-all.cc") + include_directories("${AOM_ROOT}/third_party/googletest/src/googletest") + add_library( + aom_gtest + STATIC "${AOM_ROOT}/third_party/googletest/src/googletest/src/gtest-all.cc") + if(MSVC OR WIN32) + target_compile_definitions(aom_gtest PRIVATE GTEST_OS_WINDOWS=1) + elseif(CONFIG_MULTITHREAD AND CMAKE_USE_PTHREADS_INIT) + target_compile_definitions(aom_gtest PRIVATE GTEST_HAS_PTHREAD=1) else() - add_subdirectory("${AOM_ROOT}/third_party/googletest/src/googletest" - EXCLUDE_FROM_ALL) + target_compile_definitions(aom_gtest PRIVATE GTEST_HAS_PTHREAD=0) endif() endif() @@ -307,12 +306,12 @@ function(setup_aom_test_targets) add_executable(test_intra_pred_speed ${AOM_TEST_INTRA_PRED_SPEED_SOURCES} $<TARGET_OBJECTS:aom_common_app_util>) target_link_libraries(test_intra_pred_speed ${AOM_LIB_LINK_TYPE} aom - gtest) + aom_gtest) list(APPEND AOM_APP_TARGETS test_intra_pred_speed) endif() endif() - target_link_libraries(test_libaom ${AOM_LIB_LINK_TYPE} aom gtest) + target_link_libraries(test_libaom ${AOM_LIB_LINK_TYPE} aom aom_gtest) if(CONFIG_LIBYUV) target_sources(test_libaom PRIVATE $<TARGET_OBJECTS:yuv>) diff --git a/libaom/test/test_data_util.cmake b/libaom/test/test_data_util.cmake index 6d684cb..c3c86aa 100644 --- a/libaom/test/test_data_util.cmake +++ b/libaom/test/test_data_util.cmake @@ -500,6 +500,12 @@ if(CONFIG_AV1_DECODER) "av1-1-b8-03-sizeup.mkv.md5" "av1-1-b8-03-sizedown.mkv" "av1-1-b8-03-sizedown.mkv.md5" + "av1-1-b8-04-cdfupdate.ivf" + "av1-1-b8-04-cdfupdate.ivf.md5" + "av1-1-b8-05-mv.ivf" + "av1-1-b8-05-mv.ivf.md5" + "av1-1-b8-06-mfmv.ivf" + "av1-1-b8-06-mfmv.ivf.md5" "av1-1-b8-22-svc-L2T1.ivf" "av1-1-b8-22-svc-L2T1.ivf.md5" "av1-1-b8-22-svc-L1T2.ivf" diff --git a/libaom/test/test_vectors.cc b/libaom/test/test_vectors.cc index d2f333f..d2cd901 100644 --- a/libaom/test/test_vectors.cc +++ b/libaom/test/test_vectors.cc @@ -16,125 +16,243 @@ namespace libaom_test { #define NELEMENTS(x) static_cast<int>(sizeof(x) / sizeof(x[0])) #if CONFIG_AV1_DECODER -const char *const kAV1TestVectors[] = { - "av1-1-b8-00-quantizer-00.ivf", "av1-1-b8-00-quantizer-01.ivf", - "av1-1-b8-00-quantizer-02.ivf", "av1-1-b8-00-quantizer-03.ivf", - "av1-1-b8-00-quantizer-04.ivf", "av1-1-b8-00-quantizer-05.ivf", - "av1-1-b8-00-quantizer-06.ivf", "av1-1-b8-00-quantizer-07.ivf", - "av1-1-b8-00-quantizer-08.ivf", "av1-1-b8-00-quantizer-09.ivf", - "av1-1-b8-00-quantizer-10.ivf", "av1-1-b8-00-quantizer-11.ivf", - "av1-1-b8-00-quantizer-12.ivf", "av1-1-b8-00-quantizer-13.ivf", - "av1-1-b8-00-quantizer-14.ivf", "av1-1-b8-00-quantizer-15.ivf", - "av1-1-b8-00-quantizer-16.ivf", "av1-1-b8-00-quantizer-17.ivf", - "av1-1-b8-00-quantizer-18.ivf", "av1-1-b8-00-quantizer-19.ivf", - "av1-1-b8-00-quantizer-20.ivf", "av1-1-b8-00-quantizer-21.ivf", - "av1-1-b8-00-quantizer-22.ivf", "av1-1-b8-00-quantizer-23.ivf", - "av1-1-b8-00-quantizer-24.ivf", "av1-1-b8-00-quantizer-25.ivf", - "av1-1-b8-00-quantizer-26.ivf", "av1-1-b8-00-quantizer-27.ivf", - "av1-1-b8-00-quantizer-28.ivf", "av1-1-b8-00-quantizer-29.ivf", - "av1-1-b8-00-quantizer-30.ivf", "av1-1-b8-00-quantizer-31.ivf", - "av1-1-b8-00-quantizer-32.ivf", "av1-1-b8-00-quantizer-33.ivf", - "av1-1-b8-00-quantizer-34.ivf", "av1-1-b8-00-quantizer-35.ivf", - "av1-1-b8-00-quantizer-36.ivf", "av1-1-b8-00-quantizer-37.ivf", - "av1-1-b8-00-quantizer-38.ivf", "av1-1-b8-00-quantizer-39.ivf", - "av1-1-b8-00-quantizer-40.ivf", "av1-1-b8-00-quantizer-41.ivf", - "av1-1-b8-00-quantizer-42.ivf", "av1-1-b8-00-quantizer-43.ivf", - "av1-1-b8-00-quantizer-44.ivf", "av1-1-b8-00-quantizer-45.ivf", - "av1-1-b8-00-quantizer-46.ivf", "av1-1-b8-00-quantizer-47.ivf", - "av1-1-b8-00-quantizer-48.ivf", "av1-1-b8-00-quantizer-49.ivf", - "av1-1-b8-00-quantizer-50.ivf", "av1-1-b8-00-quantizer-51.ivf", - "av1-1-b8-00-quantizer-52.ivf", "av1-1-b8-00-quantizer-53.ivf", - "av1-1-b8-00-quantizer-54.ivf", "av1-1-b8-00-quantizer-55.ivf", - "av1-1-b8-00-quantizer-56.ivf", "av1-1-b8-00-quantizer-57.ivf", - "av1-1-b8-00-quantizer-58.ivf", "av1-1-b8-00-quantizer-59.ivf", - "av1-1-b8-00-quantizer-60.ivf", "av1-1-b8-00-quantizer-61.ivf", - "av1-1-b8-00-quantizer-62.ivf", "av1-1-b8-00-quantizer-63.ivf", - "av1-1-b10-00-quantizer-00.ivf", "av1-1-b10-00-quantizer-01.ivf", - "av1-1-b10-00-quantizer-02.ivf", "av1-1-b10-00-quantizer-03.ivf", - "av1-1-b10-00-quantizer-04.ivf", "av1-1-b10-00-quantizer-05.ivf", - "av1-1-b10-00-quantizer-06.ivf", "av1-1-b10-00-quantizer-07.ivf", - "av1-1-b10-00-quantizer-08.ivf", "av1-1-b10-00-quantizer-09.ivf", - "av1-1-b10-00-quantizer-10.ivf", "av1-1-b10-00-quantizer-11.ivf", - "av1-1-b10-00-quantizer-12.ivf", "av1-1-b10-00-quantizer-13.ivf", - "av1-1-b10-00-quantizer-14.ivf", "av1-1-b10-00-quantizer-15.ivf", - "av1-1-b10-00-quantizer-16.ivf", "av1-1-b10-00-quantizer-17.ivf", - "av1-1-b10-00-quantizer-18.ivf", "av1-1-b10-00-quantizer-19.ivf", - "av1-1-b10-00-quantizer-20.ivf", "av1-1-b10-00-quantizer-21.ivf", - "av1-1-b10-00-quantizer-22.ivf", "av1-1-b10-00-quantizer-23.ivf", - "av1-1-b10-00-quantizer-24.ivf", "av1-1-b10-00-quantizer-25.ivf", - "av1-1-b10-00-quantizer-26.ivf", "av1-1-b10-00-quantizer-27.ivf", - "av1-1-b10-00-quantizer-28.ivf", "av1-1-b10-00-quantizer-29.ivf", - "av1-1-b10-00-quantizer-30.ivf", "av1-1-b10-00-quantizer-31.ivf", - "av1-1-b10-00-quantizer-32.ivf", "av1-1-b10-00-quantizer-33.ivf", - "av1-1-b10-00-quantizer-34.ivf", "av1-1-b10-00-quantizer-35.ivf", - "av1-1-b10-00-quantizer-36.ivf", "av1-1-b10-00-quantizer-37.ivf", - "av1-1-b10-00-quantizer-38.ivf", "av1-1-b10-00-quantizer-39.ivf", - "av1-1-b10-00-quantizer-40.ivf", "av1-1-b10-00-quantizer-41.ivf", - "av1-1-b10-00-quantizer-42.ivf", "av1-1-b10-00-quantizer-43.ivf", - "av1-1-b10-00-quantizer-44.ivf", "av1-1-b10-00-quantizer-45.ivf", - "av1-1-b10-00-quantizer-46.ivf", "av1-1-b10-00-quantizer-47.ivf", - "av1-1-b10-00-quantizer-48.ivf", "av1-1-b10-00-quantizer-49.ivf", - "av1-1-b10-00-quantizer-50.ivf", "av1-1-b10-00-quantizer-51.ivf", - "av1-1-b10-00-quantizer-52.ivf", "av1-1-b10-00-quantizer-53.ivf", - "av1-1-b10-00-quantizer-54.ivf", "av1-1-b10-00-quantizer-55.ivf", - "av1-1-b10-00-quantizer-56.ivf", "av1-1-b10-00-quantizer-57.ivf", - "av1-1-b10-00-quantizer-58.ivf", "av1-1-b10-00-quantizer-59.ivf", - "av1-1-b10-00-quantizer-60.ivf", "av1-1-b10-00-quantizer-61.ivf", - "av1-1-b10-00-quantizer-62.ivf", "av1-1-b10-00-quantizer-63.ivf", - "av1-1-b8-01-size-16x16.ivf", "av1-1-b8-01-size-16x18.ivf", - "av1-1-b8-01-size-16x32.ivf", "av1-1-b8-01-size-16x34.ivf", - "av1-1-b8-01-size-16x64.ivf", "av1-1-b8-01-size-16x66.ivf", - "av1-1-b8-01-size-18x16.ivf", "av1-1-b8-01-size-18x18.ivf", - "av1-1-b8-01-size-18x32.ivf", "av1-1-b8-01-size-18x34.ivf", - "av1-1-b8-01-size-18x64.ivf", "av1-1-b8-01-size-18x66.ivf", - "av1-1-b8-01-size-196x196.ivf", "av1-1-b8-01-size-196x198.ivf", - "av1-1-b8-01-size-196x200.ivf", "av1-1-b8-01-size-196x202.ivf", - "av1-1-b8-01-size-196x208.ivf", "av1-1-b8-01-size-196x210.ivf", - "av1-1-b8-01-size-196x224.ivf", "av1-1-b8-01-size-196x226.ivf", - "av1-1-b8-01-size-198x196.ivf", "av1-1-b8-01-size-198x198.ivf", - "av1-1-b8-01-size-198x200.ivf", "av1-1-b8-01-size-198x202.ivf", - "av1-1-b8-01-size-198x208.ivf", "av1-1-b8-01-size-198x210.ivf", - "av1-1-b8-01-size-198x224.ivf", "av1-1-b8-01-size-198x226.ivf", - "av1-1-b8-01-size-200x196.ivf", "av1-1-b8-01-size-200x198.ivf", - "av1-1-b8-01-size-200x200.ivf", "av1-1-b8-01-size-200x202.ivf", - "av1-1-b8-01-size-200x208.ivf", "av1-1-b8-01-size-200x210.ivf", - "av1-1-b8-01-size-200x224.ivf", "av1-1-b8-01-size-200x226.ivf", - "av1-1-b8-01-size-202x196.ivf", "av1-1-b8-01-size-202x198.ivf", - "av1-1-b8-01-size-202x200.ivf", "av1-1-b8-01-size-202x202.ivf", - "av1-1-b8-01-size-202x208.ivf", "av1-1-b8-01-size-202x210.ivf", - "av1-1-b8-01-size-202x224.ivf", "av1-1-b8-01-size-202x226.ivf", - "av1-1-b8-01-size-208x196.ivf", "av1-1-b8-01-size-208x198.ivf", - "av1-1-b8-01-size-208x200.ivf", "av1-1-b8-01-size-208x202.ivf", - "av1-1-b8-01-size-208x208.ivf", "av1-1-b8-01-size-208x210.ivf", - "av1-1-b8-01-size-208x224.ivf", "av1-1-b8-01-size-208x226.ivf", - "av1-1-b8-01-size-210x196.ivf", "av1-1-b8-01-size-210x198.ivf", - "av1-1-b8-01-size-210x200.ivf", "av1-1-b8-01-size-210x202.ivf", - "av1-1-b8-01-size-210x208.ivf", "av1-1-b8-01-size-210x210.ivf", - "av1-1-b8-01-size-210x224.ivf", "av1-1-b8-01-size-210x226.ivf", - "av1-1-b8-01-size-224x196.ivf", "av1-1-b8-01-size-224x198.ivf", - "av1-1-b8-01-size-224x200.ivf", "av1-1-b8-01-size-224x202.ivf", - "av1-1-b8-01-size-224x208.ivf", "av1-1-b8-01-size-224x210.ivf", - "av1-1-b8-01-size-224x224.ivf", "av1-1-b8-01-size-224x226.ivf", - "av1-1-b8-01-size-226x196.ivf", "av1-1-b8-01-size-226x198.ivf", - "av1-1-b8-01-size-226x200.ivf", "av1-1-b8-01-size-226x202.ivf", - "av1-1-b8-01-size-226x208.ivf", "av1-1-b8-01-size-226x210.ivf", - "av1-1-b8-01-size-226x224.ivf", "av1-1-b8-01-size-226x226.ivf", - "av1-1-b8-01-size-32x16.ivf", "av1-1-b8-01-size-32x18.ivf", - "av1-1-b8-01-size-32x32.ivf", "av1-1-b8-01-size-32x34.ivf", - "av1-1-b8-01-size-32x64.ivf", "av1-1-b8-01-size-32x66.ivf", - "av1-1-b8-01-size-34x16.ivf", "av1-1-b8-01-size-34x18.ivf", - "av1-1-b8-01-size-34x32.ivf", "av1-1-b8-01-size-34x34.ivf", - "av1-1-b8-01-size-34x64.ivf", "av1-1-b8-01-size-34x66.ivf", - "av1-1-b8-01-size-64x16.ivf", "av1-1-b8-01-size-64x18.ivf", - "av1-1-b8-01-size-64x32.ivf", "av1-1-b8-01-size-64x34.ivf", - "av1-1-b8-01-size-64x64.ivf", "av1-1-b8-01-size-64x66.ivf", - "av1-1-b8-01-size-66x16.ivf", "av1-1-b8-01-size-66x18.ivf", - "av1-1-b8-01-size-66x32.ivf", "av1-1-b8-01-size-66x34.ivf", - "av1-1-b8-01-size-66x64.ivf", "av1-1-b8-01-size-66x66.ivf", - "av1-1-b8-02-allintra.ivf", "av1-1-b8-03-sizedown.mkv", - "av1-1-b8-03-sizeup.mkv", "av1-1-b8-22-svc-L1T2.ivf", - "av1-1-b8-22-svc-L2T1.ivf", "av1-1-b8-22-svc-L2T2.ivf" -}; +const char *const kAV1TestVectors[] = { "av1-1-b8-00-quantizer-00.ivf", + "av1-1-b8-00-quantizer-01.ivf", + "av1-1-b8-00-quantizer-02.ivf", + "av1-1-b8-00-quantizer-03.ivf", + "av1-1-b8-00-quantizer-04.ivf", + "av1-1-b8-00-quantizer-05.ivf", + "av1-1-b8-00-quantizer-06.ivf", + "av1-1-b8-00-quantizer-07.ivf", + "av1-1-b8-00-quantizer-08.ivf", + "av1-1-b8-00-quantizer-09.ivf", + "av1-1-b8-00-quantizer-10.ivf", + "av1-1-b8-00-quantizer-11.ivf", + "av1-1-b8-00-quantizer-12.ivf", + "av1-1-b8-00-quantizer-13.ivf", + "av1-1-b8-00-quantizer-14.ivf", + "av1-1-b8-00-quantizer-15.ivf", + "av1-1-b8-00-quantizer-16.ivf", + "av1-1-b8-00-quantizer-17.ivf", + "av1-1-b8-00-quantizer-18.ivf", + "av1-1-b8-00-quantizer-19.ivf", + "av1-1-b8-00-quantizer-20.ivf", + "av1-1-b8-00-quantizer-21.ivf", + "av1-1-b8-00-quantizer-22.ivf", + "av1-1-b8-00-quantizer-23.ivf", + "av1-1-b8-00-quantizer-24.ivf", + "av1-1-b8-00-quantizer-25.ivf", + "av1-1-b8-00-quantizer-26.ivf", + "av1-1-b8-00-quantizer-27.ivf", + "av1-1-b8-00-quantizer-28.ivf", + "av1-1-b8-00-quantizer-29.ivf", + "av1-1-b8-00-quantizer-30.ivf", + "av1-1-b8-00-quantizer-31.ivf", + "av1-1-b8-00-quantizer-32.ivf", + "av1-1-b8-00-quantizer-33.ivf", + "av1-1-b8-00-quantizer-34.ivf", + "av1-1-b8-00-quantizer-35.ivf", + "av1-1-b8-00-quantizer-36.ivf", + "av1-1-b8-00-quantizer-37.ivf", + "av1-1-b8-00-quantizer-38.ivf", + "av1-1-b8-00-quantizer-39.ivf", + "av1-1-b8-00-quantizer-40.ivf", + "av1-1-b8-00-quantizer-41.ivf", + "av1-1-b8-00-quantizer-42.ivf", + "av1-1-b8-00-quantizer-43.ivf", + "av1-1-b8-00-quantizer-44.ivf", + "av1-1-b8-00-quantizer-45.ivf", + "av1-1-b8-00-quantizer-46.ivf", + "av1-1-b8-00-quantizer-47.ivf", + "av1-1-b8-00-quantizer-48.ivf", + "av1-1-b8-00-quantizer-49.ivf", + "av1-1-b8-00-quantizer-50.ivf", + "av1-1-b8-00-quantizer-51.ivf", + "av1-1-b8-00-quantizer-52.ivf", + "av1-1-b8-00-quantizer-53.ivf", + "av1-1-b8-00-quantizer-54.ivf", + "av1-1-b8-00-quantizer-55.ivf", + "av1-1-b8-00-quantizer-56.ivf", + "av1-1-b8-00-quantizer-57.ivf", + "av1-1-b8-00-quantizer-58.ivf", + "av1-1-b8-00-quantizer-59.ivf", + "av1-1-b8-00-quantizer-60.ivf", + "av1-1-b8-00-quantizer-61.ivf", + "av1-1-b8-00-quantizer-62.ivf", + "av1-1-b8-00-quantizer-63.ivf", + "av1-1-b10-00-quantizer-00.ivf", + "av1-1-b10-00-quantizer-01.ivf", + "av1-1-b10-00-quantizer-02.ivf", + "av1-1-b10-00-quantizer-03.ivf", + "av1-1-b10-00-quantizer-04.ivf", + "av1-1-b10-00-quantizer-05.ivf", + "av1-1-b10-00-quantizer-06.ivf", + "av1-1-b10-00-quantizer-07.ivf", + "av1-1-b10-00-quantizer-08.ivf", + "av1-1-b10-00-quantizer-09.ivf", + "av1-1-b10-00-quantizer-10.ivf", + "av1-1-b10-00-quantizer-11.ivf", + "av1-1-b10-00-quantizer-12.ivf", + "av1-1-b10-00-quantizer-13.ivf", + "av1-1-b10-00-quantizer-14.ivf", + "av1-1-b10-00-quantizer-15.ivf", + "av1-1-b10-00-quantizer-16.ivf", + "av1-1-b10-00-quantizer-17.ivf", + "av1-1-b10-00-quantizer-18.ivf", + "av1-1-b10-00-quantizer-19.ivf", + "av1-1-b10-00-quantizer-20.ivf", + "av1-1-b10-00-quantizer-21.ivf", + "av1-1-b10-00-quantizer-22.ivf", + "av1-1-b10-00-quantizer-23.ivf", + "av1-1-b10-00-quantizer-24.ivf", + "av1-1-b10-00-quantizer-25.ivf", + "av1-1-b10-00-quantizer-26.ivf", + "av1-1-b10-00-quantizer-27.ivf", + "av1-1-b10-00-quantizer-28.ivf", + "av1-1-b10-00-quantizer-29.ivf", + "av1-1-b10-00-quantizer-30.ivf", + "av1-1-b10-00-quantizer-31.ivf", + "av1-1-b10-00-quantizer-32.ivf", + "av1-1-b10-00-quantizer-33.ivf", + "av1-1-b10-00-quantizer-34.ivf", + "av1-1-b10-00-quantizer-35.ivf", + "av1-1-b10-00-quantizer-36.ivf", + "av1-1-b10-00-quantizer-37.ivf", + "av1-1-b10-00-quantizer-38.ivf", + "av1-1-b10-00-quantizer-39.ivf", + "av1-1-b10-00-quantizer-40.ivf", + "av1-1-b10-00-quantizer-41.ivf", + "av1-1-b10-00-quantizer-42.ivf", + "av1-1-b10-00-quantizer-43.ivf", + "av1-1-b10-00-quantizer-44.ivf", + "av1-1-b10-00-quantizer-45.ivf", + "av1-1-b10-00-quantizer-46.ivf", + "av1-1-b10-00-quantizer-47.ivf", + "av1-1-b10-00-quantizer-48.ivf", + "av1-1-b10-00-quantizer-49.ivf", + "av1-1-b10-00-quantizer-50.ivf", + "av1-1-b10-00-quantizer-51.ivf", + "av1-1-b10-00-quantizer-52.ivf", + "av1-1-b10-00-quantizer-53.ivf", + "av1-1-b10-00-quantizer-54.ivf", + "av1-1-b10-00-quantizer-55.ivf", + "av1-1-b10-00-quantizer-56.ivf", + "av1-1-b10-00-quantizer-57.ivf", + "av1-1-b10-00-quantizer-58.ivf", + "av1-1-b10-00-quantizer-59.ivf", + "av1-1-b10-00-quantizer-60.ivf", + "av1-1-b10-00-quantizer-61.ivf", + "av1-1-b10-00-quantizer-62.ivf", + "av1-1-b10-00-quantizer-63.ivf", + "av1-1-b8-01-size-16x16.ivf", + "av1-1-b8-01-size-16x18.ivf", + "av1-1-b8-01-size-16x32.ivf", + "av1-1-b8-01-size-16x34.ivf", + "av1-1-b8-01-size-16x64.ivf", + "av1-1-b8-01-size-16x66.ivf", + "av1-1-b8-01-size-18x16.ivf", + "av1-1-b8-01-size-18x18.ivf", + "av1-1-b8-01-size-18x32.ivf", + "av1-1-b8-01-size-18x34.ivf", + "av1-1-b8-01-size-18x64.ivf", + "av1-1-b8-01-size-18x66.ivf", + "av1-1-b8-01-size-196x196.ivf", + "av1-1-b8-01-size-196x198.ivf", + "av1-1-b8-01-size-196x200.ivf", + "av1-1-b8-01-size-196x202.ivf", + "av1-1-b8-01-size-196x208.ivf", + "av1-1-b8-01-size-196x210.ivf", + "av1-1-b8-01-size-196x224.ivf", + "av1-1-b8-01-size-196x226.ivf", + "av1-1-b8-01-size-198x196.ivf", + "av1-1-b8-01-size-198x198.ivf", + "av1-1-b8-01-size-198x200.ivf", + "av1-1-b8-01-size-198x202.ivf", + "av1-1-b8-01-size-198x208.ivf", + "av1-1-b8-01-size-198x210.ivf", + "av1-1-b8-01-size-198x224.ivf", + "av1-1-b8-01-size-198x226.ivf", + "av1-1-b8-01-size-200x196.ivf", + "av1-1-b8-01-size-200x198.ivf", + "av1-1-b8-01-size-200x200.ivf", + "av1-1-b8-01-size-200x202.ivf", + "av1-1-b8-01-size-200x208.ivf", + "av1-1-b8-01-size-200x210.ivf", + "av1-1-b8-01-size-200x224.ivf", + "av1-1-b8-01-size-200x226.ivf", + "av1-1-b8-01-size-202x196.ivf", + "av1-1-b8-01-size-202x198.ivf", + "av1-1-b8-01-size-202x200.ivf", + "av1-1-b8-01-size-202x202.ivf", + "av1-1-b8-01-size-202x208.ivf", + "av1-1-b8-01-size-202x210.ivf", + "av1-1-b8-01-size-202x224.ivf", + "av1-1-b8-01-size-202x226.ivf", + "av1-1-b8-01-size-208x196.ivf", + "av1-1-b8-01-size-208x198.ivf", + "av1-1-b8-01-size-208x200.ivf", + "av1-1-b8-01-size-208x202.ivf", + "av1-1-b8-01-size-208x208.ivf", + "av1-1-b8-01-size-208x210.ivf", + "av1-1-b8-01-size-208x224.ivf", + "av1-1-b8-01-size-208x226.ivf", + "av1-1-b8-01-size-210x196.ivf", + "av1-1-b8-01-size-210x198.ivf", + "av1-1-b8-01-size-210x200.ivf", + "av1-1-b8-01-size-210x202.ivf", + "av1-1-b8-01-size-210x208.ivf", + "av1-1-b8-01-size-210x210.ivf", + "av1-1-b8-01-size-210x224.ivf", + "av1-1-b8-01-size-210x226.ivf", + "av1-1-b8-01-size-224x196.ivf", + "av1-1-b8-01-size-224x198.ivf", + "av1-1-b8-01-size-224x200.ivf", + "av1-1-b8-01-size-224x202.ivf", + "av1-1-b8-01-size-224x208.ivf", + "av1-1-b8-01-size-224x210.ivf", + "av1-1-b8-01-size-224x224.ivf", + "av1-1-b8-01-size-224x226.ivf", + "av1-1-b8-01-size-226x196.ivf", + "av1-1-b8-01-size-226x198.ivf", + "av1-1-b8-01-size-226x200.ivf", + "av1-1-b8-01-size-226x202.ivf", + "av1-1-b8-01-size-226x208.ivf", + "av1-1-b8-01-size-226x210.ivf", + "av1-1-b8-01-size-226x224.ivf", + "av1-1-b8-01-size-226x226.ivf", + "av1-1-b8-01-size-32x16.ivf", + "av1-1-b8-01-size-32x18.ivf", + "av1-1-b8-01-size-32x32.ivf", + "av1-1-b8-01-size-32x34.ivf", + "av1-1-b8-01-size-32x64.ivf", + "av1-1-b8-01-size-32x66.ivf", + "av1-1-b8-01-size-34x16.ivf", + "av1-1-b8-01-size-34x18.ivf", + "av1-1-b8-01-size-34x32.ivf", + "av1-1-b8-01-size-34x34.ivf", + "av1-1-b8-01-size-34x64.ivf", + "av1-1-b8-01-size-34x66.ivf", + "av1-1-b8-01-size-64x16.ivf", + "av1-1-b8-01-size-64x18.ivf", + "av1-1-b8-01-size-64x32.ivf", + "av1-1-b8-01-size-64x34.ivf", + "av1-1-b8-01-size-64x64.ivf", + "av1-1-b8-01-size-64x66.ivf", + "av1-1-b8-01-size-66x16.ivf", + "av1-1-b8-01-size-66x18.ivf", + "av1-1-b8-01-size-66x32.ivf", + "av1-1-b8-01-size-66x34.ivf", + "av1-1-b8-01-size-66x64.ivf", + "av1-1-b8-01-size-66x66.ivf", + "av1-1-b8-02-allintra.ivf", + "av1-1-b8-03-sizedown.mkv", + "av1-1-b8-03-sizeup.mkv", + "av1-1-b8-04-cdfupdate.ivf", + "av1-1-b8-05-mv.ivf", + "av1-1-b8-06-mfmv.ivf", + "av1-1-b8-22-svc-L1T2.ivf", + "av1-1-b8-22-svc-L2T1.ivf", + "av1-1-b8-22-svc-L2T2.ivf" }; const int kNumAV1TestVectors = NELEMENTS(kAV1TestVectors); #endif // CONFIG_AV1_DECODER diff --git a/libaom/test/variance_test.cc b/libaom/test/variance_test.cc index 0df314b..1942de0 100644 --- a/libaom/test/variance_test.cc +++ b/libaom/test/variance_test.cc @@ -43,10 +43,10 @@ typedef unsigned int (*SubpixAvgVarMxNFunc)(const uint8_t *a, int a_stride, typedef unsigned int (*Get4x4SseFunc)(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride); typedef unsigned int (*SumOfSquaresFunction)(const int16_t *src); -typedef unsigned int (*JntSubpixAvgVarMxNFunc)( +typedef unsigned int (*DistWtdSubpixAvgVarMxNFunc)( const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b, int b_stride, uint32_t *sse, const uint8_t *second_pred, - const JNT_COMP_PARAMS *jcp_param); + const DIST_WTD_COMP_PARAMS *jcp_param); typedef uint32_t (*ObmcSubpelVarFunc)(const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, @@ -216,10 +216,10 @@ static uint32_t subpel_avg_variance_ref(const uint8_t *ref, const uint8_t *src, return static_cast<uint32_t>(sse - ((se * se) >> (l2w + l2h))); } -static uint32_t jnt_subpel_avg_variance_ref( +static uint32_t dist_wtd_subpel_avg_variance_ref( const uint8_t *ref, const uint8_t *src, const uint8_t *second_pred, int l2w, int l2h, int xoff, int yoff, uint32_t *sse_ptr, bool use_high_bit_depth, - aom_bit_depth_t bit_depth, JNT_COMP_PARAMS *jcp_param) { + aom_bit_depth_t bit_depth, DIST_WTD_COMP_PARAMS *jcp_param) { int64_t se = 0; uint64_t sse = 0; const int w = 1 << l2w; @@ -703,13 +703,14 @@ class SubpelVarianceTest protected: void RefTest(); void ExtremeRefTest(); + void SpeedTest(); ACMRandom rnd_; uint8_t *src_; uint8_t *ref_; uint8_t *sec_; TestParams<FunctionType> params_; - JNT_COMP_PARAMS jcp_param_; + DIST_WTD_COMP_PARAMS jcp_param_; // some relay helpers bool use_high_bit_depth() const { return params_.use_high_bit_depth; } @@ -785,6 +786,41 @@ void SubpelVarianceTest<SubpelVarianceFunctionType>::ExtremeRefTest() { } } +template <typename SubpelVarianceFunctionType> +void SubpelVarianceTest<SubpelVarianceFunctionType>::SpeedTest() { + if (!use_high_bit_depth()) { + for (int j = 0; j < block_size(); j++) { + src_[j] = rnd_.Rand8(); + } + for (int j = 0; j < block_size() + width() + height() + 1; j++) { + ref_[j] = rnd_.Rand8(); + } + } else { + for (int j = 0; j < block_size(); j++) { + CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask(); + } + for (int j = 0; j < block_size() + width() + height() + 1; j++) { + CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask(); + } + } + + unsigned int sse1; + int run_time = 1000000000 / block_size(); + aom_usec_timer timer; + + aom_usec_timer_start(&timer); + for (int i = 0; i < run_time; ++i) { + int x = rnd_(8); + int y = rnd_(8); + params_.func(ref_, width() + 1, x, y, src_, width(), &sse1); + } + aom_usec_timer_mark(&timer); + + const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer)); + printf("sub_pixel_variance_%dx%d_%d: %d us\n", width(), height(), + params_.bit_depth, elapsed_time); +} + template <> void SubpelVarianceTest<SubpixAvgVarMxNFunc>::RefTest() { for (int x = 0; x < 8; ++x) { @@ -820,7 +856,7 @@ void SubpelVarianceTest<SubpixAvgVarMxNFunc>::RefTest() { } template <> -void SubpelVarianceTest<JntSubpixAvgVarMxNFunc>::RefTest() { +void SubpelVarianceTest<DistWtdSubpixAvgVarMxNFunc>::RefTest() { for (int x = 0; x < 8; ++x) { for (int y = 0; y < 8; ++y) { if (!use_high_bit_depth()) { @@ -849,7 +885,7 @@ void SubpelVarianceTest<JntSubpixAvgVarMxNFunc>::RefTest() { ASM_REGISTER_STATE_CHECK(var1 = params_.func(ref_, width() + 0, x, y, src_, width(), &sse1, sec_, &jcp_param_)); - var2 = jnt_subpel_avg_variance_ref( + var2 = dist_wtd_subpel_avg_variance_ref( ref_, src_, sec_, params_.log2width, params_.log2height, x, y, &sse2, use_high_bit_depth(), params_.bit_depth, &jcp_param_); EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y; @@ -1022,7 +1058,8 @@ typedef MainTestClass<VarianceMxNFunc> AvxMseTest; typedef MainTestClass<VarianceMxNFunc> AvxVarianceTest; typedef SubpelVarianceTest<SubpixVarMxNFunc> AvxSubpelVarianceTest; typedef SubpelVarianceTest<SubpixAvgVarMxNFunc> AvxSubpelAvgVarianceTest; -typedef SubpelVarianceTest<JntSubpixAvgVarMxNFunc> AvxJntSubpelAvgVarianceTest; +typedef SubpelVarianceTest<DistWtdSubpixAvgVarMxNFunc> + AvxDistWtdSubpelAvgVarianceTest; typedef ObmcVarianceTest<ObmcSubpelVarFunc> AvxObmcSubpelVarianceTest; TEST_P(AvxSseTest, RefSse) { RefTestSse(); } @@ -1039,7 +1076,7 @@ TEST_P(SumOfSquaresTest, Ref) { RefTest(); } TEST_P(AvxSubpelVarianceTest, Ref) { RefTest(); } TEST_P(AvxSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); } TEST_P(AvxSubpelAvgVarianceTest, Ref) { RefTest(); } -TEST_P(AvxJntSubpelAvgVarianceTest, Ref) { RefTest(); } +TEST_P(AvxDistWtdSubpelAvgVarianceTest, Ref) { RefTest(); } TEST_P(AvxObmcSubpelVarianceTest, Ref) { RefTest(); } TEST_P(AvxObmcSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); } TEST_P(AvxObmcSubpelVarianceTest, DISABLED_Speed) { SpeedTest(); } @@ -1121,36 +1158,35 @@ INSTANTIATE_TEST_CASE_P( SubpelAvgVarianceParams(2, 3, &aom_sub_pixel_avg_variance4x8_c, 0), SubpelAvgVarianceParams(2, 2, &aom_sub_pixel_avg_variance4x4_c, 0))); -typedef TestParams<JntSubpixAvgVarMxNFunc> JntSubpelAvgVarianceParams; +typedef TestParams<DistWtdSubpixAvgVarMxNFunc> DistWtdSubpelAvgVarianceParams; INSTANTIATE_TEST_CASE_P( - C, AvxJntSubpelAvgVarianceTest, - ::testing::Values( - JntSubpelAvgVarianceParams(6, 6, &aom_jnt_sub_pixel_avg_variance64x64_c, - 0), - JntSubpelAvgVarianceParams(6, 5, &aom_jnt_sub_pixel_avg_variance64x32_c, - 0), - JntSubpelAvgVarianceParams(5, 6, &aom_jnt_sub_pixel_avg_variance32x64_c, - 0), - JntSubpelAvgVarianceParams(5, 5, &aom_jnt_sub_pixel_avg_variance32x32_c, - 0), - JntSubpelAvgVarianceParams(5, 4, &aom_jnt_sub_pixel_avg_variance32x16_c, - 0), - JntSubpelAvgVarianceParams(4, 5, &aom_jnt_sub_pixel_avg_variance16x32_c, - 0), - JntSubpelAvgVarianceParams(4, 4, &aom_jnt_sub_pixel_avg_variance16x16_c, - 0), - JntSubpelAvgVarianceParams(4, 3, &aom_jnt_sub_pixel_avg_variance16x8_c, - 0), - JntSubpelAvgVarianceParams(3, 4, &aom_jnt_sub_pixel_avg_variance8x16_c, - 0), - JntSubpelAvgVarianceParams(3, 3, &aom_jnt_sub_pixel_avg_variance8x8_c, - 0), - JntSubpelAvgVarianceParams(3, 2, &aom_jnt_sub_pixel_avg_variance8x4_c, - 0), - JntSubpelAvgVarianceParams(2, 3, &aom_jnt_sub_pixel_avg_variance4x8_c, - 0), - JntSubpelAvgVarianceParams(2, 2, &aom_jnt_sub_pixel_avg_variance4x4_c, - 0))); + C, AvxDistWtdSubpelAvgVarianceTest, + ::testing::Values(DistWtdSubpelAvgVarianceParams( + 6, 6, &aom_dist_wtd_sub_pixel_avg_variance64x64_c, 0), + DistWtdSubpelAvgVarianceParams( + 6, 5, &aom_dist_wtd_sub_pixel_avg_variance64x32_c, 0), + DistWtdSubpelAvgVarianceParams( + 5, 6, &aom_dist_wtd_sub_pixel_avg_variance32x64_c, 0), + DistWtdSubpelAvgVarianceParams( + 5, 5, &aom_dist_wtd_sub_pixel_avg_variance32x32_c, 0), + DistWtdSubpelAvgVarianceParams( + 5, 4, &aom_dist_wtd_sub_pixel_avg_variance32x16_c, 0), + DistWtdSubpelAvgVarianceParams( + 4, 5, &aom_dist_wtd_sub_pixel_avg_variance16x32_c, 0), + DistWtdSubpelAvgVarianceParams( + 4, 4, &aom_dist_wtd_sub_pixel_avg_variance16x16_c, 0), + DistWtdSubpelAvgVarianceParams( + 4, 3, &aom_dist_wtd_sub_pixel_avg_variance16x8_c, 0), + DistWtdSubpelAvgVarianceParams( + 3, 4, &aom_dist_wtd_sub_pixel_avg_variance8x16_c, 0), + DistWtdSubpelAvgVarianceParams( + 3, 3, &aom_dist_wtd_sub_pixel_avg_variance8x8_c, 0), + DistWtdSubpelAvgVarianceParams( + 3, 2, &aom_dist_wtd_sub_pixel_avg_variance8x4_c, 0), + DistWtdSubpelAvgVarianceParams( + 2, 3, &aom_dist_wtd_sub_pixel_avg_variance4x8_c, 0), + DistWtdSubpelAvgVarianceParams( + 2, 2, &aom_dist_wtd_sub_pixel_avg_variance4x4_c, 0))); INSTANTIATE_TEST_CASE_P( C, AvxObmcSubpelVarianceTest, @@ -1188,6 +1224,7 @@ TEST_P(AvxHBDVarianceTest, OneQuarter) { OneQuarterTest(); } TEST_P(AvxHBDVarianceTest, DISABLED_Speed) { SpeedTest(); } TEST_P(AvxHBDSubpelVarianceTest, Ref) { RefTest(); } TEST_P(AvxHBDSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); } +TEST_P(AvxHBDSubpelVarianceTest, DISABLED_Speed) { SpeedTest(); } TEST_P(AvxHBDSubpelAvgVarianceTest, Ref) { RefTest(); } /* TODO(debargha): This test does not support the highbd version @@ -1677,6 +1714,9 @@ INSTANTIATE_TEST_CASE_P(AVX2, AvxHBDVarianceTest, #endif // HAVE_AVX2 const SubpelVarianceParams kArrayHBDSubpelVariance_sse2[] = { + SubpelVarianceParams(7, 7, &aom_highbd_12_sub_pixel_variance128x128_sse2, 12), + SubpelVarianceParams(7, 6, &aom_highbd_12_sub_pixel_variance128x64_sse2, 12), + SubpelVarianceParams(6, 7, &aom_highbd_12_sub_pixel_variance64x128_sse2, 12), SubpelVarianceParams(6, 6, &aom_highbd_12_sub_pixel_variance64x64_sse2, 12), SubpelVarianceParams(6, 5, &aom_highbd_12_sub_pixel_variance64x32_sse2, 12), SubpelVarianceParams(5, 6, &aom_highbd_12_sub_pixel_variance32x64_sse2, 12), @@ -1688,6 +1728,9 @@ const SubpelVarianceParams kArrayHBDSubpelVariance_sse2[] = { SubpelVarianceParams(3, 4, &aom_highbd_12_sub_pixel_variance8x16_sse2, 12), SubpelVarianceParams(3, 3, &aom_highbd_12_sub_pixel_variance8x8_sse2, 12), SubpelVarianceParams(3, 2, &aom_highbd_12_sub_pixel_variance8x4_sse2, 12), + SubpelVarianceParams(7, 7, &aom_highbd_10_sub_pixel_variance128x128_sse2, 10), + SubpelVarianceParams(7, 6, &aom_highbd_10_sub_pixel_variance128x64_sse2, 10), + SubpelVarianceParams(6, 7, &aom_highbd_10_sub_pixel_variance64x128_sse2, 10), SubpelVarianceParams(6, 6, &aom_highbd_10_sub_pixel_variance64x64_sse2, 10), SubpelVarianceParams(6, 5, &aom_highbd_10_sub_pixel_variance64x32_sse2, 10), SubpelVarianceParams(5, 6, &aom_highbd_10_sub_pixel_variance32x64_sse2, 10), @@ -1699,6 +1742,9 @@ const SubpelVarianceParams kArrayHBDSubpelVariance_sse2[] = { SubpelVarianceParams(3, 4, &aom_highbd_10_sub_pixel_variance8x16_sse2, 10), SubpelVarianceParams(3, 3, &aom_highbd_10_sub_pixel_variance8x8_sse2, 10), SubpelVarianceParams(3, 2, &aom_highbd_10_sub_pixel_variance8x4_sse2, 10), + SubpelVarianceParams(7, 7, &aom_highbd_8_sub_pixel_variance128x128_sse2, 8), + SubpelVarianceParams(7, 6, &aom_highbd_8_sub_pixel_variance128x64_sse2, 8), + SubpelVarianceParams(6, 7, &aom_highbd_8_sub_pixel_variance64x128_sse2, 8), SubpelVarianceParams(6, 6, &aom_highbd_8_sub_pixel_variance64x64_sse2, 8), SubpelVarianceParams(6, 5, &aom_highbd_8_sub_pixel_variance64x32_sse2, 8), SubpelVarianceParams(5, 6, &aom_highbd_8_sub_pixel_variance32x64_sse2, 8), @@ -1711,7 +1757,6 @@ const SubpelVarianceParams kArrayHBDSubpelVariance_sse2[] = { SubpelVarianceParams(3, 3, &aom_highbd_8_sub_pixel_variance8x8_sse2, 8), SubpelVarianceParams(3, 2, &aom_highbd_8_sub_pixel_variance8x4_sse2, 8) }; - INSTANTIATE_TEST_CASE_P(SSE2, AvxHBDSubpelVarianceTest, ::testing::ValuesIn(kArrayHBDSubpelVariance_sse2)); @@ -1840,44 +1885,34 @@ INSTANTIATE_TEST_CASE_P( 0))); INSTANTIATE_TEST_CASE_P( - SSSE3, AvxJntSubpelAvgVarianceTest, + SSSE3, AvxDistWtdSubpelAvgVarianceTest, ::testing::Values( - JntSubpelAvgVarianceParams(6, 6, - &aom_jnt_sub_pixel_avg_variance64x64_ssse3, - 0), - JntSubpelAvgVarianceParams(6, 5, - &aom_jnt_sub_pixel_avg_variance64x32_ssse3, - 0), - JntSubpelAvgVarianceParams(5, 6, - &aom_jnt_sub_pixel_avg_variance32x64_ssse3, - 0), - JntSubpelAvgVarianceParams(5, 5, - &aom_jnt_sub_pixel_avg_variance32x32_ssse3, - 0), - JntSubpelAvgVarianceParams(5, 4, - &aom_jnt_sub_pixel_avg_variance32x16_ssse3, - 0), - JntSubpelAvgVarianceParams(4, 5, - &aom_jnt_sub_pixel_avg_variance16x32_ssse3, - 0), - JntSubpelAvgVarianceParams(4, 4, - &aom_jnt_sub_pixel_avg_variance16x16_ssse3, - 0), - JntSubpelAvgVarianceParams(4, 3, - &aom_jnt_sub_pixel_avg_variance16x8_ssse3, - 0), - JntSubpelAvgVarianceParams(3, 4, - &aom_jnt_sub_pixel_avg_variance8x16_ssse3, - 0), - JntSubpelAvgVarianceParams(3, 3, - &aom_jnt_sub_pixel_avg_variance8x8_ssse3, 0), - JntSubpelAvgVarianceParams(3, 2, - &aom_jnt_sub_pixel_avg_variance8x4_ssse3, 0), - JntSubpelAvgVarianceParams(2, 3, - &aom_jnt_sub_pixel_avg_variance4x8_ssse3, 0), - JntSubpelAvgVarianceParams(2, 2, - &aom_jnt_sub_pixel_avg_variance4x4_ssse3, - 0))); + DistWtdSubpelAvgVarianceParams( + 6, 6, &aom_dist_wtd_sub_pixel_avg_variance64x64_ssse3, 0), + DistWtdSubpelAvgVarianceParams( + 6, 5, &aom_dist_wtd_sub_pixel_avg_variance64x32_ssse3, 0), + DistWtdSubpelAvgVarianceParams( + 5, 6, &aom_dist_wtd_sub_pixel_avg_variance32x64_ssse3, 0), + DistWtdSubpelAvgVarianceParams( + 5, 5, &aom_dist_wtd_sub_pixel_avg_variance32x32_ssse3, 0), + DistWtdSubpelAvgVarianceParams( + 5, 4, &aom_dist_wtd_sub_pixel_avg_variance32x16_ssse3, 0), + DistWtdSubpelAvgVarianceParams( + 4, 5, &aom_dist_wtd_sub_pixel_avg_variance16x32_ssse3, 0), + DistWtdSubpelAvgVarianceParams( + 4, 4, &aom_dist_wtd_sub_pixel_avg_variance16x16_ssse3, 0), + DistWtdSubpelAvgVarianceParams( + 4, 3, &aom_dist_wtd_sub_pixel_avg_variance16x8_ssse3, 0), + DistWtdSubpelAvgVarianceParams( + 3, 4, &aom_dist_wtd_sub_pixel_avg_variance8x16_ssse3, 0), + DistWtdSubpelAvgVarianceParams( + 3, 3, &aom_dist_wtd_sub_pixel_avg_variance8x8_ssse3, 0), + DistWtdSubpelAvgVarianceParams( + 3, 2, &aom_dist_wtd_sub_pixel_avg_variance8x4_ssse3, 0), + DistWtdSubpelAvgVarianceParams( + 2, 3, &aom_dist_wtd_sub_pixel_avg_variance4x8_ssse3, 0), + DistWtdSubpelAvgVarianceParams( + 2, 2, &aom_dist_wtd_sub_pixel_avg_variance4x4_ssse3, 0))); #endif // HAVE_SSSE3 #if HAVE_SSE4_1 diff --git a/libaom/test/warp_filter_test_util.cc b/libaom/test/warp_filter_test_util.cc index 69b2ed4..9208af8 100644 --- a/libaom/test/warp_filter_test_util.cc +++ b/libaom/test/warp_filter_test_util.cc @@ -149,7 +149,7 @@ void AV1WarpFilterTest::RunSpeedTest(warp_affine_func test_impl) { int do_average = 0; conv_params = get_conv_params_no_round(do_average, 0, dsta, out_w, 1, bd); - conv_params.use_jnt_comp_avg = 0; + conv_params.use_dist_wtd_comp_avg = 0; const int num_loops = 1000000000 / (out_w + out_h); aom_usec_timer timer; @@ -222,9 +222,9 @@ void AV1WarpFilterTest::RunCheckOutput(warp_affine_func test_impl) { conv_params = get_conv_params(0, 0, bd); } if (jj >= 4) { - conv_params.use_jnt_comp_avg = 0; + conv_params.use_dist_wtd_comp_avg = 0; } else { - conv_params.use_jnt_comp_avg = 1; + conv_params.use_dist_wtd_comp_avg = 1; conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0]; conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1]; } @@ -236,9 +236,9 @@ void AV1WarpFilterTest::RunCheckOutput(warp_affine_func test_impl) { get_conv_params_no_round(do_average, 0, dstb, out_w, 1, bd); } if (jj >= 4) { - conv_params.use_jnt_comp_avg = 0; + conv_params.use_dist_wtd_comp_avg = 0; } else { - conv_params.use_jnt_comp_avg = 1; + conv_params.use_dist_wtd_comp_avg = 1; conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0]; conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1]; } @@ -342,7 +342,7 @@ void AV1HighbdWarpFilterTest::RunSpeedTest(highbd_warp_affine_func test_impl) { sub_x = 0; sub_y = 0; int do_average = 0; - conv_params.use_jnt_comp_avg = 0; + conv_params.use_dist_wtd_comp_avg = 0; conv_params = get_conv_params_no_round(do_average, 0, dsta, out_w, 1, bd); const int num_loops = 1000000000 / (out_w + out_h); @@ -419,9 +419,9 @@ void AV1HighbdWarpFilterTest::RunCheckOutput( conv_params = get_conv_params(0, 0, bd); } if (jj >= 4) { - conv_params.use_jnt_comp_avg = 0; + conv_params.use_dist_wtd_comp_avg = 0; } else { - conv_params.use_jnt_comp_avg = 1; + conv_params.use_dist_wtd_comp_avg = 1; conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0]; conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1]; } @@ -436,9 +436,9 @@ void AV1HighbdWarpFilterTest::RunCheckOutput( get_conv_params_no_round(do_average, 0, dstb, out_w, 1, bd); } if (jj >= 4) { - conv_params.use_jnt_comp_avg = 0; + conv_params.use_dist_wtd_comp_avg = 0; } else { - conv_params.use_jnt_comp_avg = 1; + conv_params.use_dist_wtd_comp_avg = 1; conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0]; conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1]; } diff --git a/libaom/test/yuv_temporal_filter_test.cc b/libaom/test/yuv_temporal_filter_test.cc new file mode 100644 index 0000000..fcaf0df --- /dev/null +++ b/libaom/test/yuv_temporal_filter_test.cc @@ -0,0 +1,726 @@ +/* + * Copyright (c) 2019, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "config/av1_rtcd.h" +#include "test/acm_random.h" +#include "test/register_state_check.h" +#include "aom_ports/aom_timer.h" +#include "aom_ports/mem.h" + +namespace { + +using ::libaom_test::ACMRandom; + +const int MAX_WIDTH = 32; +const int MAX_HEIGHT = 32; + +typedef void (*YUVTemporalFilterFunc)( + const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, + int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, + int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *blk_fw, int use_32x32, + uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, + uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count); + +struct TemporalFilterWithBd { + TemporalFilterWithBd(YUVTemporalFilterFunc func, int bitdepth) + : temporal_filter(func), bd(bitdepth) {} + + YUVTemporalFilterFunc temporal_filter; + int bd; +}; + +std::ostream &operator<<(std::ostream &os, const TemporalFilterWithBd &tf) { + return os << "Bitdepth: " << tf.bd; +} + +int GetFilterWeight(unsigned int row, unsigned int col, + unsigned int block_height, unsigned int block_width, + const int *const blk_fw, int use_32x32) { + if (use_32x32) { + return blk_fw[0]; + } + + return blk_fw[2 * (row >= block_height / 2) + (col >= block_width / 2)]; +} + +template <typename PixelType> +int GetModIndex(int sum_dist, int index, int rounding, int strength, + int filter_weight) { + int mod = sum_dist * 3 / index; + mod += rounding; + mod >>= strength; + + mod = AOMMIN(16, mod); + + mod = 16 - mod; + mod *= filter_weight; + + return mod; +} + +// Lowbitdepth version +template <> +int GetModIndex<uint8_t>(int sum_dist, int index, int rounding, int strength, + int filter_weight) { + unsigned int index_mult[14] = { + 0, 0, 0, 0, 49152, 39322, 32768, 28087, 24576, 21846, 19661, 17874, 0, 15124 + }; + + assert(index >= 0 && index <= 13); + assert(index_mult[index] != 0); + + int mod = (clamp(sum_dist, 0, UINT16_MAX) * index_mult[index]) >> 16; + mod += rounding; + mod >>= strength; + + mod = AOMMIN(16, mod); + + mod = 16 - mod; + mod *= filter_weight; + + return mod; +} + +// Highbitdepth version +template <> +int GetModIndex<uint16_t>(int sum_dist, int index, int rounding, int strength, + int filter_weight) { + int64_t index_mult[14] = { 0U, 0U, 0U, 0U, + 3221225472U, 2576980378U, 2147483648U, 1840700270U, + 1610612736U, 1431655766U, 1288490189U, 1171354718U, + 0U, 991146300U }; + + assert(index >= 0 && index <= 13); + assert(index_mult[index] != 0); + + int mod = static_cast<int>((sum_dist * index_mult[index]) >> 32); + mod += rounding; + mod >>= strength; + + mod = AOMMIN(16, mod); + + mod = 16 - mod; + mod *= filter_weight; + + return mod; +} + +template <typename PixelType> +void SetArray(PixelType *pixel_array, int width, int height, int stride, + int val) { + for (int row = 0; row < height; row++) { + for (int col = 0; col < width; col++) { + pixel_array[col] = val; + } + pixel_array += stride; + } +} + +template <typename PixelType> +void SetArray(PixelType *pixel_array, int width, int height, int stride, + ACMRandom *rnd, int low_val, int high_val) { + EXPECT_LE(low_val, high_val); + + for (int row = 0; row < height; row++) { + for (int col = 0; col < width; col++) { + const int val = + static_cast<int>((*rnd).PseudoUniform(high_val - low_val)); + pixel_array[col] = low_val + val; + } + pixel_array += stride; + } +} + +template <typename ValueType> +bool CheckArrayEqual(const ValueType *arr_1, const ValueType *arr_2, int width, + int height, int stride_1, int stride_2) { + for (int row = 0; row < height; row++) { + for (int col = 0; col < width; col++) { + if (arr_1[col] != arr_2[col]) { + return false; + } + } + arr_1 += stride_1; + arr_2 += stride_2; + } + return true; +} + +template <typename ValueType> +void PrintArrayDiff(const ValueType *arr_1, const ValueType *arr_2, int width, + int height, int stride_1, int stride_2) { + const ValueType *arr_1_start = arr_1, *arr_2_start = arr_2; + + printf("Array 1:\n"); + for (int row = 0; row < height; ++row) { + for (int col = 0; col < width; ++col) { + if (arr_1[col] != arr_2[col]) { + printf("*%3d", arr_1[col]); + } else { + printf("%4d", arr_1[col]); + } + } + printf("\n"); + arr_1 += stride_1; + arr_2 += stride_2; + } + + arr_1 = arr_1_start; + arr_2 = arr_2_start; + + printf("Array 2:\n"); + for (int row = 0; row < height; ++row) { + for (int col = 0; col < width; ++col) { + if (arr_1[col] != arr_2[col]) { + printf("*%3d", arr_2[col]); + } else { + printf("%4d", arr_2[col]); + } + } + printf("\n"); + arr_1 += stride_1; + arr_2 += stride_2; + } + + arr_1 = arr_1_start; + arr_2 = arr_2_start; + printf("Difference:\n"); + for (int row = 0; row < height; ++row) { + for (int col = 0; col < width; ++col) { + printf("%4d", arr_1[col] - arr_2[col]); + } + printf("\n"); + arr_1 += stride_1; + arr_2 += stride_2; + } +} + +template <typename PixelType> +void ApplyReferenceFilter(const PixelType *y_src, const PixelType *y_pre, + const PixelType *u_src, const PixelType *v_src, + const PixelType *u_pre, const PixelType *v_pre, + unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, + const int *const blk_fw, int use_32x32, + uint32_t *y_accum, uint16_t *y_count, + uint32_t *u_accum, uint16_t *u_count, + uint32_t *v_accum, uint16_t *v_count) { + const int uv_block_width = block_width >> ss_x, + uv_block_height = block_height >> ss_y; + const int y_src_stride = block_width, y_pre_stride = block_width; + const int uv_src_stride = uv_block_width, uv_pre_stride = uv_block_width; + const int y_diff_stride = block_width, uv_diff_stride = uv_block_width; + const int y_count_stride = block_width, u_count_stride = uv_block_width, + v_count_stride = uv_block_width; + const int y_accum_stride = block_width, u_accum_stride = uv_block_width, + v_accum_stride = uv_block_width; + + int y_dif[MAX_WIDTH * MAX_HEIGHT] = { 0 }; + int u_dif[MAX_WIDTH * MAX_HEIGHT] = { 0 }; + int v_dif[MAX_WIDTH * MAX_HEIGHT] = { 0 }; + + const int rounding = (1 << strength) >> 1; + + // Get the square diffs + for (int row = 0; row < (int)block_height; row++) { + for (int col = 0; col < (int)block_width; col++) { + const int diff = + y_src[row * y_src_stride + col] - y_pre[row * y_pre_stride + col]; + y_dif[row * y_diff_stride + col] = diff * diff; + } + } + + for (int row = 0; row < (int)uv_block_height; row++) { + for (int col = 0; col < (int)uv_block_width; col++) { + const int u_diff = + u_src[row * uv_src_stride + col] - u_pre[row * uv_pre_stride + col]; + const int v_diff = + v_src[row * uv_src_stride + col] - v_pre[row * uv_pre_stride + col]; + u_dif[row * uv_diff_stride + col] = u_diff * u_diff; + v_dif[row * uv_diff_stride + col] = v_diff * v_diff; + } + } + + // Apply the filter to luma + for (int row = 0; row < (int)block_height; row++) { + for (int col = 0; col < (int)block_width; col++) { + const int uv_row = row >> ss_y; + const int uv_col = col >> ss_x; + const int filter_weight = GetFilterWeight(row, col, block_height, + block_width, blk_fw, use_32x32); + + // First we get the modifier for the current y pixel + const int y_pixel = y_pre[row * y_pre_stride + col]; + int y_num_used = 0; + int y_mod = 0; + + // Sum the neighboring 3x3 y pixels + for (int row_step = -1; row_step <= 1; row_step++) { + for (int col_step = -1; col_step <= 1; col_step++) { + const int sub_row = row + row_step; + const int sub_col = col + col_step; + + if (sub_row >= 0 && sub_row < (int)block_height && sub_col >= 0 && + sub_col < (int)block_width) { + y_mod += y_dif[sub_row * y_diff_stride + sub_col]; + y_num_used++; + } + } + } + + // Sum the corresponding uv pixels to the current y modifier + // Note we are rounding down instead of rounding to the nearest pixel. + y_mod += u_dif[uv_row * uv_diff_stride + uv_col]; + y_mod += v_dif[uv_row * uv_diff_stride + uv_col]; + + y_num_used += 2; + + // Set the modifier + y_mod = GetModIndex<PixelType>(y_mod, y_num_used, rounding, strength, + filter_weight); + + // Accumulate the result + y_count[row * y_count_stride + col] += y_mod; + y_accum[row * y_accum_stride + col] += y_mod * y_pixel; + } + } + + // Apply the filter to chroma + for (int uv_row = 0; uv_row < (int)uv_block_height; uv_row++) { + for (int uv_col = 0; uv_col < (int)uv_block_width; uv_col++) { + const int y_row = uv_row << ss_y; + const int y_col = uv_col << ss_x; + const int filter_weight = GetFilterWeight( + uv_row, uv_col, uv_block_height, uv_block_width, blk_fw, use_32x32); + + const int u_pixel = u_pre[uv_row * uv_pre_stride + uv_col]; + const int v_pixel = v_pre[uv_row * uv_pre_stride + uv_col]; + + int uv_num_used = 0; + int u_mod = 0, v_mod = 0; + + // Sum the neighboring 3x3 chromal pixels to the chroma modifier + for (int row_step = -1; row_step <= 1; row_step++) { + for (int col_step = -1; col_step <= 1; col_step++) { + const int sub_row = uv_row + row_step; + const int sub_col = uv_col + col_step; + + if (sub_row >= 0 && sub_row < uv_block_height && sub_col >= 0 && + sub_col < uv_block_width) { + u_mod += u_dif[sub_row * uv_diff_stride + sub_col]; + v_mod += v_dif[sub_row * uv_diff_stride + sub_col]; + uv_num_used++; + } + } + } + + // Sum all the luma pixels associated with the current luma pixel + for (int row_step = 0; row_step < 1 + ss_y; row_step++) { + for (int col_step = 0; col_step < 1 + ss_x; col_step++) { + const int sub_row = y_row + row_step; + const int sub_col = y_col + col_step; + const int y_diff = y_dif[sub_row * y_diff_stride + sub_col]; + + u_mod += y_diff; + v_mod += y_diff; + uv_num_used++; + } + } + + // Set the modifier + u_mod = GetModIndex<PixelType>(u_mod, uv_num_used, rounding, strength, + filter_weight); + v_mod = GetModIndex<PixelType>(v_mod, uv_num_used, rounding, strength, + filter_weight); + + // Accumulate the result + u_count[uv_row * u_count_stride + uv_col] += u_mod; + u_accum[uv_row * u_accum_stride + uv_col] += u_mod * u_pixel; + v_count[uv_row * v_count_stride + uv_col] += v_mod; + v_accum[uv_row * v_accum_stride + uv_col] += v_mod * v_pixel; + } + } +} + +class YUVTemporalFilterTest + : public ::testing::TestWithParam<TemporalFilterWithBd> { + public: + virtual void SetUp() { + filter_func_ = GetParam().temporal_filter; + bd_ = GetParam().bd; + use_highbd_ = (bd_ != 8); + + rnd_.Reset(ACMRandom::DeterministicSeed()); + saturate_test_ = 0; + num_repeats_ = 10; + + ASSERT_TRUE(bd_ == 8 || bd_ == 10 || bd_ == 12); + } + + protected: + template <typename PixelType> + void CompareTestWithParam(int width, int height, int ss_x, int ss_y, + int filter_strength, int use_32x32, + const int *filter_weight); + template <typename PixelType> + void RunTestFilterWithParam(int width, int height, int ss_x, int ss_y, + int filter_strength, int use_32x32, + const int *filter_weight); + template <typename PixelType> + void ApplyTestFilter(const PixelType *y_src, int y_src_stride, + const PixelType *y_pre, int y_pre_stride, + const PixelType *u_src, const PixelType *v_src, + int uv_src_stride, const PixelType *u_pre, + const PixelType *v_pre, int uv_pre_stride, + unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *blk_fw, + int use_32x32, uint32_t *y_accum, uint16_t *y_count, + uint32_t *u_accumu, uint16_t *u_count, uint32_t *v_accum, + uint16_t *v_count); + + YUVTemporalFilterFunc filter_func_; + ACMRandom rnd_; + int saturate_test_; + int num_repeats_; + int use_highbd_; + int bd_; +}; + +template <> +void YUVTemporalFilterTest::ApplyTestFilter<uint8_t>( + const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, + int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, + int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *blk_fw, int use_32x32, + uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, uint16_t *u_count, + uint32_t *v_accum, uint16_t *v_count) { + ASM_REGISTER_STATE_CHECK( + filter_func_(y_src, y_src_stride, y_pre, y_pre_stride, u_src, v_src, + uv_src_stride, u_pre, v_pre, uv_pre_stride, block_width, + block_height, ss_x, ss_y, strength, blk_fw, use_32x32, + y_accum, y_count, u_accum, u_count, v_accum, v_count)); +} + +template <> +void YUVTemporalFilterTest::ApplyTestFilter<uint16_t>( + const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre, + int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src, + int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre, + int uv_pre_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, const int *blk_fw, int use_32x32, + uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, uint16_t *u_count, + uint32_t *v_accum, uint16_t *v_count) { + ASM_REGISTER_STATE_CHECK(filter_func_( + CONVERT_TO_BYTEPTR(y_src), y_src_stride, CONVERT_TO_BYTEPTR(y_pre), + y_pre_stride, CONVERT_TO_BYTEPTR(u_src), CONVERT_TO_BYTEPTR(v_src), + uv_src_stride, CONVERT_TO_BYTEPTR(u_pre), CONVERT_TO_BYTEPTR(v_pre), + uv_pre_stride, block_width, block_height, ss_x, ss_y, strength, blk_fw, + use_32x32, y_accum, y_count, u_accum, u_count, v_accum, v_count)); +} + +template <typename PixelType> +void YUVTemporalFilterTest::CompareTestWithParam(int width, int height, + int ss_x, int ss_y, + int filter_strength, + int use_32x32, + const int *filter_weight) { + const int uv_width = width >> ss_x, uv_height = height >> ss_y; + const int y_stride = width, uv_stride = uv_width; + + DECLARE_ALIGNED(16, PixelType, y_src[MAX_WIDTH * MAX_HEIGHT]) = { 0 }; + DECLARE_ALIGNED(16, PixelType, y_pre[MAX_WIDTH * MAX_HEIGHT]) = { 0 }; + DECLARE_ALIGNED(16, uint16_t, y_count_ref[MAX_WIDTH * MAX_HEIGHT]) = { 0 }; + DECLARE_ALIGNED(16, uint32_t, y_accum_ref[MAX_WIDTH * MAX_HEIGHT]) = { 0 }; + DECLARE_ALIGNED(16, uint16_t, y_count_tst[MAX_WIDTH * MAX_HEIGHT]) = { 0 }; + DECLARE_ALIGNED(16, uint32_t, y_accum_tst[MAX_WIDTH * MAX_HEIGHT]) = { 0 }; + + DECLARE_ALIGNED(16, PixelType, u_src[MAX_WIDTH * MAX_HEIGHT]) = { 0 }; + DECLARE_ALIGNED(16, PixelType, u_pre[MAX_WIDTH * MAX_HEIGHT]) = { 0 }; + DECLARE_ALIGNED(16, uint16_t, u_count_ref[MAX_WIDTH * MAX_HEIGHT]) = { 0 }; + DECLARE_ALIGNED(16, uint32_t, u_accum_ref[MAX_WIDTH * MAX_HEIGHT]) = { 0 }; + DECLARE_ALIGNED(16, uint16_t, u_count_tst[MAX_WIDTH * MAX_HEIGHT]) = { 0 }; + DECLARE_ALIGNED(16, uint32_t, u_accum_tst[MAX_WIDTH * MAX_HEIGHT]) = { 0 }; + + DECLARE_ALIGNED(16, PixelType, v_src[MAX_WIDTH * MAX_HEIGHT]) = { 0 }; + DECLARE_ALIGNED(16, PixelType, v_pre[MAX_WIDTH * MAX_HEIGHT]) = { 0 }; + DECLARE_ALIGNED(16, uint16_t, v_count_ref[MAX_WIDTH * MAX_HEIGHT]) = { 0 }; + DECLARE_ALIGNED(16, uint32_t, v_accum_ref[MAX_WIDTH * MAX_HEIGHT]) = { 0 }; + DECLARE_ALIGNED(16, uint16_t, v_count_tst[MAX_WIDTH * MAX_HEIGHT]) = { 0 }; + DECLARE_ALIGNED(16, uint32_t, v_accum_tst[MAX_WIDTH * MAX_HEIGHT]) = { 0 }; + + for (int repeats = 0; repeats < num_repeats_; repeats++) { + if (saturate_test_) { + const int max_val = (1 << bd_) - 1; + SetArray(y_src, width, height, y_stride, max_val); + SetArray(y_pre, width, height, y_stride, 0); + SetArray(u_src, uv_width, uv_height, uv_stride, max_val); + SetArray(u_pre, uv_width, uv_height, uv_stride, 0); + SetArray(v_src, uv_width, uv_height, uv_stride, max_val); + SetArray(v_pre, uv_width, uv_height, uv_stride, 0); + } else { + const int max_val = 7 << (bd_ - 8); + SetArray(y_src, width, height, y_stride, &rnd_, 0, max_val); + SetArray(y_pre, width, height, y_stride, &rnd_, 0, max_val); + SetArray(u_src, uv_width, uv_height, uv_stride, &rnd_, 0, max_val); + SetArray(u_pre, uv_width, uv_height, uv_stride, &rnd_, 0, max_val); + SetArray(v_src, uv_width, uv_height, uv_stride, &rnd_, 0, max_val); + SetArray(v_pre, uv_width, uv_height, uv_stride, &rnd_, 0, max_val); + } + + ApplyReferenceFilter<PixelType>( + y_src, y_pre, u_src, v_src, u_pre, v_pre, width, height, ss_x, ss_y, + filter_strength, filter_weight, use_32x32, y_accum_ref, y_count_ref, + u_accum_ref, u_count_ref, v_accum_ref, v_count_ref); + + ApplyTestFilter(y_src, y_stride, y_pre, y_stride, u_src, v_src, uv_stride, + u_pre, v_pre, uv_stride, width, height, ss_x, ss_y, + filter_strength, filter_weight, use_32x32, y_accum_tst, + y_count_tst, u_accum_tst, u_count_tst, v_accum_tst, + v_count_tst); + + EXPECT_TRUE(CheckArrayEqual(y_accum_tst, y_accum_ref, width, height, + y_stride, y_stride)); + EXPECT_TRUE(CheckArrayEqual(y_count_tst, y_count_ref, width, height, + y_stride, y_stride)); + EXPECT_TRUE(CheckArrayEqual(u_accum_tst, u_accum_ref, uv_width, uv_height, + uv_stride, uv_stride)); + EXPECT_TRUE(CheckArrayEqual(u_count_tst, u_count_ref, uv_width, uv_height, + uv_stride, uv_stride)); + EXPECT_TRUE(CheckArrayEqual(v_accum_tst, v_accum_ref, uv_width, uv_height, + uv_stride, uv_stride)); + EXPECT_TRUE(CheckArrayEqual(v_count_tst, v_count_ref, uv_width, uv_height, + uv_stride, uv_stride)); + + if (HasFailure()) { + if (use_32x32) { + printf("SS_X: %d, SS_Y: %d, Strength: %d, Weight: %d\n", ss_x, ss_y, + filter_strength, *filter_weight); + } else { + printf("SS_X: %d, SS_Y: %d, Strength: %d, Weights: %d,%d,%d,%d\n", ss_x, + ss_y, filter_strength, filter_weight[0], filter_weight[1], + filter_weight[2], filter_weight[3]); + } + + PrintArrayDiff(y_accum_ref, y_accum_tst, width, height, y_stride, + y_stride); + PrintArrayDiff(y_count_ref, y_count_tst, width, height, y_stride, + y_stride); + PrintArrayDiff(u_accum_ref, v_accum_tst, uv_width, uv_height, uv_stride, + uv_stride); + PrintArrayDiff(u_count_ref, v_count_tst, uv_width, uv_height, uv_stride, + uv_stride); + PrintArrayDiff(u_accum_ref, v_accum_tst, uv_width, uv_height, uv_stride, + uv_stride); + PrintArrayDiff(u_count_ref, v_count_tst, uv_width, uv_height, uv_stride, + uv_stride); + + return; + } + } +} + +template <typename PixelType> +void YUVTemporalFilterTest::RunTestFilterWithParam(int width, int height, + int ss_x, int ss_y, + int filter_strength, + int use_32x32, + const int *filter_weight) { + PixelType y_src[MAX_WIDTH * MAX_HEIGHT] = { 0 }; + PixelType y_pre[MAX_WIDTH * MAX_HEIGHT] = { 0 }; + uint16_t y_count[MAX_WIDTH * MAX_HEIGHT] = { 0 }; + uint32_t y_accum[MAX_WIDTH * MAX_HEIGHT] = { 0 }; + + PixelType u_src[MAX_WIDTH * MAX_HEIGHT] = { 0 }; + PixelType u_pre[MAX_WIDTH * MAX_HEIGHT] = { 0 }; + uint16_t u_count[MAX_WIDTH * MAX_HEIGHT] = { 0 }; + uint32_t u_accum[MAX_WIDTH * MAX_HEIGHT] = { 0 }; + + PixelType v_src[MAX_WIDTH * MAX_HEIGHT] = { 0 }; + PixelType v_pre[MAX_WIDTH * MAX_HEIGHT] = { 0 }; + uint16_t v_count[MAX_WIDTH * MAX_HEIGHT] = { 0 }; + uint32_t v_accum[MAX_WIDTH * MAX_HEIGHT] = { 0 }; + + SetArray(y_src, width, height, MAX_WIDTH, &rnd_, 0, 7 << (bd_ = 8)); + SetArray(y_pre, width, height, MAX_WIDTH, &rnd_, 0, 7 << (bd_ = 8)); + SetArray(u_src, width, height, MAX_WIDTH, &rnd_, 0, 7 << (bd_ = 8)); + SetArray(u_pre, width, height, MAX_WIDTH, &rnd_, 0, 7 << (bd_ = 8)); + SetArray(v_src, width, height, MAX_WIDTH, &rnd_, 0, 7 << (bd_ = 8)); + SetArray(v_pre, width, height, MAX_WIDTH, &rnd_, 0, 7 << (bd_ = 8)); + + for (int repeats = 0; repeats < num_repeats_; repeats++) { + ApplyTestFilter(y_src, MAX_WIDTH, y_pre, MAX_WIDTH, u_src, v_src, MAX_WIDTH, + u_pre, v_pre, MAX_WIDTH, width, height, ss_x, ss_y, + filter_strength, filter_weight, use_32x32, y_accum, y_count, + u_accum, u_count, v_accum, v_count); + } +} + +TEST_P(YUVTemporalFilterTest, Use32x32) { + const int width = 32, height = 32; + const int use_32x32 = 1; + + for (int ss_x = 0; ss_x <= 1; ss_x++) { + for (int ss_y = 0; ss_y <= 1; ss_y++) { + for (int filter_strength = 0; filter_strength <= 6; + filter_strength += 2) { + for (int filter_weight = 0; filter_weight <= 2; filter_weight++) { + if (use_highbd_) { + const int adjusted_strength = filter_strength + 2 * (bd_ - 8); + CompareTestWithParam<uint16_t>(width, height, ss_x, ss_y, + adjusted_strength, use_32x32, + &filter_weight); + } else { + CompareTestWithParam<uint8_t>(width, height, ss_x, ss_y, + filter_strength, use_32x32, + &filter_weight); + } + ASSERT_FALSE(HasFailure()); + } + } + } + } +} + +TEST_P(YUVTemporalFilterTest, Use16x16) { + const int width = 32, height = 32; + const int use_32x32 = 0; + + for (int ss_x = 0; ss_x <= 1; ss_x++) { + for (int ss_y = 0; ss_y <= 1; ss_y++) { + for (int filter_idx = 0; filter_idx < 3 * 3 * 3 * 3; filter_idx++) { + // Set up the filter + int filter_weight[4]; + int filter_idx_cp = filter_idx; + for (int idx = 0; idx < 4; idx++) { + filter_weight[idx] = filter_idx_cp % 3; + filter_idx_cp /= 3; + } + + // Test each parameter + for (int filter_strength = 0; filter_strength <= 6; + filter_strength += 2) { + if (use_highbd_) { + const int adjusted_strength = filter_strength + 2 * (bd_ - 8); + CompareTestWithParam<uint16_t>(width, height, ss_x, ss_y, + adjusted_strength, use_32x32, + filter_weight); + } else { + CompareTestWithParam<uint8_t>(width, height, ss_x, ss_y, + filter_strength, use_32x32, + filter_weight); + } + + ASSERT_FALSE(HasFailure()); + } + } + } + } +} + +TEST_P(YUVTemporalFilterTest, SaturationTest) { + const int width = 32, height = 32; + const int use_32x32 = 1; + const int filter_weight = 1; + saturate_test_ = 1; + + for (int ss_x = 0; ss_x <= 1; ss_x++) { + for (int ss_y = 0; ss_y <= 1; ss_y++) { + for (int filter_strength = 0; filter_strength <= 6; + filter_strength += 2) { + if (use_highbd_) { + const int adjusted_strength = filter_strength + 2 * (bd_ - 8); + CompareTestWithParam<uint16_t>(width, height, ss_x, ss_y, + adjusted_strength, use_32x32, + &filter_weight); + } else { + CompareTestWithParam<uint8_t>(width, height, ss_x, ss_y, + filter_strength, use_32x32, + &filter_weight); + } + + ASSERT_FALSE(HasFailure()); + } + } + } +} + +TEST_P(YUVTemporalFilterTest, DISABLED_Speed) { + const int width = 32, height = 32; + num_repeats_ = 1000; + + for (int use_32x32 = 0; use_32x32 <= 1; use_32x32++) { + const int num_filter_weights = use_32x32 ? 3 : 3 * 3 * 3 * 3; + for (int ss_x = 0; ss_x <= 1; ss_x++) { + for (int ss_y = 0; ss_y <= 1; ss_y++) { + for (int filter_idx = 0; filter_idx < num_filter_weights; + filter_idx++) { + // Set up the filter + int filter_weight[4]; + int filter_idx_cp = filter_idx; + for (int idx = 0; idx < 4; idx++) { + filter_weight[idx] = filter_idx_cp % 3; + filter_idx_cp /= 3; + } + + // Test each parameter + for (int filter_strength = 0; filter_strength <= 6; + filter_strength += 2) { + aom_usec_timer timer; + aom_usec_timer_start(&timer); + + if (use_highbd_) { + RunTestFilterWithParam<uint16_t>(width, height, ss_x, ss_y, + filter_strength, use_32x32, + filter_weight); + } else { + RunTestFilterWithParam<uint8_t>(width, height, ss_x, ss_y, + filter_strength, use_32x32, + filter_weight); + } + + aom_usec_timer_mark(&timer); + const int elapsed_time = + static_cast<int>(aom_usec_timer_elapsed(&timer)); + + printf( + "Bitdepth: %d, Use 32X32: %d, SS_X: %d, SS_Y: %d, Weight Idx: " + "%d, Strength: %d, Time: %5d\n", + bd_, use_32x32, ss_x, ss_y, filter_idx, filter_strength, + elapsed_time); + } + } + } + } + } +} + +INSTANTIATE_TEST_CASE_P( + C, YUVTemporalFilterTest, + ::testing::Values( + TemporalFilterWithBd(&av1_apply_temporal_filter_c, 8), + TemporalFilterWithBd(&av1_highbd_apply_temporal_filter_c, 10), + TemporalFilterWithBd(&av1_highbd_apply_temporal_filter_c, 12))); + +#if HAVE_SSE4_1 +INSTANTIATE_TEST_CASE_P( + SSE4_1, YUVTemporalFilterTest, + ::testing::Values( + TemporalFilterWithBd(&av1_apply_temporal_filter_sse4_1, 8), + TemporalFilterWithBd(&av1_highbd_apply_temporal_filter_sse4_1, 10), + TemporalFilterWithBd(&av1_highbd_apply_temporal_filter_sse4_1, 12))); +#endif // HAVE_SSE4_1 + +} // namespace diff --git a/libaom/third_party/libwebm/AUTHORS.TXT b/libaom/third_party/libwebm/AUTHORS.TXT index 8ab6f79..9686ac1 100644 --- a/libaom/third_party/libwebm/AUTHORS.TXT +++ b/libaom/third_party/libwebm/AUTHORS.TXT @@ -1,4 +1,4 @@ -# Names should be added to this file like so:
-# Name or Organization <email address>
-
-Google Inc.
+# Names should be added to this file like so: +# Name or Organization <email address> + +Google Inc. diff --git a/libaom/third_party/libwebm/README.libaom b/libaom/third_party/libwebm/README.libaom index bd288d2..17b2f47 100644 --- a/libaom/third_party/libwebm/README.libaom +++ b/libaom/third_party/libwebm/README.libaom @@ -1,5 +1,5 @@ URL: https://chromium.googlesource.com/webm/libwebm -Version: af81f26025b7435fa9a14ad07c58b44cf9280430 +Version: 9f23fbc50e7a76c815b1d3f0309abe1066301331 License: BSD License File: LICENSE.txt @@ -7,8 +7,6 @@ Description: libwebm is used to handle WebM container I/O. Local Changes: -Add av1 codec as an eligible codec for webm: - https://aomedia-review.googlesource.com/c/aom/+/15103 Only keep: - Android.mk - AUTHORS.TXT diff --git a/libaom/third_party/libwebm/common/file_util.cc b/libaom/third_party/libwebm/common/file_util.cc index 618ffc0..e6109d5 100644 --- a/libaom/third_party/libwebm/common/file_util.cc +++ b/libaom/third_party/libwebm/common/file_util.cc @@ -46,7 +46,7 @@ std::string GetTempFileName() { errno_t err = tmpnam_s(tmp_file_name); #else char* fname_pointer = tmpnam(tmp_file_name); - errno_t err = (fname_pointer == &tmp_file_name[0]) ? 0 : -1; + int err = (fname_pointer == &tmp_file_name[0]) ? 0 : -1; #endif if (err == 0) { return std::string(tmp_file_name); diff --git a/libaom/third_party/libwebm/common/webmids.h b/libaom/third_party/libwebm/common/webmids.h index 89d722a..fc0c208 100644 --- a/libaom/third_party/libwebm/common/webmids.h +++ b/libaom/third_party/libwebm/common/webmids.h @@ -93,6 +93,7 @@ enum MkvId { kMkvDisplayHeight = 0x54BA, kMkvDisplayUnit = 0x54B2, kMkvAspectRatioType = 0x54B3, + kMkvColourSpace = 0x2EB524, kMkvFrameRate = 0x2383E3, // end video // colour diff --git a/libaom/third_party/libwebm/mkvmuxer/mkvmuxer.cc b/libaom/third_party/libwebm/mkvmuxer/mkvmuxer.cc index bae2c99..5120312 100644 --- a/libaom/third_party/libwebm/mkvmuxer/mkvmuxer.cc +++ b/libaom/third_party/libwebm/mkvmuxer/mkvmuxer.cc @@ -773,6 +773,14 @@ bool Track::Write(IMkvWriter* writer) const { if (!type_ || !codec_id_) return false; + // AV1 tracks require a CodecPrivate. See + // https://github.com/Matroska-Org/matroska-specification/blob/av1-mappin/codec/av1.md + // TODO(tomfinegan): Update the above link to the AV1 Matroska mappings to + // point to a stable version once it is finalized, or our own WebM mappings + // page on webmproject.org should we decide to release them. + if (!strcmp(codec_id_, Tracks::kAv1CodecId) && !codec_private_) + return false; + // |size| may be bigger than what is written out in this function because // derived classes may write out more data in the Track element. const uint64_t payload_size = PayloadSize(); @@ -1027,19 +1035,16 @@ bool MasteringMetadata::Write(IMkvWriter* writer) const { !WriteEbmlElement(writer, libwebm::kMkvLuminanceMin, luminance_min_)) { return false; } - if (r_ && - !r_->Write(writer, libwebm::kMkvPrimaryRChromaticityX, - libwebm::kMkvPrimaryRChromaticityY)) { + if (r_ && !r_->Write(writer, libwebm::kMkvPrimaryRChromaticityX, + libwebm::kMkvPrimaryRChromaticityY)) { return false; } - if (g_ && - !g_->Write(writer, libwebm::kMkvPrimaryGChromaticityX, - libwebm::kMkvPrimaryGChromaticityY)) { + if (g_ && !g_->Write(writer, libwebm::kMkvPrimaryGChromaticityX, + libwebm::kMkvPrimaryGChromaticityY)) { return false; } - if (b_ && - !b_->Write(writer, libwebm::kMkvPrimaryBChromaticityX, - libwebm::kMkvPrimaryBChromaticityY)) { + if (b_ && !b_->Write(writer, libwebm::kMkvPrimaryBChromaticityX, + libwebm::kMkvPrimaryBChromaticityY)) { return false; } if (white_point_ && @@ -1421,6 +1426,7 @@ VideoTrack::VideoTrack(unsigned int* seed) stereo_mode_(0), alpha_mode_(0), width_(0), + colour_space_(NULL), colour_(NULL), projection_(NULL) {} @@ -1518,6 +1524,10 @@ bool VideoTrack::Write(IMkvWriter* writer) const { static_cast<uint64>(alpha_mode_))) return false; } + if (colour_space_) { + if (!WriteEbmlElement(writer, libwebm::kMkvColourSpace, colour_space_)) + return false; + } if (frame_rate_ > 0.0) { if (!WriteEbmlElement(writer, libwebm::kMkvFrameRate, static_cast<float>(frame_rate_))) { @@ -1542,6 +1552,22 @@ bool VideoTrack::Write(IMkvWriter* writer) const { return true; } +void VideoTrack::set_colour_space(const char* colour_space) { + if (colour_space) { + delete[] colour_space_; + + const size_t length = strlen(colour_space) + 1; + colour_space_ = new (std::nothrow) char[length]; // NOLINT + if (colour_space_) { +#ifdef _MSC_VER + strcpy_s(colour_space_, length, colour_space); +#else + strcpy(colour_space_, colour_space); +#endif + } + } +} + bool VideoTrack::SetColour(const Colour& colour) { std::unique_ptr<Colour> colour_ptr(new Colour()); if (!colour_ptr.get()) @@ -1625,6 +1651,8 @@ uint64_t VideoTrack::VideoPayloadSize() const { if (frame_rate_ > 0.0) size += EbmlElementSize(libwebm::kMkvFrameRate, static_cast<float>(frame_rate_)); + if (colour_space_) + size += EbmlElementSize(libwebm::kMkvColourSpace, colour_space_); if (colour_) size += colour_->ColourSize(); if (projection_) @@ -1702,10 +1730,9 @@ bool AudioTrack::Write(IMkvWriter* writer) const { const char Tracks::kOpusCodecId[] = "A_OPUS"; const char Tracks::kVorbisCodecId[] = "A_VORBIS"; +const char Tracks::kAv1CodecId[] = "V_AV1"; const char Tracks::kVp8CodecId[] = "V_VP8"; const char Tracks::kVp9CodecId[] = "V_VP9"; -const char Tracks::kVp10CodecId[] = "V_VP10"; -const char Tracks::kAV1CodecId[] = "V_AV1"; const char Tracks::kWebVttCaptionsId[] = "D_WEBVTT/CAPTIONS"; const char Tracks::kWebVttDescriptionsId[] = "D_WEBVTT/DESCRIPTIONS"; const char Tracks::kWebVttMetadataId[] = "D_WEBVTT/METADATA"; @@ -4161,15 +4188,15 @@ bool Segment::WriteFramesLessThan(uint64_t timestamp) { } bool Segment::DocTypeIsWebm() const { - const int kNumCodecIds = 10; + const int kNumCodecIds = 9; // TODO(vigneshv): Tweak .clang-format. const char* kWebmCodecIds[kNumCodecIds] = { Tracks::kOpusCodecId, Tracks::kVorbisCodecId, - Tracks::kVp8CodecId, Tracks::kVp9CodecId, - Tracks::kVp10CodecId, Tracks::kAV1CodecId, - Tracks::kWebVttCaptionsId, Tracks::kWebVttDescriptionsId, - Tracks::kWebVttMetadataId, Tracks::kWebVttSubtitlesId}; + Tracks::kAv1CodecId, Tracks::kVp8CodecId, + Tracks::kVp9CodecId, Tracks::kWebVttCaptionsId, + Tracks::kWebVttDescriptionsId, Tracks::kWebVttMetadataId, + Tracks::kWebVttSubtitlesId}; const int num_tracks = static_cast<int>(tracks_.track_entries_size()); for (int track_index = 0; track_index < num_tracks; ++track_index) { diff --git a/libaom/third_party/libwebm/mkvmuxer/mkvmuxer.h b/libaom/third_party/libwebm/mkvmuxer/mkvmuxer.h index 9e817bc..f2db377 100644 --- a/libaom/third_party/libwebm/mkvmuxer/mkvmuxer.h +++ b/libaom/third_party/libwebm/mkvmuxer/mkvmuxer.h @@ -795,6 +795,8 @@ class VideoTrack : public Track { uint64_t alpha_mode() { return alpha_mode_; } void set_width(uint64_t width) { width_ = width; } uint64_t width() const { return width_; } + void set_colour_space(const char* colour_space); + const char* colour_space() const { return colour_space_; } Colour* colour() { return colour_; } @@ -824,6 +826,7 @@ class VideoTrack : public Track { uint64_t stereo_mode_; uint64_t alpha_mode_; uint64_t width_; + char* colour_space_; Colour* colour_; Projection* projection_; @@ -871,10 +874,9 @@ class Tracks { static const char kOpusCodecId[]; static const char kVorbisCodecId[]; + static const char kAv1CodecId[]; static const char kVp8CodecId[]; static const char kVp9CodecId[]; - static const char kVp10CodecId[]; - static const char kAV1CodecId[]; static const char kWebVttCaptionsId[]; static const char kWebVttDescriptionsId[]; static const char kWebVttMetadataId[]; diff --git a/libaom/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc b/libaom/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc index 355d4e2..3bff7cd 100644 --- a/libaom/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc +++ b/libaom/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc @@ -136,9 +136,8 @@ uint64 WriteBlock(IMkvWriter* writer, const Frame* const frame, int64 timecode, return false; } - if (!frame->is_key() && - !WriteEbmlElement(writer, libwebm::kMkvReferenceBlock, - reference_block_timestamp)) { + if (!frame->is_key() && !WriteEbmlElement(writer, libwebm::kMkvReferenceBlock, + reference_block_timestamp)) { return false; } diff --git a/libaom/third_party/libwebm/mkvmuxer/mkvwriter.cc b/libaom/third_party/libwebm/mkvmuxer/mkvwriter.cc index 84655d8..d668384 100644 --- a/libaom/third_party/libwebm/mkvmuxer/mkvwriter.cc +++ b/libaom/third_party/libwebm/mkvmuxer/mkvwriter.cc @@ -78,6 +78,8 @@ int32 MkvWriter::Position(int64 position) { #ifdef _MSC_VER return _fseeki64(file_, position, SEEK_SET); +#elif defined(_WIN32) + return fseeko64(file_, static_cast<off_t>(position), SEEK_SET); #else return fseeko(file_, static_cast<off_t>(position), SEEK_SET); #endif diff --git a/libaom/third_party/libwebm/mkvparser/mkvparser.cc b/libaom/third_party/libwebm/mkvparser/mkvparser.cc index e7b76f7..9c78ead 100644 --- a/libaom/third_party/libwebm/mkvparser/mkvparser.cc +++ b/libaom/third_party/libwebm/mkvparser/mkvparser.cc @@ -36,8 +36,6 @@ inline bool isnan(double val) { return std::isnan(val); } inline bool isinf(double val) { return std::isinf(val); } #endif // MSC_COMPAT -IMkvReader::~IMkvReader() {} - template <typename Type> Type* SafeArrayAlloc(unsigned long long num_elements, unsigned long long element_size) { @@ -5274,6 +5272,7 @@ bool Projection::Parse(IMkvReader* reader, long long start, long long size, VideoTrack::VideoTrack(Segment* pSegment, long long element_start, long long element_size) : Track(pSegment, element_start, element_size), + m_colour_space(NULL), m_colour(NULL), m_projection(NULL) {} @@ -5299,6 +5298,7 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info, long long stereo_mode = 0; double rate = 0.0; + char* colour_space = NULL; IMkvReader* const pReader = pSegment->m_pReader; @@ -5312,7 +5312,7 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info, const long long stop = pos + s.size; Colour* colour = NULL; - Projection* projection = NULL; + std::unique_ptr<Projection> projection_ptr; while (pos < stop) { long long id, size; @@ -5364,8 +5364,16 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info, if (!Colour::Parse(pReader, pos, size, &colour)) return E_FILE_FORMAT_INVALID; } else if (id == libwebm::kMkvProjection) { - if (!Projection::Parse(pReader, pos, size, &projection)) + Projection* projection = NULL; + if (!Projection::Parse(pReader, pos, size, &projection)) { return E_FILE_FORMAT_INVALID; + } else { + projection_ptr.reset(projection); + } + } else if (id == libwebm::kMkvColourSpace) { + const long status = UnserializeString(pReader, pos, size, colour_space); + if (status < 0) + return status; } pos += size; // consume payload @@ -5397,7 +5405,8 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info, pTrack->m_stereo_mode = stereo_mode; pTrack->m_rate = rate; pTrack->m_colour = colour; - pTrack->m_projection = projection; + pTrack->m_colour_space = colour_space; + pTrack->m_projection = projection_ptr.release(); pResult = pTrack; return 0; // success diff --git a/libaom/third_party/libwebm/mkvparser/mkvparser.h b/libaom/third_party/libwebm/mkvparser/mkvparser.h index 26c2b7e..848d01f 100644 --- a/libaom/third_party/libwebm/mkvparser/mkvparser.h +++ b/libaom/third_party/libwebm/mkvparser/mkvparser.h @@ -22,7 +22,7 @@ class IMkvReader { virtual int Length(long long* total, long long* available) = 0; protected: - virtual ~IMkvReader(); + virtual ~IMkvReader() {} }; template <typename Type> @@ -527,6 +527,8 @@ class VideoTrack : public Track { Projection* GetProjection() const; + const char* GetColourSpace() const { return m_colour_space; } + private: long long m_width; long long m_height; @@ -534,7 +536,7 @@ class VideoTrack : public Track { long long m_display_height; long long m_display_unit; long long m_stereo_mode; - + char* m_colour_space; double m_rate; Colour* m_colour; diff --git a/libaom/third_party/libwebm/mkvparser/mkvreader.cc b/libaom/third_party/libwebm/mkvparser/mkvreader.cc index 23d68f5..9d19c1b 100644 --- a/libaom/third_party/libwebm/mkvparser/mkvreader.cc +++ b/libaom/third_party/libwebm/mkvparser/mkvreader.cc @@ -118,6 +118,8 @@ int MkvReader::Read(long long offset, long len, unsigned char* buffer) { if (status) return -1; // error +#elif defined(_WIN32) + fseeko64(m_file, static_cast<off_t>(offset), SEEK_SET); #else fseeko(m_file, static_cast<off_t>(offset), SEEK_SET); #endif diff --git a/libaom/tools/txfm_analyzer/txfm_graph.h b/libaom/tools/txfm_analyzer/txfm_graph.h index 2e3c955..8dc3614 100644 --- a/libaom/tools/txfm_analyzer/txfm_graph.h +++ b/libaom/tools/txfm_analyzer/txfm_graph.h @@ -23,7 +23,6 @@ struct Node { int visited; }; -#define PI (3.141592653589793238462643383279502884) #define STAGENUM (10) #define NODENUM (32) #define COS_MOD (128) |