summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorandroid-build-team Robot <android-build-team-robot@google.com>2019-05-11 23:20:42 +0000
committerandroid-build-team Robot <android-build-team-robot@google.com>2019-05-11 23:20:42 +0000
commitae9e769d90104bb1a680601e15c9befdd26e21cf (patch)
tree3f75dbf1ca3dc3455647615cde513037d4d06e0e
parent6908555f3ced722d3f7badd753b3813e13526c8b (diff)
parent22776ab2e71269213c6206f19e4b5d04a3384164 (diff)
downloadplatform_external_libaom-android-10.0.0_r2.tar.gz
platform_external_libaom-android-10.0.0_r2.tar.bz2
platform_external_libaom-android-10.0.0_r2.zip
Change-Id: Ibdbdbc94e55e035ddd9afffe44c26fc7fde20a98
-rw-r--r--Android.bp13
-rw-r--r--config/arm/config/aom_config.asm12
-rw-r--r--config/arm/config/aom_config.h12
-rw-r--r--config/arm/config/aom_dsp_rtcd.h4
-rw-r--r--config/arm/config/aom_scale_rtcd.h3
-rw-r--r--config/arm/config/av1_rtcd.h82
-rw-r--r--config/arm64/config/aom_config.asm12
-rw-r--r--config/arm64/config/aom_config.h12
-rw-r--r--config/arm64/config/aom_dsp_rtcd.h4
-rw-r--r--config/arm64/config/aom_scale_rtcd.h3
-rw-r--r--config/arm64/config/av1_rtcd.h82
-rw-r--r--config/x86/config/aom_config.asm11
-rw-r--r--config/x86/config/aom_config.h12
-rw-r--r--config/x86/config/aom_dsp_rtcd.h3
-rw-r--r--config/x86/config/aom_scale_rtcd.h3
-rw-r--r--config/x86/config/av1_rtcd.h83
-rw-r--r--config/x86_64/config/aom_config.asm11
-rw-r--r--config/x86_64/config/aom_config.h12
-rw-r--r--config/x86_64/config/aom_dsp_rtcd.h3
-rw-r--r--config/x86_64/config/aom_scale_rtcd.h3
-rw-r--r--config/x86_64/config/av1_rtcd.h83
-rw-r--r--libaom/CMakeLists.txt5
-rw-r--r--libaom/PATENTS14
-rw-r--r--libaom/aom/aom_encoder.h8
-rw-r--r--libaom/aom/aom_frame_buffer.h6
-rw-r--r--libaom/aom/aomcx.h461
-rw-r--r--libaom/aom_dsp/add_noise.c2
-rw-r--r--libaom/aom_dsp/aom_dsp.cmake4
-rwxr-xr-xlibaom/aom_dsp/aom_dsp_rtcd_defs.pl186
-rw-r--r--libaom/aom_dsp/avg.c79
-rw-r--r--libaom/aom_dsp/bitreader_buffer.c4
-rw-r--r--libaom/aom_dsp/grain_synthesis.c3
-rw-r--r--libaom/aom_dsp/grain_synthesis.h70
-rw-r--r--libaom/aom_dsp/noise_model.h8
-rw-r--r--libaom/aom_dsp/prob.h2
-rw-r--r--libaom/aom_dsp/quantize.c254
-rw-r--r--libaom/aom_dsp/quantize.h60
-rw-r--r--libaom/aom_dsp/sad.c17
-rw-r--r--libaom/aom_dsp/variance.c336
-rw-r--r--libaom/aom_dsp/variance.h15
-rw-r--r--libaom/aom_dsp/x86/adaptive_quantize_sse2.c421
-rw-r--r--libaom/aom_dsp/x86/avg_intrin_sse2.c123
-rw-r--r--libaom/aom_dsp/x86/convolve_avx2.h259
-rw-r--r--libaom/aom_dsp/x86/convolve_sse2.h4
-rw-r--r--libaom/aom_dsp/x86/convolve_sse4_1.h4
-rw-r--r--libaom/aom_dsp/x86/fft_avx2.c1
-rw-r--r--libaom/aom_dsp/x86/fft_sse2.c1
-rw-r--r--libaom/aom_dsp/x86/highbd_loopfilter_sse2.c4
-rw-r--r--libaom/aom_dsp/x86/highbd_quantize_intrin_sse2.c58
-rw-r--r--libaom/aom_dsp/x86/highbd_variance_sse2.c192
-rw-r--r--libaom/aom_dsp/x86/intrapred_asm_sse2.asm17
-rw-r--r--libaom/aom_dsp/x86/intrapred_avx2.c2656
-rw-r--r--libaom/aom_dsp/x86/jnt_sad_ssse3.c64
-rw-r--r--libaom/aom_dsp/x86/jnt_variance_ssse3.c102
-rw-r--r--libaom/aom_dsp/x86/loopfilter_sse2.c378
-rw-r--r--libaom/aom_dsp/x86/lpf_common_sse2.h280
-rw-r--r--libaom/aom_dsp/x86/quantize_sse2.c22
-rw-r--r--libaom/aom_dsp/x86/quantize_ssse3.c192
-rw-r--r--libaom/aom_dsp/x86/quantize_x86.h68
-rw-r--r--libaom/aom_dsp/x86/sse_avx2.c48
-rw-r--r--libaom/aom_dsp/x86/txfm_common_avx2.h90
-rw-r--r--libaom/aom_dsp/x86/variance_sse2.c2
-rw-r--r--libaom/aom_ports/mem.h30
-rw-r--r--libaom/aom_ports/x86.h60
-rw-r--r--libaom/aom_scale/aom_scale.cmake4
-rw-r--r--libaom/aom_scale/aom_scale_rtcd.pl2
-rw-r--r--libaom/aom_scale/generic/yv12config.c143
-rw-r--r--libaom/aom_scale/generic/yv12extend.c25
-rw-r--r--libaom/aom_scale/yv12config.h21
-rw-r--r--libaom/apps/aomdec.c11
-rw-r--r--libaom/apps/aomenc.c347
-rw-r--r--libaom/av1/av1.cmake41
-rw-r--r--libaom/av1/av1_cx_iface.c624
-rw-r--r--libaom/av1/av1_dx_iface.c124
-rw-r--r--libaom/av1/av1_iface_common.h7
-rw-r--r--libaom/av1/common/alloccommon.c2
-rw-r--r--libaom/av1/common/arm/av1_txfm_neon.c2
-rw-r--r--libaom/av1/common/arm/jnt_convolve_neon.c169
-rw-r--r--libaom/av1/common/arm/warp_plane_neon.c4
-rw-r--r--libaom/av1/common/av1_inv_txfm2d.c4
-rw-r--r--libaom/av1/common/av1_loopfilter.c8
-rw-r--r--[-rwxr-xr-x]libaom/av1/common/av1_rtcd_defs.pl92
-rw-r--r--libaom/av1/common/av1_txfm.c1
-rw-r--r--libaom/av1/common/av1_txfm.h6
-rw-r--r--libaom/av1/common/blockd.h128
-rw-r--r--libaom/av1/common/cdef.c1
-rw-r--r--libaom/av1/common/cdef_block.c4
-rw-r--r--libaom/av1/common/cdef_block.h1
-rw-r--r--libaom/av1/common/cfl.c11
-rw-r--r--libaom/av1/common/convolve.c109
-rw-r--r--libaom/av1/common/convolve.h2
-rw-r--r--libaom/av1/common/debugmodes.c6
-rw-r--r--libaom/av1/common/entropy.c2
-rw-r--r--libaom/av1/common/entropy.h4
-rw-r--r--libaom/av1/common/entropymode.c35
-rw-r--r--libaom/av1/common/entropymode.h3
-rw-r--r--libaom/av1/common/entropymv.h12
-rw-r--r--libaom/av1/common/enums.h164
-rw-r--r--libaom/av1/common/filter.h4
-rw-r--r--libaom/av1/common/idct.c2
-rw-r--r--libaom/av1/common/mv.h10
-rw-r--r--libaom/av1/common/mvref_common.c94
-rw-r--r--libaom/av1/common/mvref_common.h18
-rw-r--r--libaom/av1/common/onyxc_int.h293
-rw-r--r--libaom/av1/common/pred_common.h24
-rw-r--r--libaom/av1/common/reconinter.c134
-rw-r--r--libaom/av1/common/reconinter.h29
-rw-r--r--libaom/av1/common/reconintra.c4
-rw-r--r--libaom/av1/common/restoration.c2
-rw-r--r--libaom/av1/common/restoration.h2
-rw-r--r--libaom/av1/common/scale.c16
-rw-r--r--libaom/av1/common/scan.h4
-rw-r--r--libaom/av1/common/seg_common.h4
-rw-r--r--libaom/av1/common/tile_common.c70
-rw-r--r--libaom/av1/common/tile_common.h11
-rw-r--r--libaom/av1/common/txb_common.c17
-rw-r--r--libaom/av1/common/txb_common.h17
-rw-r--r--libaom/av1/common/warped_motion.c8
-rw-r--r--libaom/av1/common/x86/av1_convolve_scale_sse4.c8
-rw-r--r--libaom/av1/common/x86/av1_inv_txfm_ssse3.c14
-rw-r--r--libaom/av1/common/x86/av1_inv_txfm_ssse3.h4
-rw-r--r--libaom/av1/common/x86/av1_txfm_sse4.c2
-rw-r--r--libaom/av1/common/x86/convolve_2d_avx2.c140
-rw-r--r--libaom/av1/common/x86/convolve_2d_sse2.c31
-rw-r--r--libaom/av1/common/x86/convolve_avx2.c516
-rw-r--r--libaom/av1/common/x86/highbd_convolve_2d_avx2.c4
-rw-r--r--libaom/av1/common/x86/highbd_convolve_2d_sse4.c37
-rw-r--r--libaom/av1/common/x86/highbd_inv_txfm_avx2.c236
-rw-r--r--libaom/av1/common/x86/highbd_inv_txfm_sse4.c809
-rw-r--r--libaom/av1/common/x86/highbd_jnt_convolve_avx2.c83
-rw-r--r--libaom/av1/common/x86/highbd_jnt_convolve_sse4.c38
-rw-r--r--libaom/av1/common/x86/highbd_warp_plane_sse4.c4
-rw-r--r--libaom/av1/common/x86/jnt_convolve_avx2.c778
-rw-r--r--libaom/av1/common/x86/jnt_convolve_sse2.c56
-rw-r--r--libaom/av1/common/x86/jnt_convolve_ssse3.c15
-rw-r--r--libaom/av1/common/x86/warp_plane_sse4.c4
-rw-r--r--libaom/av1/common/x86/wiener_convolve_avx2.c398
-rw-r--r--libaom/av1/decoder/decodeframe.c471
-rw-r--r--libaom/av1/decoder/decodemv.c34
-rw-r--r--libaom/av1/decoder/decoder.c91
-rw-r--r--libaom/av1/decoder/decoder.h29
-rw-r--r--libaom/av1/decoder/decodetxb.c42
-rw-r--r--libaom/av1/decoder/inspection.c11
-rw-r--r--libaom/av1/decoder/inspection.h5
-rw-r--r--libaom/av1/decoder/obu.c37
-rw-r--r--libaom/av1/encoder/aq_cyclicrefresh.c187
-rw-r--r--libaom/av1/encoder/aq_cyclicrefresh.h7
-rw-r--r--libaom/av1/encoder/aq_variance.c4
-rw-r--r--libaom/av1/encoder/av1_multi_thread.c9
-rw-r--r--libaom/av1/encoder/av1_quantize.c231
-rw-r--r--libaom/av1/encoder/av1_quantize.h4
-rw-r--r--libaom/av1/encoder/bitstream.c588
-rw-r--r--libaom/av1/encoder/bitstream.h12
-rw-r--r--libaom/av1/encoder/block.h60
-rw-r--r--libaom/av1/encoder/context_tree.h37
-rw-r--r--libaom/av1/encoder/cost.h4
-rw-r--r--libaom/av1/encoder/encode_strategy.c1173
-rw-r--r--libaom/av1/encoder/encode_strategy.h46
-rw-r--r--libaom/av1/encoder/encodeframe.c2683
-rw-r--r--libaom/av1/encoder/encodemb.c66
-rw-r--r--libaom/av1/encoder/encodemb.h4
-rw-r--r--libaom/av1/encoder/encoder.c4117
-rw-r--r--libaom/av1/encoder/encoder.h621
-rw-r--r--libaom/av1/encoder/encodetxb.c385
-rw-r--r--libaom/av1/encoder/encodetxb.h3
-rw-r--r--libaom/av1/encoder/ethread.c109
-rw-r--r--libaom/av1/encoder/firstpass.c2666
-rw-r--r--libaom/av1/encoder/firstpass.h137
-rw-r--r--libaom/av1/encoder/global_motion.c261
-rw-r--r--libaom/av1/encoder/global_motion.h6
-rw-r--r--libaom/av1/encoder/gop_structure.c192
-rw-r--r--libaom/av1/encoder/gop_structure.h36
-rw-r--r--libaom/av1/encoder/hash_motion.c6
-rw-r--r--libaom/av1/encoder/hash_motion.h3
-rw-r--r--libaom/av1/encoder/level.c647
-rw-r--r--libaom/av1/encoder/level.h81
-rw-r--r--libaom/av1/encoder/lookahead.c20
-rw-r--r--libaom/av1/encoder/lookahead.h3
-rw-r--r--libaom/av1/encoder/mbgraph.c6
-rw-r--r--libaom/av1/encoder/mcomp.c383
-rw-r--r--libaom/av1/encoder/mcomp.h25
-rw-r--r--libaom/av1/encoder/mips/msa/temporal_filter_msa.c1
-rw-r--r--libaom/av1/encoder/ml.c4
-rw-r--r--libaom/av1/encoder/partition_model_weights.h2273
-rw-r--r--libaom/av1/encoder/partition_strategy.c727
-rw-r--r--libaom/av1/encoder/partition_strategy.h154
-rw-r--r--libaom/av1/encoder/pass2_strategy.c1787
-rw-r--r--libaom/av1/encoder/pass2_strategy.h34
-rw-r--r--libaom/av1/encoder/picklpf.c36
-rw-r--r--libaom/av1/encoder/pickrst.c44
-rw-r--r--libaom/av1/encoder/ratectrl.c608
-rw-r--r--libaom/av1/encoder/ratectrl.h89
-rw-r--r--libaom/av1/encoder/rd.c741
-rw-r--r--libaom/av1/encoder/rd.h275
-rw-r--r--libaom/av1/encoder/rdopt.c4842
-rw-r--r--libaom/av1/encoder/rdopt.h19
-rw-r--r--libaom/av1/encoder/reconinter_enc.c121
-rw-r--r--libaom/av1/encoder/reconinter_enc.h21
-rw-r--r--libaom/av1/encoder/speed_features.c520
-rw-r--r--libaom/av1/encoder/speed_features.h242
-rw-r--r--libaom/av1/encoder/temporal_filter.c594
-rw-r--r--libaom/av1/encoder/temporal_filter.h12
-rw-r--r--libaom/av1/encoder/tokenize.h4
-rw-r--r--libaom/av1/encoder/tpl_model.c592
-rw-r--r--libaom/av1/encoder/tpl_model.h26
-rw-r--r--libaom/av1/encoder/var_based_part.c778
-rw-r--r--libaom/av1/encoder/var_based_part.h37
-rw-r--r--libaom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c798
-rw-r--r--libaom/av1/encoder/x86/corner_match_avx2.c79
-rw-r--r--libaom/av1/encoder/x86/encodetxb_avx2.c8
-rw-r--r--libaom/av1/encoder/x86/encodetxb_sse4.c8
-rw-r--r--libaom/av1/encoder/x86/highbd_block_error_intrin_avx2.c62
-rw-r--r--libaom/av1/encoder/x86/highbd_fwd_txfm_avx2.c3170
-rw-r--r--libaom/av1/encoder/x86/highbd_temporal_filter_sse4.c954
-rw-r--r--libaom/av1/encoder/x86/pickrst_avx2.c12
-rw-r--r--libaom/av1/encoder/x86/pickrst_sse4.c12
-rw-r--r--libaom/av1/encoder/x86/temporal_filter_apply_sse2.asm217
-rw-r--r--libaom/av1/encoder/x86/temporal_filter_constants.h401
-rw-r--r--libaom/av1/encoder/x86/temporal_filter_sse4.c1006
-rw-r--r--libaom/build/cmake/aom_config_defaults.cmake25
-rw-r--r--libaom/build/cmake/aom_experiment_deps.cmake4
-rw-r--r--libaom/build/cmake/toolchains/arm64-mingw-gcc.cmake3
-rw-r--r--libaom/build/cmake/toolchains/armv7-linux-gcc.cmake9
-rw-r--r--libaom/build/cmake/toolchains/armv7-mingw-gcc.cmake3
-rw-r--r--libaom/build/cmake/toolchains/x86-mingw-gcc.cmake3
-rw-r--r--libaom/build/cmake/toolchains/x86_64-mingw-gcc.cmake3
-rw-r--r--libaom/common/av1_config.c4
-rw-r--r--libaom/common/rawenc.c88
-rw-r--r--libaom/common/tools_common.c5
-rw-r--r--libaom/common/tools_common.h9
-rw-r--r--libaom/common/video_reader.c4
-rw-r--r--libaom/common/video_reader.h3
-rw-r--r--libaom/common/video_writer.c4
-rw-r--r--libaom/common/video_writer.h4
-rw-r--r--libaom/common/webmenc.h4
-rw-r--r--libaom/examples/analyzer.cc2
-rw-r--r--libaom/examples/av1_dec_fuzzer.cc72
-rwxr-xr-xlibaom/examples/build_av1_dec_fuzzer.sh78
-rw-r--r--libaom/examples/inspect.c134
-rw-r--r--libaom/examples/lightfield_bitstream_parsing.c2
-rw-r--r--libaom/examples/lightfield_decoder.c8
-rw-r--r--libaom/examples/lightfield_encoder.c10
-rw-r--r--libaom/examples/lightfield_tile_list_decoder.c2
-rw-r--r--libaom/test/av1_convolve_2d_test.cc99
-rw-r--r--libaom/test/av1_convolve_2d_test_util.cc55
-rw-r--r--libaom/test/av1_convolve_scale_test.cc12
-rw-r--r--libaom/test/av1_fwd_txfm2d_test.cc73
-rw-r--r--libaom/test/av1_highbd_iht_test.cc3
-rw-r--r--libaom/test/av1_round_shift_array_test.cc2
-rw-r--r--libaom/test/av1_txfm_test.h4
-rw-r--r--libaom/test/comp_avg_pred_test.cc52
-rw-r--r--libaom/test/comp_avg_pred_test.h244
-rw-r--r--libaom/test/corner_match_test.cc68
-rw-r--r--libaom/test/dr_prediction_test.cc101
-rw-r--r--libaom/test/edge_detect_test.cc11
-rw-r--r--libaom/test/encode_api_test.cc2
-rw-r--r--libaom/test/end_to_end_test.cc7
-rw-r--r--libaom/test/error_block_test.cc77
-rw-r--r--libaom/test/external_frame_buffer_test.cc22
-rw-r--r--libaom/test/fwd_kf_test.cc110
-rw-r--r--libaom/test/gf_max_pyr_height_test.cc115
-rw-r--r--libaom/test/hiprec_convolve_test_util.cc4
-rw-r--r--libaom/test/horz_superres_test.cc178
-rw-r--r--libaom/test/level_test.cc108
-rw-r--r--libaom/test/quantize_func_test.cc71
-rw-r--r--libaom/test/resize_test.cc3
-rw-r--r--libaom/test/rt_end_to_end_test.cc141
-rw-r--r--libaom/test/sad_test.cc265
-rw-r--r--libaom/test/sum_squares_test.cc52
-rw-r--r--libaom/test/test-data.sha16
-rw-r--r--libaom/test/test.cmake35
-rw-r--r--libaom/test/test_data_util.cmake6
-rw-r--r--libaom/test/test_vectors.cc356
-rw-r--r--libaom/test/variance_test.cc187
-rw-r--r--libaom/test/warp_filter_test_util.cc20
-rw-r--r--libaom/test/yuv_temporal_filter_test.cc726
-rw-r--r--libaom/third_party/libwebm/AUTHORS.TXT8
-rw-r--r--libaom/third_party/libwebm/README.libaom4
-rw-r--r--libaom/third_party/libwebm/common/file_util.cc2
-rw-r--r--libaom/third_party/libwebm/common/webmids.h1
-rw-r--r--libaom/third_party/libwebm/mkvmuxer/mkvmuxer.cc59
-rw-r--r--libaom/third_party/libwebm/mkvmuxer/mkvmuxer.h6
-rw-r--r--libaom/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc5
-rw-r--r--libaom/third_party/libwebm/mkvmuxer/mkvwriter.cc2
-rw-r--r--libaom/third_party/libwebm/mkvparser/mkvparser.cc19
-rw-r--r--libaom/third_party/libwebm/mkvparser/mkvparser.h6
-rw-r--r--libaom/third_party/libwebm/mkvparser/mkvreader.cc2
-rw-r--r--libaom/tools/txfm_analyzer/txfm_graph.h1
288 files changed, 37114 insertions, 16681 deletions
diff --git a/Android.bp b/Android.bp
index f375775..c722d3a 100644
--- a/Android.bp
+++ b/Android.bp
@@ -122,7 +122,6 @@ aom_av1_decoder_sources = [
aom_av1_encoder_asm_sse2 = [
"libaom/av1/encoder/x86/dct_sse2.asm",
"libaom/av1/encoder/x86/error_sse2.asm",
- "libaom/av1/encoder/x86/temporal_filter_apply_sse2.asm",
]
aom_av1_encoder_asm_ssse3_x86_64 = [
@@ -132,8 +131,11 @@ aom_av1_encoder_asm_ssse3_x86_64 = [
aom_av1_encoder_intrin_avx2 = [
"libaom/av1/encoder/x86/av1_quantize_avx2.c",
"libaom/av1/encoder/x86/av1_highbd_quantize_avx2.c",
+ "libaom/av1/encoder/x86/corner_match_avx2.c",
"libaom/av1/encoder/x86/error_intrin_avx2.c",
+ "libaom/av1/encoder/x86/highbd_block_error_intrin_avx2.c",
"libaom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c",
+ "libaom/av1/encoder/x86/highbd_fwd_txfm_avx2.c",
"libaom/av1/encoder/x86/wedge_utils_avx2.c",
"libaom/av1/encoder/x86/encodetxb_avx2.c",
"libaom/av1/encoder/x86/rdopt_avx2.c",
@@ -170,6 +172,8 @@ aom_av1_encoder_intrin_sse4_1 = [
"libaom/av1/encoder/x86/encodetxb_sse4.c",
"libaom/av1/encoder/x86/highbd_fwd_txfm_sse4.c",
"libaom/av1/encoder/x86/rdopt_sse4.c",
+ "libaom/av1/encoder/x86/temporal_filter_sse4.c",
+ "libaom/av1/encoder/x86/highbd_temporal_filter_sse4.c",
"libaom/av1/encoder/x86/pickrst_sse4.c",
]
@@ -194,20 +198,25 @@ aom_av1_encoder_sources = [
"libaom/av1/encoder/encodeframe.c",
"libaom/av1/encoder/encodemb.c",
"libaom/av1/encoder/encodemv.c",
+ "libaom/av1/encoder/encode_strategy.c",
"libaom/av1/encoder/encoder.c",
"libaom/av1/encoder/encodetxb.c",
"libaom/av1/encoder/ethread.c",
"libaom/av1/encoder/extend.c",
"libaom/av1/encoder/firstpass.c",
"libaom/av1/encoder/global_motion.c",
+ "libaom/av1/encoder/gop_structure.c",
"libaom/av1/encoder/hash.c",
"libaom/av1/encoder/hash_motion.c",
"libaom/av1/encoder/hybrid_fwd_txfm.c",
+ "libaom/av1/encoder/level.c",
"libaom/av1/encoder/lookahead.c",
"libaom/av1/encoder/mbgraph.c",
"libaom/av1/encoder/mcomp.c",
"libaom/av1/encoder/ml.c",
"libaom/av1/encoder/palette.c",
+ "libaom/av1/encoder/partition_strategy.c",
+ "libaom/av1/encoder/pass2_strategy.c",
"libaom/av1/encoder/pickcdef.c",
"libaom/av1/encoder/picklpf.c",
"libaom/av1/encoder/pickrst.c",
@@ -220,7 +229,9 @@ aom_av1_encoder_sources = [
"libaom/av1/encoder/speed_features.c",
"libaom/av1/encoder/temporal_filter.c",
"libaom/av1/encoder/tokenize.c",
+ "libaom/av1/encoder/tpl_model.c",
"libaom/av1/encoder/wedge_utils.c",
+ "libaom/av1/encoder/var_based_part.c",
"libaom/third_party/fastfeat/fast.c",
"libaom/third_party/fastfeat/fast_9.c",
"libaom/third_party/fastfeat/nonmax.c",
diff --git a/config/arm/config/aom_config.asm b/config/arm/config/aom_config.asm
index b8fcd42..50338c1 100644
--- a/config/arm/config/aom_config.asm
+++ b/config/arm/config/aom_config.asm
@@ -13,7 +13,8 @@ ARCH_MIPS equ 0
ARCH_PPC equ 0
ARCH_X86 equ 0
ARCH_X86_64 equ 0
-CONFIG_2PASS_PARTITION_SEARCH_LVL equ 1
+CONFIG_2PASS_PARTITION_SEARCH_LVL_END equ 3
+CONFIG_2PASS_PARTITION_SEARCH_LVL_START equ 1
CONFIG_ACCOUNTING equ 0
CONFIG_ANALYZER equ 0
CONFIG_AV1_DECODER equ 1
@@ -21,7 +22,8 @@ CONFIG_AV1_ENCODER equ 0
CONFIG_BIG_ENDIAN equ 0
CONFIG_BITSTREAM_DEBUG equ 0
CONFIG_COEFFICIENT_RANGE_CHECKING equ 0
-CONFIG_COLLECT_INTER_MODE_RD_STATS equ 1
+CONFIG_COLLECT_COMPONENT_TIMING equ 0
+CONFIG_COLLECT_PARTITION_STATS equ 0
CONFIG_COLLECT_RD_STATS equ 0
CONFIG_DEBUG equ 0
CONFIG_DENOISE equ 1
@@ -29,11 +31,8 @@ CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 equ 1
CONFIG_DIST_8X8 equ 0
CONFIG_ENTROPY_STATS equ 0
CONFIG_FILEOPTIONS equ 1
-CONFIG_FIX_GF_LENGTH equ 1
-CONFIG_FP_MB_STATS equ 0
CONFIG_GCC equ 1
CONFIG_GCOV equ 0
-CONFIG_GLOBAL_MOTION_SEARCH equ 1
CONFIG_GPROF equ 0
CONFIG_INSPECTION equ 0
CONFIG_INTERNAL_STATS equ 0
@@ -44,16 +43,15 @@ CONFIG_MAX_DECODE_PROFILE equ 0
CONFIG_MISMATCH_DEBUG equ 0
CONFIG_MULTITHREAD equ 1
CONFIG_NORMAL_TILE_MODE equ 1
-CONFIG_ONE_PASS_SVM equ 0
CONFIG_OS_SUPPORT equ 1
CONFIG_PIC equ 0
CONFIG_RD_DEBUG equ 0
-CONFIG_REDUCED_ENCODER_BORDER equ 0
CONFIG_RUNTIME_CPU_DETECT equ 0
CONFIG_SHARED equ 0
CONFIG_SHARP_SETTINGS equ 0
CONFIG_SIZE_LIMIT equ 1
CONFIG_SPATIAL_RESAMPLING equ 1
+CONFIG_SPEED_STATS equ 0
CONFIG_STATIC equ 1
CONFIG_WEBM_IO equ 1
DECODE_HEIGHT_LIMIT equ 16384
diff --git a/config/arm/config/aom_config.h b/config/arm/config/aom_config.h
index 5418985..a3b86df 100644
--- a/config/arm/config/aom_config.h
+++ b/config/arm/config/aom_config.h
@@ -15,7 +15,8 @@
#define ARCH_PPC 0
#define ARCH_X86 0
#define ARCH_X86_64 0
-#define CONFIG_2PASS_PARTITION_SEARCH_LVL 1
+#define CONFIG_2PASS_PARTITION_SEARCH_LVL_END 3
+#define CONFIG_2PASS_PARTITION_SEARCH_LVL_START 1
#define CONFIG_ACCOUNTING 0
#define CONFIG_ANALYZER 0
#define CONFIG_AV1_DECODER 1
@@ -23,7 +24,8 @@
#define CONFIG_BIG_ENDIAN 0
#define CONFIG_BITSTREAM_DEBUG 0
#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
-#define CONFIG_COLLECT_INTER_MODE_RD_STATS 1
+#define CONFIG_COLLECT_COMPONENT_TIMING 0
+#define CONFIG_COLLECT_PARTITION_STATS 0
#define CONFIG_COLLECT_RD_STATS 0
#define CONFIG_DEBUG 0
#define CONFIG_DENOISE 1
@@ -31,11 +33,8 @@
#define CONFIG_DIST_8X8 0
#define CONFIG_ENTROPY_STATS 0
#define CONFIG_FILEOPTIONS 1
-#define CONFIG_FIX_GF_LENGTH 1
-#define CONFIG_FP_MB_STATS 0
#define CONFIG_GCC 1
#define CONFIG_GCOV 0
-#define CONFIG_GLOBAL_MOTION_SEARCH 1
#define CONFIG_GPROF 0
#define CONFIG_INSPECTION 0
#define CONFIG_INTERNAL_STATS 0
@@ -46,16 +45,15 @@
#define CONFIG_MISMATCH_DEBUG 0
#define CONFIG_MULTITHREAD 1
#define CONFIG_NORMAL_TILE_MODE 1
-#define CONFIG_ONE_PASS_SVM 0
#define CONFIG_OS_SUPPORT 1
#define CONFIG_PIC 0
#define CONFIG_RD_DEBUG 0
-#define CONFIG_REDUCED_ENCODER_BORDER 0
#define CONFIG_RUNTIME_CPU_DETECT 0
#define CONFIG_SHARED 0
#define CONFIG_SHARP_SETTINGS 0
#define CONFIG_SIZE_LIMIT 1
#define CONFIG_SPATIAL_RESAMPLING 1
+#define CONFIG_SPEED_STATS 0
#define CONFIG_STATIC 1
#define CONFIG_WEBM_IO 1
#define DECODE_HEIGHT_LIMIT 16384
diff --git a/config/arm/config/aom_dsp_rtcd.h b/config/arm/config/aom_dsp_rtcd.h
index e3150f7..0b1a28a 100644
--- a/config/arm/config/aom_dsp_rtcd.h
+++ b/config/arm/config/aom_dsp_rtcd.h
@@ -1400,10 +1400,6 @@ void aom_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov
void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define aom_v_predictor_8x8 aom_v_predictor_8x8_neon
-void av1_round_shift_array_c(int32_t *arr, int size, int bit);
-void av1_round_shift_array_neon(int32_t *arr, int size, int bit);
-#define av1_round_shift_array av1_round_shift_array_neon
-
void aom_dsp_rtcd(void);
#include "config/aom_config.h"
diff --git a/config/arm/config/aom_scale_rtcd.h b/config/arm/config/aom_scale_rtcd.h
index 7260bd3..067ddb4 100644
--- a/config/arm/config/aom_scale_rtcd.h
+++ b/config/arm/config/aom_scale_rtcd.h
@@ -77,6 +77,9 @@ void aom_yv12_partial_copy_v_c(const struct yv12_buffer_config *src_bc, int hsta
void aom_yv12_partial_copy_y_c(const struct yv12_buffer_config *src_ybc, int hstart1, int hend1, int vstart1, int vend1, struct yv12_buffer_config *dst_ybc, int hstart2, int vstart2);
#define aom_yv12_partial_copy_y aom_yv12_partial_copy_y_c
+int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, int num_planes);
+#define aom_yv12_realloc_with_new_border aom_yv12_realloc_with_new_border_c
+
void aom_scale_rtcd(void);
#include "config/aom_config.h"
diff --git a/config/arm/config/av1_rtcd.h b/config/arm/config/av1_rtcd.h
index c58e511..6f42666 100644
--- a/config/arm/config/av1_rtcd.h
+++ b/config/arm/config/av1_rtcd.h
@@ -89,6 +89,22 @@ void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int d
void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
#define av1_convolve_y_sr av1_convolve_y_sr_neon
+void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+#define av1_dist_wtd_convolve_2d av1_dist_wtd_convolve_2d_neon
+
+void av1_dist_wtd_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_2d_copy_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+#define av1_dist_wtd_convolve_2d_copy av1_dist_wtd_convolve_2d_copy_neon
+
+void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+#define av1_dist_wtd_convolve_x av1_dist_wtd_convolve_x_neon
+
+void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+#define av1_dist_wtd_convolve_y av1_dist_wtd_convolve_y_neon
+
void av1_dr_prediction_z1_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy);
#define av1_dr_prediction_z1 av1_dr_prediction_z1_c
@@ -140,6 +156,18 @@ void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *d
void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
#define av1_highbd_convolve_y_sr av1_highbd_convolve_y_sr_c
+void av1_highbd_dist_wtd_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_dist_wtd_convolve_2d av1_highbd_dist_wtd_convolve_2d_c
+
+void av1_highbd_dist_wtd_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_dist_wtd_convolve_2d_copy av1_highbd_dist_wtd_convolve_2d_copy_c
+
+void av1_highbd_dist_wtd_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_dist_wtd_convolve_x av1_highbd_dist_wtd_convolve_x_c
+
+void av1_highbd_dist_wtd_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_dist_wtd_convolve_y av1_highbd_dist_wtd_convolve_y_c
+
void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd);
#define av1_highbd_dr_prediction_z1 av1_highbd_dr_prediction_z1_c
@@ -152,27 +180,9 @@ void av1_highbd_dr_prediction_z3_c(uint16_t *dst, ptrdiff_t stride, int bw, int
void av1_highbd_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
#define av1_highbd_inv_txfm_add av1_highbd_inv_txfm_add_c
-void av1_highbd_inv_txfm_add_16x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_16x16 av1_highbd_inv_txfm_add_16x16_c
-
-void av1_highbd_inv_txfm_add_16x32_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_16x32 av1_highbd_inv_txfm_add_16x32_c
-
void av1_highbd_inv_txfm_add_16x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
#define av1_highbd_inv_txfm_add_16x4 av1_highbd_inv_txfm_add_16x4_c
-void av1_highbd_inv_txfm_add_16x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_16x8 av1_highbd_inv_txfm_add_16x8_c
-
-void av1_highbd_inv_txfm_add_32x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_32x16 av1_highbd_inv_txfm_add_32x16_c
-
-void av1_highbd_inv_txfm_add_32x32_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_32x32 av1_highbd_inv_txfm_add_32x32_c
-
-void av1_highbd_inv_txfm_add_32x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_32x8 av1_highbd_inv_txfm_add_32x8_c
-
void av1_highbd_inv_txfm_add_4x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
#define av1_highbd_inv_txfm_add_4x16 av1_highbd_inv_txfm_add_4x16_c
@@ -182,12 +192,6 @@ void av1_highbd_inv_txfm_add_4x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int
void av1_highbd_inv_txfm_add_4x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
#define av1_highbd_inv_txfm_add_4x8 av1_highbd_inv_txfm_add_4x8_c
-void av1_highbd_inv_txfm_add_8x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_8x16 av1_highbd_inv_txfm_add_8x16_c
-
-void av1_highbd_inv_txfm_add_8x32_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_8x32 av1_highbd_inv_txfm_add_8x32_c
-
void av1_highbd_inv_txfm_add_8x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
#define av1_highbd_inv_txfm_add_8x4 av1_highbd_inv_txfm_add_8x4_c
@@ -200,18 +204,6 @@ void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int des
void av1_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
#define av1_highbd_iwht4x4_1_add av1_highbd_iwht4x4_1_add_c
-void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-#define av1_highbd_jnt_convolve_2d av1_highbd_jnt_convolve_2d_c
-
-void av1_highbd_jnt_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-#define av1_highbd_jnt_convolve_2d_copy av1_highbd_jnt_convolve_2d_copy_c
-
-void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-#define av1_highbd_jnt_convolve_x av1_highbd_jnt_convolve_x_c
-
-void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-#define av1_highbd_jnt_convolve_y av1_highbd_jnt_convolve_y_c
-
void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
#define av1_highbd_warp_affine av1_highbd_warp_affine_c
@@ -279,21 +271,9 @@ void av1_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, con
void av1_inv_txfm_add_neon(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
#define av1_inv_txfm_add av1_inv_txfm_add_neon
-void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-#define av1_jnt_convolve_2d av1_jnt_convolve_2d_neon
-
-void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-#define av1_jnt_convolve_2d_copy av1_jnt_convolve_2d_copy_neon
-
-void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-#define av1_jnt_convolve_x av1_jnt_convolve_x_neon
-
-void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-#define av1_jnt_convolve_y av1_jnt_convolve_y_neon
+void av1_round_shift_array_c(int32_t *arr, int size, int bit);
+void av1_round_shift_array_neon(int32_t *arr, int size, int bit);
+#define av1_round_shift_array av1_round_shift_array_neon
int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
diff --git a/config/arm64/config/aom_config.asm b/config/arm64/config/aom_config.asm
index b8fcd42..50338c1 100644
--- a/config/arm64/config/aom_config.asm
+++ b/config/arm64/config/aom_config.asm
@@ -13,7 +13,8 @@ ARCH_MIPS equ 0
ARCH_PPC equ 0
ARCH_X86 equ 0
ARCH_X86_64 equ 0
-CONFIG_2PASS_PARTITION_SEARCH_LVL equ 1
+CONFIG_2PASS_PARTITION_SEARCH_LVL_END equ 3
+CONFIG_2PASS_PARTITION_SEARCH_LVL_START equ 1
CONFIG_ACCOUNTING equ 0
CONFIG_ANALYZER equ 0
CONFIG_AV1_DECODER equ 1
@@ -21,7 +22,8 @@ CONFIG_AV1_ENCODER equ 0
CONFIG_BIG_ENDIAN equ 0
CONFIG_BITSTREAM_DEBUG equ 0
CONFIG_COEFFICIENT_RANGE_CHECKING equ 0
-CONFIG_COLLECT_INTER_MODE_RD_STATS equ 1
+CONFIG_COLLECT_COMPONENT_TIMING equ 0
+CONFIG_COLLECT_PARTITION_STATS equ 0
CONFIG_COLLECT_RD_STATS equ 0
CONFIG_DEBUG equ 0
CONFIG_DENOISE equ 1
@@ -29,11 +31,8 @@ CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 equ 1
CONFIG_DIST_8X8 equ 0
CONFIG_ENTROPY_STATS equ 0
CONFIG_FILEOPTIONS equ 1
-CONFIG_FIX_GF_LENGTH equ 1
-CONFIG_FP_MB_STATS equ 0
CONFIG_GCC equ 1
CONFIG_GCOV equ 0
-CONFIG_GLOBAL_MOTION_SEARCH equ 1
CONFIG_GPROF equ 0
CONFIG_INSPECTION equ 0
CONFIG_INTERNAL_STATS equ 0
@@ -44,16 +43,15 @@ CONFIG_MAX_DECODE_PROFILE equ 0
CONFIG_MISMATCH_DEBUG equ 0
CONFIG_MULTITHREAD equ 1
CONFIG_NORMAL_TILE_MODE equ 1
-CONFIG_ONE_PASS_SVM equ 0
CONFIG_OS_SUPPORT equ 1
CONFIG_PIC equ 0
CONFIG_RD_DEBUG equ 0
-CONFIG_REDUCED_ENCODER_BORDER equ 0
CONFIG_RUNTIME_CPU_DETECT equ 0
CONFIG_SHARED equ 0
CONFIG_SHARP_SETTINGS equ 0
CONFIG_SIZE_LIMIT equ 1
CONFIG_SPATIAL_RESAMPLING equ 1
+CONFIG_SPEED_STATS equ 0
CONFIG_STATIC equ 1
CONFIG_WEBM_IO equ 1
DECODE_HEIGHT_LIMIT equ 16384
diff --git a/config/arm64/config/aom_config.h b/config/arm64/config/aom_config.h
index 5418985..a3b86df 100644
--- a/config/arm64/config/aom_config.h
+++ b/config/arm64/config/aom_config.h
@@ -15,7 +15,8 @@
#define ARCH_PPC 0
#define ARCH_X86 0
#define ARCH_X86_64 0
-#define CONFIG_2PASS_PARTITION_SEARCH_LVL 1
+#define CONFIG_2PASS_PARTITION_SEARCH_LVL_END 3
+#define CONFIG_2PASS_PARTITION_SEARCH_LVL_START 1
#define CONFIG_ACCOUNTING 0
#define CONFIG_ANALYZER 0
#define CONFIG_AV1_DECODER 1
@@ -23,7 +24,8 @@
#define CONFIG_BIG_ENDIAN 0
#define CONFIG_BITSTREAM_DEBUG 0
#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
-#define CONFIG_COLLECT_INTER_MODE_RD_STATS 1
+#define CONFIG_COLLECT_COMPONENT_TIMING 0
+#define CONFIG_COLLECT_PARTITION_STATS 0
#define CONFIG_COLLECT_RD_STATS 0
#define CONFIG_DEBUG 0
#define CONFIG_DENOISE 1
@@ -31,11 +33,8 @@
#define CONFIG_DIST_8X8 0
#define CONFIG_ENTROPY_STATS 0
#define CONFIG_FILEOPTIONS 1
-#define CONFIG_FIX_GF_LENGTH 1
-#define CONFIG_FP_MB_STATS 0
#define CONFIG_GCC 1
#define CONFIG_GCOV 0
-#define CONFIG_GLOBAL_MOTION_SEARCH 1
#define CONFIG_GPROF 0
#define CONFIG_INSPECTION 0
#define CONFIG_INTERNAL_STATS 0
@@ -46,16 +45,15 @@
#define CONFIG_MISMATCH_DEBUG 0
#define CONFIG_MULTITHREAD 1
#define CONFIG_NORMAL_TILE_MODE 1
-#define CONFIG_ONE_PASS_SVM 0
#define CONFIG_OS_SUPPORT 1
#define CONFIG_PIC 0
#define CONFIG_RD_DEBUG 0
-#define CONFIG_REDUCED_ENCODER_BORDER 0
#define CONFIG_RUNTIME_CPU_DETECT 0
#define CONFIG_SHARED 0
#define CONFIG_SHARP_SETTINGS 0
#define CONFIG_SIZE_LIMIT 1
#define CONFIG_SPATIAL_RESAMPLING 1
+#define CONFIG_SPEED_STATS 0
#define CONFIG_STATIC 1
#define CONFIG_WEBM_IO 1
#define DECODE_HEIGHT_LIMIT 16384
diff --git a/config/arm64/config/aom_dsp_rtcd.h b/config/arm64/config/aom_dsp_rtcd.h
index e3150f7..0b1a28a 100644
--- a/config/arm64/config/aom_dsp_rtcd.h
+++ b/config/arm64/config/aom_dsp_rtcd.h
@@ -1400,10 +1400,6 @@ void aom_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov
void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define aom_v_predictor_8x8 aom_v_predictor_8x8_neon
-void av1_round_shift_array_c(int32_t *arr, int size, int bit);
-void av1_round_shift_array_neon(int32_t *arr, int size, int bit);
-#define av1_round_shift_array av1_round_shift_array_neon
-
void aom_dsp_rtcd(void);
#include "config/aom_config.h"
diff --git a/config/arm64/config/aom_scale_rtcd.h b/config/arm64/config/aom_scale_rtcd.h
index 7260bd3..067ddb4 100644
--- a/config/arm64/config/aom_scale_rtcd.h
+++ b/config/arm64/config/aom_scale_rtcd.h
@@ -77,6 +77,9 @@ void aom_yv12_partial_copy_v_c(const struct yv12_buffer_config *src_bc, int hsta
void aom_yv12_partial_copy_y_c(const struct yv12_buffer_config *src_ybc, int hstart1, int hend1, int vstart1, int vend1, struct yv12_buffer_config *dst_ybc, int hstart2, int vstart2);
#define aom_yv12_partial_copy_y aom_yv12_partial_copy_y_c
+int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, int num_planes);
+#define aom_yv12_realloc_with_new_border aom_yv12_realloc_with_new_border_c
+
void aom_scale_rtcd(void);
#include "config/aom_config.h"
diff --git a/config/arm64/config/av1_rtcd.h b/config/arm64/config/av1_rtcd.h
index c58e511..6f42666 100644
--- a/config/arm64/config/av1_rtcd.h
+++ b/config/arm64/config/av1_rtcd.h
@@ -89,6 +89,22 @@ void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int d
void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
#define av1_convolve_y_sr av1_convolve_y_sr_neon
+void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+#define av1_dist_wtd_convolve_2d av1_dist_wtd_convolve_2d_neon
+
+void av1_dist_wtd_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_2d_copy_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+#define av1_dist_wtd_convolve_2d_copy av1_dist_wtd_convolve_2d_copy_neon
+
+void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+#define av1_dist_wtd_convolve_x av1_dist_wtd_convolve_x_neon
+
+void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+#define av1_dist_wtd_convolve_y av1_dist_wtd_convolve_y_neon
+
void av1_dr_prediction_z1_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy);
#define av1_dr_prediction_z1 av1_dr_prediction_z1_c
@@ -140,6 +156,18 @@ void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride, uint16_t *d
void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
#define av1_highbd_convolve_y_sr av1_highbd_convolve_y_sr_c
+void av1_highbd_dist_wtd_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_dist_wtd_convolve_2d av1_highbd_dist_wtd_convolve_2d_c
+
+void av1_highbd_dist_wtd_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_dist_wtd_convolve_2d_copy av1_highbd_dist_wtd_convolve_2d_copy_c
+
+void av1_highbd_dist_wtd_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_dist_wtd_convolve_x av1_highbd_dist_wtd_convolve_x_c
+
+void av1_highbd_dist_wtd_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_dist_wtd_convolve_y av1_highbd_dist_wtd_convolve_y_c
+
void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd);
#define av1_highbd_dr_prediction_z1 av1_highbd_dr_prediction_z1_c
@@ -152,27 +180,9 @@ void av1_highbd_dr_prediction_z3_c(uint16_t *dst, ptrdiff_t stride, int bw, int
void av1_highbd_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
#define av1_highbd_inv_txfm_add av1_highbd_inv_txfm_add_c
-void av1_highbd_inv_txfm_add_16x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_16x16 av1_highbd_inv_txfm_add_16x16_c
-
-void av1_highbd_inv_txfm_add_16x32_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_16x32 av1_highbd_inv_txfm_add_16x32_c
-
void av1_highbd_inv_txfm_add_16x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
#define av1_highbd_inv_txfm_add_16x4 av1_highbd_inv_txfm_add_16x4_c
-void av1_highbd_inv_txfm_add_16x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_16x8 av1_highbd_inv_txfm_add_16x8_c
-
-void av1_highbd_inv_txfm_add_32x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_32x16 av1_highbd_inv_txfm_add_32x16_c
-
-void av1_highbd_inv_txfm_add_32x32_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_32x32 av1_highbd_inv_txfm_add_32x32_c
-
-void av1_highbd_inv_txfm_add_32x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_32x8 av1_highbd_inv_txfm_add_32x8_c
-
void av1_highbd_inv_txfm_add_4x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
#define av1_highbd_inv_txfm_add_4x16 av1_highbd_inv_txfm_add_4x16_c
@@ -182,12 +192,6 @@ void av1_highbd_inv_txfm_add_4x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int
void av1_highbd_inv_txfm_add_4x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
#define av1_highbd_inv_txfm_add_4x8 av1_highbd_inv_txfm_add_4x8_c
-void av1_highbd_inv_txfm_add_8x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_8x16 av1_highbd_inv_txfm_add_8x16_c
-
-void av1_highbd_inv_txfm_add_8x32_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_8x32 av1_highbd_inv_txfm_add_8x32_c
-
void av1_highbd_inv_txfm_add_8x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
#define av1_highbd_inv_txfm_add_8x4 av1_highbd_inv_txfm_add_8x4_c
@@ -200,18 +204,6 @@ void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int des
void av1_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
#define av1_highbd_iwht4x4_1_add av1_highbd_iwht4x4_1_add_c
-void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-#define av1_highbd_jnt_convolve_2d av1_highbd_jnt_convolve_2d_c
-
-void av1_highbd_jnt_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-#define av1_highbd_jnt_convolve_2d_copy av1_highbd_jnt_convolve_2d_copy_c
-
-void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-#define av1_highbd_jnt_convolve_x av1_highbd_jnt_convolve_x_c
-
-void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-#define av1_highbd_jnt_convolve_y av1_highbd_jnt_convolve_y_c
-
void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
#define av1_highbd_warp_affine av1_highbd_warp_affine_c
@@ -279,21 +271,9 @@ void av1_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, con
void av1_inv_txfm_add_neon(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
#define av1_inv_txfm_add av1_inv_txfm_add_neon
-void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-#define av1_jnt_convolve_2d av1_jnt_convolve_2d_neon
-
-void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-#define av1_jnt_convolve_2d_copy av1_jnt_convolve_2d_copy_neon
-
-void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-#define av1_jnt_convolve_x av1_jnt_convolve_x_neon
-
-void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-#define av1_jnt_convolve_y av1_jnt_convolve_y_neon
+void av1_round_shift_array_c(int32_t *arr, int size, int bit);
+void av1_round_shift_array_neon(int32_t *arr, int size, int bit);
+#define av1_round_shift_array av1_round_shift_array_neon
int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
diff --git a/config/x86/config/aom_config.asm b/config/x86/config/aom_config.asm
index 4360c87..222e3bf 100644
--- a/config/x86/config/aom_config.asm
+++ b/config/x86/config/aom_config.asm
@@ -3,7 +3,7 @@
%define ARCH_PPC 0
%define ARCH_X86 1
%define ARCH_X86_64 0
-%define CONFIG_2PASS_PARTITION_SEARCH_LVL 1
+%define CONFIG_2PASS_PARTITION_SEARCH_LVL_START 1
%define CONFIG_ACCOUNTING 0
%define CONFIG_ANALYZER 0
%define CONFIG_AV1_DECODER 1
@@ -11,7 +11,8 @@
%define CONFIG_BIG_ENDIAN 0
%define CONFIG_BITSTREAM_DEBUG 0
%define CONFIG_COEFFICIENT_RANGE_CHECKING 0
-%define CONFIG_COLLECT_INTER_MODE_RD_STATS 1
+%define CONFIG_COLLECT_COMPONENT_TIMING 0
+%define CONFIG_COLLECT_PARTITION_STATS 0
%define CONFIG_COLLECT_RD_STATS 0
%define CONFIG_DEBUG 0
%define CONFIG_DENOISE 1
@@ -19,11 +20,8 @@
%define CONFIG_DIST_8X8 0
%define CONFIG_ENTROPY_STATS 0
%define CONFIG_FILEOPTIONS 1
-%define CONFIG_FIX_GF_LENGTH 1
-%define CONFIG_FP_MB_STATS 0
%define CONFIG_GCC 1
%define CONFIG_GCOV 0
-%define CONFIG_GLOBAL_MOTION_SEARCH 1
%define CONFIG_GPROF 0
%define CONFIG_INSPECTION 0
%define CONFIG_INTERNAL_STATS 0
@@ -34,16 +32,15 @@
%define CONFIG_MISMATCH_DEBUG 0
%define CONFIG_MULTITHREAD 1
%define CONFIG_NORMAL_TILE_MODE 1
-%define CONFIG_ONE_PASS_SVM 0
%define CONFIG_OS_SUPPORT 1
%define CONFIG_PIC 1
%define CONFIG_RD_DEBUG 0
-%define CONFIG_REDUCED_ENCODER_BORDER 0
%define CONFIG_RUNTIME_CPU_DETECT 0
%define CONFIG_SHARED 0
%define CONFIG_SHARP_SETTINGS 0
%define CONFIG_SIZE_LIMIT 1
%define CONFIG_SPATIAL_RESAMPLING 1
+%define CONFIG_SPEED_STATS 0
%define CONFIG_STATIC 1
%define CONFIG_WEBM_IO 1
%define DECODE_HEIGHT_LIMIT 16384
diff --git a/config/x86/config/aom_config.h b/config/x86/config/aom_config.h
index e162899..db2edbd 100644
--- a/config/x86/config/aom_config.h
+++ b/config/x86/config/aom_config.h
@@ -15,7 +15,8 @@
#define ARCH_PPC 0
#define ARCH_X86 1
#define ARCH_X86_64 0
-#define CONFIG_2PASS_PARTITION_SEARCH_LVL 1
+#define CONFIG_2PASS_PARTITION_SEARCH_LVL_END 3
+#define CONFIG_2PASS_PARTITION_SEARCH_LVL_START 1
#define CONFIG_ACCOUNTING 0
#define CONFIG_ANALYZER 0
#define CONFIG_AV1_DECODER 1
@@ -23,7 +24,8 @@
#define CONFIG_BIG_ENDIAN 0
#define CONFIG_BITSTREAM_DEBUG 0
#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
-#define CONFIG_COLLECT_INTER_MODE_RD_STATS 1
+#define CONFIG_COLLECT_COMPONENT_TIMING 0
+#define CONFIG_COLLECT_PARTITION_STATS 0
#define CONFIG_COLLECT_RD_STATS 0
#define CONFIG_DEBUG 0
#define CONFIG_DENOISE 1
@@ -31,11 +33,8 @@
#define CONFIG_DIST_8X8 0
#define CONFIG_ENTROPY_STATS 0
#define CONFIG_FILEOPTIONS 1
-#define CONFIG_FIX_GF_LENGTH 1
-#define CONFIG_FP_MB_STATS 0
#define CONFIG_GCC 1
#define CONFIG_GCOV 0
-#define CONFIG_GLOBAL_MOTION_SEARCH 1
#define CONFIG_GPROF 0
#define CONFIG_INSPECTION 0
#define CONFIG_INTERNAL_STATS 0
@@ -46,16 +45,15 @@
#define CONFIG_MISMATCH_DEBUG 0
#define CONFIG_MULTITHREAD 1
#define CONFIG_NORMAL_TILE_MODE 1
-#define CONFIG_ONE_PASS_SVM 0
#define CONFIG_OS_SUPPORT 1
#define CONFIG_PIC 1
#define CONFIG_RD_DEBUG 0
-#define CONFIG_REDUCED_ENCODER_BORDER 0
#define CONFIG_RUNTIME_CPU_DETECT 0
#define CONFIG_SHARED 0
#define CONFIG_SHARP_SETTINGS 0
#define CONFIG_SIZE_LIMIT 1
#define CONFIG_SPATIAL_RESAMPLING 1
+#define CONFIG_SPEED_STATS 0
#define CONFIG_STATIC 1
#define CONFIG_WEBM_IO 1
#define DECODE_HEIGHT_LIMIT 16384
diff --git a/config/x86/config/aom_dsp_rtcd.h b/config/x86/config/aom_dsp_rtcd.h
index 8f11e0b..f84f313 100644
--- a/config/x86/config/aom_dsp_rtcd.h
+++ b/config/x86/config/aom_dsp_rtcd.h
@@ -1650,9 +1650,6 @@ void aom_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov
void aom_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define aom_v_predictor_8x8 aom_v_predictor_8x8_sse2
-void av1_round_shift_array_c(int32_t *arr, int size, int bit);
-#define av1_round_shift_array av1_round_shift_array_c
-
void aom_dsp_rtcd(void);
#ifdef RTCD_C
diff --git a/config/x86/config/aom_scale_rtcd.h b/config/x86/config/aom_scale_rtcd.h
index b6e8149..65c184b 100644
--- a/config/x86/config/aom_scale_rtcd.h
+++ b/config/x86/config/aom_scale_rtcd.h
@@ -77,6 +77,9 @@ void aom_yv12_partial_copy_v_c(const struct yv12_buffer_config *src_bc, int hsta
void aom_yv12_partial_copy_y_c(const struct yv12_buffer_config *src_ybc, int hstart1, int hend1, int vstart1, int vend1, struct yv12_buffer_config *dst_ybc, int hstart2, int vstart2);
#define aom_yv12_partial_copy_y aom_yv12_partial_copy_y_c
+int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, int num_planes);
+#define aom_yv12_realloc_with_new_border aom_yv12_realloc_with_new_border_c
+
void aom_scale_rtcd(void);
#ifdef RTCD_C
diff --git a/config/x86/config/av1_rtcd.h b/config/x86/config/av1_rtcd.h
index c5d7794..f788933 100644
--- a/config/x86/config/av1_rtcd.h
+++ b/config/x86/config/av1_rtcd.h
@@ -88,6 +88,23 @@ void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int d
void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
#define av1_convolve_y_sr av1_convolve_y_sr_sse2
+void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_2d_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+#define av1_dist_wtd_convolve_2d av1_dist_wtd_convolve_2d_ssse3
+
+void av1_dist_wtd_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+#define av1_dist_wtd_convolve_2d_copy av1_dist_wtd_convolve_2d_copy_sse2
+
+void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+#define av1_dist_wtd_convolve_x av1_dist_wtd_convolve_x_sse2
+
+void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+#define av1_dist_wtd_convolve_y av1_dist_wtd_convolve_y_sse2
+
void av1_dr_prediction_z1_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy);
#define av1_dr_prediction_z1 av1_dr_prediction_z1_c
@@ -143,6 +160,18 @@ void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *d
void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
#define av1_highbd_convolve_y_sr av1_highbd_convolve_y_sr_ssse3
+void av1_highbd_dist_wtd_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_dist_wtd_convolve_2d av1_highbd_dist_wtd_convolve_2d_c
+
+void av1_highbd_dist_wtd_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_dist_wtd_convolve_2d_copy av1_highbd_dist_wtd_convolve_2d_copy_c
+
+void av1_highbd_dist_wtd_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_dist_wtd_convolve_x av1_highbd_dist_wtd_convolve_x_c
+
+void av1_highbd_dist_wtd_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_dist_wtd_convolve_y av1_highbd_dist_wtd_convolve_y_c
+
void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd);
#define av1_highbd_dr_prediction_z1 av1_highbd_dr_prediction_z1_c
@@ -155,27 +184,9 @@ void av1_highbd_dr_prediction_z3_c(uint16_t *dst, ptrdiff_t stride, int bw, int
void av1_highbd_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
#define av1_highbd_inv_txfm_add av1_highbd_inv_txfm_add_c
-void av1_highbd_inv_txfm_add_16x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_16x16 av1_highbd_inv_txfm_add_16x16_c
-
-void av1_highbd_inv_txfm_add_16x32_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_16x32 av1_highbd_inv_txfm_add_16x32_c
-
void av1_highbd_inv_txfm_add_16x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
#define av1_highbd_inv_txfm_add_16x4 av1_highbd_inv_txfm_add_16x4_c
-void av1_highbd_inv_txfm_add_16x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_16x8 av1_highbd_inv_txfm_add_16x8_c
-
-void av1_highbd_inv_txfm_add_32x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_32x16 av1_highbd_inv_txfm_add_32x16_c
-
-void av1_highbd_inv_txfm_add_32x32_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_32x32 av1_highbd_inv_txfm_add_32x32_c
-
-void av1_highbd_inv_txfm_add_32x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_32x8 av1_highbd_inv_txfm_add_32x8_c
-
void av1_highbd_inv_txfm_add_4x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
#define av1_highbd_inv_txfm_add_4x16 av1_highbd_inv_txfm_add_4x16_c
@@ -185,12 +196,6 @@ void av1_highbd_inv_txfm_add_4x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int
void av1_highbd_inv_txfm_add_4x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
#define av1_highbd_inv_txfm_add_4x8 av1_highbd_inv_txfm_add_4x8_c
-void av1_highbd_inv_txfm_add_8x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_8x16 av1_highbd_inv_txfm_add_8x16_c
-
-void av1_highbd_inv_txfm_add_8x32_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_8x32 av1_highbd_inv_txfm_add_8x32_c
-
void av1_highbd_inv_txfm_add_8x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
#define av1_highbd_inv_txfm_add_8x4 av1_highbd_inv_txfm_add_8x4_c
@@ -203,18 +208,6 @@ void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int des
void av1_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
#define av1_highbd_iwht4x4_1_add av1_highbd_iwht4x4_1_add_c
-void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-#define av1_highbd_jnt_convolve_2d av1_highbd_jnt_convolve_2d_c
-
-void av1_highbd_jnt_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-#define av1_highbd_jnt_convolve_2d_copy av1_highbd_jnt_convolve_2d_copy_c
-
-void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-#define av1_highbd_jnt_convolve_x av1_highbd_jnt_convolve_x_c
-
-void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-#define av1_highbd_jnt_convolve_y av1_highbd_jnt_convolve_y_c
-
void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
#define av1_highbd_warp_affine av1_highbd_warp_affine_c
@@ -283,22 +276,8 @@ void av1_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, con
void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
#define av1_inv_txfm_add av1_inv_txfm_add_ssse3
-void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-#define av1_jnt_convolve_2d av1_jnt_convolve_2d_ssse3
-
-void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-#define av1_jnt_convolve_2d_copy av1_jnt_convolve_2d_copy_sse2
-
-void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-#define av1_jnt_convolve_x av1_jnt_convolve_x_sse2
-
-void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-#define av1_jnt_convolve_y av1_jnt_convolve_y_sse2
+void av1_round_shift_array_c(int32_t *arr, int size, int bit);
+#define av1_round_shift_array av1_round_shift_array_c
int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
diff --git a/config/x86_64/config/aom_config.asm b/config/x86_64/config/aom_config.asm
index 986dc75..43e7f74 100644
--- a/config/x86_64/config/aom_config.asm
+++ b/config/x86_64/config/aom_config.asm
@@ -3,7 +3,7 @@
%define ARCH_PPC 0
%define ARCH_X86 0
%define ARCH_X86_64 1
-%define CONFIG_2PASS_PARTITION_SEARCH_LVL 1
+%define CONFIG_2PASS_PARTITION_SEARCH_LVL_START 1
%define CONFIG_ACCOUNTING 0
%define CONFIG_ANALYZER 0
%define CONFIG_AV1_DECODER 1
@@ -11,7 +11,8 @@
%define CONFIG_BIG_ENDIAN 0
%define CONFIG_BITSTREAM_DEBUG 0
%define CONFIG_COEFFICIENT_RANGE_CHECKING 0
-%define CONFIG_COLLECT_INTER_MODE_RD_STATS 1
+%define CONFIG_COLLECT_COMPONENT_TIMING 0
+%define CONFIG_COLLECT_PARTITION_STATS 0
%define CONFIG_COLLECT_RD_STATS 0
%define CONFIG_DEBUG 0
%define CONFIG_DENOISE 1
@@ -19,11 +20,8 @@
%define CONFIG_DIST_8X8 0
%define CONFIG_ENTROPY_STATS 0
%define CONFIG_FILEOPTIONS 1
-%define CONFIG_FIX_GF_LENGTH 1
-%define CONFIG_FP_MB_STATS 0
%define CONFIG_GCC 1
%define CONFIG_GCOV 0
-%define CONFIG_GLOBAL_MOTION_SEARCH 1
%define CONFIG_GPROF 0
%define CONFIG_INSPECTION 0
%define CONFIG_INTERNAL_STATS 0
@@ -34,16 +32,15 @@
%define CONFIG_MISMATCH_DEBUG 0
%define CONFIG_MULTITHREAD 1
%define CONFIG_NORMAL_TILE_MODE 1
-%define CONFIG_ONE_PASS_SVM 0
%define CONFIG_OS_SUPPORT 1
%define CONFIG_PIC 0
%define CONFIG_RD_DEBUG 0
-%define CONFIG_REDUCED_ENCODER_BORDER 0
%define CONFIG_RUNTIME_CPU_DETECT 0
%define CONFIG_SHARED 0
%define CONFIG_SHARP_SETTINGS 0
%define CONFIG_SIZE_LIMIT 1
%define CONFIG_SPATIAL_RESAMPLING 1
+%define CONFIG_SPEED_STATS 0
%define CONFIG_STATIC 1
%define CONFIG_WEBM_IO 1
%define DECODE_HEIGHT_LIMIT 16384
diff --git a/config/x86_64/config/aom_config.h b/config/x86_64/config/aom_config.h
index 0f32913..610e8ca 100644
--- a/config/x86_64/config/aom_config.h
+++ b/config/x86_64/config/aom_config.h
@@ -15,7 +15,8 @@
#define ARCH_PPC 0
#define ARCH_X86 0
#define ARCH_X86_64 1
-#define CONFIG_2PASS_PARTITION_SEARCH_LVL 1
+#define CONFIG_2PASS_PARTITION_SEARCH_LVL_END 3
+#define CONFIG_2PASS_PARTITION_SEARCH_LVL_START 1
#define CONFIG_ACCOUNTING 0
#define CONFIG_ANALYZER 0
#define CONFIG_AV1_DECODER 1
@@ -23,7 +24,8 @@
#define CONFIG_BIG_ENDIAN 0
#define CONFIG_BITSTREAM_DEBUG 0
#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
-#define CONFIG_COLLECT_INTER_MODE_RD_STATS 1
+#define CONFIG_COLLECT_COMPONENT_TIMING 0
+#define CONFIG_COLLECT_PARTITION_STATS 0
#define CONFIG_COLLECT_RD_STATS 0
#define CONFIG_DEBUG 0
#define CONFIG_DENOISE 1
@@ -31,11 +33,8 @@
#define CONFIG_DIST_8X8 0
#define CONFIG_ENTROPY_STATS 0
#define CONFIG_FILEOPTIONS 1
-#define CONFIG_FIX_GF_LENGTH 1
-#define CONFIG_FP_MB_STATS 0
#define CONFIG_GCC 1
#define CONFIG_GCOV 0
-#define CONFIG_GLOBAL_MOTION_SEARCH 1
#define CONFIG_GPROF 0
#define CONFIG_INSPECTION 0
#define CONFIG_INTERNAL_STATS 0
@@ -46,16 +45,15 @@
#define CONFIG_MISMATCH_DEBUG 0
#define CONFIG_MULTITHREAD 1
#define CONFIG_NORMAL_TILE_MODE 1
-#define CONFIG_ONE_PASS_SVM 0
#define CONFIG_OS_SUPPORT 1
#define CONFIG_PIC 0
#define CONFIG_RD_DEBUG 0
-#define CONFIG_REDUCED_ENCODER_BORDER 0
#define CONFIG_RUNTIME_CPU_DETECT 0
#define CONFIG_SHARED 0
#define CONFIG_SHARP_SETTINGS 0
#define CONFIG_SIZE_LIMIT 1
#define CONFIG_SPATIAL_RESAMPLING 1
+#define CONFIG_SPEED_STATS 0
#define CONFIG_STATIC 1
#define CONFIG_WEBM_IO 1
#define DECODE_HEIGHT_LIMIT 16384
diff --git a/config/x86_64/config/aom_dsp_rtcd.h b/config/x86_64/config/aom_dsp_rtcd.h
index 8f11e0b..f84f313 100644
--- a/config/x86_64/config/aom_dsp_rtcd.h
+++ b/config/x86_64/config/aom_dsp_rtcd.h
@@ -1650,9 +1650,6 @@ void aom_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov
void aom_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
#define aom_v_predictor_8x8 aom_v_predictor_8x8_sse2
-void av1_round_shift_array_c(int32_t *arr, int size, int bit);
-#define av1_round_shift_array av1_round_shift_array_c
-
void aom_dsp_rtcd(void);
#ifdef RTCD_C
diff --git a/config/x86_64/config/aom_scale_rtcd.h b/config/x86_64/config/aom_scale_rtcd.h
index b6e8149..65c184b 100644
--- a/config/x86_64/config/aom_scale_rtcd.h
+++ b/config/x86_64/config/aom_scale_rtcd.h
@@ -77,6 +77,9 @@ void aom_yv12_partial_copy_v_c(const struct yv12_buffer_config *src_bc, int hsta
void aom_yv12_partial_copy_y_c(const struct yv12_buffer_config *src_ybc, int hstart1, int hend1, int vstart1, int vend1, struct yv12_buffer_config *dst_ybc, int hstart2, int vstart2);
#define aom_yv12_partial_copy_y aom_yv12_partial_copy_y_c
+int aom_yv12_realloc_with_new_border_c(struct yv12_buffer_config *ybf, int new_border, int byte_alignment, int num_planes);
+#define aom_yv12_realloc_with_new_border aom_yv12_realloc_with_new_border_c
+
void aom_scale_rtcd(void);
#ifdef RTCD_C
diff --git a/config/x86_64/config/av1_rtcd.h b/config/x86_64/config/av1_rtcd.h
index 043595d..84673ba 100644
--- a/config/x86_64/config/av1_rtcd.h
+++ b/config/x86_64/config/av1_rtcd.h
@@ -88,6 +88,23 @@ void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst, int d
void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
#define av1_convolve_y_sr av1_convolve_y_sr_sse2
+void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_2d_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+#define av1_dist_wtd_convolve_2d av1_dist_wtd_convolve_2d_ssse3
+
+void av1_dist_wtd_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+#define av1_dist_wtd_convolve_2d_copy av1_dist_wtd_convolve_2d_copy_sse2
+
+void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+#define av1_dist_wtd_convolve_x av1_dist_wtd_convolve_x_sse2
+
+void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+void av1_dist_wtd_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
+#define av1_dist_wtd_convolve_y av1_dist_wtd_convolve_y_sse2
+
void av1_dr_prediction_z1_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy);
#define av1_dr_prediction_z1 av1_dr_prediction_z1_c
@@ -146,6 +163,18 @@ void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride, uint16_t *d
void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
#define av1_highbd_convolve_y_sr av1_highbd_convolve_y_sr_ssse3
+void av1_highbd_dist_wtd_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_dist_wtd_convolve_2d av1_highbd_dist_wtd_convolve_2d_c
+
+void av1_highbd_dist_wtd_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_dist_wtd_convolve_2d_copy av1_highbd_dist_wtd_convolve_2d_copy_c
+
+void av1_highbd_dist_wtd_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_dist_wtd_convolve_x av1_highbd_dist_wtd_convolve_x_c
+
+void av1_highbd_dist_wtd_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+#define av1_highbd_dist_wtd_convolve_y av1_highbd_dist_wtd_convolve_y_c
+
void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd);
#define av1_highbd_dr_prediction_z1 av1_highbd_dr_prediction_z1_c
@@ -158,27 +187,9 @@ void av1_highbd_dr_prediction_z3_c(uint16_t *dst, ptrdiff_t stride, int bw, int
void av1_highbd_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
#define av1_highbd_inv_txfm_add av1_highbd_inv_txfm_add_c
-void av1_highbd_inv_txfm_add_16x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_16x16 av1_highbd_inv_txfm_add_16x16_c
-
-void av1_highbd_inv_txfm_add_16x32_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_16x32 av1_highbd_inv_txfm_add_16x32_c
-
void av1_highbd_inv_txfm_add_16x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
#define av1_highbd_inv_txfm_add_16x4 av1_highbd_inv_txfm_add_16x4_c
-void av1_highbd_inv_txfm_add_16x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_16x8 av1_highbd_inv_txfm_add_16x8_c
-
-void av1_highbd_inv_txfm_add_32x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_32x16 av1_highbd_inv_txfm_add_32x16_c
-
-void av1_highbd_inv_txfm_add_32x32_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_32x32 av1_highbd_inv_txfm_add_32x32_c
-
-void av1_highbd_inv_txfm_add_32x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_32x8 av1_highbd_inv_txfm_add_32x8_c
-
void av1_highbd_inv_txfm_add_4x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
#define av1_highbd_inv_txfm_add_4x16 av1_highbd_inv_txfm_add_4x16_c
@@ -188,12 +199,6 @@ void av1_highbd_inv_txfm_add_4x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int
void av1_highbd_inv_txfm_add_4x8_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
#define av1_highbd_inv_txfm_add_4x8 av1_highbd_inv_txfm_add_4x8_c
-void av1_highbd_inv_txfm_add_8x16_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_8x16 av1_highbd_inv_txfm_add_8x16_c
-
-void av1_highbd_inv_txfm_add_8x32_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
-#define av1_highbd_inv_txfm_add_8x32 av1_highbd_inv_txfm_add_8x32_c
-
void av1_highbd_inv_txfm_add_8x4_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
#define av1_highbd_inv_txfm_add_8x4 av1_highbd_inv_txfm_add_8x4_c
@@ -206,18 +211,6 @@ void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int des
void av1_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd);
#define av1_highbd_iwht4x4_1_add av1_highbd_iwht4x4_1_add_c
-void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-#define av1_highbd_jnt_convolve_2d av1_highbd_jnt_convolve_2d_c
-
-void av1_highbd_jnt_convolve_2d_copy_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-#define av1_highbd_jnt_convolve_2d_copy av1_highbd_jnt_convolve_2d_copy_c
-
-void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-#define av1_highbd_jnt_convolve_x av1_highbd_jnt_convolve_x_c
-
-void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd);
-#define av1_highbd_jnt_convolve_y av1_highbd_jnt_convolve_y_c
-
void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta);
#define av1_highbd_warp_affine av1_highbd_warp_affine_c
@@ -286,22 +279,8 @@ void av1_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride, con
void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param);
#define av1_inv_txfm_add av1_inv_txfm_add_ssse3
-void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-#define av1_jnt_convolve_2d av1_jnt_convolve_2d_ssse3
-
-void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-#define av1_jnt_convolve_2d_copy av1_jnt_convolve_2d_copy_sse2
-
-void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-#define av1_jnt_convolve_x av1_jnt_convolve_x_sse2
-
-void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params);
-#define av1_jnt_convolve_y av1_jnt_convolve_y_sse2
+void av1_round_shift_array_c(int32_t *arr, int size, int bit);
+#define av1_round_shift_array av1_round_shift_array_c
int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
diff --git a/libaom/CMakeLists.txt b/libaom/CMakeLists.txt
index f409892..2c35a0f 100644
--- a/libaom/CMakeLists.txt
+++ b/libaom/CMakeLists.txt
@@ -293,8 +293,11 @@ if(CONFIG_AV1_DECODER AND ENABLE_EXAMPLES)
if(EMSCRIPTEN)
add_preproc_definition(_POSIX_SOURCE)
- append_link_flag_to_target("inspect" "-s TOTAL_MEMORY=402653184")
+ append_link_flag_to_target("inspect" "--emrun")
+ append_link_flag_to_target("inspect" "-s USE_PTHREADS=0")
+ append_link_flag_to_target("inspect" "-s WASM=1")
append_link_flag_to_target("inspect" "-s MODULARIZE=1")
+ append_link_flag_to_target("inspect" "-s ALLOW_MEMORY_GROWTH=1")
append_link_flag_to_target(
"inspect" "-s \'EXTRA_EXPORTED_RUNTIME_METHODS=[\"UTF8ToString\"]\'")
append_link_flag_to_target("inspect"
diff --git a/libaom/PATENTS b/libaom/PATENTS
index be491f5..493f616 100644
--- a/libaom/PATENTS
+++ b/libaom/PATENTS
@@ -57,10 +57,10 @@ Alliance for Open Media Patent License 1.0
2. Definitions.
-2.1. Affiliate. “Affiliate” means an entity that directly or indirectly
+2.1. Affiliate. "Affiliate" means an entity that directly or indirectly
Controls, is Controlled by, or is under common Control of that party.
-2.2. Control. “Control” means direct or indirect control of more than 50% of
+2.2. Control. "Control" means direct or indirect control of more than 50% of
the voting power to elect directors of that corporation, or for any other
entity, the power to direct management of such entity.
@@ -70,7 +70,7 @@ Alliance for Open Media Patent License 1.0
2.4. Encoder. "Encoder" means any encoder that produces a bitstream that can
be decoded by a Decoder only to the extent it produces such a bitstream.
-2.5. Final Deliverable. “Final Deliverable” means the final version of a
+2.5. Final Deliverable. "Final Deliverable" means the final version of a
deliverable approved by the Alliance for Open Media as a Final
Deliverable.
@@ -79,9 +79,9 @@ Alliance for Open Media Patent License 1.0
Implementation also includes components of an Implementation only to the
extent they are used as part of an Implementation.
-2.7. License. “License” means this license.
+2.7. License. "License" means this license.
-2.8. Licensee. “Licensee” means any person or entity who exercises patent
+2.8. Licensee. "Licensee" means any person or entity who exercises patent
rights granted under this License.
2.9. Licensor. "Licensor" means (i) any Licensee that makes, sells, offers
@@ -98,11 +98,11 @@ Alliance for Open Media Patent License 1.0
as if the Specification was a W3C Recommendation; or (ii) are infringed
by the Reference Implementation.
-2.11. Reference Implementation. “Reference Implementation” means an Encoder
+2.11. Reference Implementation. "Reference Implementation" means an Encoder
and/or Decoder released by the Alliance for Open Media as a Final
Deliverable.
-2.12. Specification. “Specification” means the specification designated by
+2.12. Specification. "Specification" means the specification designated by
the Alliance for Open Media as a Final Deliverable for which this
License was issued.
diff --git a/libaom/aom/aom_encoder.h b/libaom/aom/aom_encoder.h
index 777236f..f8a7cec 100644
--- a/libaom/aom/aom_encoder.h
+++ b/libaom/aom/aom_encoder.h
@@ -406,8 +406,7 @@ typedef struct aom_codec_enc_cfg {
* upscaling after the encode/decode process. Taking control of upscaling and
* using restoration filters should allow it to outperform normal resizing.
*
- * Mode 0 is SUPERRES_NONE, mode 1 is SUPERRES_FIXED, mode 2 is
- * SUPERRES_RANDOM and mode 3 is SUPERRES_QTHRESH.
+ * Valid values are 0 to 4 as defined in enum SUPERRES_MODE.
*/
unsigned int rc_superres_mode;
@@ -862,6 +861,11 @@ aom_codec_err_t aom_codec_enc_config_set(aom_codec_ctx_t *ctx,
*/
aom_fixed_buf_t *aom_codec_get_global_headers(aom_codec_ctx_t *ctx);
+/*!\brief usage parameter analogous to AV1 GOOD QUALITY mode. */
+#define AOM_USAGE_GOOD_QUALITY (0)
+/*!\brief usage parameter analogous to AV1 REALTIME mode. */
+#define AOM_USAGE_REALTIME (1)
+
/*!\brief Encode a frame
*
* Encodes a video frame at the given "presentation time." The presentation
diff --git a/libaom/aom/aom_frame_buffer.h b/libaom/aom/aom_frame_buffer.h
index fba4322..a715645 100644
--- a/libaom/aom/aom_frame_buffer.h
+++ b/libaom/aom/aom_frame_buffer.h
@@ -53,9 +53,9 @@ typedef struct aom_codec_frame_buffer {
* data. The callback is triggered when the decoder needs a frame buffer to
* decode a compressed image into. This function may be called more than once
* for every call to aom_codec_decode. The application may set fb->priv to
- * some data which will be passed back in the ximage and the release function
- * call. |fb| is guaranteed to not be NULL. On success the callback must
- * return 0. Any failure the callback must return a value less than 0.
+ * some data which will be passed back in the aom_image_t and the release
+ * function call. |fb| is guaranteed to not be NULL. On success the callback
+ * must return 0. Any failure the callback must return a value less than 0.
*
* \param[in] priv Callback's private data
* \param[in] new_size Size in bytes needed by the buffer
diff --git a/libaom/aom/aomcx.h b/libaom/aom/aomcx.h
index 9aa77bb..da7498f 100644
--- a/libaom/aom/aomcx.h
+++ b/libaom/aom/aomcx.h
@@ -512,16 +512,25 @@ enum aome_enc_control_id {
*/
AV1E_SET_RENDER_SIZE,
- /*!\brief Codec control function to set target level.
- *
- * 255: off (default); 0: only keep level stats; 10: target for level 1.0;
- * 11: target for level 1.1; ... 62: target for level 6.2
- */
- AV1E_SET_TARGET_LEVEL,
-
- /*!\brief Codec control function to get bitstream level.
- */
- AV1E_GET_LEVEL,
+ /*!\brief Control to set target sequence level index for a certain operating
+ * point(OP).
+ * Possible values are in the form of "ABxy"(pad leading zeros if less than
+ * 4 digits).
+ * AB: OP index.
+ * xy: Target level index for the OP. Can be values 0~23(corresponding to
+ * level 2.0 ~ 7.3) or 31(maximum level parameter, no level-based
+ * constraints).
+ * E.g. "0" means target level index 0 for the 0th OP;
+ * "111" means target level index 11 for the 1st OP;
+ * "1021" means target level index 21 for the 10th OP.
+ * If the target level is not specified for an OP, the maximum level parameter
+ * of 31 is used as default.
+ */
+ AV1E_SET_TARGET_SEQ_LEVEL_IDX,
+
+ /*!\brief Codec control function to get sequence level index.
+ */
+ AV1E_GET_SEQ_LEVEL_IDX,
/*!\brief Codec control function to set intended superblock size.
*
@@ -561,12 +570,23 @@ enum aome_enc_control_id {
*/
AV1E_SET_ENABLE_RESTORATION,
+ /*!\brief Codec control function to predict with OBMC mode.
+ *
+ * 0 = do not allow OBMC mode
+ * 1 = allow OBMC mode
+ *
+ * By default, the encoder allows OBMC prediction mode.
+ *
+ */
+ AV1E_SET_ENABLE_OBMC,
+
/*!\brief Codec control function to encode without trellis quantization.
*
* 0 = apply trellis quantization
* 1 = do not apply trellis quantization
+ * 2 = disable trellis quantization partially
*
- * By default, the encoder applies trellis optimization on quantized
+ * By default, the encoder applies optimization on quantized
* coefficients.
*
*/
@@ -700,13 +720,59 @@ enum aome_enc_control_id {
*/
AV1E_SET_ANS_WINDOW_SIZE_LOG2,
- /*!\brief Codec control function to turn on / off dual filter
- * enabling/disabling.
+ /*!\brief Codec control function to enable/disable rectangular partitions.
+ *
+ * This will enable or disable usage of rectangular partitions. The default
+ * value is 1.
+ *
+ */
+ AV1E_SET_ENABLE_RECT_PARTITIONS,
+
+ /*!\brief Codec control function to enable/disable AB partitions.
+ *
+ * This will enable or disable usage of AB partitions. The default
+ * value is 1.
+ *
+ */
+ AV1E_SET_ENABLE_AB_PARTITIONS,
+
+ /*!\brief Codec control function to enable/disable 1:4 and 4:1 partitions.
*
- * This will enable or disable dual filter. The default value is 1
+ * This will enable or disable usage of 1:4 and 4:1 partitions. The default
+ * value is 1.
*
*/
- AV1E_SET_ENABLE_DF,
+ AV1E_SET_ENABLE_1TO4_PARTITIONS,
+
+ /*!\brief Codec control function to set min partition size.
+ *
+ * This will set min partition size. The default value is 4 for 4x4.
+ * valid values are [4, 8, 16, 32, 64, 128]
+ * min_partition_size is applied to both width and height of the partition.
+ * i.e, both width and height of a partition can not be smaller than
+ * the min_partition_size, except the partition at the picture boundary.
+ *
+ */
+ AV1E_SET_MIN_PARTITION_SIZE,
+
+ /*!\brief Codec control function to set max partition size.
+ *
+ * This will set max partition size. The default value is 128 for 128x128.
+ * valid values are [4, 8, 16, 32, 64, 128]
+ * max_partition_size is applied to both width and height of the partition.
+ * i.e, both width and height of a partition can not be larger than
+ * the max_partition_size.
+ */
+ AV1E_SET_MAX_PARTITION_SIZE,
+
+ /*!\brief Codec control function to turn on / off intra edge filter
+ * at sequence level.
+ *
+ * This will enable or disable usage of intra-edge filtering. The default
+ * value is 1.
+ *
+ */
+ AV1E_SET_ENABLE_INTRA_EDGE_FILTER,
/*!\brief Codec control function to turn on / off frame order hint for a
* few tools:
@@ -720,14 +786,42 @@ enum aome_enc_control_id {
*/
AV1E_SET_ENABLE_ORDER_HINT,
- /*!\brief Codec control function to turn on / off joint compound mode
+ /*!\brief Codec control function to turn on / off 64-length transforms.
+ *
+ * This will enable or disable usage of length 64 transforms in any
+ * direction. The default value is 1.
+ *
+ */
+ AV1E_SET_ENABLE_TX64,
+
+ /*!\brief Codec control function to turn on / off flip and identity
+ * transforms.
+ *
+ * This will enable or disable usage of flip and identity transform
+ * types in any direction. The default value is 1. Including:
+ * FLIPADST_DCT, DCT_FLIPADST, FLIPADST_FLIPADST, ADST_FLIPADST,
+ * FLIPADST_ADST, IDTX, V_DCT, H_DCT, V_ADST, H_ADST, V_FLIPADST,
+ * H_FLIPADST
+ */
+ AV1E_SET_ENABLE_FLIP_IDTX,
+
+ /*!\brief Codec control function to set transform block size search method.
+ *
+ * This will set the transform block size search method.
+ * 0: use Full RD search, 1: use Fast RD search, 2: always use largest
+ * allowed transform block size based on partition size.
+ */
+ AV1E_SET_TX_SIZE_SEARCH_METHOD,
+
+ /*!\brief Codec control function to turn on / off dist-wtd compound mode
* at sequence level.
*
- * This will enable or disable joint compound mode. The default value is 1.
- * If AV1E_SET_ENABLE_ORDER_HINT is 0, then this flag is forced to 0.
+ * This will enable or disable distance-weighted compound mode. The default
+ * value is 1. If AV1E_SET_ENABLE_ORDER_HINT is 0, then this flag is forced
+ * to 0.
*
*/
- AV1E_SET_ENABLE_JNT_COMP,
+ AV1E_SET_ENABLE_DIST_WTD_COMP,
/*!\brief Codec control function to turn on / off ref frame mvs (mfmv) usage
* at sequence level.
@@ -747,6 +841,86 @@ enum aome_enc_control_id {
*/
AV1E_SET_ALLOW_REF_FRAME_MVS,
+ /*!\brief Codec control function to turn on / off dual filter usage
+ * for a sequence.
+ *
+ * This will enable or disable use of dual interpolation filter.
+ * The default value is 1.
+ *
+ */
+ AV1E_SET_ENABLE_DUAL_FILTER,
+
+ /*!\brief Codec control function to turn on / off masked compound usage
+ * for a sequence.
+ *
+ * This will enable or disable usage of wedge and diff-wtd compound
+ * modes. The default value is 1.
+ *
+ */
+ AV1E_SET_ENABLE_MASKED_COMP,
+
+ /*!\brief Codec control function to turn on / off one sided compound usage
+ * for a sequence.
+ *
+ * This will enable or disable usage of one sided compound
+ * modes. The default value is 1.
+ *
+ */
+ AV1E_SET_ENABLE_ONESIDED_COMP,
+
+ /*!\brief Codec control function to turn on / off interintra compound
+ * for a sequence.
+ *
+ * This will enable or disable usage of inter-intra compound modes.
+ * The default value is 1.
+ *
+ */
+ AV1E_SET_ENABLE_INTERINTRA_COMP,
+
+ /*!\brief Codec control function to turn on / off smooth inter-intra
+ * mode for a sequence.
+ *
+ * This will enable or disable usage of smooth inter-intra mode.
+ * The default value is 1.
+ *
+ */
+ AV1E_SET_ENABLE_SMOOTH_INTERINTRA,
+
+ /*!\brief Codec control function to turn on / off difference weighted
+ * compound.
+ *
+ * This will enable or disable usage of difference weighted compound.
+ * The default value is 1.
+ *
+ */
+ AV1E_SET_ENABLE_DIFF_WTD_COMP,
+
+ /*!\brief Codec control function to turn on / off interinter wedge
+ * compound.
+ *
+ * This will enable or disable usage of interinter wedge compound.
+ * The default value is 1.
+ *
+ */
+ AV1E_SET_ENABLE_INTERINTER_WEDGE,
+
+ /*!\brief Codec control function to turn on / off interintra wedge
+ * compound.
+ *
+ * This will enable or disable usage of interintra wedge compound.
+ * The default value is 1.
+ *
+ */
+ AV1E_SET_ENABLE_INTERINTRA_WEDGE,
+
+ /*!\brief Codec control function to turn on / off global motion usage
+ * for a sequence.
+ *
+ * This will enable or disable usage of global motion. The default value is 1.
+ *
+ */
+ AV1E_SET_ENABLE_GLOBAL_MOTION,
+
/*!\brief Codec control function to turn on / off warped motion usage
* at sequence level.
*
@@ -764,6 +938,39 @@ enum aome_enc_control_id {
*/
AV1E_SET_ALLOW_WARPED_MOTION,
+ /*!\brief Codec control function to turn on / off filter intra usage at
+ * sequence level.
+ *
+ * This will enable or disable usage of filter intra. The default value is 1.
+ * If AV1E_SET_ENABLE_FILTER_INTRA is 0, then this flag is forced to 0.
+ *
+ */
+ AV1E_SET_ENABLE_FILTER_INTRA,
+
+ /*!\brief Codec control function to turn on / off smooth intra modes usage.
+ *
+ * This will enable or disable usage of smooth, smooth_h and smooth_v intra
+ * modes. The default value is 1.
+ *
+ */
+ AV1E_SET_ENABLE_SMOOTH_INTRA,
+
+ /*!\brief Codec control function to turn on / off Paeth intra mode usage.
+ *
+ * This will enable or disable usage of Paeth intra mode. The default value
+ * is 1.
+ *
+ */
+ AV1E_SET_ENABLE_PAETH_INTRA,
+
+ /*!\brief Codec control function to turn on / off CFL uv intra mode usage.
+ *
+ * This will enable or disable usage of chroma-from-luma intra mode. The
+ * default value is 1.
+ *
+ */
+ AV1E_SET_ENABLE_CFL_INTRA,
+
/*!\brief Codec control function to turn on / off frame superresolution.
*
* This will enable or disable frame superresolution. The default value is 1
@@ -771,6 +978,15 @@ enum aome_enc_control_id {
*/
AV1E_SET_ENABLE_SUPERRES,
+ /*!\brief Codec control function to turn on/off palette mode */
+ AV1E_SET_ENABLE_PALETTE,
+
+ /*!\brief Codec control function to turn on/off intra block copy mode */
+ AV1E_SET_ENABLE_INTRABC,
+
+ /*!\brief Codec control function to turn on/off intra angle delta */
+ AV1E_SET_ENABLE_ANGLE_DELTA,
+
/*!\brief Codec control function to set the delta q mode
*
* AV1 has a segment based feature that allows encoder to adaptively change
@@ -828,6 +1044,54 @@ enum aome_enc_control_id {
/*!\brief Sets the chroma subsampling y value */
AV1E_SET_CHROMA_SUBSAMPLING_Y,
+
+ /*!\brief Control to use a reduced tx type set */
+ AV1E_SET_REDUCED_TX_TYPE_SET,
+
+ /*!\brief Control to use dct only for intra modes */
+ AV1E_SET_INTRA_DCT_ONLY,
+
+ /*!\brief Control to use dct only for inter modes */
+ AV1E_SET_INTER_DCT_ONLY,
+
+ /*!\brief Control to use default tx type only for intra modes */
+ AV1E_SET_INTRA_DEFAULT_TX_ONLY,
+
+ /*!\brief Control to use adaptive quantize_b */
+ AV1E_SET_QUANT_B_ADAPT,
+
+ /*!\brief Control to select maximum height for the GF group pyramid structure
+ * (valid values: 0 - 4) */
+ AV1E_SET_GF_MAX_PYRAMID_HEIGHT,
+
+ /*!\brief Control to select maximum reference frames allowed per frame
+ * (valid values: 3 - 7) */
+ AV1E_SET_MAX_REFERENCE_FRAMES,
+
+ /*!\brief Control to use reduced set of single and compound references. */
+ AV1E_SET_REDUCED_REFERENCE_SET,
+
+ /*!\brief Control to set frequency of the cost updates for coefficients
+ * Possible values are:
+ * 0: Update at SB level (default)
+ * 1: Update at SB row level in tile
+ * 2: Update at tile level
+ */
+ AV1E_SET_COEFF_COST_UPD_FREQ,
+
+ /*!\brief Control to set frequency of the cost updates for mode
+ * Possible values are:
+ * 0: Update at SB level (default)
+ * 1: Update at SB row level in tile
+ * 2: Update at tile level
+ */
+ AV1E_SET_MODE_COST_UPD_FREQ,
+
+ /*!\brief Control to set bit mask that specifies which tier each of the 32
+ * possible operating points conforms to.
+ * Bit value 0: Main Tier; 1: High Tier.
+ */
+ AV1E_SET_TIER_MASK,
};
/*!\brief aom 1-D scaling mode
@@ -934,13 +1198,11 @@ AOM_CTRL_USE_TYPE(AOME_SET_ACTIVEMAP, aom_active_map_t *)
AOM_CTRL_USE_TYPE(AOME_SET_SCALEMODE, aom_scaling_mode_t *)
#define AOM_CTRL_AOME_SET_SCALEMODE
-AOM_CTRL_USE_TYPE(AOME_SET_SPATIAL_LAYER_ID, int)
+AOM_CTRL_USE_TYPE(AOME_SET_SPATIAL_LAYER_ID, unsigned int)
#define AOM_CTRL_AOME_SET_SPATIAL_LAYER_ID
AOM_CTRL_USE_TYPE(AOME_SET_CPUUSED, int)
#define AOM_CTRL_AOME_SET_CPUUSED
-AOM_CTRL_USE_TYPE(AOME_SET_DEVSF, int)
-#define AOM_CTRL_AOME_SET_DEVSF
AOM_CTRL_USE_TYPE(AOME_SET_ENABLEAUTOALTREF, unsigned int)
#define AOM_CTRL_AOME_SET_ENABLEAUTOALTREF
@@ -961,12 +1223,12 @@ AOM_CTRL_USE_TYPE(AOME_SET_TUNING, int) /* aom_tune_metric */
AOM_CTRL_USE_TYPE(AOME_SET_CQ_LEVEL, unsigned int)
#define AOM_CTRL_AOME_SET_CQ_LEVEL
-AOM_CTRL_USE_TYPE(AV1E_SET_ROW_MT, int)
+AOM_CTRL_USE_TYPE(AV1E_SET_ROW_MT, unsigned int)
#define AOM_CTRL_AV1E_SET_ROW_MT
-AOM_CTRL_USE_TYPE(AV1E_SET_TILE_COLUMNS, int)
+AOM_CTRL_USE_TYPE(AV1E_SET_TILE_COLUMNS, unsigned int)
#define AOM_CTRL_AV1E_SET_TILE_COLUMNS
-AOM_CTRL_USE_TYPE(AV1E_SET_TILE_ROWS, int)
+AOM_CTRL_USE_TYPE(AV1E_SET_TILE_ROWS, unsigned int)
#define AOM_CTRL_AV1E_SET_TILE_ROWS
AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_TPL_MODEL, unsigned int)
@@ -997,6 +1259,9 @@ AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_CDEF, unsigned int)
AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_RESTORATION, unsigned int)
#define AOM_CTRL_AV1E_SET_ENABLE_RESTORATION
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_OBMC, unsigned int)
+#define AOM_CTRL_AV1E_SET_ENABLE_OBMC
+
AOM_CTRL_USE_TYPE(AV1E_SET_DISABLE_TRELLIS_QUANT, unsigned int)
#define AOM_CTRL_AV1E_SET_DISABLE_TRELLIS_QUANT
@@ -1029,37 +1294,109 @@ AOM_CTRL_USE_TYPE(AV1E_SET_MTU, unsigned int)
AOM_CTRL_USE_TYPE(AV1E_SET_TIMING_INFO_TYPE, int) /* aom_timing_info_type_t */
#define AOM_CTRL_AV1E_SET_TIMING_INFO_TYPE
-AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_DF, unsigned int)
-#define AOM_CTRL_AV1E_SET_ENABLE_DF
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_RECT_PARTITIONS, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_RECT_PARTITIONS
-AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_ORDER_HINT, unsigned int)
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_AB_PARTITIONS, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_AB_PARTITIONS
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_1TO4_PARTITIONS, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_1TO4_PARTITIONS
+
+AOM_CTRL_USE_TYPE(AV1E_SET_MIN_PARTITION_SIZE, int)
+#define AOM_CTRL_AV1E_SET_MIN_PARTITION_SIZE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_MAX_PARTITION_SIZE, int)
+#define AOM_CTRL_AV1E_SET_MAX_PARTITION_SIZE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_INTRA_EDGE_FILTER, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_INTRA_EDGE_FILTER
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_ORDER_HINT, int)
#define AOM_CTRL_AV1E_SET_ENABLE_ORDER_HINT
-AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_JNT_COMP, unsigned int)
-#define AOM_CTRL_AV1E_SET_ENABLE_JNT_COMP
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_TX64, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_TX64
-AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_REF_FRAME_MVS, unsigned int)
+AOM_CTRL_USE_TYPE(AV1E_SET_TX_SIZE_SEARCH_METHOD, int)
+#define AOM_CTRL_AV1E_SET_TXSIZE_SEARCH_METHOD
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_FLIP_IDTX, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_FLIP_IDTX
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_DIST_WTD_COMP, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_DIST_WTD_COMP
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_REF_FRAME_MVS, int)
#define AOM_CTRL_AV1E_SET_ENABLE_REF_FRAME_MVS
-AOM_CTRL_USE_TYPE(AV1E_SET_ALLOW_REF_FRAME_MVS, unsigned int)
+AOM_CTRL_USE_TYPE(AV1E_SET_ALLOW_REF_FRAME_MVS, int)
#define AOM_CTRL_AV1E_SET_ALLOW_REF_FRAME_MVS
-AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_WARPED_MOTION, unsigned int)
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_DUAL_FILTER, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_DUAL_FILTER
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_MASKED_COMP, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_MASKED_COMP
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_ONESIDED_COMP, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_ONESIDED_COMP
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_INTERINTRA_COMP, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_INTERINTRA_COMP
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_SMOOTH_INTERINTRA, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_SMOOTH_INTERINTRA
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_DIFF_WTD_COMP, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_DIFF_WTD_COMP
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_INTERINTER_WEDGE, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_INTERINTER_WEDGE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_INTERINTRA_WEDGE, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_INTERINTRA_WEDGE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_GLOBAL_MOTION, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_GLOBAL_MOTION
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_WARPED_MOTION, int)
#define AOM_CTRL_AV1E_SET_ENABLE_WARPED_MOTION
-AOM_CTRL_USE_TYPE(AV1E_SET_ALLOW_WARPED_MOTION, unsigned int)
+AOM_CTRL_USE_TYPE(AV1E_SET_ALLOW_WARPED_MOTION, int)
#define AOM_CTRL_AV1E_SET_ALLOW_WARPED_MOTION
-AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_SUPERRES, unsigned int)
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_FILTER_INTRA, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_FILTER_INTRA
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_SMOOTH_INTRA, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_SMOOTH_INTRA
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_PAETH_INTRA, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_PAETH_INTRA
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_CFL_INTRA, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_CFL_INTRA
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_SUPERRES, int)
#define AOM_CTRL_AV1E_SET_ENABLE_SUPERRES
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_PALETTE, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_PALETTE
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_INTRABC, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_INTRABC
+
+AOM_CTRL_USE_TYPE(AV1E_SET_ENABLE_ANGLE_DELTA, int)
+#define AOM_CTRL_AV1E_SET_ENABLE_ANGLE_DELTA
+
AOM_CTRL_USE_TYPE(AV1E_SET_FRAME_PARALLEL_DECODING, unsigned int)
#define AOM_CTRL_AV1E_SET_FRAME_PARALLEL_DECODING
-AOM_CTRL_USE_TYPE(AV1E_SET_ERROR_RESILIENT_MODE, unsigned int)
+AOM_CTRL_USE_TYPE(AV1E_SET_ERROR_RESILIENT_MODE, int)
#define AOM_CTRL_AV1E_SET_ERROR_RESILIENT_MODE
-AOM_CTRL_USE_TYPE(AV1E_SET_S_FRAME_MODE, unsigned int)
+AOM_CTRL_USE_TYPE(AV1E_SET_S_FRAME_MODE, int)
#define AOM_CTRL_AV1E_SET_S_FRAME_MODE
AOM_CTRL_USE_TYPE(AV1E_SET_AQ_MODE, unsigned int)
@@ -1107,14 +1444,8 @@ AOM_CTRL_USE_TYPE(AV1E_SET_RENDER_SIZE, int *)
AOM_CTRL_USE_TYPE(AV1E_SET_SUPERBLOCK_SIZE, unsigned int)
#define AOM_CTRL_AV1E_SET_SUPERBLOCK_SIZE
-AOM_CTRL_USE_TYPE(AV1E_SET_TARGET_LEVEL, unsigned int)
-#define AOM_CTRL_AV1E_SET_TARGET_LEVEL
-
-AOM_CTRL_USE_TYPE(AV1E_GET_LEVEL, int *)
-#define AOM_CTRL_AV1E_GET_LEVEL
-
-AOM_CTRL_USE_TYPE(AV1E_SET_ANS_WINDOW_SIZE_LOG2, unsigned int)
-#define AOM_CTRL_AV1E_SET_ANS_WINDOW_SIZE_LOG2
+AOM_CTRL_USE_TYPE(AV1E_GET_SEQ_LEVEL_IDX, int *)
+#define AOM_CTRL_AV1E_GET_SEQ_LEVEL_IDX
AOM_CTRL_USE_TYPE(AV1E_SET_SINGLE_TILE_DECODING, unsigned int)
#define AOM_CTRL_AV1E_SET_SINGLE_TILE_DECODING
@@ -1122,13 +1453,13 @@ AOM_CTRL_USE_TYPE(AV1E_SET_SINGLE_TILE_DECODING, unsigned int)
AOM_CTRL_USE_TYPE(AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST, unsigned int)
#define AOM_CTRL_AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST
-AOM_CTRL_USE_TYPE(AV1E_SET_FILM_GRAIN_TEST_VECTOR, unsigned int)
+AOM_CTRL_USE_TYPE(AV1E_SET_FILM_GRAIN_TEST_VECTOR, int)
#define AOM_CTRL_AV1E_SET_FILM_GRAIN_TEST_VECTOR
AOM_CTRL_USE_TYPE(AV1E_SET_FILM_GRAIN_TABLE, const char *)
#define AOM_CTRL_AV1E_SET_FILM_GRAIN_TABLE
-AOM_CTRL_USE_TYPE(AV1E_SET_CDF_UPDATE_MODE, int)
+AOM_CTRL_USE_TYPE(AV1E_SET_CDF_UPDATE_MODE, unsigned int)
#define AOM_CTRL_AV1E_SET_CDF_UPDATE_MODE
#ifdef CONFIG_DENOISE
@@ -1145,6 +1476,42 @@ AOM_CTRL_USE_TYPE(AV1E_SET_CHROMA_SUBSAMPLING_X, unsigned int)
AOM_CTRL_USE_TYPE(AV1E_SET_CHROMA_SUBSAMPLING_Y, unsigned int)
#define AOM_CTRL_AV1E_SET_CHROMA_SUBSAMPLING_Y
+AOM_CTRL_USE_TYPE(AV1E_SET_REDUCED_TX_TYPE_SET, int)
+#define AOM_CTRL_AV1E_SET_REDUCED_TX_TYPE_SET
+
+AOM_CTRL_USE_TYPE(AV1E_SET_INTRA_DCT_ONLY, int)
+#define AOM_CTRL_AV1E_SET_INTRA_DCT_ONLY
+
+AOM_CTRL_USE_TYPE(AV1E_SET_INTER_DCT_ONLY, int)
+#define AOM_CTRL_AV1E_SET_INTER_DCT_ONLY
+
+AOM_CTRL_USE_TYPE(AV1E_SET_INTRA_DEFAULT_TX_ONLY, int)
+#define AOM_CTRL_AV1E_SET_INTRA_DEFAULT_TX_ONLY
+
+AOM_CTRL_USE_TYPE(AV1E_SET_QUANT_B_ADAPT, int)
+#define AOM_CTRL_AV1E_SET_QUANT_B_ADAPT
+
+AOM_CTRL_USE_TYPE(AV1E_SET_GF_MAX_PYRAMID_HEIGHT, unsigned int)
+#define AOM_CTRL_AV1E_SET_GF_MAX_PYRAMID_HEIGHT
+
+AOM_CTRL_USE_TYPE(AV1E_SET_MAX_REFERENCE_FRAMES, int)
+#define AOM_CTRL_AV1E_SET_MAX_REFERENCE_FRAMES
+
+AOM_CTRL_USE_TYPE(AV1E_SET_REDUCED_REFERENCE_SET, int)
+#define AOM_CTRL_AV1E_SET_REDUCED_REFERENCE_SET
+
+AOM_CTRL_USE_TYPE(AV1E_SET_COEFF_COST_UPD_FREQ, unsigned int)
+#define AOM_CTRL_AV1E_SET_COEFF_COST_UPD_FREQ
+
+AOM_CTRL_USE_TYPE(AV1E_SET_MODE_COST_UPD_FREQ, unsigned int)
+#define AOM_CTRL_AV1E_SET_MODE_COST_UPD_FREQ
+
+AOM_CTRL_USE_TYPE(AV1E_SET_TARGET_SEQ_LEVEL_IDX, int)
+#define AOM_CTRL_AV1E_SET_TARGET_SEQ_LEVEL_IDX
+
+AOM_CTRL_USE_TYPE(AV1E_SET_TIER_MASK, unsigned int)
+#define AOM_CTRL_AV1E_SET_TIER_MASK
+
/*!\endcond */
/*! @} - end defgroup aom_encoder */
#ifdef __cplusplus
diff --git a/libaom/aom_dsp/add_noise.c b/libaom/aom_dsp/add_noise.c
index bfb3e7e..43587ca 100644
--- a/libaom/aom_dsp/add_noise.c
+++ b/libaom/aom_dsp/add_noise.c
@@ -40,7 +40,7 @@ void aom_plane_add_noise_c(uint8_t *start, char *noise, char blackclamp[16],
}
static double gaussian(double sigma, double mu, double x) {
- return 1 / (sigma * sqrt(2.0 * 3.14159265)) *
+ return 1 / (sigma * sqrt(2.0 * PI)) *
(exp(-(x - mu) * (x - mu) / (2 * sigma * sigma)));
}
diff --git a/libaom/aom_dsp/aom_dsp.cmake b/libaom/aom_dsp/aom_dsp.cmake
index a8490c4..abf6a60 100644
--- a/libaom/aom_dsp/aom_dsp.cmake
+++ b/libaom/aom_dsp/aom_dsp.cmake
@@ -194,6 +194,7 @@ if(CONFIG_AV1_ENCODER)
"${AOM_ROOT}/aom_dsp/x86/highbd_subtract_sse2.c"
"${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse2.c"
"${AOM_ROOT}/aom_dsp/x86/quantize_sse2.c"
+ "${AOM_ROOT}/aom_dsp/x86/adaptive_quantize_sse2.c"
"${AOM_ROOT}/aom_dsp/x86/quantize_x86.h"
"${AOM_ROOT}/aom_dsp/x86/sum_squares_sse2.c"
"${AOM_ROOT}/aom_dsp/x86/variance_sse2.c")
@@ -226,6 +227,7 @@ if(CONFIG_AV1_ENCODER)
"${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.c"
"${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.h"
"${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.c"
+ "${AOM_ROOT}/aom_dsp/x86/quantize_ssse3.c"
"${AOM_ROOT}/aom_dsp/x86/variance_impl_ssse3.c"
"${AOM_ROOT}/aom_dsp/x86/jnt_variance_ssse3.c"
"${AOM_ROOT}/aom_dsp/x86/jnt_sad_ssse3.c")
@@ -361,6 +363,8 @@ function(setup_aom_dsp_targets)
endif()
endif()
+ target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_dsp>)
+
# Pass the new lib targets up to the parent scope instance of
# $AOM_LIB_TARGETS.
set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE)
diff --git a/libaom/aom_dsp/aom_dsp_rtcd_defs.pl b/libaom/aom_dsp/aom_dsp_rtcd_defs.pl
index 59d0620..f56a117 100755
--- a/libaom/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/libaom/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -466,10 +466,6 @@ specialize qw/aom_highbd_lpf_horizontal_4 sse2/;
add_proto qw/void aom_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
specialize qw/aom_highbd_lpf_horizontal_4_dual sse2 avx2/;
-# Helper functions.
-add_proto qw/void av1_round_shift_array/, "int32_t *arr, int size, int bit";
-specialize "av1_round_shift_array", qw/sse4_1 neon/;
-
#
# Encoder functions.
#
@@ -522,10 +518,17 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
add_proto qw/void aom_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/aom_quantize_b sse2/, "$ssse3_x86_64", "$avx_x86_64";
+ add_proto qw/void aom_quantize_b_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/aom_quantize_b_adaptive sse2/;
+
add_proto qw/void aom_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/aom_quantize_b_32x32/, "$ssse3_x86_64", "$avx_x86_64";
+ add_proto qw/void aom_quantize_b_32x32_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/aom_quantize_b_32x32_adaptive sse2/;
+
add_proto qw/void aom_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/aom_quantize_b_64x64 ssse3/;
} # CONFIG_AV1_ENCODER
if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
@@ -536,7 +539,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
specialize qw/aom_highbd_quantize_b_32x32 sse2/;
add_proto qw/void aom_highbd_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-
+ specialize qw/aom_highbd_quantize_b_64x64 sse2/;
} # CONFIG_AV1_ENCODER
#
@@ -596,7 +599,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
($w, $h) = @$_;
add_proto qw/unsigned int/, "aom_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
add_proto qw/unsigned int/, "aom_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
- add_proto qw/unsigned int/, "aom_jnt_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param";
+ add_proto qw/unsigned int/, "aom_dist_wtd_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param";
}
specialize qw/aom_sad128x128 avx2 sse2/;
@@ -647,29 +650,29 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
specialize qw/aom_sad16x64_avg sse2/;
specialize qw/aom_sad64x16_avg sse2/;
- specialize qw/aom_jnt_sad128x128_avg ssse3/;
- specialize qw/aom_jnt_sad128x64_avg ssse3/;
- specialize qw/aom_jnt_sad64x128_avg ssse3/;
- specialize qw/aom_jnt_sad64x64_avg ssse3/;
- specialize qw/aom_jnt_sad64x32_avg ssse3/;
- specialize qw/aom_jnt_sad32x64_avg ssse3/;
- specialize qw/aom_jnt_sad32x32_avg ssse3/;
- specialize qw/aom_jnt_sad32x16_avg ssse3/;
- specialize qw/aom_jnt_sad16x32_avg ssse3/;
- specialize qw/aom_jnt_sad16x16_avg ssse3/;
- specialize qw/aom_jnt_sad16x8_avg ssse3/;
- specialize qw/aom_jnt_sad8x16_avg ssse3/;
- specialize qw/aom_jnt_sad8x8_avg ssse3/;
- specialize qw/aom_jnt_sad8x4_avg ssse3/;
- specialize qw/aom_jnt_sad4x8_avg ssse3/;
- specialize qw/aom_jnt_sad4x4_avg ssse3/;
-
- specialize qw/aom_jnt_sad4x16_avg ssse3/;
- specialize qw/aom_jnt_sad16x4_avg ssse3/;
- specialize qw/aom_jnt_sad8x32_avg ssse3/;
- specialize qw/aom_jnt_sad32x8_avg ssse3/;
- specialize qw/aom_jnt_sad16x64_avg ssse3/;
- specialize qw/aom_jnt_sad64x16_avg ssse3/;
+ specialize qw/aom_dist_wtd_sad128x128_avg ssse3/;
+ specialize qw/aom_dist_wtd_sad128x64_avg ssse3/;
+ specialize qw/aom_dist_wtd_sad64x128_avg ssse3/;
+ specialize qw/aom_dist_wtd_sad64x64_avg ssse3/;
+ specialize qw/aom_dist_wtd_sad64x32_avg ssse3/;
+ specialize qw/aom_dist_wtd_sad32x64_avg ssse3/;
+ specialize qw/aom_dist_wtd_sad32x32_avg ssse3/;
+ specialize qw/aom_dist_wtd_sad32x16_avg ssse3/;
+ specialize qw/aom_dist_wtd_sad16x32_avg ssse3/;
+ specialize qw/aom_dist_wtd_sad16x16_avg ssse3/;
+ specialize qw/aom_dist_wtd_sad16x8_avg ssse3/;
+ specialize qw/aom_dist_wtd_sad8x16_avg ssse3/;
+ specialize qw/aom_dist_wtd_sad8x8_avg ssse3/;
+ specialize qw/aom_dist_wtd_sad8x4_avg ssse3/;
+ specialize qw/aom_dist_wtd_sad4x8_avg ssse3/;
+ specialize qw/aom_dist_wtd_sad4x4_avg ssse3/;
+
+ specialize qw/aom_dist_wtd_sad4x16_avg ssse3/;
+ specialize qw/aom_dist_wtd_sad16x4_avg ssse3/;
+ specialize qw/aom_dist_wtd_sad8x32_avg ssse3/;
+ specialize qw/aom_dist_wtd_sad32x8_avg ssse3/;
+ specialize qw/aom_dist_wtd_sad16x64_avg ssse3/;
+ specialize qw/aom_dist_wtd_sad64x16_avg ssse3/;
add_proto qw/unsigned int/, "aom_sad4xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height";
add_proto qw/unsigned int/, "aom_sad8xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height";
@@ -694,7 +697,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
specialize "aom_highbd_sad${w}x${h}", qw/sse2/;
specialize "aom_highbd_sad${w}x${h}_avg", qw/sse2/;
}
- add_proto qw/unsigned int/, "aom_highbd_jnt_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const JNT_COMP_PARAMS* jcp_param";
+ add_proto qw/unsigned int/, "aom_highbd_dist_wtd_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param";
}
specialize qw/aom_highbd_sad128x128 avx2/;
specialize qw/aom_highbd_sad128x64 avx2/;
@@ -839,6 +842,30 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
specialize qw/aom_highbd_sad64x16x4d sse2/;
#
+ # Avg
+ #
+ add_proto qw/unsigned int aom_avg_8x8/, "const uint8_t *, int p";
+ specialize qw/aom_avg_8x8 sse2/;
+
+ add_proto qw/unsigned int aom_avg_4x4/, "const uint8_t *, int p";
+ specialize qw/aom_avg_4x4 sse2/;
+
+ add_proto qw/void aom_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
+ specialize qw/aom_minmax_8x8 sse2/;
+
+ add_proto qw/void aom_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height";
+ # TODO(kyslov@) bring back SSE2 by extending it to 128 block size
+ #specialize qw/aom_int_pro_row sse2/;
+
+ add_proto qw/int16_t aom_int_pro_col/, "const uint8_t *ref, const int width";
+ # TODO(kyslov@) bring back SSE2 by extending it to 128 block size
+ #specialize qw/aom_int_pro_col sse2/;
+
+ add_proto qw/int aom_vector_var/, "const int16_t *ref, const int16_t *src, const int bwl";
+ # TODO(kyslov@) bring back SSE2 by extending it to 128 block size
+ #specialize qw/aom_vector_var sse2/;
+
+ #
# hamadard transform and satd for implmenting temporal dependency model
#
add_proto qw/void aom_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
@@ -919,11 +946,11 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
int ref_stride, int subpel_search";
specialize qw/aom_comp_avg_upsampled_pred sse2/;
- add_proto qw/void aom_jnt_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+ add_proto qw/void aom_dist_wtd_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
- int ref_stride, const JNT_COMP_PARAMS *jcp_param, int subpel_search";
- specialize qw/aom_jnt_comp_avg_upsampled_pred ssse3/;
+ int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search";
+ specialize qw/aom_dist_wtd_comp_avg_upsampled_pred ssse3/;
add_proto qw/void aom_comp_mask_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
@@ -942,11 +969,11 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search";
specialize qw/aom_highbd_comp_avg_upsampled_pred sse2/;
- add_proto qw/void aom_highbd_jnt_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
+ add_proto qw/void aom_highbd_dist_wtd_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
- int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param, int subpel_search";
- specialize qw/aom_highbd_jnt_comp_avg_upsampled_pred sse2/;
+ int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search";
+ specialize qw/aom_highbd_dist_wtd_comp_avg_upsampled_pred sse2/;
#
@@ -972,7 +999,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
add_proto qw/unsigned int/, "aom_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
add_proto qw/uint32_t/, "aom_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
add_proto qw/uint32_t/, "aom_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- add_proto qw/uint32_t/, "aom_jnt_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param";
+ add_proto qw/uint32_t/, "aom_dist_wtd_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param";
}
specialize qw/aom_variance128x128 sse2 avx2 /;
specialize qw/aom_variance128x64 sse2 avx2 /;
@@ -1044,30 +1071,30 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
specialize qw/aom_sub_pixel_avg_variance16x64 sse2 ssse3/;
specialize qw/aom_sub_pixel_avg_variance64x16 sse2 ssse3/;
- specialize qw/aom_jnt_sub_pixel_avg_variance64x64 ssse3/;
- specialize qw/aom_jnt_sub_pixel_avg_variance64x32 ssse3/;
- specialize qw/aom_jnt_sub_pixel_avg_variance32x64 ssse3/;
- specialize qw/aom_jnt_sub_pixel_avg_variance32x32 ssse3/;
- specialize qw/aom_jnt_sub_pixel_avg_variance32x16 ssse3/;
- specialize qw/aom_jnt_sub_pixel_avg_variance16x32 ssse3/;
- specialize qw/aom_jnt_sub_pixel_avg_variance16x16 ssse3/;
- specialize qw/aom_jnt_sub_pixel_avg_variance16x8 ssse3/;
- specialize qw/aom_jnt_sub_pixel_avg_variance8x16 ssse3/;
- specialize qw/aom_jnt_sub_pixel_avg_variance8x8 ssse3/;
- specialize qw/aom_jnt_sub_pixel_avg_variance8x4 ssse3/;
- specialize qw/aom_jnt_sub_pixel_avg_variance4x8 ssse3/;
- specialize qw/aom_jnt_sub_pixel_avg_variance4x4 ssse3/;
-
- specialize qw/aom_jnt_sub_pixel_avg_variance4x16 ssse3/;
- specialize qw/aom_jnt_sub_pixel_avg_variance16x4 ssse3/;
- specialize qw/aom_jnt_sub_pixel_avg_variance8x32 ssse3/;
- specialize qw/aom_jnt_sub_pixel_avg_variance32x8 ssse3/;
- specialize qw/aom_jnt_sub_pixel_avg_variance16x64 ssse3/;
- specialize qw/aom_jnt_sub_pixel_avg_variance64x16 ssse3/;
-
- specialize qw/aom_jnt_sub_pixel_avg_variance128x128 ssse3/;
- specialize qw/aom_jnt_sub_pixel_avg_variance128x64 ssse3/;
- specialize qw/aom_jnt_sub_pixel_avg_variance64x128 ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x64 ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x32 ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x64 ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x32 ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x16 ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x32 ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x16 ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x8 ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x16 ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x8 ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x4 ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x8 ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x4 ssse3/;
+
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x16 ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x4 ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x32 ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x8 ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x64 ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x16 ssse3/;
+
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance128x128 ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance128x64 ssse3/;
+ specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x128 ssse3/;
foreach $bd (8, 10, 12) {
@@ -1099,7 +1126,7 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "sse4_1";
}
- add_proto qw/uint32_t/, "aom_highbd_${bd}_jnt_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const JNT_COMP_PARAMS* jcp_param";
+ add_proto qw/uint32_t/, "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param";
}
}
@@ -1188,8 +1215,8 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
#
add_proto qw/void aom_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
- add_proto qw/void aom_jnt_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const JNT_COMP_PARAMS *jcp_param";
- specialize qw/aom_jnt_comp_avg_pred ssse3/;
+ add_proto qw/void aom_dist_wtd_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param";
+ specialize qw/aom_dist_wtd_comp_avg_pred ssse3/;
add_proto qw/unsigned int aom_highbd_12_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/aom_highbd_12_variance128x128 sse2/;
@@ -1355,12 +1382,21 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
add_proto qw/void aom_highbd_comp_avg_pred/, "uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride";
- add_proto qw/void aom_highbd_jnt_comp_avg_pred/, "uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const JNT_COMP_PARAMS *jcp_param";
- specialize qw/aom_highbd_jnt_comp_avg_pred sse2/;
+ add_proto qw/void aom_highbd_dist_wtd_comp_avg_pred/, "uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param";
+ specialize qw/aom_highbd_dist_wtd_comp_avg_pred sse2/;
#
# Subpixel Variance
#
+ add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_12_sub_pixel_variance128x128 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_12_sub_pixel_variance128x64 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_12_sub_pixel_variance64x128 sse2/;
+
add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/aom_highbd_12_sub_pixel_variance64x64 sse2/;
@@ -1397,6 +1433,15 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_10_sub_pixel_variance128x128 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_10_sub_pixel_variance128x64 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_10_sub_pixel_variance64x128 sse2/;
+
add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/aom_highbd_10_sub_pixel_variance64x64 sse2/;
@@ -1433,6 +1478,15 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance128x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_8_sub_pixel_variance128x128 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance128x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_8_sub_pixel_variance128x64 sse2/;
+
+ add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x128/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+ specialize qw/aom_highbd_8_sub_pixel_variance64x128 sse2/;
+
add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
specialize qw/aom_highbd_8_sub_pixel_variance64x64 sse2/;
diff --git a/libaom/aom_dsp/avg.c b/libaom/aom_dsp/avg.c
index 4d78c9c..43d2760 100644
--- a/libaom/aom_dsp/avg.c
+++ b/libaom/aom_dsp/avg.c
@@ -14,6 +14,40 @@
#include "config/aom_dsp_rtcd.h"
#include "aom_ports/mem.h"
+void aom_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp,
+ int *min, int *max) {
+ int i, j;
+ *min = 255;
+ *max = 0;
+ for (i = 0; i < 8; ++i, s += p, d += dp) {
+ for (j = 0; j < 8; ++j) {
+ int diff = abs(s[j] - d[j]);
+ *min = diff < *min ? diff : *min;
+ *max = diff > *max ? diff : *max;
+ }
+ }
+}
+
+unsigned int aom_avg_4x4_c(const uint8_t *s, int p) {
+ int i, j;
+ int sum = 0;
+ for (i = 0; i < 4; ++i, s += p)
+ for (j = 0; j < 4; sum += s[j], ++j) {
+ }
+
+ return (sum + 8) >> 4;
+}
+
+unsigned int aom_avg_8x8_c(const uint8_t *s, int p) {
+ int i, j;
+ int sum = 0;
+ for (i = 0; i < 8; ++i, s += p)
+ for (j = 0; j < 8; sum += s[j], ++j) {
+ }
+
+ return (sum + 32) >> 6;
+}
+
// src_diff: first pass, 9 bit, dynamic range [-255, 255]
// second pass, 12 bit, dynamic range [-2040, 2040]
static void hadamard_col8(const int16_t *src_diff, ptrdiff_t src_stride,
@@ -146,3 +180,48 @@ int aom_satd_c(const tran_low_t *coeff, int length) {
// satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
return satd;
}
+
+// Integer projection onto row vectors.
+// height: value range {16, 32, 64, 128}.
+void aom_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref,
+ const int ref_stride, const int height) {
+ int idx;
+ const int norm_factor = height >> 1;
+ for (idx = 0; idx < 16; ++idx) {
+ int i;
+ hbuf[idx] = 0;
+ // hbuf[idx]: 14 bit, dynamic range [0, 32640].
+ for (i = 0; i < height; ++i) hbuf[idx] += ref[i * ref_stride];
+ // hbuf[idx]: 9 bit, dynamic range [0, 1020].
+ hbuf[idx] /= norm_factor;
+ ++ref;
+ }
+}
+
+// width: value range {16, 32, 64, 128}.
+int16_t aom_int_pro_col_c(const uint8_t *ref, const int width) {
+ int idx;
+ int16_t sum = 0;
+ // sum: 14 bit, dynamic range [0, 32640]
+ for (idx = 0; idx < width; ++idx) sum += ref[idx];
+ return sum;
+}
+
+// ref: [0 - 510]
+// src: [0 - 510]
+// bwl: {2, 3, 4, 5}
+int aom_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl) {
+ int i;
+ int width = 4 << bwl;
+ int sse = 0, mean = 0, var;
+
+ for (i = 0; i < width; ++i) {
+ int diff = ref[i] - src[i]; // diff: dynamic range [-510, 510], 10 bits.
+ mean += diff; // mean: dynamic range 16 bits.
+ sse += diff * diff; // sse: dynamic range 26 bits.
+ }
+
+ // (mean * mean): dynamic range 31 bits.
+ var = sse - ((mean * mean) >> (bwl + 2));
+ return var;
+}
diff --git a/libaom/aom_dsp/bitreader_buffer.c b/libaom/aom_dsp/bitreader_buffer.c
index 984b217..d79feea 100644
--- a/libaom/aom_dsp/bitreader_buffer.c
+++ b/libaom/aom_dsp/bitreader_buffer.c
@@ -60,9 +60,9 @@ int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits) {
uint32_t aom_rb_read_uvlc(struct aom_read_bit_buffer *rb) {
int leading_zeros = 0;
- while (!aom_rb_read_bit(rb)) ++leading_zeros;
+ while (leading_zeros < 32 && !aom_rb_read_bit(rb)) ++leading_zeros;
// Maximum 32 bits.
- if (leading_zeros >= 32) return UINT32_MAX;
+ if (leading_zeros == 32) return UINT32_MAX;
const uint32_t base = (1u << leading_zeros) - 1;
const uint32_t value = aom_rb_read_literal(rb, leading_zeros);
return base + value;
diff --git a/libaom/aom_dsp/grain_synthesis.c b/libaom/aom_dsp/grain_synthesis.c
index b96e1c3..4b94dbc 100644
--- a/libaom/aom_dsp/grain_synthesis.c
+++ b/libaom/aom_dsp/grain_synthesis.c
@@ -232,7 +232,6 @@ static int scaling_lut_y[256];
static int scaling_lut_cb[256];
static int scaling_lut_cr[256];
-static int grain_center;
static int grain_min;
static int grain_max;
@@ -1077,7 +1076,7 @@ int av1_add_film_grain_run(const aom_film_grain_t *params, uint8_t *luma,
int overlap = params->overlap_flag;
int bit_depth = params->bit_depth;
- grain_center = 128 << (bit_depth - 8);
+ const int grain_center = 128 << (bit_depth - 8);
grain_min = 0 - grain_center;
grain_max = (256 << (bit_depth - 8)) - 1 - grain_center;
diff --git a/libaom/aom_dsp/grain_synthesis.h b/libaom/aom_dsp/grain_synthesis.h
index 7aee6f6..9155b39 100644
--- a/libaom/aom_dsp/grain_synthesis.h
+++ b/libaom/aom_dsp/grain_synthesis.h
@@ -20,6 +20,8 @@
extern "C" {
#endif
+#include <string.h>
+
#include "aom_dsp/aom_dsp_common.h"
#include "aom/aom_image.h"
@@ -28,6 +30,9 @@ extern "C" {
* This structure contains input parameters for film grain synthesis
*/
typedef struct {
+ // This structure is compared element-by-element in the function
+ // av1_check_grain_params_equiv: this function must be updated if any changes
+ // are made to this structure.
int apply_grain;
int update_parameters;
@@ -79,8 +84,73 @@ typedef struct {
int grain_scale_shift;
uint16_t random_seed;
+ // This structure is compared element-by-element in the function
+ // av1_check_grain_params_equiv: this function must be updated if any changes
+ // are made to this structure.
} aom_film_grain_t;
+/*!\brief Check if two film grain parameters structs are equivalent
+ *
+ * Check if two film grain parameters are equal, except for the
+ * update_parameters and random_seed elements which are ignored.
+ *
+ * \param[in] pa The first set of parameters to compare
+ * \param[in] pb The second set of parameters to compare
+ * \return Returns 1 if the params are equivalent, 0 otherwise
+ */
+static INLINE int av1_check_grain_params_equiv(
+ const aom_film_grain_t *const pa, const aom_film_grain_t *const pb) {
+ if (pa->apply_grain != pb->apply_grain) return 0;
+ // Don't compare update_parameters
+
+ if (pa->num_y_points != pb->num_y_points) return 0;
+ if (memcmp(pa->scaling_points_y, pb->scaling_points_y,
+ pa->num_y_points * 2 * sizeof(*pa->scaling_points_y)) != 0)
+ return 0;
+
+ if (pa->num_cb_points != pb->num_cb_points) return 0;
+ if (memcmp(pa->scaling_points_cb, pb->scaling_points_cb,
+ pa->num_cb_points * 2 * sizeof(*pa->scaling_points_cb)) != 0)
+ return 0;
+
+ if (pa->num_cr_points != pb->num_cr_points) return 0;
+ if (memcmp(pa->scaling_points_cr, pb->scaling_points_cr,
+ pa->num_cr_points * 2 * sizeof(*pa->scaling_points_cr)) != 0)
+ return 0;
+
+ if (pa->scaling_shift != pb->scaling_shift) return 0;
+ if (pa->ar_coeff_lag != pb->ar_coeff_lag) return 0;
+
+ const int num_pos = 2 * pa->ar_coeff_lag * (pa->ar_coeff_lag + 1);
+ if (memcmp(pa->ar_coeffs_y, pb->ar_coeffs_y,
+ num_pos * sizeof(*pa->ar_coeffs_y)) != 0)
+ return 0;
+ if (memcmp(pa->ar_coeffs_cb, pb->ar_coeffs_cb,
+ num_pos * sizeof(*pa->ar_coeffs_cb)) != 0)
+ return 0;
+ if (memcmp(pa->ar_coeffs_cr, pb->ar_coeffs_cr,
+ num_pos * sizeof(*pa->ar_coeffs_cr)) != 0)
+ return 0;
+
+ if (pa->ar_coeff_shift != pb->ar_coeff_shift) return 0;
+
+ if (pa->cb_mult != pb->cb_mult) return 0;
+ if (pa->cb_luma_mult != pb->cb_luma_mult) return 0;
+ if (pa->cb_offset != pb->cb_offset) return 0;
+
+ if (pa->cr_mult != pb->cr_mult) return 0;
+ if (pa->cr_luma_mult != pb->cr_luma_mult) return 0;
+ if (pa->cr_offset != pb->cr_offset) return 0;
+
+ if (pa->overlap_flag != pb->overlap_flag) return 0;
+ if (pa->clip_to_restricted_range != pb->clip_to_restricted_range) return 0;
+ if (pa->bit_depth != pb->bit_depth) return 0;
+ if (pa->chroma_scaling_from_luma != pb->chroma_scaling_from_luma) return 0;
+ if (pa->grain_scale_shift != pb->grain_scale_shift) return 0;
+
+ return 1;
+}
+
/*!\brief Add film grain
*
* Add film grain to an image
diff --git a/libaom/aom_dsp/noise_model.h b/libaom/aom_dsp/noise_model.h
index 049d5be..5e7de9b 100644
--- a/libaom/aom_dsp/noise_model.h
+++ b/libaom/aom_dsp/noise_model.h
@@ -158,10 +158,10 @@ int aom_flat_block_finder_run(const aom_flat_block_finder_t *block_finder,
int stride, uint8_t *flat_blocks);
// The noise shape indicates the allowed coefficients in the AR model.
-typedef enum {
+enum {
AOM_NOISE_SHAPE_DIAMOND = 0,
AOM_NOISE_SHAPE_SQUARE = 1
-} aom_noise_shape;
+} UENUM1BYTE(aom_noise_shape);
// The parameters of the noise model include the shape type, lag, the
// bit depth of the input images provided, and whether the input images
@@ -202,13 +202,13 @@ typedef struct {
} aom_noise_model_t;
/*!\brief Result of a noise model update. */
-typedef enum {
+enum {
AOM_NOISE_STATUS_OK = 0,
AOM_NOISE_STATUS_INVALID_ARGUMENT,
AOM_NOISE_STATUS_INSUFFICIENT_FLAT_BLOCKS,
AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE,
AOM_NOISE_STATUS_INTERNAL_ERROR,
-} aom_noise_status_t;
+} UENUM1BYTE(aom_noise_status_t);
/*!\brief Initializes a noise model with the given parameters.
*
diff --git a/libaom/aom_dsp/prob.h b/libaom/aom_dsp/prob.h
index d003a98..20ffdea 100644
--- a/libaom/aom_dsp/prob.h
+++ b/libaom/aom_dsp/prob.h
@@ -641,7 +641,7 @@ static INLINE uint8_t get_prob(unsigned int num, unsigned int den) {
}
}
-static INLINE void update_cdf(aom_cdf_prob *cdf, int val, int nsymbs) {
+static INLINE void update_cdf(aom_cdf_prob *cdf, int8_t val, int nsymbs) {
int rate;
int i, tmp;
diff --git a/libaom/aom_dsp/quantize.c b/libaom/aom_dsp/quantize.c
index 62dbd86..ced34b4 100644
--- a/libaom/aom_dsp/quantize.c
+++ b/libaom/aom_dsp/quantize.c
@@ -11,6 +11,98 @@
#include "aom_dsp/quantize.h"
#include "aom_mem/aom_mem.h"
+#include "av1/encoder/av1_quantize.h"
+
+void quantize_b_adaptive_helper_c(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+ const qm_val_t *iqm_ptr, const int log_scale) {
+ const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+ ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+ const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+ int i, non_zero_count = (int)n_coeffs, eob = -1;
+ (void)iscan;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ int prescan_add[2];
+ for (i = 0; i < 2; ++i)
+ prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+
+ // Pre-scan pass
+ for (i = (int)n_coeffs - 1; i >= 0; i--) {
+ const int rc = scan[i];
+ const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+ const int coeff = coeff_ptr[rc] * wt;
+ const int prescan_add_val = prescan_add[rc != 0];
+ if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val) &&
+ coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add_val))
+ non_zero_count--;
+ else
+ break;
+ }
+
+ // Quantization pass: All coefficients with index >= zero_flag are
+ // skippable. Note: zero_flag can be zero.
+#if SKIP_EOB_FACTOR_ADJUST
+ int first = -1;
+#endif // SKIP_EOB_FACTOR_ADJUST
+ for (i = 0; i < non_zero_count; i++) {
+ const int rc = scan[i];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ int tmp32;
+
+ const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+ if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) {
+ int64_t tmp =
+ clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale),
+ INT16_MIN, INT16_MAX);
+ tmp *= wt;
+ tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+ quant_shift_ptr[rc != 0]) >>
+ (16 - log_scale + AOM_QM_BITS)); // quantization
+ qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+ const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
+ const int dequant =
+ (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
+ AOM_QM_BITS;
+ const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale;
+ dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+
+ if (tmp32) {
+ eob = i;
+#if SKIP_EOB_FACTOR_ADJUST
+ if (first == -1) first = i;
+#endif // SKIP_EOB_FACTOR_ADJUST
+ }
+ }
+ }
+#if SKIP_EOB_FACTOR_ADJUST
+ if (eob >= 0 && first == eob) {
+ const int rc = scan[eob];
+ if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+ const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+ const int coeff = coeff_ptr[rc] * wt;
+ const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+ const int prescan_add_val =
+ ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+ if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val) &&
+ coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add_val)) {
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ eob = -1;
+ }
+ }
+ }
+#endif // SKIP_EOB_FACTOR_ADJUST
+ *eob_ptr = eob + 1;
+}
void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
const int16_t *zbin_ptr, const int16_t *round_ptr,
@@ -74,6 +166,94 @@ void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
*eob_ptr = eob + 1;
}
+void highbd_quantize_b_adaptive_helper_c(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+ const qm_val_t *iqm_ptr, const int log_scale) {
+ int i, eob = -1;
+ const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+ ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+ const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+ int dequant;
+ int idx_arr[4096];
+ (void)iscan;
+ int idx = 0;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ int prescan_add[2];
+ for (i = 0; i < 2; ++i)
+ prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+
+ // Pre-scan pass
+ for (i = 0; i < n_coeffs; i++) {
+ const int rc = scan[i];
+ const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+ const int coeff = coeff_ptr[rc] * wt;
+
+ // If the coefficient is out of the base ZBIN range, keep it for
+ // quantization.
+ const int prescan_add_val = prescan_add[rc != 0];
+ if (coeff >= (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val) &&
+ coeff <= (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add_val))
+ idx_arr[idx++] = i;
+ }
+
+ // Quantization pass: only process the coefficients selected in
+ // pre-scan pass. Note: idx can be zero.
+#if SKIP_EOB_FACTOR_ADJUST
+ int first = -1;
+#endif // SKIP_EOB_FACTOR_ADJUST
+ for (i = 0; i < idx; i++) {
+ const int rc = scan[idx_arr[i]];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+ const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int64_t tmp1 =
+ abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale);
+ const int64_t tmpw = tmp1 * wt;
+ const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw;
+ const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >>
+ (16 - log_scale + AOM_QM_BITS));
+ qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+ dequant =
+ (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+ const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale;
+ dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+ if (abs_qcoeff) {
+ eob = idx_arr[i];
+#if SKIP_EOB_FACTOR_ADJUST
+ if (first == -1) first = eob;
+#endif // SKIP_EOB_FACTOR_ADJUST
+ }
+ }
+#if SKIP_EOB_FACTOR_ADJUST
+ if (eob >= 0 && first == eob) {
+ const int rc = scan[eob];
+ if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+ const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+ const int coeff = coeff_ptr[rc] * wt;
+ const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+ const int prescan_add_val =
+ ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+ if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val) &&
+ coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add_val)) {
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ eob = -1;
+ }
+ }
+ }
+#endif // SKIP_EOB_FACTOR_ADJUST
+ *eob_ptr = eob + 1;
+}
+
void highbd_quantize_b_helper_c(
const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
const int16_t *round_ptr, const int16_t *quant_ptr,
@@ -133,6 +313,80 @@ void highbd_quantize_b_helper_c(
/* These functions should only be called when quantisation matrices
are not used. */
+void aom_quantize_b_adaptive_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+ quant_ptr, quant_shift_ptr, qcoeff_ptr,
+ dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
+ NULL, NULL, 0);
+}
+
+void aom_quantize_b_32x32_adaptive_c(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+ quant_ptr, quant_shift_ptr, qcoeff_ptr,
+ dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
+ NULL, NULL, 1);
+}
+
+void aom_quantize_b_64x64_adaptive_c(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+ quant_ptr, quant_shift_ptr, qcoeff_ptr,
+ dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
+ NULL, NULL, 2);
+}
+
+void aom_highbd_quantize_b_adaptive_c(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ highbd_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+ quant_ptr, quant_shift_ptr, qcoeff_ptr,
+ dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
+ iscan, NULL, NULL, 0);
+}
+
+void aom_highbd_quantize_b_32x32_adaptive_c(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ highbd_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+ quant_ptr, quant_shift_ptr, qcoeff_ptr,
+ dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
+ iscan, NULL, NULL, 1);
+}
+
+void aom_highbd_quantize_b_64x64_adaptive_c(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ highbd_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+ quant_ptr, quant_shift_ptr, qcoeff_ptr,
+ dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
+ iscan, NULL, NULL, 2);
+}
+
void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
const int16_t *zbin_ptr, const int16_t *round_ptr,
const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
diff --git a/libaom/aom_dsp/quantize.h b/libaom/aom_dsp/quantize.h
index c55ab23..43c30ee 100644
--- a/libaom/aom_dsp/quantize.h
+++ b/libaom/aom_dsp/quantize.h
@@ -20,6 +20,66 @@
extern "C" {
#endif
+void quantize_b_adaptive_helper_c(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+ const qm_val_t *iqm_ptr, const int log_scale);
+
+void aom_quantize_b_adaptive_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan);
+
+void aom_quantize_b_32x32_adaptive_c(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan);
+
+void aom_quantize_b_64x64_adaptive_c(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan);
+
+void highbd_quantize_b_adaptive_helper_c(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+ const qm_val_t *iqm_ptr, const int log_scale);
+
+void aom_highbd_quantize_b_adaptive_c(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan);
+
+void aom_highbd_quantize_b_32x32_adaptive_c(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan);
+
+void aom_highbd_quantize_b_64x64_adaptive_c(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan);
+
void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
const int16_t *zbin_ptr, const int16_t *round_ptr,
const int16_t *quant_ptr,
diff --git a/libaom/aom_dsp/sad.c b/libaom/aom_dsp/sad.c
index 252e0e1..9169e78 100644
--- a/libaom/aom_dsp/sad.c
+++ b/libaom/aom_dsp/sad.c
@@ -54,12 +54,12 @@ static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b,
aom_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \
return sad(src, src_stride, comp_pred, m, m, n); \
} \
- unsigned int aom_jnt_sad##m##x##n##_avg_c( \
+ unsigned int aom_dist_wtd_sad##m##x##n##_avg_c( \
const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
- const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \
+ const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
uint8_t comp_pred[m * n]; \
- aom_jnt_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride, \
- jcp_param); \
+ aom_dist_wtd_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, \
+ ref_stride, jcp_param); \
return sad(src, src_stride, comp_pred, m, m, n); \
}
@@ -208,12 +208,13 @@ static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride,
ref, ref_stride); \
return highbd_sadb(src, src_stride, comp_pred, m, m, n); \
} \
- unsigned int aom_highbd_jnt_sad##m##x##n##_avg_c( \
+ unsigned int aom_highbd_dist_wtd_sad##m##x##n##_avg_c( \
const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
- const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \
+ const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
uint16_t comp_pred[m * n]; \
- aom_highbd_jnt_comp_avg_pred(CONVERT_TO_BYTEPTR(comp_pred), second_pred, \
- m, n, ref, ref_stride, jcp_param); \
+ aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(comp_pred), \
+ second_pred, m, n, ref, ref_stride, \
+ jcp_param); \
return highbd_sadb(src, src_stride, comp_pred, m, m, n); \
}
diff --git a/libaom/aom_dsp/variance.c b/libaom/aom_dsp/variance.c
index 0f4990e..18a33c5 100644
--- a/libaom/aom_dsp/variance.c
+++ b/libaom/aom_dsp/variance.c
@@ -164,40 +164,40 @@ void aom_var_filter_block2d_bil_second_pass_c(const uint16_t *a, uint8_t *b,
return aom_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \
}
-#define SUBPIX_AVG_VAR(W, H) \
- uint32_t aom_sub_pixel_avg_variance##W##x##H##_c( \
- const uint8_t *a, int a_stride, int xoffset, int yoffset, \
- const uint8_t *b, int b_stride, uint32_t *sse, \
- const uint8_t *second_pred) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint8_t temp2[H * W]; \
- DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
- \
- aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
- bilinear_filters_2t[xoffset]); \
- aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
- bilinear_filters_2t[yoffset]); \
- \
- aom_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
- \
- return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse); \
- } \
- uint32_t aom_jnt_sub_pixel_avg_variance##W##x##H##_c( \
- const uint8_t *a, int a_stride, int xoffset, int yoffset, \
- const uint8_t *b, int b_stride, uint32_t *sse, \
- const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint8_t temp2[H * W]; \
- DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
- \
- aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
- bilinear_filters_2t[xoffset]); \
- aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
- bilinear_filters_2t[yoffset]); \
- \
- aom_jnt_comp_avg_pred(temp3, second_pred, W, H, temp2, W, jcp_param); \
- \
- return aom_variance##W##x##H(temp3, W, b, b_stride, sse); \
+#define SUBPIX_AVG_VAR(W, H) \
+ uint32_t aom_sub_pixel_avg_variance##W##x##H##_c( \
+ const uint8_t *a, int a_stride, int xoffset, int yoffset, \
+ const uint8_t *b, int b_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint8_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
+ \
+ aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
+ bilinear_filters_2t[xoffset]); \
+ aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
+ bilinear_filters_2t[yoffset]); \
+ \
+ aom_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
+ \
+ return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse); \
+ } \
+ uint32_t aom_dist_wtd_sub_pixel_avg_variance##W##x##H##_c( \
+ const uint8_t *a, int a_stride, int xoffset, int yoffset, \
+ const uint8_t *b, int b_stride, uint32_t *sse, \
+ const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint8_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
+ \
+ aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
+ bilinear_filters_2t[xoffset]); \
+ aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
+ bilinear_filters_2t[yoffset]); \
+ \
+ aom_dist_wtd_comp_avg_pred(temp3, second_pred, W, H, temp2, W, jcp_param); \
+ \
+ return aom_variance##W##x##H(temp3, W, b, b_stride, sse); \
}
/* Identical to the variance call except it takes an additional parameter, sum,
@@ -291,7 +291,7 @@ void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
const int ref_num = 0;
const int is_intrabc = is_intrabc_block(mi);
const struct scale_factors *const sf =
- is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf;
+ is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
const int is_scaled = av1_is_scaled(sf);
if (is_scaled) {
@@ -424,9 +424,10 @@ void aom_comp_avg_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
}
}
-void aom_jnt_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
- int height, const uint8_t *ref, int ref_stride,
- const JNT_COMP_PARAMS *jcp_param) {
+void aom_dist_wtd_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred,
+ int width, int height, const uint8_t *ref,
+ int ref_stride,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
int i, j;
const int fwd_offset = jcp_param->fwd_offset;
const int bck_offset = jcp_param->bck_offset;
@@ -443,11 +444,11 @@ void aom_jnt_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
}
}
-void aom_jnt_comp_avg_upsampled_pred_c(
+void aom_dist_wtd_comp_avg_upsampled_pred_c(
MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
- int ref_stride, const JNT_COMP_PARAMS *jcp_param, int subpel_search) {
+ int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) {
int i, j;
const int fwd_offset = jcp_param->fwd_offset;
const int bck_offset = jcp_param->bck_offset;
@@ -688,125 +689,128 @@ void aom_highbd_var_filter_block2d_bil_second_pass(
dst, dst_stride, sse); \
}
-#define HIGHBD_SUBPIX_AVG_VAR(W, H) \
- uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \
- const uint8_t *src, int src_stride, int xoffset, int yoffset, \
- const uint8_t *dst, int dst_stride, uint32_t *sse, \
- const uint8_t *second_pred) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint16_t temp2[H * W]; \
- DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
- \
- aom_highbd_var_filter_block2d_bil_first_pass( \
- src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
- aom_highbd_var_filter_block2d_bil_second_pass( \
- fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
- \
- aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
- CONVERT_TO_BYTEPTR(temp2), W); \
- \
- return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
- dst, dst_stride, sse); \
- } \
- \
- uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \
- const uint8_t *src, int src_stride, int xoffset, int yoffset, \
- const uint8_t *dst, int dst_stride, uint32_t *sse, \
- const uint8_t *second_pred) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint16_t temp2[H * W]; \
- DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
- \
- aom_highbd_var_filter_block2d_bil_first_pass( \
- src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
- aom_highbd_var_filter_block2d_bil_second_pass( \
- fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
- \
- aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
- CONVERT_TO_BYTEPTR(temp2), W); \
- \
- return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
- dst, dst_stride, sse); \
- } \
- \
- uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \
- const uint8_t *src, int src_stride, int xoffset, int yoffset, \
- const uint8_t *dst, int dst_stride, uint32_t *sse, \
- const uint8_t *second_pred) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint16_t temp2[H * W]; \
- DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
- \
- aom_highbd_var_filter_block2d_bil_first_pass( \
- src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
- aom_highbd_var_filter_block2d_bil_second_pass( \
- fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
- \
- aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
- CONVERT_TO_BYTEPTR(temp2), W); \
- \
- return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
- dst, dst_stride, sse); \
- } \
- \
- uint32_t aom_highbd_8_jnt_sub_pixel_avg_variance##W##x##H##_c( \
- const uint8_t *src, int src_stride, int xoffset, int yoffset, \
- const uint8_t *dst, int dst_stride, uint32_t *sse, \
- const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint16_t temp2[H * W]; \
- DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
- \
- aom_highbd_var_filter_block2d_bil_first_pass( \
- src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
- aom_highbd_var_filter_block2d_bil_second_pass( \
- fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
- \
- aom_highbd_jnt_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
- CONVERT_TO_BYTEPTR(temp2), W, jcp_param); \
- \
- return aom_highbd_8_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
- dst_stride, sse); \
- } \
- \
- uint32_t aom_highbd_10_jnt_sub_pixel_avg_variance##W##x##H##_c( \
- const uint8_t *src, int src_stride, int xoffset, int yoffset, \
- const uint8_t *dst, int dst_stride, uint32_t *sse, \
- const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint16_t temp2[H * W]; \
- DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
- \
- aom_highbd_var_filter_block2d_bil_first_pass( \
- src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
- aom_highbd_var_filter_block2d_bil_second_pass( \
- fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
- \
- aom_highbd_jnt_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
- CONVERT_TO_BYTEPTR(temp2), W, jcp_param); \
- \
- return aom_highbd_10_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
- dst_stride, sse); \
- } \
- \
- uint32_t aom_highbd_12_jnt_sub_pixel_avg_variance##W##x##H##_c( \
- const uint8_t *src, int src_stride, int xoffset, int yoffset, \
- const uint8_t *dst, int dst_stride, uint32_t *sse, \
- const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint16_t temp2[H * W]; \
- DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
- \
- aom_highbd_var_filter_block2d_bil_first_pass( \
- src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
- aom_highbd_var_filter_block2d_bil_second_pass( \
- fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
- \
- aom_highbd_jnt_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
- CONVERT_TO_BYTEPTR(temp2), W, jcp_param); \
- \
- return aom_highbd_12_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
- dst_stride, sse); \
+#define HIGHBD_SUBPIX_AVG_VAR(W, H) \
+ uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+ \
+ aom_highbd_var_filter_block2d_bil_first_pass( \
+ src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+ aom_highbd_var_filter_block2d_bil_second_pass( \
+ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
+ \
+ aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
+ CONVERT_TO_BYTEPTR(temp2), W); \
+ \
+ return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
+ dst, dst_stride, sse); \
+ } \
+ \
+ uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+ \
+ aom_highbd_var_filter_block2d_bil_first_pass( \
+ src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+ aom_highbd_var_filter_block2d_bil_second_pass( \
+ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
+ \
+ aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
+ CONVERT_TO_BYTEPTR(temp2), W); \
+ \
+ return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
+ dst, dst_stride, sse); \
+ } \
+ \
+ uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+ \
+ aom_highbd_var_filter_block2d_bil_first_pass( \
+ src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+ aom_highbd_var_filter_block2d_bil_second_pass( \
+ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
+ \
+ aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
+ CONVERT_TO_BYTEPTR(temp2), W); \
+ \
+ return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
+ dst, dst_stride, sse); \
+ } \
+ \
+ uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, uint32_t *sse, \
+ const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+ \
+ aom_highbd_var_filter_block2d_bil_first_pass( \
+ src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+ aom_highbd_var_filter_block2d_bil_second_pass( \
+ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
+ \
+ aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
+ W, H, CONVERT_TO_BYTEPTR(temp2), W, \
+ jcp_param); \
+ \
+ return aom_highbd_8_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
+ dst_stride, sse); \
+ } \
+ \
+ uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, uint32_t *sse, \
+ const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+ \
+ aom_highbd_var_filter_block2d_bil_first_pass( \
+ src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+ aom_highbd_var_filter_block2d_bil_second_pass( \
+ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
+ \
+ aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
+ W, H, CONVERT_TO_BYTEPTR(temp2), W, \
+ jcp_param); \
+ \
+ return aom_highbd_10_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
+ dst_stride, sse); \
+ } \
+ \
+ uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance##W##x##H##_c( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, uint32_t *sse, \
+ const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint16_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+ \
+ aom_highbd_var_filter_block2d_bil_first_pass( \
+ src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+ aom_highbd_var_filter_block2d_bil_second_pass( \
+ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
+ \
+ aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
+ W, H, CONVERT_TO_BYTEPTR(temp2), W, \
+ jcp_param); \
+ \
+ return aom_highbd_12_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
+ dst_stride, sse); \
}
/* All three forms of the variance are available in the same sizes. */
@@ -880,7 +884,7 @@ void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd,
const int ref_num = 0;
const int is_intrabc = is_intrabc_block(mi);
const struct scale_factors *const sf =
- is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf;
+ is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
const int is_scaled = av1_is_scaled(sf);
if (is_scaled) {
@@ -1018,10 +1022,10 @@ void aom_highbd_comp_avg_upsampled_pred_c(
}
}
-void aom_highbd_jnt_comp_avg_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
- int width, int height, const uint8_t *ref8,
- int ref_stride,
- const JNT_COMP_PARAMS *jcp_param) {
+void aom_highbd_dist_wtd_comp_avg_pred_c(
+ uint8_t *comp_pred8, const uint8_t *pred8, int width, int height,
+ const uint8_t *ref8, int ref_stride,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
int i, j;
const int fwd_offset = jcp_param->fwd_offset;
const int bck_offset = jcp_param->bck_offset;
@@ -1041,11 +1045,11 @@ void aom_highbd_jnt_comp_avg_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
}
}
-void aom_highbd_jnt_comp_avg_upsampled_pred_c(
+void aom_highbd_dist_wtd_comp_avg_upsampled_pred_c(
MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
- int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param,
+ int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param,
int subpel_search) {
int i, j;
const int fwd_offset = jcp_param->fwd_offset;
diff --git a/libaom/aom_dsp/variance.h b/libaom/aom_dsp/variance.h
index 362da29..4550c17 100644
--- a/libaom/aom_dsp/variance.h
+++ b/libaom/aom_dsp/variance.h
@@ -50,15 +50,14 @@ typedef unsigned int (*aom_subp_avg_variance_fn_t)(
const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b,
int b_stride, unsigned int *sse, const uint8_t *second_pred);
-typedef unsigned int (*aom_jnt_sad_avg_fn_t)(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- const uint8_t *second_pred,
- const JNT_COMP_PARAMS *jcp_param);
+typedef unsigned int (*aom_dist_wtd_sad_avg_fn_t)(
+ const uint8_t *a, int a_stride, const uint8_t *b, int b_stride,
+ const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
-typedef unsigned int (*aom_jnt_subp_avg_variance_fn_t)(
+typedef unsigned int (*aom_dist_wtd_subp_avg_variance_fn_t)(
const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b,
int b_stride, unsigned int *sse, const uint8_t *second_pred,
- const JNT_COMP_PARAMS *jcp_param);
+ const DIST_WTD_COMP_PARAMS *jcp_param);
typedef unsigned int (*aom_masked_sad_fn_t)(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
@@ -101,8 +100,8 @@ typedef struct aom_variance_vtable {
aom_obmc_sad_fn_t osdf;
aom_obmc_variance_fn_t ovf;
aom_obmc_subpixvariance_fn_t osvf;
- aom_jnt_sad_avg_fn_t jsdaf;
- aom_jnt_subp_avg_variance_fn_t jsvaf;
+ aom_dist_wtd_sad_avg_fn_t jsdaf;
+ aom_dist_wtd_subp_avg_variance_fn_t jsvaf;
} aom_variance_fn_ptr_t;
void aom_highbd_var_filter_block2d_bil_first_pass(
diff --git a/libaom/aom_dsp/x86/adaptive_quantize_sse2.c b/libaom/aom_dsp/x86/adaptive_quantize_sse2.c
new file mode 100644
index 0000000..3822c27
--- /dev/null
+++ b/libaom/aom_dsp/x86/adaptive_quantize_sse2.c
@@ -0,0 +1,421 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "av1/encoder/av1_quantize.h"
+#include "aom_dsp/x86/quantize_x86.h"
+
+void aom_quantize_b_adaptive_sse2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ const __m128i zero = _mm_setzero_si128();
+ int index = 16;
+ int non_zero_count = (int)n_coeffs;
+ __m128i zbin, round, quant, dequant, shift;
+ __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+ __m128i qcoeff0, qcoeff1;
+ __m128i cmp_mask0, cmp_mask1;
+ __m128i eob = zero, eob0, prescan0, prescan1, all_zero;
+ const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 0),
+ ROUND_POWER_OF_TWO(zbin_ptr[1], 0) };
+
+ int prescan_add[2];
+ for (int i = 0; i < 2; ++i)
+ prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+
+ // max buffer is of size 256 as this functions calls with
+ // maximum n_coeffs as 256
+ int16_t prescan[256];
+ memset(prescan, -1, n_coeffs * sizeof(int16_t));
+
+ // TODO(Aniket): Experiment the following loop with intrinsic
+ for (int i = (int)n_coeffs - 1; i >= 0; i--) {
+ const int rc = scan[i];
+ const qm_val_t wt = 1 << AOM_QM_BITS;
+ const int coeff = coeff_ptr[rc] * wt;
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int prescan_add_val = prescan_add[rc != 0];
+ if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+ prescan[rc] = 0;
+ non_zero_count--;
+ } else {
+ break;
+ }
+ }
+#if SKIP_EOB_FACTOR_ADJUST
+ int first = -1;
+#endif
+ // Setup global values.
+ load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
+ dequant_ptr, &dequant, quant_shift_ptr, &shift);
+
+ // Do DC and first 15 AC.
+ coeff0 = load_coefficients(coeff_ptr);
+ coeff1 = load_coefficients(coeff_ptr + 8);
+
+ // Poor man's abs().
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+ prescan0 = _mm_loadu_si128((const __m128i *)prescan);
+ prescan1 = _mm_loadu_si128((const __m128i *)(prescan + 8));
+
+ cmp_mask0 = _mm_and_si128(prescan0, _mm_cmpgt_epi16(qcoeff0, zbin));
+ zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
+ cmp_mask1 = _mm_and_si128(prescan1, _mm_cmpgt_epi16(qcoeff1, zbin));
+
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_movemask_epi8(all_zero) == 0) {
+ _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero);
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ } else {
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+ // Reinsert signs
+ qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+ // Mask out zbin threshold coeffs
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_coefficients(qcoeff0, qcoeff_ptr);
+ store_coefficients(qcoeff1, qcoeff_ptr + 8);
+
+ coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ coeff1 = calculate_dqcoeff(qcoeff1, dequant);
+
+ store_coefficients(coeff0, dqcoeff_ptr);
+ store_coefficients(coeff1, dqcoeff_ptr + 8);
+
+ eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+ }
+
+ // AC only loop.
+ // TODO(Aniket): Reduce the processing of coeff quatization
+ // based on eob logic
+ while (index < n_coeffs) {
+ coeff0 = load_coefficients(coeff_ptr + index);
+ coeff1 = load_coefficients(coeff_ptr + index + 8);
+
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+ prescan0 = _mm_loadu_si128((const __m128i *)(prescan + index));
+ prescan1 = _mm_loadu_si128((const __m128i *)(prescan + index + 8));
+
+ cmp_mask0 = _mm_and_si128(prescan0, _mm_cmpgt_epi16(qcoeff0, zbin));
+ cmp_mask1 = _mm_and_si128(prescan1, _mm_cmpgt_epi16(qcoeff1, zbin));
+
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_movemask_epi8(all_zero) == 0) {
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero);
+ index += 16;
+ continue;
+ }
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+ qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_coefficients(qcoeff0, qcoeff_ptr + index);
+ store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
+
+ coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+ coeff1 = calculate_dqcoeff(qcoeff1, dequant);
+
+ store_coefficients(coeff0, dqcoeff_ptr + index);
+ store_coefficients(coeff1, dqcoeff_ptr + index + 8);
+
+ eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, index,
+ zero);
+ eob = _mm_max_epi16(eob, eob0);
+ index += 16;
+ }
+
+ *eob_ptr = accumulate_eob(eob);
+
+#if SKIP_EOB_FACTOR_ADJUST
+ // TODO(Aniket): Experiment the following loop with intrinsic by combining
+ // with the quantization loop above
+ for (int i = 0; i < non_zero_count; i++) {
+ const int rc = scan[i];
+ const int qcoeff = qcoeff_ptr[rc];
+ if (qcoeff) {
+ first = i;
+ break;
+ }
+ }
+ if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+ const int rc = scan[(*eob_ptr - 1)];
+ if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+ const qm_val_t wt = (1 << AOM_QM_BITS);
+ const int coeff = coeff_ptr[rc] * wt;
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+ const int prescan_add_val =
+ ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+ if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ *eob_ptr = 0;
+ }
+ }
+ }
+#endif
+}
+
+void aom_quantize_b_32x32_adaptive_sse2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ int index = 16;
+ int non_zero_count = (int)n_coeffs;
+ const int log_scale = 1;
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i log_scale_vec = _mm_set1_epi16(log_scale);
+ __m128i zbin, round, quant, dequant, shift;
+ __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+ __m128i qcoeff0, qcoeff1;
+ __m128i cmp_mask0, cmp_mask1;
+ __m128i eob = zero, eob0, prescan0, prescan1, all_zero;
+ const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+ ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+
+ int prescan_add[2];
+ for (int i = 0; i < 2; ++i)
+ prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+
+ // max buffer is of size 1024 as this functions calls with
+ // maximum n_coeffs as 1024
+ int16_t prescan[1024];
+ memset(prescan, -1, n_coeffs * sizeof(int16_t));
+
+ // TODO(Aniket): Experiment the following loop with intrinsic
+ for (int i = (int)n_coeffs - 1; i >= 0; i--) {
+ const int rc = scan[i];
+ const qm_val_t wt = 1 << AOM_QM_BITS;
+ const int coeff = coeff_ptr[rc] * wt;
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int prescan_add_val = prescan_add[rc != 0];
+ if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+ prescan[rc] = 0;
+ non_zero_count--;
+ } else {
+ break;
+ }
+ }
+#if SKIP_EOB_FACTOR_ADJUST
+ int first = -1;
+#endif
+ // Setup global values.
+ zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+ round = _mm_load_si128((const __m128i *)round_ptr);
+ quant = _mm_load_si128((const __m128i *)quant_ptr);
+ dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+ shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+
+ // Shift with rounding.
+ zbin = _mm_add_epi16(zbin, log_scale_vec);
+ round = _mm_add_epi16(round, log_scale_vec);
+ zbin = _mm_srli_epi16(zbin, log_scale);
+ round = _mm_srli_epi16(round, log_scale);
+ zbin = _mm_sub_epi16(zbin, one);
+ // Do DC and first 15 AC.
+ coeff0 = load_coefficients(coeff_ptr);
+ coeff1 = load_coefficients(coeff_ptr + 8);
+
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+ prescan0 = _mm_loadu_si128((const __m128i *)prescan);
+ prescan1 = _mm_loadu_si128((const __m128i *)(prescan + 8));
+
+ cmp_mask0 = _mm_and_si128(prescan0, _mm_cmpgt_epi16(qcoeff0, zbin));
+ zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
+ cmp_mask1 = _mm_and_si128(prescan1, _mm_cmpgt_epi16(qcoeff1, zbin));
+
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_movemask_epi8(all_zero) == 0) {
+ _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero);
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ } else {
+ calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
+
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+
+ calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
+
+ // Reinsert signs
+ qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+ // Mask out zbin threshold coeffs
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_coefficients(qcoeff0, qcoeff_ptr);
+ store_coefficients(qcoeff1, qcoeff_ptr + 8);
+
+ calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, dqcoeff_ptr,
+ &log_scale);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
+ dqcoeff_ptr + 8, &log_scale);
+
+ eob =
+ scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+ }
+
+ // AC only loop.
+ // TODO(Aniket): Reduce the processing of coeff quatization
+ // based on eob logic
+ while (index < n_coeffs) {
+ coeff0 = load_coefficients(coeff_ptr + index);
+ coeff1 = load_coefficients(coeff_ptr + index + 8);
+
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+ prescan0 = _mm_loadu_si128((const __m128i *)(prescan + index));
+ prescan1 = _mm_loadu_si128((const __m128i *)(prescan + index + 8));
+
+ cmp_mask0 = _mm_and_si128(prescan0, _mm_cmpgt_epi16(qcoeff0, zbin));
+ cmp_mask1 = _mm_and_si128(prescan1, _mm_cmpgt_epi16(qcoeff1, zbin));
+
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_movemask_epi8(all_zero) == 0) {
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero);
+ index += 16;
+ continue;
+ }
+ calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
+ calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
+
+ qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_coefficients(qcoeff0, qcoeff_ptr + index);
+ store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
+
+ calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero,
+ dqcoeff_ptr + index, &log_scale);
+ calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
+ dqcoeff_ptr + index + 8, &log_scale);
+
+ eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
+ zero);
+ eob = _mm_max_epi16(eob, eob0);
+ index += 16;
+ }
+
+ *eob_ptr = accumulate_eob(eob);
+
+#if SKIP_EOB_FACTOR_ADJUST
+ // TODO(Aniket): Experiment the following loop with intrinsic by combining
+ // with the quantization loop above
+ for (int i = 0; i < non_zero_count; i++) {
+ const int rc = scan[i];
+ const int qcoeff = qcoeff_ptr[rc];
+ if (qcoeff) {
+ first = i;
+ break;
+ }
+ }
+ if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+ const int rc = scan[(*eob_ptr - 1)];
+ if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+ const qm_val_t wt = (1 << AOM_QM_BITS);
+ const int coeff = coeff_ptr[rc] * wt;
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+ const int prescan_add_val =
+ ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+ if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ *eob_ptr = 0;
+ }
+ }
+ }
+#endif
+}
diff --git a/libaom/aom_dsp/x86/avg_intrin_sse2.c b/libaom/aom_dsp/x86/avg_intrin_sse2.c
index 969e4e1..0c20261 100644
--- a/libaom/aom_dsp/x86/avg_intrin_sse2.c
+++ b/libaom/aom_dsp/x86/avg_intrin_sse2.c
@@ -16,6 +16,129 @@
#include "aom_dsp/x86/bitdepth_conversion_sse2.h"
#include "aom_ports/mem.h"
+void aom_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
+ int *min, int *max) {
+ __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
+ u0 = _mm_setzero_si128();
+ // Row 0
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff0 = _mm_max_epi16(diff, negdiff);
+ // Row 1
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff = _mm_max_epi16(diff, negdiff);
+ maxabsdiff = _mm_max_epi16(absdiff0, absdiff);
+ minabsdiff = _mm_min_epi16(absdiff0, absdiff);
+ // Row 2
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff = _mm_max_epi16(diff, negdiff);
+ maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+ minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+ // Row 3
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff = _mm_max_epi16(diff, negdiff);
+ maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+ minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+ // Row 4
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff = _mm_max_epi16(diff, negdiff);
+ maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+ minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+ // Row 5
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff = _mm_max_epi16(diff, negdiff);
+ maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+ minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+ // Row 6
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff = _mm_max_epi16(diff, negdiff);
+ maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+ minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+ // Row 7
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff = _mm_max_epi16(diff, negdiff);
+ maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+ minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+
+ maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8));
+ maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32));
+ maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16));
+ *max = _mm_extract_epi16(maxabsdiff, 0);
+
+ minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8));
+ minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32));
+ minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16));
+ *min = _mm_extract_epi16(minabsdiff, 0);
+}
+
+unsigned int aom_avg_8x8_sse2(const uint8_t *s, int p) {
+ __m128i s0, s1, u0;
+ unsigned int avg = 0;
+ u0 = _mm_setzero_si128();
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
+ s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
+ s0 = _mm_adds_epu16(s0, s1);
+
+ s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8));
+ s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 32));
+ s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
+ avg = _mm_extract_epi16(s0, 0);
+ return (avg + 32) >> 6;
+}
+
+unsigned int aom_avg_4x4_sse2(const uint8_t *s, int p) {
+ __m128i s0, s1, u0;
+ unsigned int avg = 0;
+ u0 = _mm_setzero_si128();
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
+ s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
+ s0 = _mm_adds_epu16(s0, s1);
+ s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
+ s0 = _mm_adds_epu16(s0, s1);
+
+ s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4));
+ s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
+ avg = _mm_extract_epi16(s0, 0);
+ return (avg + 8) >> 4;
+}
+
static void hadamard_col8_sse2(__m128i *in, int iter) {
__m128i a0 = in[0];
__m128i a1 = in[1];
diff --git a/libaom/aom_dsp/x86/convolve_avx2.h b/libaom/aom_dsp/x86/convolve_avx2.h
index 3cc0e23..4a1068e 100644
--- a/libaom/aom_dsp/x86/convolve_avx2.h
+++ b/libaom/aom_dsp/x86/convolve_avx2.h
@@ -34,31 +34,214 @@ DECLARE_ALIGNED(32, static const uint8_t, filt4_d4_global_avx2[]) = {
2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
};
-DECLARE_ALIGNED(32, static const uint8_t, filt_center_global_avx2[32]) = {
- 3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 10, 255,
- 3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 10, 255
-};
-
-DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = {
- 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
- 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
-};
-
-DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = {
- 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
- 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
-};
-
-DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
- 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
- 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
-};
+#define CONVOLVE_SR_HORIZONTAL_FILTER_8TAP \
+ for (i = 0; i < (im_h - 2); i += 2) { \
+ __m256i data = _mm256_castsi128_si256( \
+ _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \
+ data = _mm256_inserti128_si256( \
+ data, \
+ _mm_loadu_si128( \
+ (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]), \
+ 1); \
+ \
+ __m256i res = convolve_lowbd_x(data, coeffs_h, filt); \
+ res = \
+ _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \
+ _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); \
+ } \
+ \
+ __m256i data_1 = _mm256_castsi128_si256( \
+ _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \
+ \
+ __m256i res = convolve_lowbd_x(data_1, coeffs_h, filt); \
+ \
+ res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \
+ \
+ _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+
+#define CONVOLVE_SR_VERTICAL_FILTER_8TAP \
+ __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \
+ __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \
+ __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \
+ __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \
+ __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); \
+ __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); \
+ \
+ __m256i s[8]; \
+ s[0] = _mm256_unpacklo_epi16(src_0, src_1); \
+ s[1] = _mm256_unpacklo_epi16(src_2, src_3); \
+ s[2] = _mm256_unpacklo_epi16(src_4, src_5); \
+ \
+ s[4] = _mm256_unpackhi_epi16(src_0, src_1); \
+ s[5] = _mm256_unpackhi_epi16(src_2, src_3); \
+ s[6] = _mm256_unpackhi_epi16(src_4, src_5); \
+ \
+ for (i = 0; i < h; i += 2) { \
+ const int16_t *data = &im_block[i * im_stride]; \
+ \
+ const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); \
+ const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); \
+ \
+ s[3] = _mm256_unpacklo_epi16(s6, s7); \
+ s[7] = _mm256_unpackhi_epi16(s6, s7); \
+ \
+ __m256i res_a = convolve(s, coeffs_v); \
+ __m256i res_b = convolve(s + 4, coeffs_v); \
+ \
+ res_a = \
+ _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v); \
+ res_b = \
+ _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v); \
+ \
+ const __m256i res_a_round = _mm256_sra_epi32( \
+ _mm256_add_epi32(res_a, round_const_v), round_shift_v); \
+ const __m256i res_b_round = _mm256_sra_epi32( \
+ _mm256_add_epi32(res_b, round_const_v), round_shift_v); \
+ \
+ const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); \
+ const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit); \
+ \
+ const __m128i res_0 = _mm256_castsi256_si128(res_8b); \
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); \
+ \
+ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; \
+ __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride]; \
+ if (w - j > 4) { \
+ _mm_storel_epi64(p_0, res_0); \
+ _mm_storel_epi64(p_1, res_1); \
+ } else if (w == 4) { \
+ xx_storel_32(p_0, res_0); \
+ xx_storel_32(p_1, res_1); \
+ } else { \
+ *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0); \
+ *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1); \
+ } \
+ \
+ s[0] = s[1]; \
+ s[1] = s[2]; \
+ s[2] = s[3]; \
+ \
+ s[4] = s[5]; \
+ s[5] = s[6]; \
+ s[6] = s[7]; \
+ }
-DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
- 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
- 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
-};
+#define DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP \
+ for (i = 0; i < im_h; i += 2) { \
+ __m256i data = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h)); \
+ if (i + 1 < im_h) \
+ data = _mm256_inserti128_si256( \
+ data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1); \
+ src_h += (src_stride << 1); \
+ __m256i res = convolve_lowbd_x(data, coeffs_x, filt); \
+ \
+ res = \
+ _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \
+ \
+ _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); \
+ }
+#define DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP \
+ __m256i s[8]; \
+ __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \
+ __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \
+ __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \
+ __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \
+ __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); \
+ __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); \
+ \
+ s[0] = _mm256_unpacklo_epi16(s0, s1); \
+ s[1] = _mm256_unpacklo_epi16(s2, s3); \
+ s[2] = _mm256_unpacklo_epi16(s4, s5); \
+ \
+ s[4] = _mm256_unpackhi_epi16(s0, s1); \
+ s[5] = _mm256_unpackhi_epi16(s2, s3); \
+ s[6] = _mm256_unpackhi_epi16(s4, s5); \
+ \
+ for (i = 0; i < h; i += 2) { \
+ const int16_t *data = &im_block[i * im_stride]; \
+ \
+ const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); \
+ const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); \
+ \
+ s[3] = _mm256_unpacklo_epi16(s6, s7); \
+ s[7] = _mm256_unpackhi_epi16(s6, s7); \
+ \
+ const __m256i res_a = convolve(s, coeffs_y); \
+ const __m256i res_a_round = _mm256_sra_epi32( \
+ _mm256_add_epi32(res_a, round_const_v), round_shift_v); \
+ \
+ if (w - j > 4) { \
+ const __m256i res_b = convolve(s + 4, coeffs_y); \
+ const __m256i res_b_round = _mm256_sra_epi32( \
+ _mm256_add_epi32(res_b, round_const_v), round_shift_v); \
+ const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round); \
+ const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); \
+ \
+ if (do_average) { \
+ const __m256i data_ref_0 = load_line2_avx2( \
+ &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]); \
+ const __m256i comp_avg_res = \
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); \
+ \
+ const __m256i round_result = convolve_rounding( \
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift); \
+ \
+ const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); \
+ const __m128i res_0 = _mm256_castsi256_si128(res_8); \
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); \
+ \
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); \
+ _mm_storel_epi64( \
+ (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); \
+ } else { \
+ const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); \
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); \
+ \
+ const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); \
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), \
+ res_1); \
+ } \
+ } else { \
+ const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round); \
+ const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); \
+ \
+ if (do_average) { \
+ const __m256i data_ref_0 = load_line2_avx2( \
+ &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]); \
+ \
+ const __m256i comp_avg_res = \
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); \
+ \
+ const __m256i round_result = convolve_rounding( \
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift); \
+ \
+ const __m256i res_8 = _mm256_packus_epi16(round_result, round_result); \
+ const __m128i res_0 = _mm256_castsi256_si128(res_8); \
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); \
+ \
+ *(uint32_t *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); \
+ *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) = \
+ _mm_cvtsi128_si32(res_1); \
+ \
+ } else { \
+ const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); \
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); \
+ \
+ const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); \
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), \
+ res_1); \
+ } \
+ } \
+ \
+ s[0] = s[1]; \
+ s[1] = s[2]; \
+ s[2] = s[3]; \
+ \
+ s[4] = s[5]; \
+ s[5] = s[6]; \
+ s[6] = s[7]; \
+ }
static INLINE void prepare_coeffs_lowbd(
const InterpFilterParams *const filter_params, const int subpel_q4,
__m256i *const coeffs /* [4] */) {
@@ -120,6 +303,17 @@ static INLINE __m256i convolve_lowbd(const __m256i *const s,
return res;
}
+static INLINE __m256i convolve_lowbd_4tap(const __m256i *const s,
+ const __m256i *const coeffs) {
+ const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]);
+ const __m256i res_45 = _mm256_maddubs_epi16(s[1], coeffs[1]);
+
+ // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ const __m256i res = _mm256_add_epi16(res_45, res_23);
+
+ return res;
+}
+
static INLINE __m256i convolve(const __m256i *const s,
const __m256i *const coeffs) {
const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
@@ -155,6 +349,17 @@ static INLINE __m256i convolve_lowbd_x(const __m256i data,
return convolve_lowbd(s, coeffs);
}
+static INLINE __m256i convolve_lowbd_x_4tap(const __m256i data,
+ const __m256i *const coeffs,
+ const __m256i *const filt) {
+ __m256i s[2];
+
+ s[0] = _mm256_shuffle_epi8(data, filt[0]);
+ s[1] = _mm256_shuffle_epi8(data, filt[1]);
+
+ return convolve_lowbd_4tap(s, coeffs);
+}
+
static INLINE void add_store_aligned_256(CONV_BUF_TYPE *const dst,
const __m256i *const res,
const int do_average) {
@@ -172,9 +377,9 @@ static INLINE void add_store_aligned_256(CONV_BUF_TYPE *const dst,
static INLINE __m256i comp_avg(const __m256i *const data_ref_0,
const __m256i *const res_unsigned,
const __m256i *const wt,
- const int use_jnt_comp_avg) {
+ const int use_dist_wtd_comp_avg) {
__m256i res;
- if (use_jnt_comp_avg) {
+ if (use_dist_wtd_comp_avg) {
const __m256i data_lo = _mm256_unpacklo_epi16(*data_ref_0, *res_unsigned);
const __m256i data_hi = _mm256_unpackhi_epi16(*data_ref_0, *res_unsigned);
@@ -206,9 +411,9 @@ static INLINE __m256i highbd_comp_avg(const __m256i *const data_ref_0,
const __m256i *const res_unsigned,
const __m256i *const wt0,
const __m256i *const wt1,
- const int use_jnt_comp_avg) {
+ const int use_dist_wtd_comp_avg) {
__m256i res;
- if (use_jnt_comp_avg) {
+ if (use_dist_wtd_comp_avg) {
const __m256i wt0_res = _mm256_mullo_epi32(*data_ref_0, *wt0);
const __m256i wt1_res = _mm256_mullo_epi32(*res_unsigned, *wt1);
const __m256i wt_res = _mm256_add_epi32(wt0_res, wt1_res);
diff --git a/libaom/aom_dsp/x86/convolve_sse2.h b/libaom/aom_dsp/x86/convolve_sse2.h
index 445d04b..385c7c7 100644
--- a/libaom/aom_dsp/x86/convolve_sse2.h
+++ b/libaom/aom_dsp/x86/convolve_sse2.h
@@ -78,9 +78,9 @@ static INLINE __m128i convolve_hi_y(const __m128i *const s,
static INLINE __m128i comp_avg(const __m128i *const data_ref_0,
const __m128i *const res_unsigned,
const __m128i *const wt,
- const int use_jnt_comp_avg) {
+ const int use_dist_wtd_avg) {
__m128i res;
- if (use_jnt_comp_avg) {
+ if (use_dist_wtd_avg) {
const __m128i data_lo = _mm_unpacklo_epi16(*data_ref_0, *res_unsigned);
const __m128i data_hi = _mm_unpackhi_epi16(*data_ref_0, *res_unsigned);
diff --git a/libaom/aom_dsp/x86/convolve_sse4_1.h b/libaom/aom_dsp/x86/convolve_sse4_1.h
index 6b8388d..b1a3bb4 100644
--- a/libaom/aom_dsp/x86/convolve_sse4_1.h
+++ b/libaom/aom_dsp/x86/convolve_sse4_1.h
@@ -35,9 +35,9 @@ static INLINE __m128i highbd_comp_avg_sse4_1(const __m128i *const data_ref_0,
const __m128i *const res_unsigned,
const __m128i *const wt0,
const __m128i *const wt1,
- const int use_jnt_comp_avg) {
+ const int use_dist_wtd_avg) {
__m128i res;
- if (use_jnt_comp_avg) {
+ if (use_dist_wtd_avg) {
const __m128i wt0_res = _mm_mullo_epi32(*data_ref_0, *wt0);
const __m128i wt1_res = _mm_mullo_epi32(*res_unsigned, *wt1);
diff --git a/libaom/aom_dsp/x86/fft_avx2.c b/libaom/aom_dsp/x86/fft_avx2.c
index 54da022..4cccc5f 100644
--- a/libaom/aom_dsp/x86/fft_avx2.c
+++ b/libaom/aom_dsp/x86/fft_avx2.c
@@ -11,6 +11,7 @@
#include <immintrin.h>
+#include "config/aom_dsp_rtcd.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/fft_common.h"
diff --git a/libaom/aom_dsp/x86/fft_sse2.c b/libaom/aom_dsp/x86/fft_sse2.c
index 12bdc3e..6f20a3c 100644
--- a/libaom/aom_dsp/x86/fft_sse2.c
+++ b/libaom/aom_dsp/x86/fft_sse2.c
@@ -11,6 +11,7 @@ s * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
#include <xmmintrin.h>
+#include "config/aom_dsp_rtcd.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/fft_common.h"
diff --git a/libaom/aom_dsp/x86/highbd_loopfilter_sse2.c b/libaom/aom_dsp/x86/highbd_loopfilter_sse2.c
index 097e077..70b91c6 100644
--- a/libaom/aom_dsp/x86/highbd_loopfilter_sse2.c
+++ b/libaom/aom_dsp/x86/highbd_loopfilter_sse2.c
@@ -727,8 +727,8 @@ void aom_highbd_lpf_horizontal_14_dual_sse2(
_limit1, _thresh1, bd);
for (i = 0; i < 6; i++) {
- _mm_store_si128((__m128i *)(s - (i + 1) * pitch), p[i]);
- _mm_store_si128((__m128i *)(s + i * pitch), q[i]);
+ _mm_storeu_si128((__m128i *)(s - (i + 1) * pitch), p[i]);
+ _mm_storeu_si128((__m128i *)(s + i * pitch), q[i]);
}
}
diff --git a/libaom/aom_dsp/x86/highbd_quantize_intrin_sse2.c b/libaom/aom_dsp/x86/highbd_quantize_intrin_sse2.c
index 58e5f98..2f4ffd3 100644
--- a/libaom/aom_dsp/x86/highbd_quantize_intrin_sse2.c
+++ b/libaom/aom_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -146,3 +146,61 @@ void aom_highbd_quantize_b_32x32_sse2(
}
*eob_ptr = eob + 1;
}
+
+void aom_highbd_quantize_b_64x64_sse2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ __m128i zbins[2];
+ __m128i nzbins[2];
+ int idx = 0;
+ int idx_arr[1024];
+ int i, eob = -1;
+ const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 2);
+ const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 2);
+ (void)scan;
+ zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
+ zbins[1] = _mm_set1_epi32(zbin1_tmp);
+
+ nzbins[0] = _mm_setzero_si128();
+ nzbins[1] = _mm_setzero_si128();
+ nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
+ nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ // Pre-scan pass
+ for (i = 0; i < n_coeffs / 4; i++) {
+ __m128i coeffs, cmp1, cmp2;
+ int test;
+ coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+ cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+ cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+ cmp1 = _mm_and_si128(cmp1, cmp2);
+ test = _mm_movemask_epi8(cmp1);
+ if (!(test & 0xf)) idx_arr[idx++] = i * 4;
+ if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
+ if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
+ if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
+ }
+
+ // Quantization pass: only process the coefficients selected in
+ // pre-scan pass. Note: idx can be zero.
+ for (i = 0; i < idx; i++) {
+ const int rc = idx_arr[i];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2);
+ const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+ const uint32_t abs_qcoeff =
+ (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 14);
+ qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 4;
+ if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
+ }
+ *eob_ptr = eob + 1;
+}
diff --git a/libaom/aom_dsp/x86/highbd_variance_sse2.c b/libaom/aom_dsp/x86/highbd_variance_sse2.c
index 226576b..fc5678d 100644
--- a/libaom/aom_dsp/x86/highbd_variance_sse2.c
+++ b/libaom/aom_dsp/x86/highbd_variance_sse2.c
@@ -287,30 +287,38 @@ DECLS(sse2);
uint32_t aom_highbd_8_sub_pixel_variance##w##x##h##_##opt( \
const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
- uint32_t sse; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
- int se = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
- src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \
- NULL); \
- if (w > wf) { \
- unsigned int sse2; \
+ int se = 0; \
+ unsigned int sse = 0; \
+ unsigned int sse2; \
+ int row_rep = (w > 64) ? 2 : 1; \
+ for (int wd_64 = 0; wd_64 < row_rep; wd_64++) { \
+ src += wd_64 * 64; \
+ dst += wd_64 * 64; \
int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
- src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \
- &sse2, NULL, NULL); \
+ src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse2, \
+ NULL, NULL); \
se += se2; \
sse += sse2; \
- if (w > wf * 2) { \
- se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
- src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \
- &sse2, NULL, NULL); \
- se += se2; \
- sse += sse2; \
+ if (w > wf) { \
se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
- src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \
+ src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \
&sse2, NULL, NULL); \
se += se2; \
sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \
+ h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \
+ h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ } \
} \
} \
*sse_ptr = sse; \
@@ -322,33 +330,42 @@ DECLS(sse2);
const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
int64_t var; \
uint32_t sse; \
+ uint64_t long_sse = 0; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
- int se = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
- src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \
- NULL); \
- if (w > wf) { \
- uint32_t sse2; \
+ int se = 0; \
+ int row_rep = (w > 64) ? 2 : 1; \
+ for (int wd_64 = 0; wd_64 < row_rep; wd_64++) { \
+ src += wd_64 * 64; \
+ dst += wd_64 * 64; \
int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
- src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \
- &sse2, NULL, NULL); \
+ src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \
+ NULL); \
se += se2; \
- sse += sse2; \
- if (w > wf * 2) { \
- se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
- src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \
- &sse2, NULL, NULL); \
- se += se2; \
- sse += sse2; \
+ long_sse += sse; \
+ if (w > wf) { \
+ uint32_t sse2; \
se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
- src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \
+ src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \
&sse2, NULL, NULL); \
se += se2; \
- sse += sse2; \
+ long_sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \
+ h, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \
+ h, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ } \
} \
} \
se = ROUND_POWER_OF_TWO(se, 2); \
- sse = ROUND_POWER_OF_TWO(sse, 4); \
+ sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 4); \
*sse_ptr = sse; \
var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \
return (var >= 0) ? (uint32_t)var : 0; \
@@ -364,35 +381,38 @@ DECLS(sse2);
uint64_t long_sse = 0; \
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ int row_rep = (w > 64) ? 2 : 1; \
for (start_row = 0; start_row < h; start_row += 16) { \
uint32_t sse2; \
int height = h - start_row < 16 ? h - start_row : 16; \
- int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
- src + (start_row * src_stride), src_stride, x_offset, y_offset, \
- dst + (start_row * dst_stride), dst_stride, height, &sse2, NULL, \
- NULL); \
- se += se2; \
- long_sse += sse2; \
- if (w > wf) { \
- se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
- src + 16 + (start_row * src_stride), src_stride, x_offset, \
- y_offset, dst + 16 + (start_row * dst_stride), dst_stride, height, \
- &sse2, NULL, NULL); \
+ uint16_t *src_tmp = src + (start_row * src_stride); \
+ uint16_t *dst_tmp = dst + (start_row * dst_stride); \
+ for (int wd_64 = 0; wd_64 < row_rep; wd_64++) { \
+ src_tmp += wd_64 * 64; \
+ dst_tmp += wd_64 * 64; \
+ int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src_tmp, src_stride, x_offset, y_offset, dst_tmp, dst_stride, \
+ height, &sse2, NULL, NULL); \
se += se2; \
long_sse += sse2; \
- if (w > wf * 2) { \
+ if (w > wf) { \
se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
- src + 32 + (start_row * src_stride), src_stride, x_offset, \
- y_offset, dst + 32 + (start_row * dst_stride), dst_stride, \
- height, &sse2, NULL, NULL); \
- se += se2; \
- long_sse += sse2; \
- se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
- src + 48 + (start_row * src_stride), src_stride, x_offset, \
- y_offset, dst + 48 + (start_row * dst_stride), dst_stride, \
- height, &sse2, NULL, NULL); \
+ src_tmp + 16, src_stride, x_offset, y_offset, dst_tmp + 16, \
+ dst_stride, height, &sse2, NULL, NULL); \
se += se2; \
long_sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src_tmp + 32, src_stride, x_offset, y_offset, dst_tmp + 32, \
+ dst_stride, height, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src_tmp + 48, src_stride, x_offset, y_offset, dst_tmp + 48, \
+ dst_stride, height, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ } \
} \
} \
} \
@@ -403,22 +423,25 @@ DECLS(sse2);
return (var >= 0) ? (uint32_t)var : 0; \
}
-#define FNS(opt) \
- FN(64, 64, 16, 6, 6, opt, (int64_t)); \
- FN(64, 32, 16, 6, 5, opt, (int64_t)); \
- FN(32, 64, 16, 5, 6, opt, (int64_t)); \
- FN(32, 32, 16, 5, 5, opt, (int64_t)); \
- FN(32, 16, 16, 5, 4, opt, (int64_t)); \
- FN(16, 32, 16, 4, 5, opt, (int64_t)); \
- FN(16, 16, 16, 4, 4, opt, (int64_t)); \
- FN(16, 8, 16, 4, 3, opt, (int64_t)); \
- FN(8, 16, 8, 3, 4, opt, (int64_t)); \
- FN(8, 8, 8, 3, 3, opt, (int64_t)); \
- FN(8, 4, 8, 3, 2, opt, (int64_t)); \
- FN(16, 4, 16, 4, 2, opt, (int64_t)); \
- FN(8, 32, 8, 3, 5, opt, (int64_t)); \
- FN(32, 8, 16, 5, 3, opt, (int64_t)); \
- FN(16, 64, 16, 4, 6, opt, (int64_t)); \
+#define FNS(opt) \
+ FN(128, 128, 16, 7, 7, opt, (int64_t)); \
+ FN(128, 64, 16, 7, 6, opt, (int64_t)); \
+ FN(64, 128, 16, 6, 7, opt, (int64_t)); \
+ FN(64, 64, 16, 6, 6, opt, (int64_t)); \
+ FN(64, 32, 16, 6, 5, opt, (int64_t)); \
+ FN(32, 64, 16, 5, 6, opt, (int64_t)); \
+ FN(32, 32, 16, 5, 5, opt, (int64_t)); \
+ FN(32, 16, 16, 5, 4, opt, (int64_t)); \
+ FN(16, 32, 16, 4, 5, opt, (int64_t)); \
+ FN(16, 16, 16, 4, 4, opt, (int64_t)); \
+ FN(16, 8, 16, 4, 3, opt, (int64_t)); \
+ FN(8, 16, 8, 3, 4, opt, (int64_t)); \
+ FN(8, 8, 8, 3, 3, opt, (int64_t)); \
+ FN(8, 4, 8, 3, 2, opt, (int64_t)); \
+ FN(16, 4, 16, 4, 2, opt, (int64_t)); \
+ FN(8, 32, 8, 3, 5, opt, (int64_t)); \
+ FN(32, 8, 16, 5, 3, opt, (int64_t)); \
+ FN(16, 64, 16, 4, 6, opt, (int64_t)); \
FN(64, 16, 16, 6, 4, opt, (int64_t))
FNS(sse2);
@@ -603,7 +626,7 @@ void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd,
const int ref_num = 0;
const int is_intrabc = is_intrabc_block(mi);
const struct scale_factors *const sf =
- is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf;
+ is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
const int is_scaled = av1_is_scaled(sf);
if (is_scaled) {
@@ -765,11 +788,11 @@ void aom_highbd_comp_avg_upsampled_pred_sse2(
}
}
-static INLINE void highbd_compute_jnt_comp_avg(__m128i *p0, __m128i *p1,
- const __m128i *w0,
- const __m128i *w1,
- const __m128i *r,
- void *const result) {
+static INLINE void highbd_compute_dist_wtd_comp_avg(__m128i *p0, __m128i *p1,
+ const __m128i *w0,
+ const __m128i *w1,
+ const __m128i *r,
+ void *const result) {
assert(DIST_PRECISION_BITS <= 4);
__m128i mult0 = _mm_mullo_epi16(*p0, *w0);
__m128i mult1 = _mm_mullo_epi16(*p1, *w1);
@@ -780,11 +803,10 @@ static INLINE void highbd_compute_jnt_comp_avg(__m128i *p0, __m128i *p1,
xx_storeu_128(result, shift);
}
-void aom_highbd_jnt_comp_avg_pred_sse2(uint8_t *comp_pred8,
- const uint8_t *pred8, int width,
- int height, const uint8_t *ref8,
- int ref_stride,
- const JNT_COMP_PARAMS *jcp_param) {
+void aom_highbd_dist_wtd_comp_avg_pred_sse2(
+ uint8_t *comp_pred8, const uint8_t *pred8, int width, int height,
+ const uint8_t *ref8, int ref_stride,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
int i;
const uint16_t wt0 = (uint16_t)jcp_param->fwd_offset;
const uint16_t wt1 = (uint16_t)jcp_param->bck_offset;
@@ -806,7 +828,7 @@ void aom_highbd_jnt_comp_avg_pred_sse2(uint8_t *comp_pred8,
__m128i p0 = xx_loadu_128(ref);
__m128i p1 = xx_loadu_128(pred);
- highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred);
+ highbd_compute_dist_wtd_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred);
comp_pred += 8;
pred += 8;
@@ -823,7 +845,7 @@ void aom_highbd_jnt_comp_avg_pred_sse2(uint8_t *comp_pred8,
__m128i p0 = _mm_unpacklo_epi64(p0_0, p0_1);
__m128i p1 = xx_loadu_128(pred);
- highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred);
+ highbd_compute_dist_wtd_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred);
comp_pred += 8;
pred += 8;
@@ -832,11 +854,11 @@ void aom_highbd_jnt_comp_avg_pred_sse2(uint8_t *comp_pred8,
}
}
-void aom_highbd_jnt_comp_avg_upsampled_pred_sse2(
+void aom_highbd_dist_wtd_comp_avg_upsampled_pred_sse2(
MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
- int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param,
+ int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param,
int subpel_search) {
uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
int n;
@@ -860,7 +882,7 @@ void aom_highbd_jnt_comp_avg_upsampled_pred_sse2(
__m128i p0 = xx_loadu_128(comp_pred16);
__m128i p1 = xx_loadu_128(pred);
- highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred16);
+ highbd_compute_dist_wtd_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred16);
comp_pred16 += 8;
pred += 8;
diff --git a/libaom/aom_dsp/x86/intrapred_asm_sse2.asm b/libaom/aom_dsp/x86/intrapred_asm_sse2.asm
index 9aece27..0eb6323 100644
--- a/libaom/aom_dsp/x86/intrapred_asm_sse2.asm
+++ b/libaom/aom_dsp/x86/intrapred_asm_sse2.asm
@@ -27,23 +27,6 @@ pw2_32: times 8 dw 16
SECTION .text
-; ------------------------------------------
-; input: x, y, z, result
-;
-; trick from pascal
-; (x+2y+z+2)>>2 can be calculated as:
-; result = avg(x,z)
-; result -= xor(x,z) & 1
-; result = avg(result,y)
-; ------------------------------------------
-%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4
- pavgb %4, %1, %3
- pxor %3, %1
- pand %3, [GLOBAL(pb_1)]
- psubb %4, %3
- pavgb %4, %2
-%endmacro
-
INIT_XMM sse2
cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset
GET_GOT goffsetq
diff --git a/libaom/aom_dsp/x86/intrapred_avx2.c b/libaom/aom_dsp/x86/intrapred_avx2.c
index 5f3e7bb..17f35a0 100644
--- a/libaom/aom_dsp/x86/intrapred_avx2.c
+++ b/libaom/aom_dsp/x86/intrapred_avx2.c
@@ -1481,9 +1481,10 @@ static void highbd_dr_prediction_z1_64xN_avx2(int N, uint16_t *dst,
void av1_highbd_dr_prediction_z1_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
int bh, const uint16_t *above,
const uint16_t *left, int upsample_above,
- int dx, int dy) {
+ int dx, int dy, int bd) {
(void)left;
(void)dy;
+ (void)bd;
switch (bw) {
case 4:
@@ -1511,8 +1512,8 @@ void av1_highbd_dr_prediction_z1_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
return;
}
-static void transpose_TX_8X8(const uint16_t *src, ptrdiff_t pitchSrc,
- uint16_t *dst, ptrdiff_t pitchDst) {
+static void highbd_transpose_TX_8X8(const uint16_t *src, ptrdiff_t pitchSrc,
+ uint16_t *dst, ptrdiff_t pitchDst) {
__m128i r0, r1, r2, r3, r4, r5, r6, r7, r0_Lo, r1_Lo, r2_Lo, r3_Lo, r4_Lo,
r5_Lo, r6_Lo;
r0 = _mm_load_si128(
@@ -1579,12 +1580,921 @@ static void transpose_TX_8X8(const uint16_t *src, ptrdiff_t pitchSrc,
_mm_storeu_si128((__m128i *)(dst + 7 * pitchDst), r3);
}
-static void transpose(const uint16_t *src, ptrdiff_t pitchSrc, uint16_t *dst,
- ptrdiff_t pitchDst, int width, int height) {
+static uint8_t HighbdLoadMaskx[8][16] = {
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+ { 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },
+ { 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 },
+ { 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 },
+ { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 },
+ { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5 },
+ { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3 },
+ { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 },
+};
+
+static uint8_t HighbdEvenOddMaskx4[8][16] = {
+ { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14,
+ 15 }, // 0=0,1, 1=2,3, 2=4,5, 3=6,7, 4=8,9, 5=10,11, 6=12,13, 7=14,15,
+ // >7=0,1
+ { 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 4, 5, 8, 9, 12, 13 },
+ { 0, 1, 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 6, 7, 10, 11 },
+ { 0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 0, 1, 8, 9 },
+ { 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 12, 13, 0, 1, 0, 1 },
+ { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 0, 1 },
+ { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 0, 1 },
+ { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15 }
+};
+
+static uint16_t HighbdEvenOddMaskx8_2[8][16] = {
+ { 0, 2, 4, 6, 8, 10, 12, 14 }, { 2, 2, 4, 6, 8, 10, 12, 14 },
+ { 4, 4, 4, 6, 8, 10, 12, 14 }, { 6, 6, 6, 6, 8, 10, 12, 14 },
+ { 8, 8, 8, 8, 8, 10, 12, 14 }, { 10, 10, 10, 10, 10, 10, 12, 14 },
+ { 12, 12, 12, 12, 12, 12, 12, 14 }, { 14, 14, 14, 14, 14, 14, 14, 14 },
+};
+
+static uint16_t HighbdBaseMask[17][16] = {
+ {
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ },
+ { 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0 },
+ { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0 },
+ { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0,
+ 0, 0, 0, 0 },
+ { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0,
+ 0, 0, 0, 0, 0, 0 },
+ { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0, 0, 0, 0, 0, 0 },
+ { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0, 0, 0, 0, 0 },
+ { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0, 0, 0, 0 },
+ { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0 },
+ { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0 },
+ { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0 },
+ { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff }
+};
+
+static void highbd_dr_prediction_z2_Nx4_avx2(
+ int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
+ const uint16_t *left, int upsample_above, int upsample_left, int dx,
+ int dy) {
+ const int min_base_x = -(1 << upsample_above);
+ const int min_base_y = -(1 << upsample_left);
+ const int frac_bits_x = 6 - upsample_above;
+ const int frac_bits_y = 6 - upsample_left;
+
+ // a assert(dx > 0);
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be caluculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i a0_x, a1_x, a32, a16;
+ __m256i diff;
+ __m128i c3f, min_base_y128;
+
+ a16 = _mm256_set1_epi32(16);
+ c3f = _mm_set1_epi32(0x3f);
+ min_base_y128 = _mm_set1_epi32(min_base_y);
+
+ for (int r = 0; r < N; r++) {
+ __m256i b, res, shift;
+ __m128i resx, resy, resxy;
+ __m128i a0_x128, a1_x128;
+ int y = r + 1;
+ int base_x = (-y * dx) >> frac_bits_x;
+ int base_shift = 0;
+ if (base_x < (min_base_x - 1)) {
+ base_shift = (min_base_x - base_x - 1) >> upsample_above;
+ }
+ int base_min_diff =
+ (min_base_x - base_x + upsample_above) >> upsample_above;
+ if (base_min_diff > 4) {
+ base_min_diff = 4;
+ } else {
+ if (base_min_diff < 0) base_min_diff = 0;
+ }
+
+ if (base_shift > 3) {
+ a0_x = _mm256_setzero_si256();
+ a1_x = _mm256_setzero_si256();
+ shift = _mm256_setzero_si256();
+ } else {
+ a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+ if (upsample_above) {
+ a0_x128 = _mm_shuffle_epi8(a0_x128,
+ *(__m128i *)HighbdEvenOddMaskx4[base_shift]);
+ a1_x128 = _mm_srli_si128(a0_x128, 8);
+
+ shift = _mm256_castsi128_si256(_mm_srli_epi32(
+ _mm_and_si128(
+ _mm_slli_epi32(
+ _mm_setr_epi32(-y * dx, (1 << 6) - y * dx,
+ (2 << 6) - y * dx, (3 << 6) - y * dx),
+ upsample_above),
+ c3f),
+ 1));
+ } else {
+ a0_x128 =
+ _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+ a1_x128 = _mm_srli_si128(a0_x128, 2);
+
+ shift = _mm256_castsi128_si256(_mm_srli_epi32(
+ _mm_and_si128(_mm_setr_epi32(-y * dx, (1 << 6) - y * dx,
+ (2 << 6) - y * dx, (3 << 6) - y * dx),
+ c3f),
+ 1));
+ }
+ a0_x = _mm256_cvtepu16_epi32(a0_x128);
+ a1_x = _mm256_cvtepu16_epi32(a1_x128);
+ }
+ // y calc
+ __m128i a0_y, a1_y, shifty;
+ if (base_x < min_base_x) {
+ __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
+ DECLARE_ALIGNED(32, int, base_y_c[4]);
+ r6 = _mm_set1_epi32(r << 6);
+ dy128 = _mm_set1_epi32(dy);
+ c1234 = _mm_setr_epi32(1, 2, 3, 4);
+ y_c128 = _mm_sub_epi32(r6, _mm_mullo_epi32(c1234, dy128));
+ base_y_c128 = _mm_srai_epi32(y_c128, frac_bits_y);
+ mask128 = _mm_cmpgt_epi32(min_base_y128, base_y_c128);
+ base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
+ _mm_store_si128((__m128i *)base_y_c, base_y_c128);
+
+ a0_y = _mm_setr_epi32(left[base_y_c[0]], left[base_y_c[1]],
+ left[base_y_c[2]], left[base_y_c[3]]);
+ a1_y = _mm_setr_epi32(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
+ left[base_y_c[2] + 1], left[base_y_c[3] + 1]);
+
+ if (upsample_left) {
+ shifty = _mm_srli_epi32(
+ _mm_and_si128(_mm_slli_epi32(y_c128, upsample_left), c3f), 1);
+ } else {
+ shifty = _mm_srli_epi32(_mm_and_si128(y_c128, c3f), 1);
+ }
+ a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
+ a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
+ shift = _mm256_inserti128_si256(shift, shifty, 1);
+ }
+
+ diff = _mm256_sub_epi32(a1_x, a0_x); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi32(a0_x, 5); // a[x] * 32
+ a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi32(diff, shift);
+ res = _mm256_add_epi32(a32, b);
+ res = _mm256_srli_epi32(res, 5);
+
+ resx = _mm256_castsi256_si128(res);
+ resx = _mm_packus_epi32(resx, resx);
+
+ resy = _mm256_extracti128_si256(res, 1);
+ resy = _mm_packus_epi32(resy, resy);
+
+ resxy =
+ _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
+ _mm_storel_epi64((__m128i *)(dst), resxy);
+ dst += stride;
+ }
+}
+
+static void highbd_dr_prediction_32bit_z2_Nx8_avx2(
+ int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
+ const uint16_t *left, int upsample_above, int upsample_left, int dx,
+ int dy) {
+ const int min_base_x = -(1 << upsample_above);
+ const int min_base_y = -(1 << upsample_left);
+ const int frac_bits_x = 6 - upsample_above;
+ const int frac_bits_y = 6 - upsample_left;
+
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be caluculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c3f, min_base_y256;
+ __m256i diff;
+ __m128i a0_x128, a1_x128;
+
+ a16 = _mm256_set1_epi32(16);
+ c3f = _mm256_set1_epi32(0x3f);
+ min_base_y256 = _mm256_set1_epi32(min_base_y);
+
+ for (int r = 0; r < N; r++) {
+ __m256i b, res, shift;
+ __m128i resx, resy, resxy;
+ int y = r + 1;
+ int base_x = (-y * dx) >> frac_bits_x;
+ int base_shift = 0;
+ if (base_x < (min_base_x - 1)) {
+ base_shift = (min_base_x - base_x - 1) >> upsample_above;
+ }
+ int base_min_diff =
+ (min_base_x - base_x + upsample_above) >> upsample_above;
+ if (base_min_diff > 8) {
+ base_min_diff = 8;
+ } else {
+ if (base_min_diff < 0) base_min_diff = 0;
+ }
+
+ if (base_shift > 7) {
+ resx = _mm_setzero_si128();
+ } else {
+ if (upsample_above) {
+ a0_x128 = _mm_setr_epi16(
+ above[base_x + HighbdEvenOddMaskx8_2[base_shift][0]],
+ above[base_x + HighbdEvenOddMaskx8_2[base_shift][1]],
+ above[base_x + HighbdEvenOddMaskx8_2[base_shift][2]],
+ above[base_x + HighbdEvenOddMaskx8_2[base_shift][3]],
+ above[base_x + HighbdEvenOddMaskx8_2[base_shift][4]],
+ above[base_x + HighbdEvenOddMaskx8_2[base_shift][5]],
+ above[base_x + HighbdEvenOddMaskx8_2[base_shift][6]],
+ above[base_x + HighbdEvenOddMaskx8_2[base_shift][7]]);
+ a1_x128 = _mm_setr_epi16(
+ above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][0]],
+ above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][1]],
+ above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][2]],
+ above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][3]],
+ above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][4]],
+ above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][5]],
+ above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][6]],
+ above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][7]]);
+ shift = _mm256_srli_epi32(
+ _mm256_and_si256(
+ _mm256_slli_epi32(
+ _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx,
+ (2 << 6) - y * dx, (3 << 6) - y * dx,
+ (4 << 6) - y * dx, (5 << 6) - y * dx,
+ (6 << 6) - y * dx, (7 << 6) - y * dx),
+ upsample_above),
+ c3f),
+ 1);
+ } else {
+ a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+ a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift));
+ a0_x128 =
+ _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+ a1_x128 =
+ _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+
+ shift = _mm256_srli_epi32(
+ _mm256_and_si256(
+ _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx,
+ (3 << 6) - y * dx, (4 << 6) - y * dx,
+ (5 << 6) - y * dx, (6 << 6) - y * dx,
+ (7 << 6) - y * dx),
+ c3f),
+ 1);
+ }
+
+ a0_x = _mm256_cvtepu16_epi32(a0_x128);
+ a1_x = _mm256_cvtepu16_epi32(a1_x128);
+
+ diff = _mm256_sub_epi32(a1_x, a0_x); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi32(a0_x, 5); // a[x] * 32
+ a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi32(diff, shift);
+ res = _mm256_add_epi32(a32, b);
+ res = _mm256_srli_epi32(res, 5);
+
+ resx = _mm256_castsi256_si128(_mm256_packus_epi32(
+ res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
+ }
+ // y calc
+ if (base_x < min_base_x) {
+ DECLARE_ALIGNED(32, int, base_y_c[8]);
+ __m256i r6, c256, dy256, y_c256, base_y_c256, mask256;
+ r6 = _mm256_set1_epi32(r << 6);
+ dy256 = _mm256_set1_epi32(dy);
+ c256 = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+ y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
+ base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y);
+ mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
+ base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
+ _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
+
+ a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
+ left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
+ left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
+ left[base_y_c[6]], left[base_y_c[7]]));
+ a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
+ left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
+ left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
+ left[base_y_c[6] + 1], left[base_y_c[7] + 1]));
+
+ if (upsample_left) {
+ shift = _mm256_srli_epi32(
+ _mm256_and_si256(_mm256_slli_epi32((y_c256), upsample_left), c3f),
+ 1);
+ } else {
+ shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1);
+ }
+ diff = _mm256_sub_epi32(a1_y, a0_y); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi32(a0_y, 5); // a[x] * 32
+ a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi32(diff, shift);
+ res = _mm256_add_epi32(a32, b);
+ res = _mm256_srli_epi32(res, 5);
+
+ resy = _mm256_castsi256_si128(_mm256_packus_epi32(
+ res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
+ } else {
+ resy = resx;
+ }
+ resxy =
+ _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
+ _mm_storeu_si128((__m128i *)(dst), resxy);
+ dst += stride;
+ }
+}
+
+static void highbd_dr_prediction_z2_Nx8_avx2(
+ int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
+ const uint16_t *left, int upsample_above, int upsample_left, int dx,
+ int dy) {
+ const int min_base_x = -(1 << upsample_above);
+ const int min_base_y = -(1 << upsample_left);
+ const int frac_bits_x = 6 - upsample_above;
+ const int frac_bits_y = 6 - upsample_left;
+
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be caluculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m128i c3f, min_base_y128;
+ __m256i a0_x, a1_x, diff, a32, a16;
+ __m128i a0_x128, a1_x128;
+
+ a16 = _mm256_set1_epi16(16);
+ c3f = _mm_set1_epi16(0x3f);
+ min_base_y128 = _mm_set1_epi16(min_base_y);
+
+ for (int r = 0; r < N; r++) {
+ __m256i b, res, shift;
+ __m128i resx, resy, resxy;
+ int y = r + 1;
+ int base_x = (-y * dx) >> frac_bits_x;
+ int base_shift = 0;
+ if (base_x < (min_base_x - 1)) {
+ base_shift = (min_base_x - base_x - 1) >> upsample_above;
+ }
+ int base_min_diff =
+ (min_base_x - base_x + upsample_above) >> upsample_above;
+ if (base_min_diff > 8) {
+ base_min_diff = 8;
+ } else {
+ if (base_min_diff < 0) base_min_diff = 0;
+ }
+
+ if (base_shift > 7) {
+ a0_x = _mm256_setzero_si256();
+ a1_x = _mm256_setzero_si256();
+ shift = _mm256_setzero_si256();
+ } else {
+ if (upsample_above) {
+ a0_x128 = _mm_setr_epi16(
+ above[base_x + HighbdEvenOddMaskx8_2[base_shift][0]],
+ above[base_x + HighbdEvenOddMaskx8_2[base_shift][1]],
+ above[base_x + HighbdEvenOddMaskx8_2[base_shift][2]],
+ above[base_x + HighbdEvenOddMaskx8_2[base_shift][3]],
+ above[base_x + HighbdEvenOddMaskx8_2[base_shift][4]],
+ above[base_x + HighbdEvenOddMaskx8_2[base_shift][5]],
+ above[base_x + HighbdEvenOddMaskx8_2[base_shift][6]],
+ above[base_x + HighbdEvenOddMaskx8_2[base_shift][7]]);
+ a1_x128 = _mm_setr_epi16(
+ above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][0]],
+ above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][1]],
+ above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][2]],
+ above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][3]],
+ above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][4]],
+ above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][5]],
+ above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][6]],
+ above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][7]]);
+ shift = _mm256_castsi128_si256(_mm_srli_epi16(
+ _mm_and_si128(
+ _mm_slli_epi16(
+ _mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
+ (2 << 6) - y * dx, (3 << 6) - y * dx,
+ (4 << 6) - y * dx, (5 << 6) - y * dx,
+ (6 << 6) - y * dx, (7 << 6) - y * dx),
+ upsample_above),
+ c3f),
+ 1));
+ } else {
+ a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+ a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift));
+ a0_x128 =
+ _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+ a1_x128 =
+ _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+
+ shift = _mm256_castsi128_si256(_mm_srli_epi16(
+ _mm_and_si128(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
+ (2 << 6) - y * dx, (3 << 6) - y * dx,
+ (4 << 6) - y * dx, (5 << 6) - y * dx,
+ (6 << 6) - y * dx, (7 << 6) - y * dx),
+ c3f),
+ 1));
+ }
+ a0_x = _mm256_castsi128_si256(a0_x128);
+ a1_x = _mm256_castsi128_si256(a1_x128);
+ }
+
+ // y calc
+ __m128i a0_y, a1_y, shifty;
+ if (base_x < min_base_x) {
+ DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
+ __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
+ r6 = _mm_set1_epi16(r << 6);
+ dy128 = _mm_set1_epi16(dy);
+ c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+ y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
+ base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
+ mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
+ base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
+ _mm_store_si128((__m128i *)base_y_c, base_y_c128);
+
+ a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+ left[base_y_c[2]], left[base_y_c[3]],
+ left[base_y_c[4]], left[base_y_c[5]],
+ left[base_y_c[6]], left[base_y_c[7]]);
+ a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
+ left[base_y_c[2] + 1], left[base_y_c[3] + 1],
+ left[base_y_c[4] + 1], left[base_y_c[5] + 1],
+ left[base_y_c[6] + 1], left[base_y_c[7] + 1]);
+
+ if (upsample_left) {
+ shifty = _mm_srli_epi16(
+ _mm_and_si128(_mm_slli_epi16((y_c128), upsample_left), c3f), 1);
+ } else {
+ shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
+ }
+ a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
+ a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
+ shift = _mm256_inserti128_si256(shift, shifty, 1);
+ }
+
+ diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32
+ a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi16(diff, shift);
+ res = _mm256_add_epi16(a32, b);
+ res = _mm256_srli_epi16(res, 5);
+
+ resx = _mm256_castsi256_si128(res);
+ resy = _mm256_extracti128_si256(res, 1);
+
+ resxy =
+ _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
+ _mm_storeu_si128((__m128i *)(dst), resxy);
+ dst += stride;
+ }
+}
+
+static void highbd_dr_prediction_32bit_z2_HxW_avx2(
+ int H, int W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
+ const uint16_t *left, int upsample_above, int upsample_left, int dx,
+ int dy) {
+ // here upsample_above and upsample_left are 0 by design of
+ // av1_use_intra_edge_upsample
+ const int min_base_x = -1;
+ const int min_base_y = -1;
+ (void)upsample_above;
+ (void)upsample_left;
+ const int frac_bits_x = 6;
+ const int frac_bits_y = 6;
+
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be caluculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i a0_x, a1_x, a0_y, a1_y, a32, a0_1_x, a1_1_x, a16;
+ __m256i diff, min_base_y256, c3f;
+ __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128;
+
+ a16 = _mm256_set1_epi32(16);
+ min_base_y256 = _mm256_set1_epi16(min_base_y);
+ c3f = _mm256_set1_epi32(0x3f);
+
+ for (int r = 0; r < H; r++) {
+ __m256i b, res, shift;
+ __m256i resx[2], resy[2];
+ __m256i resxy;
+ for (int j = 0; j < W; j += 16) {
+ int y = r + 1;
+ int base_x = (-y * dx) >> frac_bits_x;
+ int base_shift = 0;
+ if ((base_x + j) < (min_base_x - 1)) {
+ base_shift = (min_base_x - (base_x + j) - 1);
+ }
+ int base_min_diff = (min_base_x - base_x - j);
+ if (base_min_diff > 16) {
+ base_min_diff = 16;
+ } else {
+ if (base_min_diff < 0) base_min_diff = 0;
+ }
+
+ if (base_shift > 7) {
+ resx[0] = _mm256_setzero_si256();
+ } else {
+ a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j));
+ a1_x128 =
+ _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j));
+ a0_x128 =
+ _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+ a1_x128 =
+ _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+
+ a0_x = _mm256_cvtepu16_epi32(a0_x128);
+ a1_x = _mm256_cvtepu16_epi32(a1_x128);
+
+ shift = _mm256_srli_epi32(
+ _mm256_and_si256(
+ _mm256_setr_epi32(
+ ((0 + j) << 6) - y * dx, ((1 + j) << 6) - y * dx,
+ ((2 + j) << 6) - y * dx, ((3 + j) << 6) - y * dx,
+ ((4 + j) << 6) - y * dx, ((5 + j) << 6) - y * dx,
+ ((6 + j) << 6) - y * dx, ((7 + j) << 6) - y * dx),
+ c3f),
+ 1);
+
+ diff = _mm256_sub_epi32(a1_x, a0_x); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi32(a0_x, 5); // a[x] * 32
+ a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi32(diff, shift);
+ res = _mm256_add_epi32(a32, b);
+ res = _mm256_srli_epi32(res, 5);
+
+ resx[0] = _mm256_packus_epi32(
+ res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
+ }
+ int base_shift8 = 0;
+ if ((base_x + j + 8) < (min_base_x - 1)) {
+ base_shift8 = (min_base_x - (base_x + j + 8) - 1);
+ }
+ if (base_shift8 > 7) {
+ resx[1] = _mm256_setzero_si256();
+ } else {
+ a0_1_x128 =
+ _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 8 + j));
+ a1_1_x128 =
+ _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 9 + j));
+ a0_1_x128 = _mm_shuffle_epi8(a0_1_x128,
+ *(__m128i *)HighbdLoadMaskx[base_shift8]);
+ a1_1_x128 = _mm_shuffle_epi8(a1_1_x128,
+ *(__m128i *)HighbdLoadMaskx[base_shift8]);
+
+ a0_1_x = _mm256_cvtepu16_epi32(a0_1_x128);
+ a1_1_x = _mm256_cvtepu16_epi32(a1_1_x128);
+
+ shift = _mm256_srli_epi32(
+ _mm256_and_si256(
+ _mm256_setr_epi32(
+ ((8 + j) << 6) - y * dx, ((9 + j) << 6) - y * dx,
+ ((10 + j) << 6) - y * dx, ((11 + j) << 6) - y * dx,
+ ((12 + j) << 6) - y * dx, ((13 + j) << 6) - y * dx,
+ ((14 + j) << 6) - y * dx, ((15 + j) << 6) - y * dx),
+ c3f),
+ 1);
+
+ diff = _mm256_sub_epi32(a1_1_x, a0_1_x); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi32(a0_1_x, 5); // a[x] * 32
+ a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
+ b = _mm256_mullo_epi32(diff, shift);
+
+ resx[1] = _mm256_add_epi32(a32, b);
+ resx[1] = _mm256_srli_epi32(resx[1], 5);
+ resx[1] = _mm256_packus_epi32(
+ resx[1],
+ _mm256_castsi128_si256(_mm256_extracti128_si256(resx[1], 1)));
+ }
+ resx[0] =
+ _mm256_inserti128_si256(resx[0], _mm256_castsi256_si128(resx[1]),
+ 1); // 16 16bit values
+
+ // y calc
+ if ((base_x < min_base_x)) {
+ DECLARE_ALIGNED(32, int, base_y_c[16]);
+ __m256i r6, c256, dy256, y_c256, y_c_1_256, base_y_c256, mask256;
+ r6 = _mm256_set1_epi32(r << 6);
+ dy256 = _mm256_set1_epi32(dy);
+ c256 = _mm256_setr_epi32(1 + j, 2 + j, 3 + j, 4 + j, 5 + j, 6 + j,
+ 7 + j, 8 + j);
+ y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
+ base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y);
+ mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
+ base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
+ _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
+ c256 = _mm256_setr_epi32(9 + j, 10 + j, 11 + j, 12 + j, 13 + j, 14 + j,
+ 15 + j, 16 + j);
+ y_c_1_256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
+ base_y_c256 = _mm256_srai_epi32(y_c_1_256, frac_bits_y);
+ mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
+ base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
+ _mm256_store_si256((__m256i *)(base_y_c + 8), base_y_c256);
+
+ a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
+ left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
+ left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
+ left[base_y_c[6]], left[base_y_c[7]]));
+ a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
+ left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
+ left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
+ left[base_y_c[6] + 1], left[base_y_c[7] + 1]));
+
+ shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1);
+
+ diff = _mm256_sub_epi32(a1_y, a0_y); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi32(a0_y, 5); // a[x] * 32
+ a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi32(diff, shift);
+ res = _mm256_add_epi32(a32, b);
+ res = _mm256_srli_epi32(res, 5);
+
+ resy[0] = _mm256_packus_epi32(
+ res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
+
+ a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
+ left[base_y_c[8]], left[base_y_c[9]], left[base_y_c[10]],
+ left[base_y_c[11]], left[base_y_c[12]], left[base_y_c[13]],
+ left[base_y_c[14]], left[base_y_c[15]]));
+ a1_y = _mm256_cvtepu16_epi32(
+ _mm_setr_epi16(left[base_y_c[8] + 1], left[base_y_c[9] + 1],
+ left[base_y_c[10] + 1], left[base_y_c[11] + 1],
+ left[base_y_c[12] + 1], left[base_y_c[13] + 1],
+ left[base_y_c[14] + 1], left[base_y_c[15] + 1]));
+ shift = _mm256_srli_epi32(_mm256_and_si256(y_c_1_256, c3f), 1);
+
+ diff = _mm256_sub_epi32(a1_y, a0_y); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi32(a0_y, 5); // a[x] * 32
+ a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi32(diff, shift);
+ res = _mm256_add_epi32(a32, b);
+ res = _mm256_srli_epi32(res, 5);
+
+ resy[1] = _mm256_packus_epi32(
+ res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
+
+ resy[0] =
+ _mm256_inserti128_si256(resy[0], _mm256_castsi256_si128(resy[1]),
+ 1); // 16 16bit values
+ } else {
+ resy[0] = resx[0];
+ }
+ resxy = _mm256_blendv_epi8(resx[0], resy[0],
+ *(__m256i *)HighbdBaseMask[base_min_diff]);
+ _mm256_storeu_si256((__m256i *)(dst + j), resxy);
+ } // for j
+ dst += stride;
+ }
+}
+
+static void highbd_dr_prediction_z2_HxW_avx2(
+ int H, int W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
+ const uint16_t *left, int upsample_above, int upsample_left, int dx,
+ int dy) {
+ // here upsample_above and upsample_left are 0 by design of
+ // av1_use_intra_edge_upsample
+ const int min_base_x = -1;
+ const int min_base_y = -1;
+ (void)upsample_above;
+ (void)upsample_left;
+ const int frac_bits_x = 6;
+ const int frac_bits_y = 6;
+
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be caluculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i a0_x, a1_x, a32, a16, c3f;
+ __m256i diff, min_base_y256;
+
+ a16 = _mm256_set1_epi16(16);
+ min_base_y256 = _mm256_set1_epi16(min_base_y);
+ c3f = _mm256_set1_epi16(0x3f);
+
+ for (int r = 0; r < H; r++) {
+ __m256i b, res, shift;
+ __m256i resx, resy;
+ __m256i resxy;
+ __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128, shiftx;
+
+ for (int j = 0; j < W; j += 16) {
+ int y = r + 1;
+ int base_x = (-y * dx) >> frac_bits_x;
+ int base_shift = 0;
+ if ((base_x + j) < (min_base_x - 1)) {
+ base_shift = (min_base_x - (base_x + j) - 1);
+ }
+ int base_min_diff = (min_base_x - base_x - j);
+ if (base_min_diff > 16) {
+ base_min_diff = 16;
+ } else {
+ if (base_min_diff < 0) base_min_diff = 0;
+ }
+
+ if (base_shift > 7) {
+ a0_x = _mm256_setzero_si256();
+ a1_x = _mm256_setzero_si256();
+ shift = _mm256_setzero_si256();
+ } else {
+ a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j));
+ a1_x128 =
+ _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j));
+ a0_x128 =
+ _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+ a1_x128 =
+ _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+
+ a0_x = _mm256_castsi128_si256(a0_x128);
+ a1_x = _mm256_castsi128_si256(a1_x128);
+
+ shift = _mm256_castsi128_si256(_mm_srli_epi16(
+ _mm_and_si128(_mm_setr_epi16(
+ ((0 + j) << 6) - y * dx, ((1 + j) << 6) - y * dx,
+ ((2 + j) << 6) - y * dx, ((3 + j) << 6) - y * dx,
+ ((4 + j) << 6) - y * dx, ((5 + j) << 6) - y * dx,
+ ((6 + j) << 6) - y * dx, ((7 + j) << 6) - y * dx),
+ _mm256_castsi256_si128(c3f)),
+ 1));
+ }
+
+ base_shift = 0;
+ if ((base_x + j + 8) < (min_base_x - 1)) {
+ base_shift = (min_base_x - (base_x + j + 8) - 1);
+ }
+ if (base_shift <= 7) {
+ a0_1_x128 =
+ _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 8 + j));
+ a1_1_x128 =
+ _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 9 + j));
+ a0_1_x128 = _mm_shuffle_epi8(a0_1_x128,
+ *(__m128i *)HighbdLoadMaskx[base_shift]);
+ a1_1_x128 = _mm_shuffle_epi8(a1_1_x128,
+ *(__m128i *)HighbdLoadMaskx[base_shift]);
+
+ shiftx = _mm_srli_epi16(
+ _mm_and_si128(
+ _mm_setr_epi16(
+ ((8 + j) << 6) - y * dx, ((9 + j) << 6) - y * dx,
+ ((10 + j) << 6) - y * dx, ((11 + j) << 6) - y * dx,
+ ((12 + j) << 6) - y * dx, ((13 + j) << 6) - y * dx,
+ ((14 + j) << 6) - y * dx, ((15 + j) << 6) - y * dx),
+ _mm256_castsi256_si128(c3f)),
+ 1);
+
+ a0_x = _mm256_inserti128_si256(a0_x, a0_1_x128, 1);
+ a1_x = _mm256_inserti128_si256(a1_x, a1_1_x128, 1);
+ shift = _mm256_inserti128_si256(shift, shiftx, 1);
+ }
+
+ diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32
+ a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi16(diff, shift);
+ res = _mm256_add_epi16(a32, b);
+ resx = _mm256_srli_epi16(res, 5); // 16 16-bit values
+
+ // y calc
+ __m256i a0_y, a1_y, shifty;
+ if ((base_x < min_base_x)) {
+ DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
+ __m256i r6, c256, dy256, y_c256, base_y_c256, mask256, mul16;
+ r6 = _mm256_set1_epi16(r << 6);
+ dy256 = _mm256_set1_epi16(dy);
+ c256 = _mm256_setr_epi16(1 + j, 2 + j, 3 + j, 4 + j, 5 + j, 6 + j,
+ 7 + j, 8 + j, 9 + j, 10 + j, 11 + j, 12 + j,
+ 13 + j, 14 + j, 15 + j, 16 + j);
+ mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256),
+ _mm256_srli_epi16(min_base_y256, 1));
+ y_c256 = _mm256_sub_epi16(r6, mul16);
+ base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y);
+ mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256);
+ base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
+ _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
+
+ a0_y = _mm256_setr_epi16(
+ left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
+ left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
+ left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
+ left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
+ left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
+ left[base_y_c[15]]);
+ a1_y = _mm256_setr_epi16(
+ left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
+ left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
+ left[base_y_c[6] + 1], left[base_y_c[7] + 1], left[base_y_c[8] + 1],
+ left[base_y_c[9] + 1], left[base_y_c[10] + 1],
+ left[base_y_c[11] + 1], left[base_y_c[12] + 1],
+ left[base_y_c[13] + 1], left[base_y_c[14] + 1],
+ left[base_y_c[15] + 1]);
+
+ shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1);
+
+ diff = _mm256_sub_epi16(a1_y, a0_y); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi16(a0_y, 5); // a[x] * 32
+ a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi16(diff, shifty);
+ res = _mm256_add_epi16(a32, b);
+ resy = _mm256_srli_epi16(res, 5);
+ } else {
+ resy = _mm256_setzero_si256();
+ }
+
+ resxy = _mm256_blendv_epi8(resx, resy,
+ *(__m256i *)HighbdBaseMask[base_min_diff]);
+ _mm256_storeu_si256((__m256i *)(dst + j), resxy);
+ } // for j
+ dst += stride;
+ }
+}
+
+// Directional prediction, zone 2: 90 < angle < 180
+void av1_highbd_dr_prediction_z2_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
+ int bh, const uint16_t *above,
+ const uint16_t *left, int upsample_above,
+ int upsample_left, int dx, int dy,
+ int bd) {
+ (void)bd;
+ assert(dx > 0);
+ assert(dy > 0);
+ switch (bw) {
+ case 4:
+ highbd_dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left,
+ upsample_above, upsample_left, dx, dy);
+ break;
+ case 8:
+ if (bd < 12) {
+ highbd_dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left,
+ upsample_above, upsample_left, dx, dy);
+ } else {
+ highbd_dr_prediction_32bit_z2_Nx8_avx2(bh, dst, stride, above, left,
+ upsample_above, upsample_left,
+ dx, dy);
+ }
+ break;
+ default:
+ if (bd < 12) {
+ highbd_dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left,
+ upsample_above, upsample_left, dx, dy);
+ } else {
+ highbd_dr_prediction_32bit_z2_HxW_avx2(bh, bw, dst, stride, above, left,
+ upsample_above, upsample_left,
+ dx, dy);
+ }
+ break;
+ }
+}
+
+static void highbd_transpose(const uint16_t *src, ptrdiff_t pitchSrc,
+ uint16_t *dst, ptrdiff_t pitchDst, int width,
+ int height) {
for (int j = 0; j < height; j += 8)
for (int i = 0; i < width; i += 8)
- transpose_TX_8X8(src + i * pitchSrc + j, pitchSrc, dst + j * pitchDst + i,
- pitchDst);
+ highbd_transpose_TX_8X8(src + i * pitchSrc + j, pitchSrc,
+ dst + j * pitchDst + i, pitchDst);
}
static void highbd_dr_prediction_z3_4x4_avx2(uint16_t *dst, ptrdiff_t stride,
@@ -1649,7 +2559,7 @@ static void highbd_dr_prediction_z3_8x4_avx2(uint16_t *dst, ptrdiff_t stride,
static void highbd_dr_prediction_z3_8x16_avx2(uint16_t *dst, ptrdiff_t stride,
const uint16_t *left,
int upsample_left, int dy) {
- __m256i dstvec[8], d[16];
+ __m256i dstvec[8], d[8];
highbd_dr_prediction_z1_16xN_internal_avx2(8, dstvec, left, upsample_left,
dy);
@@ -1818,9 +2728,9 @@ static void highbd_dr_prediction_z3_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
static void highbd_dr_prediction_z3_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
const uint16_t *left,
int upsample_left, int dy) {
- uint16_t dstT[64 * 64];
+ DECLARE_ALIGNED(16, uint16_t, dstT[64 * 64]);
highbd_dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy);
- transpose(dstT, 64, dst, stride, 64, 64);
+ highbd_transpose(dstT, 64, dst, stride, 64, 64);
}
static void highbd_dr_prediction_z3_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
@@ -1872,24 +2782,24 @@ static void highbd_dr_prediction_z3_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
int upsample_left, int dy) {
uint16_t dstT[64 * 32];
highbd_dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy);
- transpose(dstT, 64, dst, stride, 32, 64);
+ highbd_transpose(dstT, 64, dst, stride, 32, 64);
}
static void highbd_dr_prediction_z3_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
const uint16_t *left,
int upsample_left, int dy) {
- uint16_t dstT[32 * 64];
+ DECLARE_ALIGNED(16, uint16_t, dstT[32 * 64]);
highbd_dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy);
- transpose(dstT, 32, dst, stride, 64, 32);
+ highbd_transpose(dstT, 32, dst, stride, 64, 32);
return;
}
static void highbd_dr_prediction_z3_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
const uint16_t *left,
int upsample_left, int dy) {
- uint16_t dstT[64 * 16];
+ DECLARE_ALIGNED(16, uint16_t, dstT[64 * 16]);
highbd_dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy);
- transpose(dstT, 64, dst, stride, 16, 64);
+ highbd_transpose(dstT, 64, dst, stride, 16, 64);
}
static void highbd_dr_prediction_z3_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
@@ -1910,9 +2820,10 @@ static void highbd_dr_prediction_z3_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
void av1_highbd_dr_prediction_z3_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
int bh, const uint16_t *above,
const uint16_t *left, int upsample_left,
- int dx, int dy) {
+ int dx, int dy, int bd) {
(void)above;
(void)dx;
+ (void)bd;
assert(dx == 1);
assert(dy > 0);
if (bw == bh) {
@@ -2013,3 +2924,1716 @@ void av1_highbd_dr_prediction_z3_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
}
return;
}
+
+// Low bit depth functions
+static uint8_t BaseMask[33][32] = {
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
+};
+
+static AOM_FORCE_INLINE void dr_prediction_z1_4xN_internal_avx2(
+ int N, __m128i *dst, const uint8_t *above, int upsample_above, int dx) {
+ const int frac_bits = 6 - upsample_above;
+ const int max_base_x = ((N + 4) - 1) << upsample_above;
+ int x;
+ // a assert(dx > 0);
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be caluculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i a0, a1, a32, a16;
+ __m256i diff, c3f;
+ __m128i a_mbase_x;
+
+ a16 = _mm256_set1_epi16(16);
+ a_mbase_x = _mm_set1_epi8(above[max_base_x]);
+ c3f = _mm256_set1_epi16(0x3f);
+
+ x = dx;
+ for (int r = 0; r < N; r++) {
+ __m256i b, res, shift;
+ __m128i res1, a0_128, a1_128;
+
+ int base = x >> frac_bits;
+ int base_max_diff = (max_base_x - base) >> upsample_above;
+ if (base_max_diff <= 0) {
+ for (int i = r; i < N; ++i) {
+ dst[i] = a_mbase_x; // save 4 values
+ }
+ return;
+ }
+ if (base_max_diff > 4) base_max_diff = 4;
+ a0_128 = _mm_loadu_si128((__m128i *)(above + base));
+ a1_128 = _mm_srli_si128(a0_128, 1);
+
+ if (upsample_above) {
+ a0_128 = _mm_shuffle_epi8(
+ a0_128,
+ _mm_setr_epi8(0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15));
+ a1_128 = _mm_srli_si128(a0_128, 4);
+
+ shift = _mm256_srli_epi16(
+ _mm256_and_si256(
+ _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f),
+ 1);
+ } else {
+ shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
+ }
+ a0 = _mm256_cvtepu8_epi16(a0_128);
+ a1 = _mm256_cvtepu8_epi16(a1_128);
+
+ diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32
+ a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi16(diff, shift);
+ res = _mm256_add_epi16(a32, b);
+ res = _mm256_srli_epi16(res, 5);
+
+ res1 = _mm256_castsi256_si128(res);
+ res1 = _mm_packus_epi16(res1, res1);
+
+ dst[r] =
+ _mm_blendv_epi8(a_mbase_x, res1, *(__m128i *)BaseMask[base_max_diff]);
+ x += dx;
+ }
+}
+
+static void dr_prediction_z1_4xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, int upsample_above,
+ int dx) {
+ __m128i dstvec[16];
+
+ dr_prediction_z1_4xN_internal_avx2(N, dstvec, above, upsample_above, dx);
+ for (int i = 0; i < N; i++) {
+ *(uint32_t *)(dst + stride * i) = _mm_cvtsi128_si32(dstvec[i]);
+ }
+}
+
+static AOM_FORCE_INLINE void dr_prediction_z1_8xN_internal_avx2(
+ int N, __m128i *dst, const uint8_t *above, int upsample_above, int dx) {
+ const int frac_bits = 6 - upsample_above;
+ const int max_base_x = ((8 + N) - 1) << upsample_above;
+
+ int x;
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be caluculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i a0, a1, a0_1, a1_1, a32, a16, diff, c3f;
+ __m128i a_mbase_x;
+
+ a16 = _mm256_set1_epi32(16);
+ a_mbase_x = _mm_set1_epi8(above[max_base_x]);
+ c3f = _mm256_set1_epi32(0x3f);
+
+ x = dx;
+ for (int r = 0; r < N; r++) {
+ __m256i b, res, res1, shift;
+ __m128i res128;
+
+ int base = x >> frac_bits;
+ int base_max_diff = (max_base_x - base) >> upsample_above;
+ if (base_max_diff <= 0) {
+ for (int i = r; i < N; ++i) {
+ dst[i] = a_mbase_x; // save 16 values, 8 to be used furter
+ }
+ return;
+ }
+ if (base_max_diff > 8) base_max_diff = 8;
+
+ a0 = _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i *)(above + base)));
+ a1 = _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
+
+ if (upsample_above) {
+ a0 = _mm256_permutevar8x32_epi32(
+ a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
+ a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1));
+
+ a0_1 =
+ _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i *)(above + base + 8)));
+ a0_1 = _mm256_permutevar8x32_epi32(
+ a0_1, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
+ a1_1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0_1, 1));
+
+ a0 = _mm256_inserti128_si256(a0, _mm256_castsi256_si128(a0_1), 1);
+ a1 = _mm256_inserti128_si256(a1, _mm256_castsi256_si128(a1_1), 1);
+
+ shift = _mm256_srli_epi32(
+ _mm256_and_si256(
+ _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above), c3f),
+ 1);
+ } else {
+ shift = _mm256_srli_epi32(_mm256_and_si256(_mm256_set1_epi32(x), c3f), 1);
+ }
+
+ diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32
+ a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi32(diff, shift);
+ res = _mm256_add_epi32(a32, b);
+ res = _mm256_srli_epi32(res, 5);
+
+ res1 = _mm256_packus_epi32(
+ res, _mm256_castsi128_si256(
+ _mm256_extracti128_si256(res, 1))); // goto 16 bit
+
+ res128 = _mm_packus_epi16(_mm256_castsi256_si128(res1),
+ _mm256_castsi256_si128(res1)); // goto 8 bit
+
+ res128 =
+ _mm_blendv_epi8(a_mbase_x, res128, *(__m128i *)BaseMask[base_max_diff]);
+ dst[r] = res128;
+ x += dx;
+ }
+}
+
+static void dr_prediction_z1_8xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, int upsample_above,
+ int dx) {
+ __m128i dstvec[32];
+
+ dr_prediction_z1_8xN_internal_avx2(N, dstvec, above, upsample_above, dx);
+ for (int i = 0; i < N; i++) {
+ _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
+ }
+}
+
+static AOM_FORCE_INLINE void dr_prediction_z1_16xN_internal_avx2(
+ int N, __m128i *dstvec, const uint8_t *above, int upsample_above, int dx) {
+ int x;
+ // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+ (void)upsample_above;
+ const int frac_bits = 6;
+ const int max_base_x = ((16 + N) - 1);
+
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be caluculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i a0, a0_1, a1, a1_1, diff, a32, a16, c3f;
+ __m128i a_mbase_x;
+
+ a16 = _mm256_set1_epi32(16);
+ a_mbase_x = _mm_set1_epi8((uint8_t)above[max_base_x]);
+ c3f = _mm256_set1_epi32(0x3f);
+
+ x = dx;
+ for (int r = 0; r < N; r++) {
+ __m256i b, res[2];
+ __m128i res128[2];
+ int base = x >> frac_bits;
+ int base_max_diff = (max_base_x - base);
+ if (base_max_diff <= 0) {
+ for (int i = r; i < N; ++i) {
+ dstvec[i] = a_mbase_x; // save 16 values
+ }
+ return;
+ }
+ __m256i shift =
+ _mm256_srli_epi32(_mm256_and_si256(_mm256_set1_epi32(x), c3f), 1);
+
+ a0 = _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i *)(above + base)));
+ a1 = _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
+
+ diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32
+ a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
+ b = _mm256_mullo_epi32(diff, shift);
+
+ res[0] = _mm256_add_epi32(a32, b);
+ res[0] = _mm256_srli_epi32(res[0], 5);
+ res[0] = _mm256_packus_epi32(
+ res[0], _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
+ res128[0] = _mm_packus_epi16(_mm256_castsi256_si128(res[0]),
+ _mm256_castsi256_si128(res[0])); // goto 8 bit
+
+ if (base_max_diff > 8) {
+ if (base_max_diff > 16) base_max_diff = 16;
+ a0_1 =
+ _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i *)(above + base + 8)));
+ a1_1 =
+ _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i *)(above + base + 9)));
+
+ diff = _mm256_sub_epi32(a1_1, a0_1); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi32(a0_1, 5); // a[x] * 32
+ a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
+ b = _mm256_mullo_epi32(diff, shift);
+
+ res[1] = _mm256_add_epi32(a32, b);
+ res[1] = _mm256_srli_epi32(res[1], 5);
+ res[1] = _mm256_packus_epi32(
+ res[1], _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
+ res128[1] =
+ _mm_packus_epi16(_mm256_castsi256_si128(res[1]),
+ _mm256_castsi256_si128(res[1])); // goto 8 bit
+
+ } else {
+ res128[1] = a_mbase_x;
+ }
+ res128[0] = _mm_unpacklo_epi64(res128[0], res128[1]); // 16 8bit values
+
+ dstvec[r] = _mm_blendv_epi8(a_mbase_x, res128[0],
+ *(__m128i *)BaseMask[base_max_diff]);
+ x += dx;
+ }
+}
+static void dr_prediction_z1_16xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, int upsample_above,
+ int dx) {
+ __m128i dstvec[64];
+
+ dr_prediction_z1_16xN_internal_avx2(N, dstvec, above, upsample_above, dx);
+ for (int i = 0; i < N; i++) {
+ _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
+ }
+}
+
+static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_avx2(
+ int N, __m256i *dstvec, const uint8_t *above, int upsample_above, int dx) {
+ int x;
+ // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+ (void)upsample_above;
+ const int frac_bits = 6;
+ const int max_base_x = ((32 + N) - 1);
+
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be caluculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i a0, a0_1, a1, a1_1, a32, a16;
+ __m256i a_mbase_x, diff, c3f;
+
+ a16 = _mm256_set1_epi32(16);
+ a_mbase_x = _mm256_set1_epi8(above[max_base_x]);
+ c3f = _mm256_set1_epi32(0x3f);
+
+ x = dx;
+ for (int r = 0; r < N; r++) {
+ __m256i b, res[2], res16[2];
+
+ int base = x >> frac_bits;
+ int base_max_diff = (max_base_x - base);
+ if (base_max_diff <= 0) {
+ for (int i = r; i < N; ++i) {
+ dstvec[i] = a_mbase_x; // save 32 values
+ }
+ return;
+ }
+ if (base_max_diff > 32) base_max_diff = 32;
+ __m256i shift =
+ _mm256_srli_epi32(_mm256_and_si256(_mm256_set1_epi32(x), c3f), 1);
+
+ for (int j = 0, jj = 0; j < 32; j += 16, jj++) {
+ int mdiff = base_max_diff - j;
+ if (mdiff <= 0) {
+ res16[jj] = a_mbase_x;
+ } else {
+ a0 = _mm256_cvtepu8_epi32(
+ _mm_loadu_si128((__m128i *)(above + base + j)));
+ a1 = _mm256_cvtepu8_epi32(
+ _mm_loadu_si128((__m128i *)(above + base + 1 + j)));
+
+ diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32
+ a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
+ b = _mm256_mullo_epi32(diff, shift);
+
+ res[0] = _mm256_add_epi32(a32, b);
+ res[0] = _mm256_srli_epi32(res[0], 5);
+ res[0] = _mm256_packus_epi32(
+ res[0],
+ _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
+
+ // goto 8 bit
+ res[0] = _mm256_packus_epi16(res[0], res[0]);
+
+ if (mdiff > 8) {
+ a0_1 = _mm256_cvtepu8_epi32(
+ _mm_loadu_si128((__m128i *)(above + base + 8 + j)));
+ a1_1 = _mm256_cvtepu8_epi32(
+ _mm_loadu_si128((__m128i *)(above + base + 9 + j)));
+
+ diff = _mm256_sub_epi32(a1_1, a0_1); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi32(a0_1, 5); // a[x] * 32
+ a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
+ b = _mm256_mullo_epi32(diff, shift);
+
+ res[1] = _mm256_add_epi32(a32, b);
+ res[1] = _mm256_srli_epi32(res[1], 5);
+ res[1] = _mm256_packus_epi32(
+ res[1],
+ _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
+ res[1] = _mm256_packus_epi16(res[1], res[1]);
+ // goto 8 bit
+ } else {
+ res[1] = a_mbase_x;
+ }
+ res16[jj] = _mm256_unpacklo_epi64(res[0], res[1]); // 16 8bit values
+ }
+ }
+ res16[1] =
+ _mm256_inserti128_si256(res16[0], _mm256_castsi256_si128(res16[1]),
+ 1); // 32 8bit values
+
+ dstvec[r] = _mm256_blendv_epi8(
+ a_mbase_x, res16[1],
+ *(__m256i *)BaseMask[base_max_diff]); // 32 8bit values
+ x += dx;
+ }
+}
+
+static void dr_prediction_z1_32xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, int upsample_above,
+ int dx) {
+ __m256i dstvec[64];
+ dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above, dx);
+ for (int i = 0; i < N; i++) {
+ _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
+ }
+}
+
+static void dr_prediction_z1_64xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, int upsample_above,
+ int dx) {
+ int x;
+ // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+ (void)upsample_above;
+ const int frac_bits = 6;
+ const int max_base_x = ((64 + N) - 1);
+
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be caluculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i a0, a0_1, a1, a1_1, a32, a16;
+ __m256i a_mbase_x, diff, c3f;
+ __m128i max_base_x128, base_inc128, mask128;
+
+ a16 = _mm256_set1_epi32(16);
+ a_mbase_x = _mm256_set1_epi8(above[max_base_x]);
+ max_base_x128 = _mm_set1_epi8(max_base_x);
+ c3f = _mm256_set1_epi32(0x3f);
+
+ x = dx;
+ for (int r = 0; r < N; r++, dst += stride) {
+ __m256i b, res[2];
+ __m128i res1;
+
+ int base = x >> frac_bits;
+ if (base >= max_base_x) {
+ for (int i = r; i < N; ++i) {
+ _mm256_storeu_si256((__m256i *)dst, a_mbase_x); // save 32 values
+ _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
+ dst += stride;
+ }
+ return;
+ }
+
+ __m256i shift =
+ _mm256_srli_epi32(_mm256_and_si256(_mm256_set1_epi32(x), c3f), 1);
+
+ __m128i a0_128, a0_1_128, a1_128, a1_1_128;
+ for (int j = 0; j < 64; j += 16) {
+ int mdif = max_base_x - (base + j);
+ if (mdif <= 0) {
+ _mm_storeu_si128((__m128i *)(dst + j),
+ _mm256_castsi256_si128(a_mbase_x));
+ } else {
+ a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
+ a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j));
+ a0 = _mm256_cvtepu8_epi32(a0_128);
+ a1 = _mm256_cvtepu8_epi32(a1_128);
+
+ diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32
+ a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
+ b = _mm256_mullo_epi32(diff, shift);
+
+ res[0] = _mm256_add_epi32(a32, b);
+ res[0] = _mm256_srli_epi32(res[0], 5);
+ res[0] = _mm256_packus_epi32(
+ res[0],
+ _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
+ // goto 8 bit
+ res[0] = _mm256_packus_epi16(res[0], res[0]);
+
+ if (mdif > 8) {
+ a0_1_128 = _mm_loadu_si128((__m128i *)(above + base + 8 + j));
+ a1_1_128 = _mm_loadu_si128((__m128i *)(above + base + 9 + j));
+ a0_1 = _mm256_cvtepu8_epi32(a0_1_128);
+ a1_1 = _mm256_cvtepu8_epi32(a1_1_128);
+
+ diff = _mm256_sub_epi32(a1_1, a0_1); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi32(a0_1, 5); // a[x] * 32
+ a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
+ b = _mm256_mullo_epi32(diff, shift);
+
+ res[1] = _mm256_add_epi32(a32, b);
+ res[1] = _mm256_srli_epi32(res[1], 5);
+ res[1] = _mm256_packus_epi32(
+ res[1],
+ _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
+ res[1] = _mm256_packus_epi16(res[1], res[1]);
+
+ } else {
+ res[1] = a_mbase_x;
+ }
+ res1 = _mm_unpacklo_epi64(
+ _mm256_castsi256_si128(res[0]),
+ _mm256_castsi256_si128(res[1])); // 16 8bit values
+
+ base_inc128 = _mm_setr_epi8(
+ base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
+ base + j + 5, base + j + 6, base + j + 7, base + j + 8,
+ base + j + 9, base + j + 10, base + j + 11, base + j + 12,
+ base + j + 13, base + j + 14, base + j + 15);
+
+ mask128 = _mm_cmpgt_epi8(_mm_subs_epu8(max_base_x128, base_inc128),
+ _mm_setzero_si128());
+ res1 =
+ _mm_blendv_epi8(_mm256_castsi256_si128(a_mbase_x), res1, mask128);
+ _mm_storeu_si128((__m128i *)(dst + j), res1);
+ }
+ }
+ x += dx;
+ }
+}
+
+// Directional prediction, zone 1: 0 < angle < 90
+void av1_dr_prediction_z1_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+ const uint8_t *above, const uint8_t *left,
+ int upsample_above, int dx, int dy) {
+ (void)left;
+ (void)dy;
+ switch (bw) {
+ case 4:
+ dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above, dx);
+ break;
+ case 8:
+ dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above, dx);
+ break;
+ case 16:
+ dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above, dx);
+ break;
+ case 32:
+ dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above, dx);
+ break;
+ case 64:
+ dr_prediction_z1_64xN_avx2(bh, dst, stride, above, upsample_above, dx);
+ break;
+ default: break;
+ }
+ return;
+}
+
+static uint8_t LoadMaskx[8][16] = {
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+ { 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
+ { 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },
+ { 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 },
+ { 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 },
+ { 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 },
+ { 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8 },
+};
+
+static uint8_t EvenOddMaskx4[8][16] = {
+ { 0, 2, 4, 6, 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 1, 3, 5, 7, 2, 4, 6, 8, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 2, 4, 6, 8, 3, 5, 7, 9, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 3, 5, 7, 9, 4, 6, 8, 10, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 4, 6, 8, 10, 5, 7, 9, 11, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 5, 7, 9, 11, 6, 8, 10, 12, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 6, 8, 10, 12, 7, 9, 11, 13, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 7, 9, 11, 13, 8, 10, 12, 14, 0 }
+};
+
+static uint8_t EvenOddMaskx[8][16] = {
+ { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 0, 0, 0, 0 },
+ { 0, 1, 3, 5, 7, 9, 11, 13, 15, 2, 4, 6, 8, 0, 0, 0 },
+ { 0, 0, 2, 4, 6, 8, 10, 12, 14, 3, 5, 7, 9, 0, 0, 0 },
+ { 0, 0, 0, 3, 5, 7, 9, 11, 13, 15, 4, 6, 8, 10, 0 },
+ { 0, 0, 0, 0, 4, 6, 8, 10, 12, 14, 5, 7, 9, 11, 0, 0 },
+ { 0, 0, 0, 0, 0, 5, 7, 9, 11, 13, 15, 6, 8, 10, 12, 0 },
+ { 0, 0, 0, 0, 0, 0, 6, 8, 10, 12, 14, 7, 9, 11, 13, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 7, 9, 11, 13, 15, 8, 10, 12, 14 }
+};
+
+static void dr_prediction_z2_Nx4_avx2(int N, uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left,
+ int upsample_above, int upsample_left,
+ int dx, int dy) {
+ const int min_base_x = -(1 << upsample_above);
+ const int min_base_y = -(1 << upsample_left);
+ const int frac_bits_x = 6 - upsample_above;
+ const int frac_bits_y = 6 - upsample_left;
+
+ // a assert(dx > 0);
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be caluculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i a0_x, a1_x, a32, a16, diff;
+ __m128i c3f, min_base_y128;
+
+ a16 = _mm256_set1_epi32(16);
+ c3f = _mm_set1_epi32(0x3f);
+ min_base_y128 = _mm_set1_epi32(min_base_y);
+
+ for (int r = 0; r < N; r++) {
+ __m256i b, res, shift;
+ __m128i resx, resy, resxy;
+ __m128i a0_x128, a1_x128;
+ int y = r + 1;
+ int base_x = (-y * dx) >> frac_bits_x;
+ int base_shift = 0;
+ if (base_x < (min_base_x - 1)) {
+ base_shift = (min_base_x - base_x - 1) >> upsample_above;
+ }
+ int base_min_diff =
+ (min_base_x - base_x + upsample_above) >> upsample_above;
+ if (base_min_diff > 4) {
+ base_min_diff = 4;
+ } else {
+ if (base_min_diff < 0) base_min_diff = 0;
+ }
+
+ if (base_shift > 3) {
+ a0_x = _mm256_setzero_si256();
+ a1_x = _mm256_setzero_si256();
+ shift = _mm256_setzero_si256();
+ } else {
+ a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+ if (upsample_above) {
+ a0_x128 =
+ _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx4[base_shift]);
+ a1_x128 = _mm_srli_si128(a0_x128, 4);
+
+ shift = _mm256_castsi128_si256(_mm_srli_epi32(
+ _mm_and_si128(
+ _mm_slli_epi32(
+ _mm_setr_epi32(-y * dx, (1 << 6) - y * dx,
+ (2 << 6) - y * dx, (3 << 6) - y * dx),
+ upsample_above),
+ c3f),
+ 1));
+ } else {
+ a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
+ a1_x128 = _mm_srli_si128(a0_x128, 1);
+
+ shift = _mm256_castsi128_si256(_mm_srli_epi32(
+ _mm_and_si128(_mm_setr_epi32(-y * dx, (1 << 6) - y * dx,
+ (2 << 6) - y * dx, (3 << 6) - y * dx),
+ c3f),
+ 1));
+ }
+ a0_x = _mm256_cvtepu8_epi32(a0_x128);
+ a1_x = _mm256_cvtepu8_epi32(a1_x128);
+ }
+ // y calc
+ __m128i a0_y, a1_y, shifty;
+ if (base_x < min_base_x) {
+ DECLARE_ALIGNED(32, int, base_y_c[4]);
+ __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
+ r6 = _mm_set1_epi32(r << 6);
+ dy128 = _mm_set1_epi32(dy);
+ c1234 = _mm_setr_epi32(1, 2, 3, 4);
+ y_c128 = _mm_sub_epi32(r6, _mm_mullo_epi32(c1234, dy128));
+ base_y_c128 = _mm_srai_epi32(y_c128, frac_bits_y);
+ mask128 = _mm_cmpgt_epi32(min_base_y128, base_y_c128);
+ base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
+ _mm_store_si128((__m128i *)base_y_c, base_y_c128);
+
+ a0_y = _mm_setr_epi32(left[base_y_c[0]], left[base_y_c[1]],
+ left[base_y_c[2]], left[base_y_c[3]]);
+ a1_y = _mm_setr_epi32(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
+ left[base_y_c[2] + 1], left[base_y_c[3] + 1]);
+
+ if (upsample_left) {
+ shifty = _mm_srli_epi32(
+ _mm_and_si128(_mm_slli_epi32(y_c128, upsample_left), c3f), 1);
+ } else {
+ shifty = _mm_srli_epi32(_mm_and_si128(y_c128, c3f), 1);
+ }
+ a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
+ a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
+ shift = _mm256_inserti128_si256(shift, shifty, 1);
+ }
+
+ diff = _mm256_sub_epi32(a1_x, a0_x); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi32(a0_x, 5); // a[x] * 32
+ a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi32(diff, shift);
+ res = _mm256_add_epi32(a32, b);
+ res = _mm256_srli_epi32(res, 5);
+
+ resx = _mm256_castsi256_si128(res);
+ resx = _mm_packus_epi32(resx, resx);
+ resx = _mm_packus_epi16(resx, resx);
+
+ resy = _mm256_extracti128_si256(res, 1);
+ resy = _mm_packus_epi32(resy, resy);
+ resy = _mm_packus_epi16(resy, resy);
+
+ resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
+ *(uint32_t *)(dst) = _mm_cvtsi128_si32(resxy);
+ dst += stride;
+ }
+}
+
+static void dr_prediction_z2_Nx8_avx2(int N, uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left,
+ int upsample_above, int upsample_left,
+ int dx, int dy) {
+ const int min_base_x = -(1 << upsample_above);
+ const int min_base_y = -(1 << upsample_left);
+ const int frac_bits_x = 6 - upsample_above;
+ const int frac_bits_y = 6 - upsample_left;
+
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be caluculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i diff, a32, a16;
+ __m256i a0_x, a1_x;
+ __m128i a0_x128, a1_x128, min_base_y128, c3f;
+
+ a16 = _mm256_set1_epi16(16);
+ c3f = _mm_set1_epi16(0x3f);
+ min_base_y128 = _mm_set1_epi16(min_base_y);
+
+ for (int r = 0; r < N; r++) {
+ __m256i b, res, shift;
+ __m128i resx, resy, resxy;
+
+ int y = r + 1;
+ int base_x = (-y * dx) >> frac_bits_x;
+ int base_shift = 0;
+ if (base_x < (min_base_x - 1)) {
+ base_shift = (min_base_x - base_x - 1) >> upsample_above;
+ }
+ int base_min_diff =
+ (min_base_x - base_x + upsample_above) >> upsample_above;
+ if (base_min_diff > 8) {
+ base_min_diff = 8;
+ } else {
+ if (base_min_diff < 0) base_min_diff = 0;
+ }
+
+ if (base_shift > 7) {
+ a0_x = _mm256_setzero_si256();
+ a1_x = _mm256_setzero_si256();
+ shift = _mm256_setzero_si256();
+ } else {
+ a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+ a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift));
+ if (upsample_above) {
+ a0_x128 =
+ _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]);
+ a1_x128 =
+ _mm_shuffle_epi8(a1_x128, *(__m128i *)EvenOddMaskx[base_shift]);
+
+ shift = _mm256_castsi128_si256(_mm_srli_epi16(
+ _mm_and_si128(
+ _mm_slli_epi16(
+ _mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
+ (2 << 6) - y * dx, (3 << 6) - y * dx,
+ (4 << 6) - y * dx, (5 << 6) - y * dx,
+ (6 << 6) - y * dx, (7 << 6) - y * dx),
+ upsample_above),
+ c3f),
+ 1));
+ } else {
+ a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
+ a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]);
+
+ shift = _mm256_castsi128_si256(_mm_srli_epi16(
+ _mm_and_si128(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
+ (2 << 6) - y * dx, (3 << 6) - y * dx,
+ (4 << 6) - y * dx, (5 << 6) - y * dx,
+ (6 << 6) - y * dx, (7 << 6) - y * dx),
+ c3f),
+ 1));
+ }
+ a0_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a0_x128));
+ a1_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a1_x128));
+ }
+
+ // y calc
+ __m128i a0_y, a1_y, shifty;
+ if (base_x < min_base_x) {
+ DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
+ __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
+ r6 = _mm_set1_epi16(r << 6);
+ dy128 = _mm_set1_epi16(dy);
+ c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+ y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
+ base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
+ mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
+ base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
+ _mm_store_si128((__m128i *)base_y_c, base_y_c128);
+
+ a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+ left[base_y_c[2]], left[base_y_c[3]],
+ left[base_y_c[4]], left[base_y_c[5]],
+ left[base_y_c[6]], left[base_y_c[7]]);
+ a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
+ left[base_y_c[2] + 1], left[base_y_c[3] + 1],
+ left[base_y_c[4] + 1], left[base_y_c[5] + 1],
+ left[base_y_c[6] + 1], left[base_y_c[7] + 1]);
+
+ if (upsample_left) {
+ shifty = _mm_srli_epi16(
+ _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
+ } else {
+ shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
+ }
+
+ a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
+ a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
+ shift = _mm256_inserti128_si256(shift, shifty, 1);
+ }
+
+ diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32
+ a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi16(diff, shift);
+ res = _mm256_add_epi16(a32, b);
+ res = _mm256_srli_epi16(res, 5);
+
+ resx = _mm_packus_epi16(_mm256_castsi256_si128(res),
+ _mm256_castsi256_si128(res));
+ resy = _mm256_extracti128_si256(res, 1);
+ resy = _mm_packus_epi16(resy, resy);
+
+ resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
+ _mm_storel_epi64((__m128i *)(dst), resxy);
+ dst += stride;
+ }
+}
+
+static void dr_prediction_z2_HxW_avx2(int H, int W, uint8_t *dst,
+ ptrdiff_t stride, const uint8_t *above,
+ const uint8_t *left, int upsample_above,
+ int upsample_left, int dx, int dy) {
+ // here upsample_above and upsample_left are 0 by design of
+ // av1_use_intra_edge_upsample
+ const int min_base_x = -1;
+ const int min_base_y = -1;
+ (void)upsample_above;
+ (void)upsample_left;
+ const int frac_bits_x = 6;
+ const int frac_bits_y = 6;
+
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be caluculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i a0_x, a1_x, a0_y, a1_y, a32, a16;
+ __m256i diff, min_base_y256, c3f, shifty;
+ __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128, a0_1_x, a1_1_x, shiftx;
+
+ a16 = _mm256_set1_epi16(16);
+ min_base_y256 = _mm256_set1_epi16(min_base_y);
+ c3f = _mm256_set1_epi16(0x3f);
+
+ for (int r = 0; r < H; r++) {
+ __m256i b, res, shift;
+ __m128i resx, resy;
+ __m128i resxy;
+ for (int j = 0; j < W; j += 16) {
+ int y = r + 1;
+ int base_x = (-y * dx) >> frac_bits_x;
+
+ int base_shift = 0;
+ if ((base_x + j) < (min_base_x - 1)) {
+ base_shift = (min_base_x - (base_x + j) - 1);
+ }
+ int base_min_diff = (min_base_x - base_x - j);
+ if (base_min_diff > 16) {
+ base_min_diff = 16;
+ } else {
+ if (base_min_diff < 0) base_min_diff = 0;
+ }
+ if (base_shift > 7) {
+ a0_x = _mm256_setzero_si256();
+ a1_x = _mm256_setzero_si256();
+ shift = _mm256_setzero_si256();
+ } else {
+ a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j));
+ a1_x128 =
+ _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j));
+ a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
+ a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]);
+
+ a0_x = _mm256_cvtepu8_epi16(a0_x128);
+ a1_x = _mm256_cvtepu8_epi16(a1_x128);
+
+ shift = _mm256_castsi128_si256(_mm_srli_epi16(
+ _mm_and_si128(_mm_setr_epi16(
+ ((0 + j) << 6) - y * dx, ((1 + j) << 6) - y * dx,
+ ((2 + j) << 6) - y * dx, ((3 + j) << 6) - y * dx,
+ ((4 + j) << 6) - y * dx, ((5 + j) << 6) - y * dx,
+ ((6 + j) << 6) - y * dx, ((7 + j) << 6) - y * dx),
+ _mm256_castsi256_si128(c3f)),
+ 1));
+ }
+
+ base_shift = 0;
+ if ((base_x + j + 8) < (min_base_x - 1)) {
+ base_shift = (min_base_x - (base_x + j + 8) - 1);
+ }
+ if (base_shift <= 7) {
+ a0_1_x128 =
+ _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 8 + j));
+ a1_1_x128 =
+ _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 9 + j));
+ a0_1_x128 =
+ _mm_shuffle_epi8(a0_1_x128, *(__m128i *)LoadMaskx[base_shift]);
+ a1_1_x128 =
+ _mm_shuffle_epi8(a1_1_x128, *(__m128i *)LoadMaskx[base_shift]);
+
+ a0_1_x = _mm_cvtepu8_epi16(a0_1_x128);
+ a1_1_x = _mm_cvtepu8_epi16(a1_1_x128);
+
+ shiftx = _mm_srli_epi16(
+ _mm_and_si128(
+ _mm_setr_epi16(
+ ((8 + j) << 6) - y * dx, ((9 + j) << 6) - y * dx,
+ ((10 + j) << 6) - y * dx, ((11 + j) << 6) - y * dx,
+ ((12 + j) << 6) - y * dx, ((13 + j) << 6) - y * dx,
+ ((14 + j) << 6) - y * dx, ((15 + j) << 6) - y * dx),
+ _mm256_castsi256_si128(c3f)),
+ 1);
+
+ a0_x = _mm256_inserti128_si256(a0_x, a0_1_x, 1);
+ a1_x = _mm256_inserti128_si256(a1_x, a1_1_x, 1);
+ shift = _mm256_inserti128_si256(shift, shiftx, 1);
+ }
+
+ diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32
+ a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi16(diff, shift);
+ res = _mm256_add_epi16(a32, b);
+ res = _mm256_srli_epi16(res, 5); // 16 16-bit values
+ resx = _mm256_castsi256_si128(_mm256_packus_epi16(
+ res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
+
+ // y calc
+ if ((base_x < min_base_x)) {
+ DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
+ __m256i r6, c256, dy256, y_c256, base_y_c256, mask256, mul16;
+ r6 = _mm256_set1_epi16(r << 6);
+ dy256 = _mm256_set1_epi16(dy);
+ c256 = _mm256_setr_epi16(1 + j, 2 + j, 3 + j, 4 + j, 5 + j, 6 + j,
+ 7 + j, 8 + j, 9 + j, 10 + j, 11 + j, 12 + j,
+ 13 + j, 14 + j, 15 + j, 16 + j);
+ mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256),
+ _mm256_srli_epi16(min_base_y256, 1));
+ y_c256 = _mm256_sub_epi16(r6, mul16);
+
+ base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y);
+ mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256);
+ base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
+ _mm256_store_si256((__m256i *)base_y_c, base_y_c256); /**/
+
+ a0_y = _mm256_setr_epi16(
+ left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
+ left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
+ left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
+ left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
+ left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
+ left[base_y_c[15]]);
+ a1_y = _mm256_setr_epi16(
+ left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
+ left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
+ left[base_y_c[6] + 1], left[base_y_c[7] + 1], left[base_y_c[8] + 1],
+ left[base_y_c[9] + 1], left[base_y_c[10] + 1],
+ left[base_y_c[11] + 1], left[base_y_c[12] + 1],
+ left[base_y_c[13] + 1], left[base_y_c[14] + 1],
+ left[base_y_c[15] + 1]);
+
+ shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1);
+
+ diff = _mm256_sub_epi16(a1_y, a0_y); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi16(a0_y, 5); // a[x] * 32
+ a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi16(diff, shifty);
+ res = _mm256_add_epi16(a32, b);
+ res = _mm256_srli_epi16(res, 5); // 16 16-bit values
+ resy = _mm256_castsi256_si128(_mm256_packus_epi16(
+ res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
+
+ } else {
+ resy = _mm_setzero_si128();
+ }
+ resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
+ _mm_storeu_si128((__m128i *)(dst + j), resxy);
+ } // for j
+ dst += stride;
+ }
+}
+
+// Directional prediction, zone 2: 90 < angle < 180
+void av1_dr_prediction_z2_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+ const uint8_t *above, const uint8_t *left,
+ int upsample_above, int upsample_left, int dx,
+ int dy) {
+ assert(dx > 0);
+ assert(dy > 0);
+ switch (bw) {
+ case 4:
+ dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left, upsample_above,
+ upsample_left, dx, dy);
+ break;
+ case 8:
+ dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left, upsample_above,
+ upsample_left, dx, dy);
+ break;
+ default:
+ dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left,
+ upsample_above, upsample_left, dx, dy);
+ break;
+ }
+ return;
+}
+
+// z3 functions
+static INLINE void transpose4x16_sse2(__m128i *x, __m128i *d) {
+ __m128i w0, w1, w2, w3, ww0, ww1, ww2, ww3;
+ w0 = _mm_unpacklo_epi8(x[0], x[1]);
+ w1 = _mm_unpacklo_epi8(x[2], x[3]);
+ w2 = _mm_unpackhi_epi8(x[0], x[1]);
+ w3 = _mm_unpackhi_epi8(x[2], x[3]);
+
+ ww0 = _mm_unpacklo_epi16(w0, w1);
+ ww1 = _mm_unpacklo_epi16(w2, w3);
+ ww2 = _mm_unpackhi_epi16(w0, w1);
+ ww3 = _mm_unpackhi_epi16(w2, w3);
+
+ w0 = _mm_unpacklo_epi32(ww0, ww1);
+ w2 = _mm_unpacklo_epi32(ww2, ww3);
+ w1 = _mm_unpackhi_epi32(ww0, ww1);
+ w3 = _mm_unpackhi_epi32(ww2, ww3);
+
+ d[0] = _mm_unpacklo_epi64(w0, w2);
+ d[1] = _mm_unpackhi_epi64(w0, w2);
+ d[2] = _mm_unpacklo_epi64(w1, w3);
+ d[3] = _mm_unpackhi_epi64(w1, w3);
+
+ d[4] = _mm_srli_si128(d[0], 8);
+ d[5] = _mm_srli_si128(d[1], 8);
+ d[6] = _mm_srli_si128(d[2], 8);
+ d[7] = _mm_srli_si128(d[3], 8);
+
+ d[8] = _mm_srli_si128(d[0], 4);
+ d[9] = _mm_srli_si128(d[1], 4);
+ d[10] = _mm_srli_si128(d[2], 4);
+ d[11] = _mm_srli_si128(d[3], 4);
+
+ d[12] = _mm_srli_si128(d[0], 12);
+ d[13] = _mm_srli_si128(d[1], 12);
+ d[14] = _mm_srli_si128(d[2], 12);
+ d[15] = _mm_srli_si128(d[3], 12);
+}
+
+static INLINE void transpose16x32_avx2(__m256i *x, __m256i *d) {
+ __m256i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
+ __m256i w10, w11, w12, w13, w14, w15;
+
+ w0 = _mm256_unpacklo_epi8(x[0], x[1]);
+ w1 = _mm256_unpacklo_epi8(x[2], x[3]);
+ w2 = _mm256_unpacklo_epi8(x[4], x[5]);
+ w3 = _mm256_unpacklo_epi8(x[6], x[7]);
+
+ w8 = _mm256_unpacklo_epi8(x[8], x[9]);
+ w9 = _mm256_unpacklo_epi8(x[10], x[11]);
+ w10 = _mm256_unpacklo_epi8(x[12], x[13]);
+ w11 = _mm256_unpacklo_epi8(x[14], x[15]);
+
+ w4 = _mm256_unpacklo_epi16(w0, w1);
+ w5 = _mm256_unpacklo_epi16(w2, w3);
+ w12 = _mm256_unpacklo_epi16(w8, w9);
+ w13 = _mm256_unpacklo_epi16(w10, w11);
+
+ w6 = _mm256_unpacklo_epi32(w4, w5);
+ w7 = _mm256_unpackhi_epi32(w4, w5);
+ w14 = _mm256_unpacklo_epi32(w12, w13);
+ w15 = _mm256_unpackhi_epi32(w12, w13);
+
+ // Store first 4-line result
+ d[0] = _mm256_unpacklo_epi64(w6, w14);
+ d[1] = _mm256_unpackhi_epi64(w6, w14);
+ d[2] = _mm256_unpacklo_epi64(w7, w15);
+ d[3] = _mm256_unpackhi_epi64(w7, w15);
+
+ w4 = _mm256_unpackhi_epi16(w0, w1);
+ w5 = _mm256_unpackhi_epi16(w2, w3);
+ w12 = _mm256_unpackhi_epi16(w8, w9);
+ w13 = _mm256_unpackhi_epi16(w10, w11);
+
+ w6 = _mm256_unpacklo_epi32(w4, w5);
+ w7 = _mm256_unpackhi_epi32(w4, w5);
+ w14 = _mm256_unpacklo_epi32(w12, w13);
+ w15 = _mm256_unpackhi_epi32(w12, w13);
+
+ // Store second 4-line result
+ d[4] = _mm256_unpacklo_epi64(w6, w14);
+ d[5] = _mm256_unpackhi_epi64(w6, w14);
+ d[6] = _mm256_unpacklo_epi64(w7, w15);
+ d[7] = _mm256_unpackhi_epi64(w7, w15);
+
+ // upper half
+ w0 = _mm256_unpackhi_epi8(x[0], x[1]);
+ w1 = _mm256_unpackhi_epi8(x[2], x[3]);
+ w2 = _mm256_unpackhi_epi8(x[4], x[5]);
+ w3 = _mm256_unpackhi_epi8(x[6], x[7]);
+
+ w8 = _mm256_unpackhi_epi8(x[8], x[9]);
+ w9 = _mm256_unpackhi_epi8(x[10], x[11]);
+ w10 = _mm256_unpackhi_epi8(x[12], x[13]);
+ w11 = _mm256_unpackhi_epi8(x[14], x[15]);
+
+ w4 = _mm256_unpacklo_epi16(w0, w1);
+ w5 = _mm256_unpacklo_epi16(w2, w3);
+ w12 = _mm256_unpacklo_epi16(w8, w9);
+ w13 = _mm256_unpacklo_epi16(w10, w11);
+
+ w6 = _mm256_unpacklo_epi32(w4, w5);
+ w7 = _mm256_unpackhi_epi32(w4, w5);
+ w14 = _mm256_unpacklo_epi32(w12, w13);
+ w15 = _mm256_unpackhi_epi32(w12, w13);
+
+ // Store first 4-line result
+ d[8] = _mm256_unpacklo_epi64(w6, w14);
+ d[9] = _mm256_unpackhi_epi64(w6, w14);
+ d[10] = _mm256_unpacklo_epi64(w7, w15);
+ d[11] = _mm256_unpackhi_epi64(w7, w15);
+
+ w4 = _mm256_unpackhi_epi16(w0, w1);
+ w5 = _mm256_unpackhi_epi16(w2, w3);
+ w12 = _mm256_unpackhi_epi16(w8, w9);
+ w13 = _mm256_unpackhi_epi16(w10, w11);
+
+ w6 = _mm256_unpacklo_epi32(w4, w5);
+ w7 = _mm256_unpackhi_epi32(w4, w5);
+ w14 = _mm256_unpacklo_epi32(w12, w13);
+ w15 = _mm256_unpackhi_epi32(w12, w13);
+
+ // Store second 4-line result
+ d[12] = _mm256_unpacklo_epi64(w6, w14);
+ d[13] = _mm256_unpackhi_epi64(w6, w14);
+ d[14] = _mm256_unpacklo_epi64(w7, w15);
+ d[15] = _mm256_unpackhi_epi64(w7, w15);
+}
+
+static INLINE void transpose16x16_sse2(__m128i *x, __m128i *d) {
+ __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
+ __m128i w10, w11, w12, w13, w14, w15;
+
+ w0 = _mm_unpacklo_epi8(x[0], x[1]);
+ w1 = _mm_unpacklo_epi8(x[2], x[3]);
+ w2 = _mm_unpacklo_epi8(x[4], x[5]);
+ w3 = _mm_unpacklo_epi8(x[6], x[7]);
+
+ w8 = _mm_unpacklo_epi8(x[8], x[9]);
+ w9 = _mm_unpacklo_epi8(x[10], x[11]);
+ w10 = _mm_unpacklo_epi8(x[12], x[13]);
+ w11 = _mm_unpacklo_epi8(x[14], x[15]);
+
+ w4 = _mm_unpacklo_epi16(w0, w1);
+ w5 = _mm_unpacklo_epi16(w2, w3);
+ w12 = _mm_unpacklo_epi16(w8, w9);
+ w13 = _mm_unpacklo_epi16(w10, w11);
+
+ w6 = _mm_unpacklo_epi32(w4, w5);
+ w7 = _mm_unpackhi_epi32(w4, w5);
+ w14 = _mm_unpacklo_epi32(w12, w13);
+ w15 = _mm_unpackhi_epi32(w12, w13);
+
+ // Store first 4-line result
+ d[0] = _mm_unpacklo_epi64(w6, w14);
+ d[1] = _mm_unpackhi_epi64(w6, w14);
+ d[2] = _mm_unpacklo_epi64(w7, w15);
+ d[3] = _mm_unpackhi_epi64(w7, w15);
+
+ w4 = _mm_unpackhi_epi16(w0, w1);
+ w5 = _mm_unpackhi_epi16(w2, w3);
+ w12 = _mm_unpackhi_epi16(w8, w9);
+ w13 = _mm_unpackhi_epi16(w10, w11);
+
+ w6 = _mm_unpacklo_epi32(w4, w5);
+ w7 = _mm_unpackhi_epi32(w4, w5);
+ w14 = _mm_unpacklo_epi32(w12, w13);
+ w15 = _mm_unpackhi_epi32(w12, w13);
+
+ // Store second 4-line result
+ d[4] = _mm_unpacklo_epi64(w6, w14);
+ d[5] = _mm_unpackhi_epi64(w6, w14);
+ d[6] = _mm_unpacklo_epi64(w7, w15);
+ d[7] = _mm_unpackhi_epi64(w7, w15);
+
+ // upper half
+ w0 = _mm_unpackhi_epi8(x[0], x[1]);
+ w1 = _mm_unpackhi_epi8(x[2], x[3]);
+ w2 = _mm_unpackhi_epi8(x[4], x[5]);
+ w3 = _mm_unpackhi_epi8(x[6], x[7]);
+
+ w8 = _mm_unpackhi_epi8(x[8], x[9]);
+ w9 = _mm_unpackhi_epi8(x[10], x[11]);
+ w10 = _mm_unpackhi_epi8(x[12], x[13]);
+ w11 = _mm_unpackhi_epi8(x[14], x[15]);
+
+ w4 = _mm_unpacklo_epi16(w0, w1);
+ w5 = _mm_unpacklo_epi16(w2, w3);
+ w12 = _mm_unpacklo_epi16(w8, w9);
+ w13 = _mm_unpacklo_epi16(w10, w11);
+
+ w6 = _mm_unpacklo_epi32(w4, w5);
+ w7 = _mm_unpackhi_epi32(w4, w5);
+ w14 = _mm_unpacklo_epi32(w12, w13);
+ w15 = _mm_unpackhi_epi32(w12, w13);
+
+ // Store first 4-line result
+ d[8] = _mm_unpacklo_epi64(w6, w14);
+ d[9] = _mm_unpackhi_epi64(w6, w14);
+ d[10] = _mm_unpacklo_epi64(w7, w15);
+ d[11] = _mm_unpackhi_epi64(w7, w15);
+
+ w4 = _mm_unpackhi_epi16(w0, w1);
+ w5 = _mm_unpackhi_epi16(w2, w3);
+ w12 = _mm_unpackhi_epi16(w8, w9);
+ w13 = _mm_unpackhi_epi16(w10, w11);
+
+ w6 = _mm_unpacklo_epi32(w4, w5);
+ w7 = _mm_unpackhi_epi32(w4, w5);
+ w14 = _mm_unpacklo_epi32(w12, w13);
+ w15 = _mm_unpackhi_epi32(w12, w13);
+
+ // Store second 4-line result
+ d[12] = _mm_unpacklo_epi64(w6, w14);
+ d[13] = _mm_unpackhi_epi64(w6, w14);
+ d[14] = _mm_unpacklo_epi64(w7, w15);
+ d[15] = _mm_unpackhi_epi64(w7, w15);
+}
+
+static void transpose_TX_8X8(const uint8_t *src, ptrdiff_t pitchSrc,
+ uint8_t *dst, ptrdiff_t pitchDst) {
+ __m128i r0, r1, r2, r3, r4, r5, r6, r7;
+ __m128i d0d1, d2d3, d4d5, d6d7;
+ r0 = _mm_loadl_epi64((__m128i *)(src + 0 * pitchSrc));
+ r1 = _mm_loadl_epi64((__m128i *)(src + 1 * pitchSrc));
+ r2 = _mm_loadl_epi64((__m128i *)(src + 2 * pitchSrc));
+ r3 = _mm_loadl_epi64((__m128i *)(src + 3 * pitchSrc));
+ r4 = _mm_loadl_epi64((__m128i *)(src + 4 * pitchSrc));
+ r5 = _mm_loadl_epi64((__m128i *)(src + 5 * pitchSrc));
+ r6 = _mm_loadl_epi64((__m128i *)(src + 6 * pitchSrc));
+ r7 = _mm_loadl_epi64((__m128i *)(src + 7 * pitchSrc));
+
+ transpose8x8_sse2(&r0, &r1, &r2, &r3, &r4, &r5, &r6, &r7, &d0d1, &d2d3, &d4d5,
+ &d6d7);
+
+ _mm_storel_epi64((__m128i *)(dst + 0 * pitchDst), d0d1);
+ _mm_storel_epi64((__m128i *)(dst + 1 * pitchDst), _mm_srli_si128(d0d1, 8));
+ _mm_storel_epi64((__m128i *)(dst + 2 * pitchDst), d2d3);
+ _mm_storel_epi64((__m128i *)(dst + 3 * pitchDst), _mm_srli_si128(d2d3, 8));
+ _mm_storel_epi64((__m128i *)(dst + 4 * pitchDst), d4d5);
+ _mm_storel_epi64((__m128i *)(dst + 5 * pitchDst), _mm_srli_si128(d4d5, 8));
+ _mm_storel_epi64((__m128i *)(dst + 6 * pitchDst), d6d7);
+ _mm_storel_epi64((__m128i *)(dst + 7 * pitchDst), _mm_srli_si128(d6d7, 8));
+}
+
+static void transpose(const uint8_t *src, ptrdiff_t pitchSrc, uint8_t *dst,
+ ptrdiff_t pitchDst, int width, int height) {
+ for (int j = 0; j < height; j += 8)
+ for (int i = 0; i < width; i += 8)
+ transpose_TX_8X8(src + i * pitchSrc + j, pitchSrc, dst + j * pitchDst + i,
+ pitchDst);
+}
+
+static void dr_prediction_z3_4x4_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[4], d[4];
+
+ dr_prediction_z1_4xN_internal_avx2(4, dstvec, left, upsample_left, dy);
+ transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
+ &d[0], &d[1], &d[2], &d[3]);
+
+ *(uint32_t *)(dst + stride * 0) = _mm_cvtsi128_si32(d[0]);
+ *(uint32_t *)(dst + stride * 1) = _mm_cvtsi128_si32(d[1]);
+ *(uint32_t *)(dst + stride * 2) = _mm_cvtsi128_si32(d[2]);
+ *(uint32_t *)(dst + stride * 3) = _mm_cvtsi128_si32(d[3]);
+ return;
+}
+
+static void dr_prediction_z3_8x8_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[8], d[8];
+
+ dr_prediction_z1_8xN_internal_avx2(8, dstvec, left, upsample_left, dy);
+ transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4],
+ &dstvec[5], &dstvec[6], &dstvec[7], &d[0], &d[1], &d[2],
+ &d[3]);
+
+ _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
+ _mm_storel_epi64((__m128i *)(dst + 1 * stride), _mm_srli_si128(d[0], 8));
+ _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[1]);
+ _mm_storel_epi64((__m128i *)(dst + 3 * stride), _mm_srli_si128(d[1], 8));
+ _mm_storel_epi64((__m128i *)(dst + 4 * stride), d[2]);
+ _mm_storel_epi64((__m128i *)(dst + 5 * stride), _mm_srli_si128(d[2], 8));
+ _mm_storel_epi64((__m128i *)(dst + 6 * stride), d[3]);
+ _mm_storel_epi64((__m128i *)(dst + 7 * stride), _mm_srli_si128(d[3], 8));
+}
+
+static void dr_prediction_z3_4x8_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[4], d[8];
+
+ dr_prediction_z1_8xN_internal_avx2(4, dstvec, left, upsample_left, dy);
+ transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &d[0],
+ &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
+ for (int i = 0; i < 8; i++) {
+ *(uint32_t *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
+ }
+}
+
+static void dr_prediction_z3_8x4_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[8], d[4];
+
+ dr_prediction_z1_4xN_internal_avx2(8, dstvec, left, upsample_left, dy);
+ transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
+ &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &d[0],
+ &d[1], &d[2], &d[3]);
+ _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
+ _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]);
+ _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]);
+ _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]);
+}
+
+static void dr_prediction_z3_8x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[8], d[8];
+
+ dr_prediction_z1_16xN_internal_avx2(8, dstvec, left, upsample_left, dy);
+ transpose8x16_16x8_sse2(dstvec, dstvec + 1, dstvec + 2, dstvec + 3,
+ dstvec + 4, dstvec + 5, dstvec + 6, dstvec + 7, d,
+ d + 1, d + 2, d + 3, d + 4, d + 5, d + 6, d + 7);
+ for (int i = 0; i < 8; i++) {
+ _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
+ _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride),
+ _mm_srli_si128(d[i], 8));
+ }
+}
+
+static void dr_prediction_z3_16x8_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[16], d[16];
+
+ dr_prediction_z1_8xN_internal_avx2(16, dstvec, left, upsample_left, dy);
+ transpose16x8_8x16_sse2(
+ &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
+ &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
+ &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
+ &d[3], &d[4], &d[5], &d[6], &d[7]);
+
+ for (int i = 0; i < 8; i++) {
+ _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+ }
+}
+
+static void dr_prediction_z3_4x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[4], d[16];
+
+ dr_prediction_z1_16xN_internal_avx2(4, dstvec, left, upsample_left, dy);
+ transpose4x16_sse2(dstvec, d);
+ for (int i = 0; i < 16; i++) {
+ *(uint32_t *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
+ }
+}
+
+static void dr_prediction_z3_16x4_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[16], d[8];
+
+ dr_prediction_z1_4xN_internal_avx2(16, dstvec, left, upsample_left, dy);
+ for (int i = 4; i < 8; i++) {
+ d[i] = _mm_setzero_si128();
+ }
+ transpose16x8_8x16_sse2(
+ &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
+ &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
+ &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
+ &d[3], &d[4], &d[5], &d[6], &d[7]);
+
+ for (int i = 0; i < 4; i++) {
+ _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+ }
+}
+
+static void dr_prediction_z3_8x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m256i dstvec[16], d[16];
+
+ dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left, dy);
+ for (int i = 8; i < 16; i++) {
+ dstvec[i] = _mm256_setzero_si256();
+ }
+ transpose16x32_avx2(dstvec, d);
+
+ for (int i = 0; i < 16; i++) {
+ _mm_storel_epi64((__m128i *)(dst + i * stride),
+ _mm256_castsi256_si128(d[i]));
+ }
+ for (int i = 0; i < 16; i++) {
+ _mm_storel_epi64((__m128i *)(dst + (i + 16) * stride),
+ _mm256_extracti128_si256(d[i], 1));
+ }
+}
+
+static void dr_prediction_z3_32x8_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[32], d[16];
+
+ dr_prediction_z1_8xN_internal_avx2(32, dstvec, left, upsample_left, dy);
+
+ transpose16x8_8x16_sse2(
+ &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
+ &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
+ &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
+ &d[3], &d[4], &d[5], &d[6], &d[7]);
+ transpose16x8_8x16_sse2(
+ &dstvec[0 + 16], &dstvec[1 + 16], &dstvec[2 + 16], &dstvec[3 + 16],
+ &dstvec[4 + 16], &dstvec[5 + 16], &dstvec[6 + 16], &dstvec[7 + 16],
+ &dstvec[8 + 16], &dstvec[9 + 16], &dstvec[10 + 16], &dstvec[11 + 16],
+ &dstvec[12 + 16], &dstvec[13 + 16], &dstvec[14 + 16], &dstvec[15 + 16],
+ &d[0 + 8], &d[1 + 8], &d[2 + 8], &d[3 + 8], &d[4 + 8], &d[5 + 8],
+ &d[6 + 8], &d[7 + 8]);
+
+ for (int i = 0; i < 8; i++) {
+ _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+ _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 8]);
+ }
+}
+
+static void dr_prediction_z3_16x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[16], d[16];
+
+ dr_prediction_z1_16xN_internal_avx2(16, dstvec, left, upsample_left, dy);
+ transpose16x16_sse2(dstvec, d);
+
+ for (int i = 0; i < 16; i++) {
+ _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+ }
+}
+
+static void dr_prediction_z3_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m256i dstvec[32], d[32];
+
+ dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left, dy);
+ transpose16x32_avx2(dstvec, d);
+ transpose16x32_avx2(dstvec + 16, d + 16);
+ for (int j = 0; j < 16; j++) {
+ _mm_storeu_si128((__m128i *)(dst + j * stride),
+ _mm256_castsi256_si128(d[j]));
+ _mm_storeu_si128((__m128i *)(dst + j * stride + 16),
+ _mm256_castsi256_si128(d[j + 16]));
+ }
+ for (int j = 0; j < 16; j++) {
+ _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride),
+ _mm256_extracti128_si256(d[j], 1));
+ _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride + 16),
+ _mm256_extracti128_si256(d[j + 16], 1));
+ }
+}
+
+static void dr_prediction_z3_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ DECLARE_ALIGNED(16, uint8_t, dstT[64 * 64]);
+ dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy);
+ transpose(dstT, 64, dst, stride, 64, 64);
+}
+
+static void dr_prediction_z3_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m256i dstvec[16], d[16];
+
+ dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left, dy);
+ transpose16x32_avx2(dstvec, d);
+ // store
+ for (int j = 0; j < 16; j++) {
+ _mm_storeu_si128((__m128i *)(dst + j * stride),
+ _mm256_castsi256_si128(d[j]));
+ _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride),
+ _mm256_extracti128_si256(d[j], 1));
+ }
+}
+
+static void dr_prediction_z3_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[32], d[16];
+
+ dr_prediction_z1_16xN_internal_avx2(32, dstvec, left, upsample_left, dy);
+ for (int i = 0; i < 32; i += 16) {
+ transpose16x16_sse2((dstvec + i), d);
+ for (int j = 0; j < 16; j++) {
+ _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
+ }
+ }
+}
+
+static void dr_prediction_z3_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ uint8_t dstT[64 * 32];
+ dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy);
+ transpose(dstT, 64, dst, stride, 32, 64);
+}
+
+static void dr_prediction_z3_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ uint8_t dstT[32 * 64];
+ dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy);
+ transpose(dstT, 32, dst, stride, 64, 32);
+ return;
+}
+
+static void dr_prediction_z3_16x64_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ uint8_t dstT[64 * 16];
+ dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy);
+ transpose(dstT, 64, dst, stride, 16, 64);
+}
+
+static void dr_prediction_z3_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[64], d[16];
+
+ dr_prediction_z1_16xN_internal_avx2(64, dstvec, left, upsample_left, dy);
+ for (int i = 0; i < 64; i += 16) {
+ transpose16x16_sse2((dstvec + i), d);
+ for (int j = 0; j < 16; j++) {
+ _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
+ }
+ }
+}
+
+void av1_dr_prediction_z3_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+ const uint8_t *above, const uint8_t *left,
+ int upsample_left, int dx, int dy) {
+ (void)above;
+ (void)dx;
+ assert(dx == 1);
+ assert(dy > 0);
+
+ if (bw == bh) {
+ switch (bw) {
+ case 4:
+ dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ case 8:
+ dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ case 16:
+ dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ case 32:
+ dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ case 64:
+ dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ }
+ } else {
+ if (bw < bh) {
+ if (bw + bw == bh) {
+ switch (bw) {
+ case 4:
+ dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ case 8:
+ dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ case 16:
+ dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ case 32:
+ dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ }
+ } else {
+ switch (bw) {
+ case 4:
+ dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ case 8:
+ dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ case 16:
+ dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ }
+ }
+ } else {
+ if (bh + bh == bw) {
+ switch (bh) {
+ case 4:
+ dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ case 8:
+ dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ case 16:
+ dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ case 32:
+ dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ }
+ } else {
+ switch (bh) {
+ case 4:
+ dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ case 8:
+ dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ case 16:
+ dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ }
+ }
+ }
+ }
+}
diff --git a/libaom/aom_dsp/x86/jnt_sad_ssse3.c b/libaom/aom_dsp/x86/jnt_sad_ssse3.c
index c3c8824..2e3e2be 100644
--- a/libaom/aom_dsp/x86/jnt_sad_ssse3.c
+++ b/libaom/aom_dsp/x86/jnt_sad_ssse3.c
@@ -192,47 +192,47 @@ unsigned int aom_sad128xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
return res;
}
-#define jnt_sadMxN_sse2(m, n) \
- unsigned int aom_jnt_sad##m##x##n##_avg_ssse3( \
+#define dist_wtd_sadMxN_sse2(m, n) \
+ unsigned int aom_dist_wtd_sad##m##x##n##_avg_ssse3( \
const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
- const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \
+ const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
uint8_t comp_pred[m * n]; \
- aom_jnt_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride, \
- jcp_param); \
+ aom_dist_wtd_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride, \
+ jcp_param); \
return aom_sad##m##xh_sse2(src, src_stride, comp_pred, m, m, n); \
}
-#define jnt_sadMxN_avx2(m, n) \
- unsigned int aom_jnt_sad##m##x##n##_avg_avx2( \
+#define dist_wtd_sadMxN_avx2(m, n) \
+ unsigned int aom_dist_wtd_sad##m##x##n##_avg_avx2( \
const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
- const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \
+ const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
uint8_t comp_pred[m * n]; \
- aom_jnt_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride, \
- jcp_param); \
+ aom_dist_wtd_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride, \
+ jcp_param); \
return aom_sad##m##xh_avx2(src, src_stride, comp_pred, m, m, n); \
}
/* clang-format off */
-jnt_sadMxN_sse2(128, 128)
-jnt_sadMxN_sse2(128, 64)
-jnt_sadMxN_sse2(64, 128)
-jnt_sadMxN_sse2(64, 64)
-jnt_sadMxN_sse2(64, 32)
-jnt_sadMxN_sse2(32, 64)
-jnt_sadMxN_sse2(32, 32)
-jnt_sadMxN_sse2(32, 16)
-jnt_sadMxN_sse2(16, 32)
-jnt_sadMxN_sse2(16, 16)
-jnt_sadMxN_sse2(16, 8)
-jnt_sadMxN_sse2(8, 16)
-jnt_sadMxN_sse2(8, 8)
-jnt_sadMxN_sse2(8, 4)
-jnt_sadMxN_sse2(4, 8)
-jnt_sadMxN_sse2(4, 4)
-jnt_sadMxN_sse2(4, 16)
-jnt_sadMxN_sse2(16, 4)
-jnt_sadMxN_sse2(8, 32)
-jnt_sadMxN_sse2(32, 8)
-jnt_sadMxN_sse2(16, 64)
-jnt_sadMxN_sse2(64, 16)
+dist_wtd_sadMxN_sse2(128, 128)
+dist_wtd_sadMxN_sse2(128, 64)
+dist_wtd_sadMxN_sse2(64, 128)
+dist_wtd_sadMxN_sse2(64, 64)
+dist_wtd_sadMxN_sse2(64, 32)
+dist_wtd_sadMxN_sse2(32, 64)
+dist_wtd_sadMxN_sse2(32, 32)
+dist_wtd_sadMxN_sse2(32, 16)
+dist_wtd_sadMxN_sse2(16, 32)
+dist_wtd_sadMxN_sse2(16, 16)
+dist_wtd_sadMxN_sse2(16, 8)
+dist_wtd_sadMxN_sse2(8, 16)
+dist_wtd_sadMxN_sse2(8, 8)
+dist_wtd_sadMxN_sse2(8, 4)
+dist_wtd_sadMxN_sse2(4, 8)
+dist_wtd_sadMxN_sse2(4, 4)
+dist_wtd_sadMxN_sse2(4, 16)
+dist_wtd_sadMxN_sse2(16, 4)
+dist_wtd_sadMxN_sse2(8, 32)
+dist_wtd_sadMxN_sse2(32, 8)
+dist_wtd_sadMxN_sse2(16, 64)
+dist_wtd_sadMxN_sse2(64, 16)
/* clang-format on */
diff --git a/libaom/aom_dsp/x86/jnt_variance_ssse3.c b/libaom/aom_dsp/x86/jnt_variance_ssse3.c
index f9a41a2..c8b02f5 100644
--- a/libaom/aom_dsp/x86/jnt_variance_ssse3.c
+++ b/libaom/aom_dsp/x86/jnt_variance_ssse3.c
@@ -29,7 +29,7 @@ void aom_var_filter_block2d_bil_second_pass_ssse3(
unsigned int pixel_step, unsigned int output_height,
unsigned int output_width, const uint8_t *filter);
-static INLINE void compute_jnt_comp_avg(__m128i *p0, __m128i *p1,
+static INLINE void compute_dist_wtd_avg(__m128i *p0, __m128i *p1,
const __m128i *w, const __m128i *r,
void *const result) {
__m128i p_lo = _mm_unpacklo_epi8(*p0, *p1);
@@ -45,10 +45,10 @@ static INLINE void compute_jnt_comp_avg(__m128i *p0, __m128i *p1,
xx_storeu_128(result, _mm_packus_epi16(shift_lo, shift_hi));
}
-void aom_jnt_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
- int width, int height, const uint8_t *ref,
- int ref_stride,
- const JNT_COMP_PARAMS *jcp_param) {
+void aom_dist_wtd_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
+ int width, int height, const uint8_t *ref,
+ int ref_stride,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
int i;
const uint8_t w0 = (uint8_t)jcp_param->fwd_offset;
const uint8_t w1 = (uint8_t)jcp_param->bck_offset;
@@ -67,7 +67,7 @@ void aom_jnt_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
__m128i p0 = xx_loadu_128(ref);
__m128i p1 = xx_loadu_128(pred);
- compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
+ compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred);
comp_pred += 16;
pred += 16;
@@ -85,7 +85,7 @@ void aom_jnt_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
__m128i p0 = _mm_unpacklo_epi64(p0_0, p0_1);
__m128i p1 = xx_loadu_128(pred);
- compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
+ compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred);
comp_pred += 16;
pred += 16;
@@ -107,7 +107,7 @@ void aom_jnt_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
row3[0], row3[1], row3[2], row3[3]);
__m128i p1 = xx_loadu_128(pred);
- compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
+ compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred);
comp_pred += 16;
pred += 16;
@@ -116,11 +116,11 @@ void aom_jnt_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
}
}
-void aom_jnt_comp_avg_upsampled_pred_ssse3(
+void aom_dist_wtd_comp_avg_upsampled_pred_ssse3(
MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
- int ref_stride, const JNT_COMP_PARAMS *jcp_param, int subpel_search) {
+ int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search) {
int n;
int i;
aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
@@ -141,52 +141,52 @@ void aom_jnt_comp_avg_upsampled_pred_ssse3(
__m128i p0 = xx_loadu_128(comp_pred);
__m128i p1 = xx_loadu_128(pred);
- compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
+ compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred);
comp_pred += 16;
pred += 16;
}
}
-#define JNT_SUBPIX_AVG_VAR(W, H) \
- uint32_t aom_jnt_sub_pixel_avg_variance##W##x##H##_ssse3( \
- const uint8_t *a, int a_stride, int xoffset, int yoffset, \
- const uint8_t *b, int b_stride, uint32_t *sse, \
- const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint8_t temp2[H * W]; \
- DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
- \
- aom_var_filter_block2d_bil_first_pass_ssse3( \
- a, fdata3, a_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
- aom_var_filter_block2d_bil_second_pass_ssse3( \
- fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
- \
- aom_jnt_comp_avg_pred_ssse3(temp3, second_pred, W, H, temp2, W, \
- jcp_param); \
- \
- return aom_variance##W##x##H(temp3, W, b, b_stride, sse); \
+#define DIST_WTD_SUBPIX_AVG_VAR(W, H) \
+ uint32_t aom_dist_wtd_sub_pixel_avg_variance##W##x##H##_ssse3( \
+ const uint8_t *a, int a_stride, int xoffset, int yoffset, \
+ const uint8_t *b, int b_stride, uint32_t *sse, \
+ const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint8_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
+ \
+ aom_var_filter_block2d_bil_first_pass_ssse3( \
+ a, fdata3, a_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+ aom_var_filter_block2d_bil_second_pass_ssse3( \
+ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
+ \
+ aom_dist_wtd_comp_avg_pred_ssse3(temp3, second_pred, W, H, temp2, W, \
+ jcp_param); \
+ \
+ return aom_variance##W##x##H(temp3, W, b, b_stride, sse); \
}
-JNT_SUBPIX_AVG_VAR(128, 128)
-JNT_SUBPIX_AVG_VAR(128, 64)
-JNT_SUBPIX_AVG_VAR(64, 128)
-JNT_SUBPIX_AVG_VAR(64, 64)
-JNT_SUBPIX_AVG_VAR(64, 32)
-JNT_SUBPIX_AVG_VAR(32, 64)
-JNT_SUBPIX_AVG_VAR(32, 32)
-JNT_SUBPIX_AVG_VAR(32, 16)
-JNT_SUBPIX_AVG_VAR(16, 32)
-JNT_SUBPIX_AVG_VAR(16, 16)
-JNT_SUBPIX_AVG_VAR(16, 8)
-JNT_SUBPIX_AVG_VAR(8, 16)
-JNT_SUBPIX_AVG_VAR(8, 8)
-JNT_SUBPIX_AVG_VAR(8, 4)
-JNT_SUBPIX_AVG_VAR(4, 8)
-JNT_SUBPIX_AVG_VAR(4, 4)
-JNT_SUBPIX_AVG_VAR(4, 16)
-JNT_SUBPIX_AVG_VAR(16, 4)
-JNT_SUBPIX_AVG_VAR(8, 32)
-JNT_SUBPIX_AVG_VAR(32, 8)
-JNT_SUBPIX_AVG_VAR(16, 64)
-JNT_SUBPIX_AVG_VAR(64, 16)
+DIST_WTD_SUBPIX_AVG_VAR(128, 128)
+DIST_WTD_SUBPIX_AVG_VAR(128, 64)
+DIST_WTD_SUBPIX_AVG_VAR(64, 128)
+DIST_WTD_SUBPIX_AVG_VAR(64, 64)
+DIST_WTD_SUBPIX_AVG_VAR(64, 32)
+DIST_WTD_SUBPIX_AVG_VAR(32, 64)
+DIST_WTD_SUBPIX_AVG_VAR(32, 32)
+DIST_WTD_SUBPIX_AVG_VAR(32, 16)
+DIST_WTD_SUBPIX_AVG_VAR(16, 32)
+DIST_WTD_SUBPIX_AVG_VAR(16, 16)
+DIST_WTD_SUBPIX_AVG_VAR(16, 8)
+DIST_WTD_SUBPIX_AVG_VAR(8, 16)
+DIST_WTD_SUBPIX_AVG_VAR(8, 8)
+DIST_WTD_SUBPIX_AVG_VAR(8, 4)
+DIST_WTD_SUBPIX_AVG_VAR(4, 8)
+DIST_WTD_SUBPIX_AVG_VAR(4, 4)
+DIST_WTD_SUBPIX_AVG_VAR(4, 16)
+DIST_WTD_SUBPIX_AVG_VAR(16, 4)
+DIST_WTD_SUBPIX_AVG_VAR(8, 32)
+DIST_WTD_SUBPIX_AVG_VAR(32, 8)
+DIST_WTD_SUBPIX_AVG_VAR(16, 64)
+DIST_WTD_SUBPIX_AVG_VAR(64, 16)
diff --git a/libaom/aom_dsp/x86/loopfilter_sse2.c b/libaom/aom_dsp/x86/loopfilter_sse2.c
index 26f249e..c021f50 100644
--- a/libaom/aom_dsp/x86/loopfilter_sse2.c
+++ b/libaom/aom_dsp/x86/loopfilter_sse2.c
@@ -16,237 +16,69 @@
#include "aom_dsp/x86/synonyms.h"
#include "aom_ports/mem.h"
#include "aom_ports/emmintrin_compat.h"
+#include "aom_dsp/x86/lpf_common_sse2.h"
static INLINE __m128i abs_diff(__m128i a, __m128i b) {
return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
}
-static INLINE void transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1,
- __m128i *x2, __m128i *x3,
- __m128i *d0, __m128i *d1,
- __m128i *d2, __m128i *d3) {
- // input
- // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
- // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
- // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
- // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
- // output
- // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
- // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
- // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
- // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
-
- __m128i w0, w1;
-
- w0 = _mm_unpacklo_epi8(
- *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
- w1 = _mm_unpacklo_epi8(
- *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
-
- *d0 = _mm_unpacklo_epi16(
- w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
-
- *d1 = _mm_srli_si128(*d0,
- 4); // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
- *d2 = _mm_srli_si128(*d0,
- 8); // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
- *d3 = _mm_srli_si128(*d0,
- 12); // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
-}
-
-static INLINE void transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
- __m128i *x3, __m128i *d0, __m128i *d1,
- __m128i *d2, __m128i *d3, __m128i *d4,
- __m128i *d5, __m128i *d6,
- __m128i *d7) {
- // input
- // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
- // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
- // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
- // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
- // output
- // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
- // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
- // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
- // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
- // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
- // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
- // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
- // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
-
- __m128i w0, w1, ww0, ww1;
-
+// this function treats its input as 2 parallel 8x4 matrices, transposes each of
+// them to 4x8 independently while flipping the second matrix horizontally.
+// Used for 14 taps pq pairs creation
+static INLINE void transpose_pq_14_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+ __m128i *x3, __m128i *q0p0,
+ __m128i *q1p1, __m128i *q2p2,
+ __m128i *q3p3, __m128i *q4p4,
+ __m128i *q5p5, __m128i *q6p6,
+ __m128i *q7p7) {
+ __m128i w0, w1, ww0, ww1, w2, w3, ww2, ww3;
w0 = _mm_unpacklo_epi8(
*x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
w1 = _mm_unpacklo_epi8(
*x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ w2 = _mm_unpackhi_epi8(
+ *x0, *x1); // 08 18 09 19 010 110 011 111 012 112 013 113 014 114 015 115
+ w3 = _mm_unpackhi_epi8(
+ *x2, *x3); // 28 38 29 39 210 310 211 311 212 312 213 313 214 314 215 315
ww0 = _mm_unpacklo_epi16(
- w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
ww1 = _mm_unpackhi_epi16(
- w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
-
- *d0 = ww0; // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
- *d1 = _mm_srli_si128(ww0,
- 4); // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
- *d2 = _mm_srli_si128(ww0,
- 8); // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
- *d3 = _mm_srli_si128(ww0,
- 12); // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
-
- *d4 = ww1; // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
- *d5 = _mm_srli_si128(ww1,
- 4); // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
- *d6 = _mm_srli_si128(ww1,
- 8); // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
- *d7 = _mm_srli_si128(ww1,
- 12); // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
-}
-
-static INLINE void transpose8x8_low_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
- __m128i *x3, __m128i *x4, __m128i *x5,
- __m128i *x6, __m128i *x7, __m128i *d0,
- __m128i *d1, __m128i *d2,
- __m128i *d3) {
- // input
- // x0 00 01 02 03 04 05 06 07
- // x1 10 11 12 13 14 15 16 17
- // x2 20 21 22 23 24 25 26 27
- // x3 30 31 32 33 34 35 36 37
- // x4 40 41 42 43 44 45 46 47
- // x5 50 51 52 53 54 55 56 57
- // x6 60 61 62 63 64 65 66 67
- // x7 70 71 72 73 74 75 76 77
- // output
- // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx
- // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx
- // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx
- // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx
-
- __m128i w0, w1, w2, w3, w4, w5;
-
- w0 = _mm_unpacklo_epi8(
- *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
-
- w1 = _mm_unpacklo_epi8(
- *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
-
- w2 = _mm_unpacklo_epi8(
- *x4, *x5); // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
-
- w3 = _mm_unpacklo_epi8(
- *x6, *x7); // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
-
- w4 = _mm_unpacklo_epi16(
- w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
- w5 = _mm_unpacklo_epi16(
- w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
-
- *d0 = _mm_unpacklo_epi32(
- w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
- *d1 = _mm_srli_si128(*d0, 8);
- *d2 = _mm_unpackhi_epi32(
- w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
- *d3 = _mm_srli_si128(*d2, 8);
-}
-
-static INLINE void transpose8x8_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
- __m128i *x3, __m128i *x4, __m128i *x5,
- __m128i *x6, __m128i *x7, __m128i *d0d1,
- __m128i *d2d3, __m128i *d4d5,
- __m128i *d6d7) {
- __m128i w0, w1, w2, w3, w4, w5, w6, w7;
- // x0 00 01 02 03 04 05 06 07
- // x1 10 11 12 13 14 15 16 17
- w0 = _mm_unpacklo_epi8(
- *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
-
- // x2 20 21 22 23 24 25 26 27
- // x3 30 31 32 33 34 35 36 37
- w1 = _mm_unpacklo_epi8(
- *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
-
- // x4 40 41 42 43 44 45 46 47
- // x5 50 51 52 53 54 55 56 57
- w2 = _mm_unpacklo_epi8(
- *x4, *x5); // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
-
- // x6 60 61 62 63 64 65 66 67
- // x7 70 71 72 73 74 75 76 77
- w3 = _mm_unpacklo_epi8(
- *x6, *x7); // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
-
- w4 = _mm_unpacklo_epi16(
- w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
- w5 = _mm_unpacklo_epi16(
- w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
-
- *d0d1 = _mm_unpacklo_epi32(
- w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
- *d2d3 = _mm_unpackhi_epi32(
- w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
-
- w6 = _mm_unpackhi_epi16(
- w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
- w7 = _mm_unpackhi_epi16(
- w2, w3); // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
-
- *d4d5 = _mm_unpacklo_epi32(
- w6, w7); // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
- *d6d7 = _mm_unpackhi_epi32(
- w6, w7); // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
-}
+ w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ ww2 = _mm_unpacklo_epi16(
+ w2, w3); // 08 18 28 38 09 19 29 39 010 110 210 310 011 111 211 311
+ ww3 = _mm_unpackhi_epi16(
+ w2,
+ w3); // 012 112 212 312 013 113 213 313 014 114 214 314 015 115 215 315
-static INLINE void transpose16x8_8x16_sse2(
- __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
- __m128i *x5, __m128i *x6, __m128i *x7, __m128i *x8, __m128i *x9,
- __m128i *x10, __m128i *x11, __m128i *x12, __m128i *x13, __m128i *x14,
- __m128i *x15, __m128i *d0, __m128i *d1, __m128i *d2, __m128i *d3,
- __m128i *d4, __m128i *d5, __m128i *d6, __m128i *d7) {
- __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
- __m128i w10, w11, w12, w13, w14, w15;
-
- w0 = _mm_unpacklo_epi8(*x0, *x1);
- w1 = _mm_unpacklo_epi8(*x2, *x3);
- w2 = _mm_unpacklo_epi8(*x4, *x5);
- w3 = _mm_unpacklo_epi8(*x6, *x7);
-
- w8 = _mm_unpacklo_epi8(*x8, *x9);
- w9 = _mm_unpacklo_epi8(*x10, *x11);
- w10 = _mm_unpacklo_epi8(*x12, *x13);
- w11 = _mm_unpacklo_epi8(*x14, *x15);
-
- w4 = _mm_unpacklo_epi16(w0, w1);
- w5 = _mm_unpacklo_epi16(w2, w3);
- w12 = _mm_unpacklo_epi16(w8, w9);
- w13 = _mm_unpacklo_epi16(w10, w11);
-
- w6 = _mm_unpacklo_epi32(w4, w5);
- w7 = _mm_unpackhi_epi32(w4, w5);
- w14 = _mm_unpacklo_epi32(w12, w13);
- w15 = _mm_unpackhi_epi32(w12, w13);
-
- // Store first 4-line result
- *d0 = _mm_unpacklo_epi64(w6, w14);
- *d1 = _mm_unpackhi_epi64(w6, w14);
- *d2 = _mm_unpacklo_epi64(w7, w15);
- *d3 = _mm_unpackhi_epi64(w7, w15);
-
- w4 = _mm_unpackhi_epi16(w0, w1);
- w5 = _mm_unpackhi_epi16(w2, w3);
- w12 = _mm_unpackhi_epi16(w8, w9);
- w13 = _mm_unpackhi_epi16(w10, w11);
-
- w6 = _mm_unpacklo_epi32(w4, w5);
- w7 = _mm_unpackhi_epi32(w4, w5);
- w14 = _mm_unpacklo_epi32(w12, w13);
- w15 = _mm_unpackhi_epi32(w12, w13);
-
- // Store second 4-line result
- *d4 = _mm_unpacklo_epi64(w6, w14);
- *d5 = _mm_unpackhi_epi64(w6, w14);
- *d6 = _mm_unpacklo_epi64(w7, w15);
- *d7 = _mm_unpackhi_epi64(w7, w15);
+ *q7p7 = _mm_unpacklo_epi32(
+ ww0,
+ _mm_srli_si128(
+ ww3, 12)); // 00 10 20 30 015 115 215 315 xx xx xx xx xx xx xx xx
+ *q6p6 = _mm_unpackhi_epi32(
+ _mm_slli_si128(ww0, 4),
+ ww3); // 01 11 21 31 014 114 214 314 xx xx xx xxxx xx xx xx
+ *q5p5 = _mm_unpackhi_epi32(
+ ww0,
+ _mm_slli_si128(
+ ww3, 4)); // 02 12 22 32 013 113 213 313 xx xx xx x xx xx xx xxx
+ *q4p4 = _mm_unpacklo_epi32(
+ _mm_srli_si128(ww0, 12),
+ ww3); // 03 13 23 33 012 112 212 312 xx xx xx xx xx xx xx xx
+ *q3p3 = _mm_unpacklo_epi32(
+ ww1,
+ _mm_srli_si128(
+ ww2, 12)); // 04 14 24 34 011 111 211 311 xx xx xx xx xx xx xx xx
+ *q2p2 = _mm_unpackhi_epi32(
+ _mm_slli_si128(ww1, 4),
+ ww2); // 05 15 25 35 010 110 210 310 xx xx xx xx xx xx xx xx
+ *q1p1 = _mm_unpackhi_epi32(
+ ww1,
+ _mm_slli_si128(
+ ww2, 4)); // 06 16 26 36 09 19 29 39 xx xx xx xx xx xx xx xx
+ *q0p0 = _mm_unpacklo_epi32(
+ _mm_srli_si128(ww1, 12),
+ ww2); // 07 17 27 37 08 18 28 38 xx xx xx xx xx xx xx xx
}
// this function treats its input as 2 parallel 8x4 matrices, transposes each of
@@ -306,116 +138,6 @@ static INLINE void transpose_pq_14_inv_sse2(__m128i *x0, __m128i *x1,
*pq3 = _mm_unpackhi_epi64(d2, d3); // pq
}
-static INLINE void transpose8x16_16x8_sse2(
- __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
- __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0d1, __m128i *d2d3,
- __m128i *d4d5, __m128i *d6d7, __m128i *d8d9, __m128i *d10d11,
- __m128i *d12d13, __m128i *d14d15) {
- __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
- __m128i w10, w11, w12, w13, w14, w15;
-
- w0 = _mm_unpacklo_epi8(*x0, *x1);
- w1 = _mm_unpacklo_epi8(*x2, *x3);
- w2 = _mm_unpacklo_epi8(*x4, *x5);
- w3 = _mm_unpacklo_epi8(*x6, *x7);
-
- w8 = _mm_unpackhi_epi8(*x0, *x1);
- w9 = _mm_unpackhi_epi8(*x2, *x3);
- w10 = _mm_unpackhi_epi8(*x4, *x5);
- w11 = _mm_unpackhi_epi8(*x6, *x7);
-
- w4 = _mm_unpacklo_epi16(w0, w1);
- w5 = _mm_unpacklo_epi16(w2, w3);
- w12 = _mm_unpacklo_epi16(w8, w9);
- w13 = _mm_unpacklo_epi16(w10, w11);
-
- w6 = _mm_unpacklo_epi32(w4, w5);
- w7 = _mm_unpackhi_epi32(w4, w5);
- w14 = _mm_unpacklo_epi32(w12, w13);
- w15 = _mm_unpackhi_epi32(w12, w13);
-
- // Store first 4-line result
- *d0d1 = _mm_unpacklo_epi64(w6, w14);
- *d2d3 = _mm_unpackhi_epi64(w6, w14);
- *d4d5 = _mm_unpacklo_epi64(w7, w15);
- *d6d7 = _mm_unpackhi_epi64(w7, w15);
-
- w4 = _mm_unpackhi_epi16(w0, w1);
- w5 = _mm_unpackhi_epi16(w2, w3);
- w12 = _mm_unpackhi_epi16(w8, w9);
- w13 = _mm_unpackhi_epi16(w10, w11);
-
- w6 = _mm_unpacklo_epi32(w4, w5);
- w7 = _mm_unpackhi_epi32(w4, w5);
- w14 = _mm_unpacklo_epi32(w12, w13);
- w15 = _mm_unpackhi_epi32(w12, w13);
-
- // Store second 4-line result
- *d8d9 = _mm_unpacklo_epi64(w6, w14);
- *d10d11 = _mm_unpackhi_epi64(w6, w14);
- *d12d13 = _mm_unpacklo_epi64(w7, w15);
- *d14d15 = _mm_unpackhi_epi64(w7, w15);
-}
-
-// this function treats its input as 2 parallel 8x4 matrices, transposes each of
-// them to 4x8 independently while flipping the second matrix horizontaly. Used
-// for 14 taps pq pairs creation
-static INLINE void transpose_pq_14_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
- __m128i *x3, __m128i *q0p0,
- __m128i *q1p1, __m128i *q2p2,
- __m128i *q3p3, __m128i *q4p4,
- __m128i *q5p5, __m128i *q6p6,
- __m128i *q7p7) {
- __m128i w0, w1, ww0, ww1, w2, w3, ww2, ww3;
- w0 = _mm_unpacklo_epi8(
- *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
- w1 = _mm_unpacklo_epi8(
- *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
- w2 = _mm_unpackhi_epi8(
- *x0, *x1); // 08 18 09 19 010 110 011 111 012 112 013 113 014 114 015 115
- w3 = _mm_unpackhi_epi8(
- *x2, *x3); // 28 38 29 39 210 310 211 311 212 312 213 313 214 314 215 315
-
- ww0 = _mm_unpacklo_epi16(
- w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
- ww1 = _mm_unpackhi_epi16(
- w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
- ww2 = _mm_unpacklo_epi16(
- w2, w3); // 08 18 28 38 09 19 29 39 010 110 210 310 011 111 211 311
- ww3 = _mm_unpackhi_epi16(
- w2,
- w3); // 012 112 212 312 013 113 213 313 014 114 214 314 015 115 215 315
-
- *q7p7 = _mm_unpacklo_epi32(
- ww0,
- _mm_srli_si128(
- ww3, 12)); // 00 10 20 30 015 115 215 315 xx xx xx xx xx xx xx xx
- *q6p6 = _mm_unpackhi_epi32(
- _mm_slli_si128(ww0, 4),
- ww3); // 01 11 21 31 014 114 214 314 xx xx xx xxxx xx xx xx
- *q5p5 = _mm_unpackhi_epi32(
- ww0,
- _mm_slli_si128(
- ww3, 4)); // 02 12 22 32 013 113 213 313 xx xx xx x xx xx xx xxx
- *q4p4 = _mm_unpacklo_epi32(
- _mm_srli_si128(ww0, 12),
- ww3); // 03 13 23 33 012 112 212 312 xx xx xx xx xx xx xx xx
- *q3p3 = _mm_unpacklo_epi32(
- ww1,
- _mm_srli_si128(
- ww2, 12)); // 04 14 24 34 011 111 211 311 xx xx xx xx xx xx xx xx
- *q2p2 = _mm_unpackhi_epi32(
- _mm_slli_si128(ww1, 4),
- ww2); // 05 15 25 35 010 110 210 310 xx xx xx xx xx xx xx xx
- *q1p1 = _mm_unpackhi_epi32(
- ww1,
- _mm_slli_si128(
- ww2, 4)); // 06 16 26 36 09 19 29 39 xx xx xx xx xx xx xx xx
- *q0p0 = _mm_unpacklo_epi32(
- _mm_srli_si128(ww1, 12),
- ww2); // 07 17 27 37 08 18 28 38 xx xx xx xx xx xx xx xx
-}
-
static AOM_FORCE_INLINE void filter4_sse2(__m128i *p1p0, __m128i *q1q0,
__m128i *hev, __m128i *mask,
__m128i *qs1qs0, __m128i *ps1ps0) {
diff --git a/libaom/aom_dsp/x86/lpf_common_sse2.h b/libaom/aom_dsp/x86/lpf_common_sse2.h
index 8970fe7..6ed2cbf 100644
--- a/libaom/aom_dsp/x86/lpf_common_sse2.h
+++ b/libaom/aom_dsp/x86/lpf_common_sse2.h
@@ -212,4 +212,284 @@ static INLINE void highbd_transpose8x16_sse2(
d4 + 1, d5 + 1, d6 + 1, d7 + 1);
}
+// Low bit depth functions
+static INLINE void transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1,
+ __m128i *x2, __m128i *x3,
+ __m128i *d0, __m128i *d1,
+ __m128i *d2, __m128i *d3) {
+ // input
+ // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
+ // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
+ // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
+ // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
+ // output
+ // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+
+ __m128i w0, w1;
+
+ w0 = _mm_unpacklo_epi8(
+ *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ w1 = _mm_unpacklo_epi8(
+ *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+
+ *d0 = _mm_unpacklo_epi16(
+ w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+
+ *d1 = _mm_srli_si128(*d0,
+ 4); // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d2 = _mm_srli_si128(*d0,
+ 8); // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d3 = _mm_srli_si128(*d0,
+ 12); // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+}
+
+static INLINE void transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+ __m128i *x3, __m128i *d0, __m128i *d1,
+ __m128i *d2, __m128i *d3, __m128i *d4,
+ __m128i *d5, __m128i *d6,
+ __m128i *d7) {
+ // input
+ // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
+ // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
+ // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
+ // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
+ // output
+ // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
+
+ __m128i w0, w1, ww0, ww1;
+
+ w0 = _mm_unpacklo_epi8(
+ *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ w1 = _mm_unpacklo_epi8(
+ *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+
+ ww0 = _mm_unpacklo_epi16(
+ w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ ww1 = _mm_unpackhi_epi16(
+ w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+
+ *d0 = ww0; // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d1 = _mm_srli_si128(ww0,
+ 4); // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d2 = _mm_srli_si128(ww0,
+ 8); // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d3 = _mm_srli_si128(ww0,
+ 12); // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+
+ *d4 = ww1; // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d5 = _mm_srli_si128(ww1,
+ 4); // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d6 = _mm_srli_si128(ww1,
+ 8); // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d7 = _mm_srli_si128(ww1,
+ 12); // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
+}
+
+static INLINE void transpose8x8_low_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+ __m128i *x3, __m128i *x4, __m128i *x5,
+ __m128i *x6, __m128i *x7, __m128i *d0,
+ __m128i *d1, __m128i *d2,
+ __m128i *d3) {
+ // input
+ // x0 00 01 02 03 04 05 06 07
+ // x1 10 11 12 13 14 15 16 17
+ // x2 20 21 22 23 24 25 26 27
+ // x3 30 31 32 33 34 35 36 37
+ // x4 40 41 42 43 44 45 46 47
+ // x5 50 51 52 53 54 55 56 57
+ // x6 60 61 62 63 64 65 66 67
+ // x7 70 71 72 73 74 75 76 77
+ // output
+ // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx
+ // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx
+ // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx
+ // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx
+
+ __m128i w0, w1, w2, w3, w4, w5;
+
+ w0 = _mm_unpacklo_epi8(
+ *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+
+ w1 = _mm_unpacklo_epi8(
+ *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+
+ w2 = _mm_unpacklo_epi8(
+ *x4, *x5); // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+
+ w3 = _mm_unpacklo_epi8(
+ *x6, *x7); // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+
+ w4 = _mm_unpacklo_epi16(
+ w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ w5 = _mm_unpacklo_epi16(
+ w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+
+ *d0 = _mm_unpacklo_epi32(
+ w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ *d1 = _mm_srli_si128(*d0, 8);
+ *d2 = _mm_unpackhi_epi32(
+ w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+ *d3 = _mm_srli_si128(*d2, 8);
+}
+
+static INLINE void transpose8x8_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+ __m128i *x3, __m128i *x4, __m128i *x5,
+ __m128i *x6, __m128i *x7, __m128i *d0d1,
+ __m128i *d2d3, __m128i *d4d5,
+ __m128i *d6d7) {
+ __m128i w0, w1, w2, w3, w4, w5, w6, w7;
+ // x0 00 01 02 03 04 05 06 07
+ // x1 10 11 12 13 14 15 16 17
+ w0 = _mm_unpacklo_epi8(
+ *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+
+ // x2 20 21 22 23 24 25 26 27
+ // x3 30 31 32 33 34 35 36 37
+ w1 = _mm_unpacklo_epi8(
+ *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+
+ // x4 40 41 42 43 44 45 46 47
+ // x5 50 51 52 53 54 55 56 57
+ w2 = _mm_unpacklo_epi8(
+ *x4, *x5); // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+
+ // x6 60 61 62 63 64 65 66 67
+ // x7 70 71 72 73 74 75 76 77
+ w3 = _mm_unpacklo_epi8(
+ *x6, *x7); // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+
+ w4 = _mm_unpacklo_epi16(
+ w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ w5 = _mm_unpacklo_epi16(
+ w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+
+ *d0d1 = _mm_unpacklo_epi32(
+ w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ *d2d3 = _mm_unpackhi_epi32(
+ w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+
+ w6 = _mm_unpackhi_epi16(
+ w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ w7 = _mm_unpackhi_epi16(
+ w2, w3); // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
+
+ *d4d5 = _mm_unpacklo_epi32(
+ w6, w7); // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
+ *d6d7 = _mm_unpackhi_epi32(
+ w6, w7); // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+}
+
+static INLINE void transpose16x8_8x16_sse2(
+ __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
+ __m128i *x5, __m128i *x6, __m128i *x7, __m128i *x8, __m128i *x9,
+ __m128i *x10, __m128i *x11, __m128i *x12, __m128i *x13, __m128i *x14,
+ __m128i *x15, __m128i *d0, __m128i *d1, __m128i *d2, __m128i *d3,
+ __m128i *d4, __m128i *d5, __m128i *d6, __m128i *d7) {
+ __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
+ __m128i w10, w11, w12, w13, w14, w15;
+
+ w0 = _mm_unpacklo_epi8(*x0, *x1);
+ w1 = _mm_unpacklo_epi8(*x2, *x3);
+ w2 = _mm_unpacklo_epi8(*x4, *x5);
+ w3 = _mm_unpacklo_epi8(*x6, *x7);
+
+ w8 = _mm_unpacklo_epi8(*x8, *x9);
+ w9 = _mm_unpacklo_epi8(*x10, *x11);
+ w10 = _mm_unpacklo_epi8(*x12, *x13);
+ w11 = _mm_unpacklo_epi8(*x14, *x15);
+
+ w4 = _mm_unpacklo_epi16(w0, w1);
+ w5 = _mm_unpacklo_epi16(w2, w3);
+ w12 = _mm_unpacklo_epi16(w8, w9);
+ w13 = _mm_unpacklo_epi16(w10, w11);
+
+ w6 = _mm_unpacklo_epi32(w4, w5);
+ w7 = _mm_unpackhi_epi32(w4, w5);
+ w14 = _mm_unpacklo_epi32(w12, w13);
+ w15 = _mm_unpackhi_epi32(w12, w13);
+
+ // Store first 4-line result
+ *d0 = _mm_unpacklo_epi64(w6, w14);
+ *d1 = _mm_unpackhi_epi64(w6, w14);
+ *d2 = _mm_unpacklo_epi64(w7, w15);
+ *d3 = _mm_unpackhi_epi64(w7, w15);
+
+ w4 = _mm_unpackhi_epi16(w0, w1);
+ w5 = _mm_unpackhi_epi16(w2, w3);
+ w12 = _mm_unpackhi_epi16(w8, w9);
+ w13 = _mm_unpackhi_epi16(w10, w11);
+
+ w6 = _mm_unpacklo_epi32(w4, w5);
+ w7 = _mm_unpackhi_epi32(w4, w5);
+ w14 = _mm_unpacklo_epi32(w12, w13);
+ w15 = _mm_unpackhi_epi32(w12, w13);
+
+ // Store second 4-line result
+ *d4 = _mm_unpacklo_epi64(w6, w14);
+ *d5 = _mm_unpackhi_epi64(w6, w14);
+ *d6 = _mm_unpacklo_epi64(w7, w15);
+ *d7 = _mm_unpackhi_epi64(w7, w15);
+}
+
+static INLINE void transpose8x16_16x8_sse2(
+ __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
+ __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0d1, __m128i *d2d3,
+ __m128i *d4d5, __m128i *d6d7, __m128i *d8d9, __m128i *d10d11,
+ __m128i *d12d13, __m128i *d14d15) {
+ __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
+ __m128i w10, w11, w12, w13, w14, w15;
+
+ w0 = _mm_unpacklo_epi8(*x0, *x1);
+ w1 = _mm_unpacklo_epi8(*x2, *x3);
+ w2 = _mm_unpacklo_epi8(*x4, *x5);
+ w3 = _mm_unpacklo_epi8(*x6, *x7);
+
+ w8 = _mm_unpackhi_epi8(*x0, *x1);
+ w9 = _mm_unpackhi_epi8(*x2, *x3);
+ w10 = _mm_unpackhi_epi8(*x4, *x5);
+ w11 = _mm_unpackhi_epi8(*x6, *x7);
+
+ w4 = _mm_unpacklo_epi16(w0, w1);
+ w5 = _mm_unpacklo_epi16(w2, w3);
+ w12 = _mm_unpacklo_epi16(w8, w9);
+ w13 = _mm_unpacklo_epi16(w10, w11);
+
+ w6 = _mm_unpacklo_epi32(w4, w5);
+ w7 = _mm_unpackhi_epi32(w4, w5);
+ w14 = _mm_unpacklo_epi32(w12, w13);
+ w15 = _mm_unpackhi_epi32(w12, w13);
+
+ // Store first 4-line result
+ *d0d1 = _mm_unpacklo_epi64(w6, w14);
+ *d2d3 = _mm_unpackhi_epi64(w6, w14);
+ *d4d5 = _mm_unpacklo_epi64(w7, w15);
+ *d6d7 = _mm_unpackhi_epi64(w7, w15);
+
+ w4 = _mm_unpackhi_epi16(w0, w1);
+ w5 = _mm_unpackhi_epi16(w2, w3);
+ w12 = _mm_unpackhi_epi16(w8, w9);
+ w13 = _mm_unpackhi_epi16(w10, w11);
+
+ w6 = _mm_unpacklo_epi32(w4, w5);
+ w7 = _mm_unpackhi_epi32(w4, w5);
+ w14 = _mm_unpacklo_epi32(w12, w13);
+ w15 = _mm_unpackhi_epi32(w12, w13);
+
+ // Store second 4-line result
+ *d8d9 = _mm_unpacklo_epi64(w6, w14);
+ *d10d11 = _mm_unpackhi_epi64(w6, w14);
+ *d12d13 = _mm_unpacklo_epi64(w7, w15);
+ *d14d15 = _mm_unpackhi_epi64(w7, w15);
+}
+
#endif // AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
diff --git a/libaom/aom_dsp/x86/quantize_sse2.c b/libaom/aom_dsp/x86/quantize_sse2.c
index d3de6e2..ebef1fb 100644
--- a/libaom/aom_dsp/x86/quantize_sse2.c
+++ b/libaom/aom_dsp/x86/quantize_sse2.c
@@ -18,28 +18,6 @@
#include "aom/aom_integer.h"
#include "aom_dsp/x86/quantize_x86.h"
-static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) {
- assert(sizeof(tran_low_t) == 4);
-
- return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1],
- (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3],
- (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5],
- (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]);
-}
-
-static INLINE void store_coefficients(__m128i coeff_vals,
- tran_low_t *coeff_ptr) {
- assert(sizeof(tran_low_t) == 4);
-
- __m128i one = _mm_set1_epi16(1);
- __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one);
- __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one);
- __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi);
- __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi);
- _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1);
- _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2);
-}
-
void aom_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
const int16_t *zbin_ptr, const int16_t *round_ptr,
const int16_t *quant_ptr,
diff --git a/libaom/aom_dsp/x86/quantize_ssse3.c b/libaom/aom_dsp/x86/quantize_ssse3.c
new file mode 100644
index 0000000..25980a0
--- /dev/null
+++ b/libaom/aom_dsp/x86/quantize_ssse3.c
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <tmmintrin.h>
+#include <emmintrin.h>
+#include <xmmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/quantize_x86.h"
+
+static INLINE void calculate_qcoeff_64x64(__m128i *coeff, const __m128i round,
+ const __m128i quant,
+ const __m128i *shift) {
+ __m128i tmp, qcoeff, tmp1;
+ qcoeff = _mm_adds_epi16(*coeff, round);
+ tmp = _mm_mulhi_epi16(qcoeff, quant);
+ qcoeff = _mm_add_epi16(tmp, qcoeff);
+ tmp = _mm_mullo_epi16(qcoeff, *shift);
+ tmp = _mm_srli_epi16(tmp, 14);
+ tmp1 = _mm_mulhi_epi16(qcoeff, *shift);
+ tmp1 = _mm_slli_epi16(tmp1, 2);
+ *coeff = _mm_or_si128(tmp, tmp1);
+}
+
+static INLINE void calculate_dqcoeff_and_store_64x64(const __m128i qcoeff,
+ const __m128i dequant,
+ const __m128i zero,
+ tran_low_t *dqcoeff) {
+ // Un-sign to bias rounding like C.
+ const __m128i coeff = _mm_abs_epi16(qcoeff);
+
+ const __m128i sign_0 = _mm_unpacklo_epi16(zero, qcoeff);
+ const __m128i sign_1 = _mm_unpackhi_epi16(zero, qcoeff);
+
+ const __m128i low = _mm_mullo_epi16(coeff, dequant);
+ const __m128i high = _mm_mulhi_epi16(coeff, dequant);
+ __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high);
+ __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high);
+
+ // "Divide" by 4.
+ dqcoeff32_0 = _mm_srli_epi32(dqcoeff32_0, 2);
+ dqcoeff32_1 = _mm_srli_epi32(dqcoeff32_1, 2);
+
+ dqcoeff32_0 = _mm_sign_epi32(dqcoeff32_0, sign_0);
+ dqcoeff32_1 = _mm_sign_epi32(dqcoeff32_1, sign_1);
+
+ _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0);
+ _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1);
+}
+
+void aom_quantize_b_64x64_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i two = _mm_set1_epi16(2);
+ int index;
+
+ __m128i zbin, round, quant, dequant, shift;
+ __m128i coeff0, coeff1, qcoeff0, qcoeff1;
+ __m128i cmp_mask0, cmp_mask1, all_zero;
+ __m128i eob = zero, eob0;
+
+ (void)scan;
+ (void)n_coeffs;
+
+ // Setup global values.
+ zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+ round = _mm_load_si128((const __m128i *)round_ptr);
+ quant = _mm_load_si128((const __m128i *)quant_ptr);
+ dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+ shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+
+ // Shift with rounding.
+ zbin = _mm_add_epi16(zbin, two);
+ round = _mm_add_epi16(round, two);
+ zbin = _mm_srli_epi16(zbin, 2);
+ round = _mm_srli_epi16(round, 2);
+ zbin = _mm_sub_epi16(zbin, one);
+ // Do DC and first 15 AC.
+ coeff0 = load_coefficients(coeff_ptr);
+ coeff1 = load_coefficients(coeff_ptr + 8);
+
+ qcoeff0 = _mm_abs_epi16(coeff0);
+ qcoeff1 = _mm_abs_epi16(coeff1);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ zbin = _mm_unpackhi_epi64(zbin, zbin);
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_movemask_epi8(all_zero) == 0) {
+ _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero);
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ } else {
+ calculate_qcoeff_64x64(&qcoeff0, round, quant, &shift);
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ calculate_qcoeff_64x64(&qcoeff1, round, quant, &shift);
+
+ // Reinsert signs.
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ // Mask out zbin threshold coeffs.
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_coefficients(qcoeff0, qcoeff_ptr);
+ store_coefficients(qcoeff1, qcoeff_ptr + 8);
+
+ calculate_dqcoeff_and_store_64x64(qcoeff0, dequant, zero, dqcoeff_ptr);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ calculate_dqcoeff_and_store_64x64(qcoeff1, dequant, zero, dqcoeff_ptr + 8);
+
+ eob =
+ scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+ }
+
+ // AC only loop.
+ for (index = 16; index < 1024; index += 16) {
+ coeff0 = load_coefficients(coeff_ptr + index);
+ coeff1 = load_coefficients(coeff_ptr + index + 8);
+
+ qcoeff0 = _mm_abs_epi16(coeff0);
+ qcoeff1 = _mm_abs_epi16(coeff1);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_movemask_epi8(all_zero) == 0) {
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero);
+ continue;
+ }
+ calculate_qcoeff_64x64(&qcoeff0, round, quant, &shift);
+ calculate_qcoeff_64x64(&qcoeff1, round, quant, &shift);
+
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_coefficients(qcoeff0, qcoeff_ptr + index);
+ store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
+
+ calculate_dqcoeff_and_store_64x64(qcoeff0, dequant, zero,
+ dqcoeff_ptr + index);
+ calculate_dqcoeff_and_store_64x64(qcoeff1, dequant, zero,
+ dqcoeff_ptr + 8 + index);
+
+ eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
+ zero);
+ eob = _mm_max_epi16(eob, eob0);
+ }
+
+ *eob_ptr = accumulate_eob(eob);
+}
diff --git a/libaom/aom_dsp/x86/quantize_x86.h b/libaom/aom_dsp/x86/quantize_x86.h
index 4eed7dd..b2de01b 100644
--- a/libaom/aom_dsp/x86/quantize_x86.h
+++ b/libaom/aom_dsp/x86/quantize_x86.h
@@ -32,6 +32,11 @@ static INLINE __m128i invert_sign_sse2(__m128i a, __m128i sign) {
return _mm_sub_epi16(a, sign);
}
+static INLINE __m128i invert_sign_32_sse2(__m128i a, __m128i sign) {
+ a = _mm_xor_si128(a, sign);
+ return _mm_sub_epi32(a, sign);
+}
+
static INLINE void calculate_qcoeff(__m128i *coeff, const __m128i round,
const __m128i quant, const __m128i shift) {
__m128i tmp, qcoeff;
@@ -41,10 +46,53 @@ static INLINE void calculate_qcoeff(__m128i *coeff, const __m128i round,
*coeff = _mm_mulhi_epi16(qcoeff, shift);
}
+static INLINE void calculate_qcoeff_log_scale(__m128i *coeff,
+ const __m128i round,
+ const __m128i quant,
+ const __m128i *shift,
+ const int *log_scale) {
+ __m128i tmp, tmp1, qcoeff;
+ qcoeff = _mm_adds_epi16(*coeff, round);
+ tmp = _mm_mulhi_epi16(qcoeff, quant);
+ qcoeff = _mm_add_epi16(tmp, qcoeff);
+ tmp = _mm_mullo_epi16(qcoeff, *shift);
+ tmp = _mm_srli_epi16(tmp, (16 - *log_scale));
+ tmp1 = _mm_mulhi_epi16(qcoeff, *shift);
+ tmp1 = _mm_slli_epi16(tmp1, *log_scale);
+ *coeff = _mm_or_si128(tmp, tmp1);
+}
+
static INLINE __m128i calculate_dqcoeff(__m128i qcoeff, __m128i dequant) {
return _mm_mullo_epi16(qcoeff, dequant);
}
+static INLINE void calculate_dqcoeff_and_store_log_scale(__m128i qcoeff,
+ __m128i dequant,
+ const __m128i zero,
+ tran_low_t *dqcoeff,
+ const int *log_scale) {
+ // calculate abs
+ __m128i coeff_sign = _mm_srai_epi16(qcoeff, 15);
+ __m128i coeff = invert_sign_sse2(qcoeff, coeff_sign);
+
+ const __m128i sign_0 = _mm_unpacklo_epi16(coeff_sign, zero);
+ const __m128i sign_1 = _mm_unpackhi_epi16(coeff_sign, zero);
+
+ const __m128i low = _mm_mullo_epi16(coeff, dequant);
+ const __m128i high = _mm_mulhi_epi16(coeff, dequant);
+ __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high);
+ __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high);
+
+ dqcoeff32_0 = _mm_srli_epi32(dqcoeff32_0, *log_scale);
+ dqcoeff32_1 = _mm_srli_epi32(dqcoeff32_1, *log_scale);
+
+ dqcoeff32_0 = invert_sign_32_sse2(dqcoeff32_0, sign_0);
+ dqcoeff32_1 = invert_sign_32_sse2(dqcoeff32_1, sign_1);
+
+ _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0);
+ _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1);
+}
+
// Scan 16 values for eob reference in scan_ptr. Use masks (-1) from comparing
// to zbin to add 1 to the index in 'scan'.
static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1,
@@ -75,3 +123,23 @@ static INLINE int16_t accumulate_eob(__m128i eob) {
eob = _mm_max_epi16(eob, eob_shuffled);
return _mm_extract_epi16(eob, 1);
}
+
+static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) {
+ assert(sizeof(tran_low_t) == 4);
+ const __m128i coeff1 = _mm_load_si128((__m128i *)(coeff_ptr));
+ const __m128i coeff2 = _mm_load_si128((__m128i *)(coeff_ptr + 4));
+ return _mm_packs_epi32(coeff1, coeff2);
+}
+
+static INLINE void store_coefficients(__m128i coeff_vals,
+ tran_low_t *coeff_ptr) {
+ assert(sizeof(tran_low_t) == 4);
+
+ __m128i one = _mm_set1_epi16(1);
+ __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one);
+ __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one);
+ __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi);
+ __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi);
+ _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1);
+ _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2);
+}
diff --git a/libaom/aom_dsp/x86/sse_avx2.c b/libaom/aom_dsp/x86/sse_avx2.c
index fa45687..42df981 100644
--- a/libaom/aom_dsp/x86/sse_avx2.c
+++ b/libaom/aom_dsp/x86/sse_avx2.c
@@ -21,12 +21,11 @@ static INLINE void sse_w32_avx2(__m256i *sum, const uint8_t *a,
const uint8_t *b) {
const __m256i v_a0 = yy_loadu_256(a);
const __m256i v_b0 = yy_loadu_256(b);
- const __m256i v_a00_w = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_a0));
- const __m256i v_a01_w =
- _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_a0, 1));
- const __m256i v_b00_w = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_b0));
- const __m256i v_b01_w =
- _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_b0, 1));
+ const __m256i zero = _mm256_setzero_si256();
+ const __m256i v_a00_w = _mm256_unpacklo_epi8(v_a0, zero);
+ const __m256i v_a01_w = _mm256_unpackhi_epi8(v_a0, zero);
+ const __m256i v_b00_w = _mm256_unpacklo_epi8(v_b0, zero);
+ const __m256i v_b01_w = _mm256_unpackhi_epi8(v_b0, zero);
const __m256i v_d00_w = _mm256_sub_epi16(v_a00_w, v_b00_w);
const __m256i v_d01_w = _mm256_sub_epi16(v_a01_w, v_b01_w);
*sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d00_w, v_d00_w));
@@ -35,15 +34,13 @@ static INLINE void sse_w32_avx2(__m256i *sum, const uint8_t *a,
static INLINE int64_t summary_all_avx2(const __m256i *sum_all) {
int64_t sum;
- const __m256i sum0_4x64 =
- _mm256_cvtepu32_epi64(_mm256_castsi256_si128(*sum_all));
- const __m256i sum1_4x64 =
- _mm256_cvtepu32_epi64(_mm256_extracti128_si256(*sum_all, 1));
+ __m256i zero = _mm256_setzero_si256();
+ const __m256i sum0_4x64 = _mm256_unpacklo_epi32(*sum_all, zero);
+ const __m256i sum1_4x64 = _mm256_unpackhi_epi32(*sum_all, zero);
const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64);
const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64),
_mm256_extracti128_si256(sum_4x64, 1));
const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
-
xx_storel_64(&sum, sum_1x64);
return sum;
}
@@ -86,7 +83,6 @@ static INLINE void sse_w4x4_avx2(const uint8_t *a, int a_stride,
const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
*sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
}
-
static INLINE void sse_w8x2_avx2(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride, __m256i *sum) {
const __m128i v_a0 = xx_loadl_64(a);
@@ -98,12 +94,12 @@ static INLINE void sse_w8x2_avx2(const uint8_t *a, int a_stride,
const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
*sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
}
-
int64_t aom_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b,
int b_stride, int width, int height) {
int32_t y = 0;
int64_t sse = 0;
__m256i sum = _mm256_setzero_si256();
+ __m256i zero = _mm256_setzero_si256();
switch (width) {
case 4:
do {
@@ -126,14 +122,26 @@ int64_t aom_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b,
case 16:
do {
const __m128i v_a0 = xx_loadu_128(a);
+ const __m128i v_a1 = xx_loadu_128(a + a_stride);
const __m128i v_b0 = xx_loadu_128(b);
- const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0);
- const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0);
- const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
- sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w));
- a += a_stride;
- b += b_stride;
- y += 1;
+ const __m128i v_b1 = xx_loadu_128(b + b_stride);
+ const __m256i v_a =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(v_a0), v_a1, 0x01);
+ const __m256i v_b =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(v_b0), v_b1, 0x01);
+ const __m256i v_al = _mm256_unpacklo_epi8(v_a, zero);
+ const __m256i v_au = _mm256_unpackhi_epi8(v_a, zero);
+ const __m256i v_bl = _mm256_unpacklo_epi8(v_b, zero);
+ const __m256i v_bu = _mm256_unpackhi_epi8(v_b, zero);
+ const __m256i v_asub = _mm256_sub_epi16(v_al, v_bl);
+ const __m256i v_bsub = _mm256_sub_epi16(v_au, v_bu);
+ const __m256i temp =
+ _mm256_add_epi32(_mm256_madd_epi16(v_asub, v_asub),
+ _mm256_madd_epi16(v_bsub, v_bsub));
+ sum = _mm256_add_epi32(sum, temp);
+ a += a_stride << 1;
+ b += b_stride << 1;
+ y += 2;
} while (y < height);
sse = summary_all_avx2(&sum);
break;
diff --git a/libaom/aom_dsp/x86/txfm_common_avx2.h b/libaom/aom_dsp/x86/txfm_common_avx2.h
index 8a40508..06a77e7 100644
--- a/libaom/aom_dsp/x86/txfm_common_avx2.h
+++ b/libaom/aom_dsp/x86/txfm_common_avx2.h
@@ -168,6 +168,36 @@ static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in,
out[7 + 8] = _mm256_permute2x128_si256(c[12 + 2], c[13 + 2], 0x31);
}
+static INLINE void transpose_16bit_16x8_avx2(const __m256i *const in,
+ __m256i *const out) {
+ const __m256i a0 = _mm256_unpacklo_epi16(in[0], in[1]);
+ const __m256i a1 = _mm256_unpacklo_epi16(in[2], in[3]);
+ const __m256i a2 = _mm256_unpacklo_epi16(in[4], in[5]);
+ const __m256i a3 = _mm256_unpacklo_epi16(in[6], in[7]);
+ const __m256i a4 = _mm256_unpackhi_epi16(in[0], in[1]);
+ const __m256i a5 = _mm256_unpackhi_epi16(in[2], in[3]);
+ const __m256i a6 = _mm256_unpackhi_epi16(in[4], in[5]);
+ const __m256i a7 = _mm256_unpackhi_epi16(in[6], in[7]);
+
+ const __m256i b0 = _mm256_unpacklo_epi32(a0, a1);
+ const __m256i b1 = _mm256_unpacklo_epi32(a2, a3);
+ const __m256i b2 = _mm256_unpacklo_epi32(a4, a5);
+ const __m256i b3 = _mm256_unpacklo_epi32(a6, a7);
+ const __m256i b4 = _mm256_unpackhi_epi32(a0, a1);
+ const __m256i b5 = _mm256_unpackhi_epi32(a2, a3);
+ const __m256i b6 = _mm256_unpackhi_epi32(a4, a5);
+ const __m256i b7 = _mm256_unpackhi_epi32(a6, a7);
+
+ out[0] = _mm256_unpacklo_epi64(b0, b1);
+ out[1] = _mm256_unpackhi_epi64(b0, b1);
+ out[2] = _mm256_unpacklo_epi64(b4, b5);
+ out[3] = _mm256_unpackhi_epi64(b4, b5);
+ out[4] = _mm256_unpacklo_epi64(b2, b3);
+ out[5] = _mm256_unpackhi_epi64(b2, b3);
+ out[6] = _mm256_unpacklo_epi64(b6, b7);
+ out[7] = _mm256_unpackhi_epi64(b6, b7);
+}
+
static INLINE void flip_buf_avx2(__m256i *in, __m256i *out, int size) {
for (int i = 0; i < size; ++i) {
out[size - i - 1] = in[i];
@@ -236,6 +266,66 @@ static INLINE void av1_round_shift_rect_array_32_avx2(__m256i *input,
}
}
+static INLINE __m256i scale_round_avx2(const __m256i a, const int scale) {
+ const __m256i scale_rounding =
+ pair_set_w16_epi16(scale, 1 << (NewSqrt2Bits - 1));
+ const __m256i b = _mm256_madd_epi16(a, scale_rounding);
+ return _mm256_srai_epi32(b, NewSqrt2Bits);
+}
+
+static INLINE void store_rect_16bit_to_32bit_w8_avx2(const __m256i a,
+ int32_t *const b) {
+ const __m256i one = _mm256_set1_epi16(1);
+ const __m256i a_lo = _mm256_unpacklo_epi16(a, one);
+ const __m256i a_hi = _mm256_unpackhi_epi16(a, one);
+ const __m256i b_lo = scale_round_avx2(a_lo, NewSqrt2);
+ const __m256i b_hi = scale_round_avx2(a_hi, NewSqrt2);
+ const __m256i temp = _mm256_permute2f128_si256(b_lo, b_hi, 0x31);
+ _mm_store_si128((__m128i *)b, _mm256_castsi256_si128(b_lo));
+ _mm_store_si128((__m128i *)(b + 4), _mm256_castsi256_si128(b_hi));
+ _mm256_store_si256((__m256i *)(b + 64), temp);
+}
+
+static INLINE void store_rect_buffer_16bit_to_32bit_w8_avx2(
+ const __m256i *const in, int32_t *const out, const int stride,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ store_rect_16bit_to_32bit_w8_avx2(in[i], out + i * stride);
+ }
+}
+
+static INLINE void pack_reg(const __m128i *in1, const __m128i *in2,
+ __m256i *out) {
+ out[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[0]), in2[0], 0x1);
+ out[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[1]), in2[1], 0x1);
+ out[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[2]), in2[2], 0x1);
+ out[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[3]), in2[3], 0x1);
+ out[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[4]), in2[4], 0x1);
+ out[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[5]), in2[5], 0x1);
+ out[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[6]), in2[6], 0x1);
+ out[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[7]), in2[7], 0x1);
+}
+
+static INLINE void extract_reg(const __m256i *in, __m128i *out1) {
+ out1[0] = _mm256_castsi256_si128(in[0]);
+ out1[1] = _mm256_castsi256_si128(in[1]);
+ out1[2] = _mm256_castsi256_si128(in[2]);
+ out1[3] = _mm256_castsi256_si128(in[3]);
+ out1[4] = _mm256_castsi256_si128(in[4]);
+ out1[5] = _mm256_castsi256_si128(in[5]);
+ out1[6] = _mm256_castsi256_si128(in[6]);
+ out1[7] = _mm256_castsi256_si128(in[7]);
+
+ out1[8] = _mm256_extracti128_si256(in[0], 0x01);
+ out1[9] = _mm256_extracti128_si256(in[1], 0x01);
+ out1[10] = _mm256_extracti128_si256(in[2], 0x01);
+ out1[11] = _mm256_extracti128_si256(in[3], 0x01);
+ out1[12] = _mm256_extracti128_si256(in[4], 0x01);
+ out1[13] = _mm256_extracti128_si256(in[5], 0x01);
+ out1[14] = _mm256_extracti128_si256(in[6], 0x01);
+ out1[15] = _mm256_extracti128_si256(in[7], 0x01);
+}
+
#ifdef __cplusplus
}
#endif
diff --git a/libaom/aom_dsp/x86/variance_sse2.c b/libaom/aom_dsp/x86/variance_sse2.c
index c831e3e..f3efc15 100644
--- a/libaom/aom_dsp/x86/variance_sse2.c
+++ b/libaom/aom_dsp/x86/variance_sse2.c
@@ -494,7 +494,7 @@ void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
const int ref_num = 0;
const int is_intrabc = is_intrabc_block(mi);
const struct scale_factors *const sf =
- is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf;
+ is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref_num];
const int is_scaled = av1_is_scaled(sf);
if (is_scaled) {
diff --git a/libaom/aom_ports/mem.h b/libaom/aom_ports/mem.h
index 3ffea3c..9e3d424 100644
--- a/libaom/aom_ports/mem.h
+++ b/libaom/aom_ports/mem.h
@@ -66,4 +66,34 @@
#define CONVERT_TO_SHORTPTR(x) ((uint16_t *)(((uintptr_t)(x)) << 1))
#define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1))
+/*!\brief force enum to be unsigned 1 byte*/
+#define UENUM1BYTE(enumvar) \
+ ; \
+ typedef uint8_t enumvar
+
+/*!\brief force enum to be signed 1 byte*/
+#define SENUM1BYTE(enumvar) \
+ ; \
+ typedef int8_t enumvar
+
+/*!\brief force enum to be unsigned 2 byte*/
+#define UENUM2BYTE(enumvar) \
+ ; \
+ typedef uint16_t enumvar
+
+/*!\brief force enum to be signed 2 byte*/
+#define SENUM2BYTE(enumvar) \
+ ; \
+ typedef int16_t enumvar
+
+/*!\brief force enum to be unsigned 4 byte*/
+#define UENUM4BYTE(enumvar) \
+ ; \
+ typedef uint32_t enumvar
+
+/*!\brief force enum to be unsigned 4 byte*/
+#define SENUM4BYTE(enumvar) \
+ ; \
+ typedef int32_t enumvar
+
#endif // AOM_AOM_PORTS_MEM_H_
diff --git a/libaom/aom_ports/x86.h b/libaom/aom_ports/x86.h
index 52ee49c..8c18448 100644
--- a/libaom/aom_ports/x86.h
+++ b/libaom/aom_ports/x86.h
@@ -222,11 +222,26 @@ static INLINE int x86_simd_caps(void) {
return flags & mask;
}
-// Note:
-// 32-bit CPU cycle counter is light-weighted for most function performance
-// measurement. For large function (CPU time > a couple of seconds), 64-bit
-// counter should be used.
-// 32-bit CPU cycle counter
+// Fine-Grain Measurement Functions
+//
+// If you are a timing a small region of code, access the timestamp counter
+// (TSC) via:
+//
+// unsigned int start = x86_tsc_start();
+// ...
+// unsigned int end = x86_tsc_end();
+// unsigned int diff = end - start;
+//
+// The start/end functions introduce a few more instructions than using
+// x86_readtsc directly, but prevent the CPU's out-of-order execution from
+// affecting the measurement (by having earlier/later instructions be evaluated
+// in the time interval). See the white paper, "How to Benchmark Code
+// Execution Times on Intel® IA-32 and IA-64 Instruction Set Architectures" by
+// Gabriele Paoloni for more information.
+//
+// If you are timing a large function (CPU time > a couple of seconds), use
+// x86_readtsc64 to read the timestamp counter in a 64-bit integer. The
+// out-of-order leakage that can occur is minimal compared to total runtime.
static INLINE unsigned int x86_readtsc(void) {
#if defined(__GNUC__) && __GNUC__
unsigned int tsc;
@@ -263,6 +278,41 @@ static INLINE uint64_t x86_readtsc64(void) {
#endif
}
+// 32-bit CPU cycle counter with a partial fence against out-of-order execution.
+static INLINE unsigned int x86_readtscp(void) {
+#if defined(__GNUC__) && __GNUC__
+ unsigned int tscp;
+ __asm__ __volatile__("rdtscp\n\t" : "=a"(tscp) :);
+ return tscp;
+#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
+ unsigned int tscp;
+ asm volatile("rdtscp\n\t" : "=a"(tscp) :);
+ return tscp;
+#elif defined(_MSC_VER)
+ unsigned int ui;
+ return (unsigned int)__rdtscp(&ui);
+#else
+#if ARCH_X86_64
+ return (unsigned int)__rdtscp();
+#else
+ __asm rdtscp;
+#endif
+#endif
+}
+
+static INLINE unsigned int x86_tsc_start(void) {
+ unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx;
+ cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
+ return x86_readtsc();
+}
+
+static INLINE unsigned int x86_tsc_end(void) {
+ uint32_t v = x86_readtscp();
+ unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx;
+ cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
+ return v;
+}
+
#if defined(__GNUC__) && __GNUC__
#define x86_pause_hint() __asm__ __volatile__("pause \n\t")
#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
diff --git a/libaom/aom_scale/aom_scale.cmake b/libaom/aom_scale/aom_scale.cmake
index 197dea6..3199733 100644
--- a/libaom/aom_scale/aom_scale.cmake
+++ b/libaom/aom_scale/aom_scale.cmake
@@ -34,5 +34,9 @@ function(setup_aom_scale_targets)
"AOM_SCALE_INTRIN_DSPR2" "aom")
endif()
+ target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_scale>)
+
+ # Pass the new lib targets up to the parent scope instance of
+ # $AOM_LIB_TARGETS.
set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_scale PARENT_SCOPE)
endfunction()
diff --git a/libaom/aom_scale/aom_scale_rtcd.pl b/libaom/aom_scale/aom_scale_rtcd.pl
index 27378c7..eef6f16 100644
--- a/libaom/aom_scale/aom_scale_rtcd.pl
+++ b/libaom/aom_scale/aom_scale_rtcd.pl
@@ -26,6 +26,8 @@ if (aom_config("CONFIG_SPATIAL_RESAMPLING") eq "yes") {
add_proto qw/void aom_vertical_band_2_1_scale_i/, "unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width";
}
+add_proto qw/int aom_yv12_realloc_with_new_border/, "struct yv12_buffer_config *ybf, int new_border, int byte_alignment, int num_planes";
+
add_proto qw/void aom_yv12_extend_frame_borders/, "struct yv12_buffer_config *ybf, const int num_planes";
add_proto qw/void aom_yv12_copy_frame/, "const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, const int num_planes";
diff --git a/libaom/aom_scale/generic/yv12config.c b/libaom/aom_scale/generic/yv12config.c
index 7cf3c4f..a5ad1a7 100644
--- a/libaom/aom_scale/generic/yv12config.c
+++ b/libaom/aom_scale/generic/yv12config.c
@@ -46,37 +46,16 @@ int aom_free_frame_buffer(YV12_BUFFER_CONFIG *ybf) {
return 0;
}
-int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
- int ss_x, int ss_y, int use_highbitdepth,
- int border, int byte_alignment,
- aom_codec_frame_buffer_t *fb,
- aom_get_frame_buffer_cb_fn_t cb, void *cb_priv) {
-#if CONFIG_SIZE_LIMIT
- if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT) return -1;
-#endif
-
- /* Only support allocating buffers that have a border that's a multiple
- * of 32. The border restriction is required to get 16-byte alignment of
- * the start of the chroma rows without introducing an arbitrary gap
- * between planes, which would break the semantics of things like
- * aom_img_set_rect(). */
- if (border & 0x1f) return -3;
-
+static int realloc_frame_buffer_aligned(
+ YV12_BUFFER_CONFIG *ybf, int width, int height, int ss_x, int ss_y,
+ int use_highbitdepth, int border, int byte_alignment,
+ aom_codec_frame_buffer_t *fb, aom_get_frame_buffer_cb_fn_t cb,
+ void *cb_priv, const int y_stride, const uint64_t yplane_size,
+ const uint64_t uvplane_size, const int aligned_width,
+ const int aligned_height, const int uv_width, const int uv_height,
+ const int uv_stride, const int uv_border_w, const int uv_border_h) {
if (ybf) {
const int aom_byte_align = (byte_alignment == 0) ? 1 : byte_alignment;
- const int aligned_width = (width + 7) & ~7;
- const int aligned_height = (height + 7) & ~7;
- const int y_stride = ((aligned_width + 2 * border) + 31) & ~31;
- const uint64_t yplane_size =
- (aligned_height + 2 * border) * (uint64_t)y_stride + byte_alignment;
- const int uv_width = aligned_width >> ss_x;
- const int uv_height = aligned_height >> ss_y;
- const int uv_stride = y_stride >> ss_x;
- const int uv_border_w = border >> ss_x;
- const int uv_border_h = border >> ss_y;
- const uint64_t uvplane_size =
- (uv_height + 2 * uv_border_h) * (uint64_t)uv_stride + byte_alignment;
-
const uint64_t frame_size =
(1 + use_highbitdepth) * (yplane_size + 2 * uvplane_size);
@@ -120,6 +99,7 @@ int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
// Allocation to hold larger frame, or first allocation.
aom_free(ybf->buffer_alloc);
ybf->buffer_alloc = NULL;
+ ybf->buffer_alloc_sz = 0;
if (frame_size != (size_t)frame_size) return -1;
@@ -190,6 +170,111 @@ int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
return -2;
}
+static int calc_stride_and_planesize(const int ss_x, const int ss_y,
+ const int aligned_width,
+ const int aligned_height, const int border,
+ const int byte_alignment, int *y_stride,
+ int *uv_stride, uint64_t *yplane_size,
+ uint64_t *uvplane_size,
+ const int uv_height) {
+ /* Only support allocating buffers that have a border that's a multiple
+ * of 32. The border restriction is required to get 16-byte alignment of
+ * the start of the chroma rows without introducing an arbitrary gap
+ * between planes, which would break the semantics of things like
+ * aom_img_set_rect(). */
+ if (border & 0x1f) return -3;
+ *y_stride = ((aligned_width + 2 * border) + 31) & ~31;
+ *yplane_size =
+ (aligned_height + 2 * border) * (uint64_t)(*y_stride) + byte_alignment;
+
+ *uv_stride = *y_stride >> ss_x;
+ *uvplane_size = (uv_height + 2 * (border >> ss_y)) * (uint64_t)(*uv_stride) +
+ byte_alignment;
+ return 0;
+}
+
+int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
+ int ss_x, int ss_y, int use_highbitdepth,
+ int border, int byte_alignment,
+ aom_codec_frame_buffer_t *fb,
+ aom_get_frame_buffer_cb_fn_t cb, void *cb_priv) {
+#if CONFIG_SIZE_LIMIT
+ if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT) return -1;
+#endif
+
+ if (ybf) {
+ int y_stride = 0;
+ int uv_stride = 0;
+ uint64_t yplane_size = 0;
+ uint64_t uvplane_size = 0;
+ const int aligned_width = (width + 7) & ~7;
+ const int aligned_height = (height + 7) & ~7;
+ const int uv_width = aligned_width >> ss_x;
+ const int uv_height = aligned_height >> ss_y;
+ const int uv_border_w = border >> ss_x;
+ const int uv_border_h = border >> ss_y;
+
+ int error = calc_stride_and_planesize(
+ ss_x, ss_y, aligned_width, aligned_height, border, byte_alignment,
+ &y_stride, &uv_stride, &yplane_size, &uvplane_size, uv_height);
+ if (error) return error;
+ return realloc_frame_buffer_aligned(
+ ybf, width, height, ss_x, ss_y, use_highbitdepth, border,
+ byte_alignment, fb, cb, cb_priv, y_stride, yplane_size, uvplane_size,
+ aligned_width, aligned_height, uv_width, uv_height, uv_stride,
+ uv_border_w, uv_border_h);
+ }
+ return -2;
+}
+
+// TODO(anyone): This function allocates memory for
+// lookahead buffer considering height and width is
+// aligned to 128. Currently variance calculation of
+// simple_motion_search_get_best_ref() function is done
+// for full sb size (i.e integral multiple of max sb
+// size = 128 or 64). Hence partial sbs need up to 127
+// pixels beyond frame boundary. 128 aligned limitation of
+// lookahead buffer can be removed if variance calculation
+// is adjusted for partial sbs
+
+// NOTE: Chroma width and height need not be aligned to
+// 128 since variance calculation happens only for luma plane
+int aom_realloc_lookahead_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
+ int ss_x, int ss_y, int use_highbitdepth,
+ int border, int byte_alignment,
+ aom_codec_frame_buffer_t *fb,
+ aom_get_frame_buffer_cb_fn_t cb,
+ void *cb_priv) {
+ if (ybf) {
+ int y_stride = 0;
+ int uv_stride = 0;
+ uint64_t yplane_size = 0;
+ uint64_t uvplane_size = 0;
+ const int aligned_128_width = (width + 127) & ~127;
+ const int aligned_128_height = (height + 127) & ~127;
+ const int aligned_width = (width + 7) & ~7;
+ const int aligned_height = (height + 7) & ~7;
+ const int uv_64_height = aligned_128_height >> ss_y;
+ const int uv_width = aligned_width >> ss_x;
+ const int uv_height = aligned_height >> ss_y;
+ const int uv_border_w = border >> ss_x;
+ const int uv_border_h = border >> ss_y;
+
+ int error = calc_stride_and_planesize(
+ ss_x, ss_y, aligned_128_width, aligned_128_height, border,
+ byte_alignment, &y_stride, &uv_stride, &yplane_size, &uvplane_size,
+ uv_64_height);
+ if (error) return error;
+
+ return realloc_frame_buffer_aligned(
+ ybf, width, height, ss_x, ss_y, use_highbitdepth, border,
+ byte_alignment, fb, cb, cb_priv, y_stride, yplane_size, uvplane_size,
+ aligned_width, aligned_height, uv_width, uv_height, uv_stride,
+ uv_border_w, uv_border_h);
+ }
+ return -2;
+}
+
int aom_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
int ss_x, int ss_y, int use_highbitdepth, int border,
int byte_alignment) {
diff --git a/libaom/aom_scale/generic/yv12extend.c b/libaom/aom_scale/generic/yv12extend.c
index 127ca23..6e9cfff 100644
--- a/libaom/aom_scale/generic/yv12extend.c
+++ b/libaom/aom_scale/generic/yv12extend.c
@@ -434,3 +434,28 @@ void aom_yv12_partial_coloc_copy_v_c(const YV12_BUFFER_CONFIG *src_bc,
aom_yv12_partial_copy_v_c(src_bc, hstart, hend, vstart, vend, dst_bc, hstart,
vstart);
}
+
+int aom_yv12_realloc_with_new_border_c(YV12_BUFFER_CONFIG *ybf, int new_border,
+ int byte_alignment, int num_planes) {
+ if (ybf) {
+ if (new_border == ybf->border) return 0;
+ YV12_BUFFER_CONFIG new_buf;
+ memset(&new_buf, 0, sizeof(new_buf));
+ const int error = aom_alloc_frame_buffer(
+ &new_buf, ybf->y_crop_width, ybf->y_crop_height, ybf->subsampling_x,
+ ybf->subsampling_y, ybf->flags & YV12_FLAG_HIGHBITDEPTH, new_border,
+ byte_alignment);
+ if (error) return error;
+ // Copy image buffer
+ aom_yv12_copy_frame(ybf, &new_buf, num_planes);
+
+ // Extend up to new border
+ aom_extend_frame_borders(&new_buf, num_planes);
+
+ // Now free the old buffer and replace with the new
+ aom_free_frame_buffer(ybf);
+ memcpy(ybf, &new_buf, sizeof(new_buf));
+ return 0;
+ }
+ return -2;
+}
diff --git a/libaom/aom_scale/yv12config.h b/libaom/aom_scale/yv12config.h
index 10c6ad5..04a1c04 100644
--- a/libaom/aom_scale/yv12config.h
+++ b/libaom/aom_scale/yv12config.h
@@ -24,15 +24,10 @@ extern "C" {
#define AOMINNERBORDERINPIXELS 160
#define AOM_INTERP_EXTEND 4
-
-// TODO(jingning): Use unified inter predictor for encoder and
-// decoder during the development process. Revisit the frame border
-// to improve the decoder performance.
-#if CONFIG_REDUCED_ENCODER_BORDER
-#define AOM_BORDER_IN_PIXELS 160
-#else
#define AOM_BORDER_IN_PIXELS 288
-#endif // CONFIG_REDUCED_ENCODER_BORDER
+#define AOM_ENC_NO_SCALE_BORDER 160
+#define AOM_ENC_LOOKAHEAD_BORDER 64
+#define AOM_DEC_BORDER_IN_PIXELS 64
typedef struct yv12_buffer_config {
union {
@@ -102,7 +97,7 @@ typedef struct yv12_buffer_config {
aom_color_primaries_t color_primaries;
aom_transfer_characteristics_t transfer_characteristics;
aom_matrix_coefficients_t matrix_coefficients;
- int monochrome;
+ uint8_t monochrome;
aom_chroma_sample_position_t chroma_sample_position;
aom_color_range_t color_range;
int render_width;
@@ -130,6 +125,14 @@ int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
int border, int byte_alignment,
aom_codec_frame_buffer_t *fb,
aom_get_frame_buffer_cb_fn_t cb, void *cb_priv);
+
+int aom_realloc_lookahead_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
+ int ss_x, int ss_y, int use_highbitdepth,
+ int border, int byte_alignment,
+ aom_codec_frame_buffer_t *fb,
+ aom_get_frame_buffer_cb_fn_t cb,
+ void *cb_priv);
+
int aom_free_frame_buffer(YV12_BUFFER_CONFIG *ybf);
#ifdef __cplusplus
diff --git a/libaom/apps/aomdec.c b/libaom/apps/aomdec.c
index 58ac172..549c4da 100644
--- a/libaom/apps/aomdec.c
+++ b/libaom/apps/aomdec.c
@@ -484,6 +484,7 @@ static int main_loop(int argc, const char **argv_) {
input.webm_ctx = &webm_ctx;
#endif
struct ObuDecInputContext obu_ctx = { NULL, NULL, 0, 0, 0 };
+ int is_ivf = 0;
obu_ctx.avx_ctx = &aom_input_ctx;
input.obu_ctx = &obu_ctx;
@@ -610,8 +611,10 @@ static int main_loop(int argc, const char **argv_) {
#endif
input.aom_input_ctx->filename = fn;
input.aom_input_ctx->file = infile;
- if (file_is_ivf(input.aom_input_ctx))
+ if (file_is_ivf(input.aom_input_ctx)) {
input.aom_input_ctx->file_type = FILE_TYPE_IVF;
+ is_ivf = 1;
+ }
#if CONFIG_WEBM_IO
else if (file_is_webm(input.webm_ctx, input.aom_input_ctx))
input.aom_input_ctx->file_type = FILE_TYPE_WEBM;
@@ -661,6 +664,10 @@ static int main_loop(int argc, const char **argv_) {
}
fourcc_interface = get_aom_decoder_by_fourcc(aom_input_ctx.fourcc);
+
+ if (is_ivf && !fourcc_interface)
+ fatal("Unsupported fourcc: %x\n", aom_input_ctx.fourcc);
+
if (interface && fourcc_interface && interface != fourcc_interface)
warn("Header indicates codec: %s\n", fourcc_interface->name);
else
@@ -844,7 +851,7 @@ static int main_loop(int argc, const char **argv_) {
}
// Default to codec bit depth if output bit depth not set
unsigned int output_bit_depth;
- if (!fixed_output_bit_depth && single_file && !do_md5) {
+ if (!fixed_output_bit_depth && single_file) {
output_bit_depth = img->bit_depth;
} else {
output_bit_depth = fixed_output_bit_depth;
diff --git a/libaom/apps/aomenc.c b/libaom/apps/aomenc.c
index 4680d3a..08bf08d 100644
--- a/libaom/apps/aomenc.c
+++ b/libaom/apps/aomenc.c
@@ -144,16 +144,14 @@ static const arg_def_t pass_arg =
ARG_DEF(NULL, "pass", 1, "Pass to execute (1/2)");
static const arg_def_t fpf_name =
ARG_DEF(NULL, "fpf", 1, "First pass statistics file name");
-#if CONFIG_FP_MB_STATS
-static const arg_def_t fpmbf_name =
- ARG_DEF(NULL, "fpmbf", 1, "First pass block statistics file name");
-#endif
static const arg_def_t limit =
ARG_DEF(NULL, "limit", 1, "Stop encoding after n input frames");
static const arg_def_t skip =
ARG_DEF(NULL, "skip", 1, "Skip the first n input frames");
static const arg_def_t good_dl =
ARG_DEF(NULL, "good", 0, "Use Good Quality Deadline");
+static const arg_def_t rt_dl =
+ ARG_DEF(NULL, "rt", 0, "Use Realtime Quality Deadline");
static const arg_def_t quietarg =
ARG_DEF("q", "quiet", 0, "Do not print encode progress");
static const arg_def_t verbosearg =
@@ -219,6 +217,7 @@ static const arg_def_t *main_args[] = { &help,
&limit,
&skip,
&good_dl,
+ &rt_dl,
&quietarg,
&verbosearg,
&psnrarg,
@@ -263,9 +262,9 @@ static const arg_def_t global_error_resilient =
"Enable global error resiliency features");
static const arg_def_t lag_in_frames =
ARG_DEF(NULL, "lag-in-frames", 1, "Max number of frames to lag");
-static const arg_def_t large_scale_tile =
- ARG_DEF(NULL, "large-scale-tile", 1,
- "Large scale tile coding (0: off (default), 1: on)");
+static const arg_def_t large_scale_tile = ARG_DEF(
+ NULL, "large-scale-tile", 1,
+ "Large scale tile coding (0: off (default), 1: on (ivf output only))");
static const arg_def_t monochrome =
ARG_DEF(NULL, "monochrome", 0, "Monochrome video (no chroma planes)");
static const arg_def_t full_still_picture_hdr = ARG_DEF(
@@ -415,7 +414,7 @@ static const arg_def_t cpu_used_av1 =
ARG_DEF(NULL, "cpu-used", 1, "CPU Used (0..8)");
static const arg_def_t rowmtarg =
ARG_DEF(NULL, "row-mt", 1,
- "Enable row based multi-threading (0: off (default), 1: on)");
+ "Enable row based multi-threading (0: off, 1: on (default))");
static const arg_def_t tile_cols =
ARG_DEF(NULL, "tile-columns", 1, "Number of tile columns to use, log2");
static const arg_def_t tile_rows =
@@ -437,10 +436,121 @@ static const arg_def_t enable_restoration =
ARG_DEF(NULL, "enable-restoration", 1,
"Enable the loop restoration filter (0: false, "
"1: true (default))");
+static const arg_def_t enable_rect_partitions =
+ ARG_DEF(NULL, "enable-rect-partitions", 1,
+ "Enable rectangular partitions "
+ "(0: false, 1: true (default))");
+static const arg_def_t enable_ab_partitions =
+ ARG_DEF(NULL, "enable-ab-partitions", 1,
+ "Enable ab partitions (0: false, 1: true (default))");
+static const arg_def_t enable_1to4_partitions =
+ ARG_DEF(NULL, "enable-1to4-partitions", 1,
+ "Enable 1:4 and 4:1 partitions "
+ "(0: false, 1: true (default))");
+static const arg_def_t min_partition_size =
+ ARG_DEF(NULL, "min-partition-size", 4,
+ "Set min partition size "
+ "(4:4x4, 8:8x8, 16:16x16, 32:32x32, 64:64x64, 128:128x128)");
+static const arg_def_t max_partition_size =
+ ARG_DEF(NULL, "max-partition-size", 128,
+ "Set max partition size "
+ "(4:4x4, 8:8x8, 16:16x16, 32:32x32, 64:64x64, 128:128x128)");
+static const arg_def_t enable_dual_filter =
+ ARG_DEF(NULL, "enable-dual-filter", 1,
+ "Enable dual filter "
+ "(0: false, 1: true (default))");
+static const arg_def_t enable_intra_edge_filter =
+ ARG_DEF(NULL, "enable-intra-edge-filter", 1,
+ "Enable intra edge filtering "
+ "(0: false, 1: true (default))");
+static const arg_def_t enable_order_hint =
+ ARG_DEF(NULL, "enable-order-hint", 1,
+ "Enable order hint "
+ "(0: false, 1: true (default))");
+static const arg_def_t enable_tx64 =
+ ARG_DEF(NULL, "enable-tx64", 1,
+ "Enable 64-pt transform (0: false, 1: true (default))");
+static const arg_def_t tx_size_search_method =
+ ARG_DEF(NULL, "tx-size-search-method", 0,
+ "Set transform block size search method "
+ "(0: Full RD (default), 1: Fast RD, 2: use largest allowed)");
+static const arg_def_t enable_flip_idtx =
+ ARG_DEF(NULL, "enable-flip-idtx", 1,
+ "Enable extended transform type (0: false, 1: true (default)) "
+ "including FLIPADST_DCT, DCT_FLIPADST, FLIPADST_FLIPADST, "
+ "ADST_FLIPADST, FLIPADST_ADST, IDTX, V_DCT, H_DCT, V_ADST, "
+ "H_ADST, V_FLIPADST, H_FLIPADST");
+static const arg_def_t enable_dist_wtd_comp =
+ ARG_DEF(NULL, "enable-dist-wtd-comp", 1,
+ "Enable distance-weighted compound "
+ "(0: false, 1: true (default))");
+static const arg_def_t enable_masked_comp =
+ ARG_DEF(NULL, "enable-masked-comp", 1,
+ "Enable masked (wedge/diff-wtd) compound "
+ "(0: false, 1: true (default))");
+static const arg_def_t enable_onesided_comp =
+ ARG_DEF(NULL, "enable-onesided-comp", 1,
+ "Enable one sided compound "
+ "(0: false, 1: true (default))");
+static const arg_def_t enable_interintra_comp =
+ ARG_DEF(NULL, "enable-interintra-comp", 1,
+ "Enable interintra compound "
+ "(0: false, 1: true (default))");
+static const arg_def_t enable_smooth_interintra =
+ ARG_DEF(NULL, "enable-smooth-interintra", 1,
+ "Enable smooth interintra mode "
+ "(0: false, 1: true (default))");
+static const arg_def_t enable_diff_wtd_comp =
+ ARG_DEF(NULL, "enable-diff-wtd-comp", 1,
+ "Enable difference-weighted compound "
+ "(0: false, 1: true (default))");
+static const arg_def_t enable_interinter_wedge =
+ ARG_DEF(NULL, "enable-interinter-wedge", 1,
+ "Enable interinter wedge compound "
+ "(0: false, 1: true (default))");
+static const arg_def_t enable_interintra_wedge =
+ ARG_DEF(NULL, "enable-interintra-wedge", 1,
+ "Enable interintra wedge compound "
+ "(0: false, 1: true (default))");
+static const arg_def_t enable_global_motion =
+ ARG_DEF(NULL, "enable-global-motion", 1,
+ "Enable global motion "
+ "(0: false, 1: true (default))");
+static const arg_def_t enable_warped_motion =
+ ARG_DEF(NULL, "enable-warped-motion", 1,
+ "Enable local warped motion "
+ "(0: false, 1: true (default))");
+static const arg_def_t enable_filter_intra =
+ ARG_DEF(NULL, "enable-filter-intra", 1,
+ "Enable filter intra prediction mode "
+ "(0: false, 1: true (default))");
+static const arg_def_t enable_smooth_intra =
+ ARG_DEF(NULL, "enable-smooth-intra", 1,
+ "Enable smooth intra prediction modes "
+ "(0: false, 1: true (default))");
+static const arg_def_t enable_paeth_intra =
+ ARG_DEF(NULL, "enable-paeth-intra", 1,
+ "Enable Paeth intra prediction mode (0: false, 1: true (default))");
+static const arg_def_t enable_cfl_intra =
+ ARG_DEF(NULL, "enable-cfl-intra", 1,
+ "Enable chroma from luma intra prediction mode "
+ "(0: false, 1: true (default))");
+static const arg_def_t enable_obmc = ARG_DEF(
+ NULL, "enable-obmc", 1, "Enable OBMC (0: false, 1: true (default))");
+static const arg_def_t enable_palette =
+ ARG_DEF(NULL, "enable-palette", 1,
+ "Enable palette prediction mode (0: false, 1: true (default))");
+static const arg_def_t enable_intrabc =
+ ARG_DEF(NULL, "enable-intrabc", 1,
+ "Enable intra block copy prediction mode "
+ "(0: false, 1: true (default))");
+static const arg_def_t enable_angle_delta =
+ ARG_DEF(NULL, "enable-angle-delta", 1,
+ "Enable intra angle delta (0: false, 1: true (default))");
static const arg_def_t disable_trellis_quant =
ARG_DEF(NULL, "disable-trellis-quant", 1,
"Disable trellis optimization of quantized coefficients (0: false ("
- "default) 1: true)");
+ "default) 1: true 2: partial true)");
static const arg_def_t enable_qm =
ARG_DEF(NULL, "enable-qm", 1,
"Enable quantisation matrices (0: false (default), 1: true)");
@@ -448,6 +558,25 @@ static const arg_def_t qm_min = ARG_DEF(
NULL, "qm-min", 1, "Min quant matrix flatness (0..15), default is 8");
static const arg_def_t qm_max = ARG_DEF(
NULL, "qm-max", 1, "Max quant matrix flatness (0..15), default is 15");
+static const arg_def_t reduced_tx_type_set = ARG_DEF(
+ NULL, "reduced-tx-type-set", 1, "Use reduced set of transform types");
+static const arg_def_t use_intra_dct_only =
+ ARG_DEF(NULL, "use-intra-dct-only", 1, "Use DCT only for INTRA modes");
+static const arg_def_t use_inter_dct_only =
+ ARG_DEF(NULL, "use-inter-dct-only", 1, "Use DCT only for INTER modes");
+static const arg_def_t use_intra_default_tx_only =
+ ARG_DEF(NULL, "use-intra-default-tx-only", 1,
+ "Use Default-transform only for INTRA modes");
+static const arg_def_t quant_b_adapt =
+ ARG_DEF(NULL, "quant-b-adapt", 1, "Use adaptive quantize_b");
+static const arg_def_t coeff_cost_upd_freq =
+ ARG_DEF(NULL, "coeff-cost-upd-freq", 1,
+ "Update freq for coeff costs"
+ "0: SB, 1: SB Row per Tile, 2: Tile");
+static const arg_def_t mode_cost_upd_freq =
+ ARG_DEF(NULL, "mode-cost-upd-freq", 1,
+ "Update freq for mode costs"
+ "0: SB, 1: SB Row per Tile, 2: Tile");
#if CONFIG_DIST_8X8
static const arg_def_t enable_dist_8x8 =
ARG_DEF(NULL, "enable-dist-8x8", 1,
@@ -515,6 +644,25 @@ static const arg_def_t min_gf_interval = ARG_DEF(
static const arg_def_t max_gf_interval = ARG_DEF(
NULL, "max-gf-interval", 1,
"max gf/arf frame interval (default 0, indicating in-built behavior)");
+static const arg_def_t gf_max_pyr_height =
+ ARG_DEF(NULL, "gf-max-pyr-height", 1,
+ "maximum height for GF group pyramid structure (0 to 4 (default))");
+static const arg_def_t max_reference_frames = ARG_DEF(
+ NULL, "max-reference-frames", 1,
+ "maximum number of reference frames allowed per frame (3 to 7 (default))");
+static const arg_def_t reduced_reference_set =
+ ARG_DEF(NULL, "reduced-reference-set", 1,
+ "Use reduced set of single and compound references (0: off "
+ "(default), 1: on)");
+static const arg_def_t target_seq_level_idx =
+ ARG_DEF(NULL, "target-seq-level-idx", 1,
+ "Target sequence level index. "
+ "Possible values are in the form of \"ABxy\"(pad leading zeros if "
+ "less than 4 digits). "
+ "AB: Operating point(OP) index; "
+ "xy: Target level index for the OP. "
+ "E.g. \"0\" means target level index 0 for the 0th OP; "
+ "\"1021\" means target level index 21 for the 10th OP.");
static const struct arg_enum_list color_primaries_enum[] = {
{ "bt709", AOM_CICP_CP_BT_709 },
@@ -620,6 +768,12 @@ static const struct arg_enum_list superblock_size_enum[] = {
static const arg_def_t superblock_size = ARG_DEF_ENUM(
NULL, "sb-size", 1, "Superblock size to use", superblock_size_enum);
+static const arg_def_t set_tier_mask =
+ ARG_DEF(NULL, "set-tier-mask", 1,
+ "Set bit mask to specify which tier each of the 32 possible "
+ "operating points conforms to. "
+ "Bit value 0(defualt): Main Tier; 1: High Tier.");
+
static const arg_def_t *av1_args[] = { &cpu_used_av1,
&auto_altref,
&sharpness,
@@ -638,10 +792,46 @@ static const arg_def_t *av1_args[] = { &cpu_used_av1,
&lossless,
&enable_cdef,
&enable_restoration,
+ &enable_rect_partitions,
+ &enable_ab_partitions,
+ &enable_1to4_partitions,
+ &min_partition_size,
+ &max_partition_size,
+ &enable_dual_filter,
+ &enable_intra_edge_filter,
+ &enable_order_hint,
+ &enable_tx64,
+ &tx_size_search_method,
+ &enable_flip_idtx,
+ &enable_dist_wtd_comp,
+ &enable_masked_comp,
+ &enable_onesided_comp,
+ &enable_interintra_comp,
+ &enable_smooth_interintra,
+ &enable_diff_wtd_comp,
+ &enable_interinter_wedge,
+ &enable_interintra_wedge,
+ &enable_global_motion,
+ &enable_warped_motion,
+ &enable_filter_intra,
+ &enable_smooth_intra,
+ &enable_paeth_intra,
+ &enable_cfl_intra,
+ &enable_obmc,
+ &enable_palette,
+ &enable_intrabc,
+ &enable_angle_delta,
&disable_trellis_quant,
&enable_qm,
&qm_min,
&qm_max,
+ &reduced_tx_type_set,
+ &use_intra_dct_only,
+ &use_inter_dct_only,
+ &use_intra_default_tx_only,
+ &quant_b_adapt,
+ &coeff_cost_upd_freq,
+ &mode_cost_upd_freq,
#if CONFIG_DIST_8X8
&enable_dist_8x8,
#endif
@@ -659,6 +849,7 @@ static const arg_def_t *av1_args[] = { &cpu_used_av1,
&input_chroma_sample_position,
&min_gf_interval,
&max_gf_interval,
+ &gf_max_pyr_height,
&superblock_size,
&num_tg,
&mtu_size,
@@ -668,8 +859,12 @@ static const arg_def_t *av1_args[] = { &cpu_used_av1,
#if CONFIG_DENOISE
&denoise_noise_level,
&denoise_block_size,
-#endif
+#endif // CONFIG_DENOISE
+ &max_reference_frames,
+ &reduced_reference_set,
&enable_ref_frame_mvs,
+ &target_seq_level_idx,
+ &set_tier_mask,
&bitdeptharg,
&inbitdeptharg,
&input_chroma_subsampling_x,
@@ -696,10 +891,46 @@ static const int av1_arg_ctrl_map[] = { AOME_SET_CPUUSED,
AV1E_SET_LOSSLESS,
AV1E_SET_ENABLE_CDEF,
AV1E_SET_ENABLE_RESTORATION,
+ AV1E_SET_ENABLE_RECT_PARTITIONS,
+ AV1E_SET_ENABLE_AB_PARTITIONS,
+ AV1E_SET_ENABLE_1TO4_PARTITIONS,
+ AV1E_SET_MIN_PARTITION_SIZE,
+ AV1E_SET_MAX_PARTITION_SIZE,
+ AV1E_SET_ENABLE_DUAL_FILTER,
+ AV1E_SET_ENABLE_INTRA_EDGE_FILTER,
+ AV1E_SET_ENABLE_ORDER_HINT,
+ AV1E_SET_ENABLE_TX64,
+ AV1E_SET_TX_SIZE_SEARCH_METHOD,
+ AV1E_SET_ENABLE_FLIP_IDTX,
+ AV1E_SET_ENABLE_DIST_WTD_COMP,
+ AV1E_SET_ENABLE_MASKED_COMP,
+ AV1E_SET_ENABLE_ONESIDED_COMP,
+ AV1E_SET_ENABLE_INTERINTRA_COMP,
+ AV1E_SET_ENABLE_SMOOTH_INTERINTRA,
+ AV1E_SET_ENABLE_DIFF_WTD_COMP,
+ AV1E_SET_ENABLE_INTERINTER_WEDGE,
+ AV1E_SET_ENABLE_INTERINTRA_WEDGE,
+ AV1E_SET_ENABLE_GLOBAL_MOTION,
+ AV1E_SET_ENABLE_WARPED_MOTION,
+ AV1E_SET_ENABLE_FILTER_INTRA,
+ AV1E_SET_ENABLE_SMOOTH_INTRA,
+ AV1E_SET_ENABLE_PAETH_INTRA,
+ AV1E_SET_ENABLE_CFL_INTRA,
+ AV1E_SET_ENABLE_OBMC,
+ AV1E_SET_ENABLE_PALETTE,
+ AV1E_SET_ENABLE_INTRABC,
+ AV1E_SET_ENABLE_ANGLE_DELTA,
AV1E_SET_DISABLE_TRELLIS_QUANT,
AV1E_SET_ENABLE_QM,
AV1E_SET_QM_MIN,
AV1E_SET_QM_MAX,
+ AV1E_SET_REDUCED_TX_TYPE_SET,
+ AV1E_SET_INTRA_DCT_ONLY,
+ AV1E_SET_INTER_DCT_ONLY,
+ AV1E_SET_INTRA_DEFAULT_TX_ONLY,
+ AV1E_SET_QUANT_B_ADAPT,
+ AV1E_SET_COEFF_COST_UPD_FREQ,
+ AV1E_SET_MODE_COST_UPD_FREQ,
#if CONFIG_DIST_8X8
AV1E_SET_ENABLE_DIST_8X8,
#endif
@@ -717,6 +948,7 @@ static const int av1_arg_ctrl_map[] = { AOME_SET_CPUUSED,
AV1E_SET_CHROMA_SAMPLE_POSITION,
AV1E_SET_MIN_GF_INTERVAL,
AV1E_SET_MAX_GF_INTERVAL,
+ AV1E_SET_GF_MAX_PYRAMID_HEIGHT,
AV1E_SET_SUPERBLOCK_SIZE,
AV1E_SET_NUM_TG,
AV1E_SET_MTU,
@@ -726,12 +958,12 @@ static const int av1_arg_ctrl_map[] = { AOME_SET_CPUUSED,
#if CONFIG_DENOISE
AV1E_SET_DENOISE_NOISE_LEVEL,
AV1E_SET_DENOISE_BLOCK_SIZE,
-#endif
+#endif // CONFIG_DENOISE
+ AV1E_SET_MAX_REFERENCE_FRAMES,
+ AV1E_SET_REDUCED_REFERENCE_SET,
AV1E_SET_ENABLE_REF_FRAME_MVS,
- AV1E_SET_ENABLE_DF,
- AV1E_SET_ENABLE_ORDER_HINT,
- AV1E_SET_ENABLE_JNT_COMP,
- AV1E_SET_ENABLE_SUPERRES,
+ AV1E_SET_TARGET_SEQ_LEVEL_IDX,
+ AV1E_SET_TIER_MASK,
0 };
#endif // CONFIG_AV1_ENCODER
@@ -798,9 +1030,6 @@ struct stream_config {
struct aom_codec_enc_cfg cfg;
const char *out_fn;
const char *stats_fn;
-#if CONFIG_FP_MB_STATS
- const char *fpmb_stats_fn;
-#endif
stereo_format_t stereo_fmt;
int arg_ctrls[ARG_CTRL_CNT_MAX][2];
int arg_ctrl_cnt;
@@ -828,9 +1057,6 @@ struct stream_state {
uint64_t cx_time;
size_t nbytes;
stats_io_t stats;
-#if CONFIG_FP_MB_STATS
- stats_io_t fpmb_stats;
-#endif
struct aom_image *img;
aom_codec_ctx_t decoder;
int mismatch_seen;
@@ -916,7 +1142,9 @@ static void parse_global_config(struct AvxEncoderConfig *global, int argc,
} else if (arg_match(&arg, &usage, argi))
global->usage = arg_parse_uint(&arg);
else if (arg_match(&arg, &good_dl, argi))
- warn("Deprecated --good option! Ignoring\n");
+ global->usage = AOM_USAGE_GOOD_QUALITY; // Good quality usage
+ else if (arg_match(&arg, &rt_dl, argi))
+ global->usage = AOM_USAGE_REALTIME; // Real-time usage
else if (arg_match(&arg, &use_yv12, argi))
global->color_type = YV12;
else if (arg_match(&arg, &use_i420, argi))
@@ -969,11 +1197,19 @@ static void parse_global_config(struct AvxEncoderConfig *global, int argc,
// Make default AV1 passes = 2 until there is a better quality 1-pass
// encoder
if (global->codec != NULL && global->codec->name != NULL)
- global->passes = (strcmp(global->codec->name, "av1") == 0) ? 2 : 1;
+ global->passes = (strcmp(global->codec->name, "av1") == 0 &&
+ global->usage != AOM_USAGE_REALTIME)
+ ? 2
+ : 1;
#else
global->passes = 1;
#endif
}
+
+ if (global->usage == AOM_USAGE_REALTIME && global->passes > 1) {
+ warn("Enforcing one-pass encoding in realtime mode\n");
+ global->passes = 1;
+ }
}
static void open_input_file(struct AvxInputContext *input,
@@ -1090,6 +1326,17 @@ static void set_config_arg_ctrls(struct stream_config *config, int key,
return;
}
+ // For target level, the settings should accumulate rather than overwrite,
+ // so we simply append it.
+ if (key == AV1E_SET_TARGET_SEQ_LEVEL_IDX) {
+ j = config->arg_ctrl_cnt;
+ assert(j < (int)ARG_CTRL_CNT_MAX);
+ config->arg_ctrls[j][0] = key;
+ config->arg_ctrls[j][1] = arg_parse_enum_or_int(arg);
+ ++config->arg_ctrl_cnt;
+ return;
+ }
+
/* Point either to the next free element or the first instance of this
* control.
*/
@@ -1159,10 +1406,6 @@ static int parse_stream_params(struct AvxEncoderConfig *global,
}
} else if (arg_match(&arg, &fpf_name, argi)) {
config->stats_fn = arg.val;
-#if CONFIG_FP_MB_STATS
- } else if (arg_match(&arg, &fpmbf_name, argi)) {
- config->fpmb_stats_fn = arg.val;
-#endif
} else if (arg_match(&arg, &use_webm, argi)) {
#if CONFIG_WEBM_IO
config->write_webm = 1;
@@ -1207,8 +1450,15 @@ static int parse_stream_params(struct AvxEncoderConfig *global,
config->cfg.g_error_resilient = arg_parse_uint(&arg);
} else if (arg_match(&arg, &lag_in_frames, argi)) {
config->cfg.g_lag_in_frames = arg_parse_uint(&arg);
+ if (global->usage == AOM_USAGE_REALTIME &&
+ config->cfg.rc_end_usage == AOM_CBR &&
+ config->cfg.g_lag_in_frames != 0) {
+ warn("non-zero %s option ignored in realtime CBR mode.\n", arg.name);
+ config->cfg.g_lag_in_frames = 0;
+ }
} else if (arg_match(&arg, &large_scale_tile, argi)) {
config->cfg.large_scale_tile = arg_parse_uint(&arg);
+ if (config->cfg.large_scale_tile) global->codec = get_aom_lst_encoder();
} else if (arg_match(&arg, &monochrome, argi)) {
config->cfg.monochrome = 1;
} else if (arg_match(&arg, &full_still_picture_hdr, argi)) {
@@ -1349,17 +1599,6 @@ static void validate_stream_config(const struct stream_state *stream,
fatal("Stream %d: duplicate stats file (from stream %d)",
streami->index, stream->index);
}
-
-#if CONFIG_FP_MB_STATS
- /* Check for two streams sharing a mb stats file. */
- if (streami != stream) {
- const char *a = stream->config.fpmb_stats_fn;
- const char *b = streami->config.fpmb_stats_fn;
- if (a && b && !strcmp(a, b))
- fatal("Stream %d: duplicate mb stats file (from stream %d)",
- streami->index, stream->index);
- }
-#endif
}
}
@@ -1524,26 +1763,11 @@ static void setup_pass(struct stream_state *stream,
fatal("Failed to open statistics store");
}
-#if CONFIG_FP_MB_STATS
- if (stream->config.fpmb_stats_fn) {
- if (!stats_open_file(&stream->fpmb_stats, stream->config.fpmb_stats_fn,
- pass))
- fatal("Failed to open mb statistics store");
- } else {
- if (!stats_open_mem(&stream->fpmb_stats, pass))
- fatal("Failed to open mb statistics store");
- }
-#endif
-
stream->config.cfg.g_pass = global->passes == 2
? pass ? AOM_RC_LAST_PASS : AOM_RC_FIRST_PASS
: AOM_RC_ONE_PASS;
if (pass) {
stream->config.cfg.rc_twopass_stats_in = stats_get(&stream->stats);
-#if CONFIG_FP_MB_STATS
- stream->config.cfg.rc_firstpass_mb_stats_in =
- stats_get(&stream->fpmb_stats);
-#endif
}
stream->cx_time = 0;
@@ -1772,13 +1996,6 @@ static void get_cx_data(struct stream_state *stream,
pkt->data.twopass_stats.sz);
stream->nbytes += pkt->data.raw.sz;
break;
-#if CONFIG_FP_MB_STATS
- case AOM_CODEC_FPMB_STATS_PKT:
- stats_write(&stream->fpmb_stats, pkt->data.firstpass_mb_stats.buf,
- pkt->data.firstpass_mb_stats.sz);
- stream->nbytes += pkt->data.raw.sz;
- break;
-#endif
case AOM_CODEC_PSNR_PKT:
if (global->show_psnr) {
@@ -1966,6 +2183,10 @@ int main(int argc, const char **argv_) {
FOREACH_STREAM(stream, streams) {
check_encoder_config(global.disable_warning_prompt, &global,
&stream->config.cfg);
+
+ // If large_scale_tile = 1, only support to output to ivf format.
+ if (stream->config.cfg.large_scale_tile && !stream->config.write_ivf)
+ die("only support ivf output format while large-scale-tile=1\n");
}
/* Handle non-option arguments */
@@ -2371,12 +2592,6 @@ int main(int argc, const char **argv_) {
stats_close(&stream->stats, global.passes - 1);
}
-#if CONFIG_FP_MB_STATS
- FOREACH_STREAM(stream, streams) {
- stats_close(&stream->fpmb_stats, global.passes - 1);
- }
-#endif
-
if (global.pass) break;
}
diff --git a/libaom/av1/av1.cmake b/libaom/av1/av1.cmake
index 8c92615..fb9678a 100644
--- a/libaom/av1/av1.cmake
+++ b/libaom/av1/av1.cmake
@@ -137,6 +137,8 @@ list(APPEND AOM_AV1_ENCODER_SOURCES
"${AOM_ROOT}/av1/encoder/encodemb.h"
"${AOM_ROOT}/av1/encoder/encodemv.c"
"${AOM_ROOT}/av1/encoder/encodemv.h"
+ "${AOM_ROOT}/av1/encoder/encode_strategy.c"
+ "${AOM_ROOT}/av1/encoder/encode_strategy.h"
"${AOM_ROOT}/av1/encoder/encoder.c"
"${AOM_ROOT}/av1/encoder/encoder.h"
"${AOM_ROOT}/av1/encoder/encodetxb.c"
@@ -149,6 +151,8 @@ list(APPEND AOM_AV1_ENCODER_SOURCES
"${AOM_ROOT}/av1/encoder/firstpass.h"
"${AOM_ROOT}/av1/encoder/global_motion.c"
"${AOM_ROOT}/av1/encoder/global_motion.h"
+ "${AOM_ROOT}/av1/encoder/gop_structure.c"
+ "${AOM_ROOT}/av1/encoder/gop_structure.h"
"${AOM_ROOT}/av1/encoder/grain_test_vectors.h"
"${AOM_ROOT}/av1/encoder/hash.c"
"${AOM_ROOT}/av1/encoder/hash.h"
@@ -156,6 +160,8 @@ list(APPEND AOM_AV1_ENCODER_SOURCES
"${AOM_ROOT}/av1/encoder/hash_motion.h"
"${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.c"
"${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.h"
+ "${AOM_ROOT}/av1/encoder/level.c"
+ "${AOM_ROOT}/av1/encoder/level.h"
"${AOM_ROOT}/av1/encoder/lookahead.c"
"${AOM_ROOT}/av1/encoder/lookahead.h"
"${AOM_ROOT}/av1/encoder/mbgraph.c"
@@ -166,6 +172,10 @@ list(APPEND AOM_AV1_ENCODER_SOURCES
"${AOM_ROOT}/av1/encoder/ml.h"
"${AOM_ROOT}/av1/encoder/palette.c"
"${AOM_ROOT}/av1/encoder/palette.h"
+ "${AOM_ROOT}/av1/encoder/partition_strategy.h"
+ "${AOM_ROOT}/av1/encoder/partition_strategy.c"
+ "${AOM_ROOT}/av1/encoder/pass2_strategy.h"
+ "${AOM_ROOT}/av1/encoder/pass2_strategy.c"
"${AOM_ROOT}/av1/encoder/pickcdef.c"
"${AOM_ROOT}/av1/encoder/picklpf.c"
"${AOM_ROOT}/av1/encoder/picklpf.h"
@@ -189,7 +199,11 @@ list(APPEND AOM_AV1_ENCODER_SOURCES
"${AOM_ROOT}/av1/encoder/temporal_filter.h"
"${AOM_ROOT}/av1/encoder/tokenize.c"
"${AOM_ROOT}/av1/encoder/tokenize.h"
+ "${AOM_ROOT}/av1/encoder/tpl_model.c"
+ "${AOM_ROOT}/av1/encoder/tpl_model.h"
"${AOM_ROOT}/av1/encoder/wedge_utils.c"
+ "${AOM_ROOT}/av1/encoder/var_based_part.c"
+ "${AOM_ROOT}/av1/encoder/var_based_part.h"
"${AOM_ROOT}/third_party/fastfeat/fast.c"
"${AOM_ROOT}/third_party/fastfeat/fast.h"
"${AOM_ROOT}/third_party/fastfeat/fast_9.c"
@@ -253,8 +267,7 @@ list(APPEND AOM_AV1_COMMON_INTRIN_AVX2
"${AOM_ROOT}/av1/common/x86/wiener_convolve_avx2.c")
list(APPEND AOM_AV1_ENCODER_ASM_SSE2 "${AOM_ROOT}/av1/encoder/x86/dct_sse2.asm"
- "${AOM_ROOT}/av1/encoder/x86/error_sse2.asm"
- "${AOM_ROOT}/av1/encoder/x86/temporal_filter_apply_sse2.asm")
+ "${AOM_ROOT}/av1/encoder/x86/error_sse2.asm")
list(APPEND AOM_AV1_ENCODER_INTRIN_SSE2
"${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_sse2.c"
@@ -277,14 +290,20 @@ list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_1
"${AOM_ROOT}/av1/encoder/x86/encodetxb_sse4.c"
"${AOM_ROOT}/av1/encoder/x86/highbd_fwd_txfm_sse4.c"
"${AOM_ROOT}/av1/encoder/x86/rdopt_sse4.c"
+ "${AOM_ROOT}/av1/encoder/x86/temporal_filter_constants.h"
+ "${AOM_ROOT}/av1/encoder/x86/temporal_filter_sse4.c"
+ "${AOM_ROOT}/av1/encoder/x86/highbd_temporal_filter_sse4.c"
"${AOM_ROOT}/av1/encoder/x86/pickrst_sse4.c")
list(APPEND AOM_AV1_ENCODER_INTRIN_AVX2
"${AOM_ROOT}/av1/encoder/x86/av1_quantize_avx2.c"
"${AOM_ROOT}/av1/encoder/x86/av1_highbd_quantize_avx2.c"
+ "${AOM_ROOT}/av1/encoder/x86/corner_match_avx2.c"
"${AOM_ROOT}/av1/encoder/x86/error_intrin_avx2.c"
+ "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_avx2.c"
"${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_avx2.h"
"${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm2d_avx2.c"
+ "${AOM_ROOT}/av1/encoder/x86/highbd_fwd_txfm_avx2.c"
"${AOM_ROOT}/av1/encoder/x86/wedge_utils_avx2.c"
"${AOM_ROOT}/av1/encoder/x86/encodetxb_avx2.c"
"${AOM_ROOT}/av1/encoder/x86/rdopt_avx2.c"
@@ -340,15 +359,7 @@ endif()
function(setup_av1_targets)
add_library(aom_av1_common OBJECT ${AOM_AV1_COMMON_SOURCES})
list(APPEND AOM_LIB_TARGETS aom_av1_common)
-
- create_dummy_source_file("aom_av1" "c" "dummy_source_file")
- add_library(aom_av1 OBJECT "${dummy_source_file}")
target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_av1_common>)
- list(APPEND AOM_LIB_TARGETS aom_av1)
-
- # Not all generators support libraries consisting only of object files. Add a
- # dummy source file to the aom_av1 target.
- add_dummy_source_file_to_target("aom_av1" "c")
if(CONFIG_AV1_DECODER)
add_library(aom_av1_decoder OBJECT ${AOM_AV1_DECODER_SOURCES})
@@ -446,13 +457,13 @@ function(setup_av1_targets)
if(HAVE_NEON)
if(AOM_AV1_COMMON_INTRIN_NEON)
- add_intrinsics_object_library("${AOM_INTRIN_NEON_FLAG}" "neon"
+ add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon"
"aom_av1_common"
"AOM_AV1_COMMON_INTRIN_NEON" "aom")
endif()
if(AOM_AV1_ENCODER_INTRIN_NEON)
- add_intrinsics_object_library("${AOM_INTRIN_NEON_FLAG}" "neon"
+ add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon"
"aom_av1_encoder"
"AOM_AV1_ENCODER_INTRIN_NEON" "aom")
endif()
@@ -470,13 +481,7 @@ function(setup_av1_targets)
"AOM_AV1_ENCODER_INTRIN_MSA" "aom")
endif()
- target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_dsp>)
- target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_scale>)
-
# Pass the new lib targets up to the parent scope instance of
# $AOM_LIB_TARGETS.
set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE)
endfunction()
-
-function(setup_av1_test_targets)
-endfunction()
diff --git a/libaom/av1/av1_cx_iface.c b/libaom/av1/av1_cx_iface.c
index 43a6028..e8cd508 100644
--- a/libaom/av1/av1_cx_iface.c
+++ b/libaom/av1/av1_cx_iface.c
@@ -26,10 +26,6 @@
#include "av1/encoder/encoder.h"
#include "av1/encoder/firstpass.h"
-#if CONFIG_REDUCED_ENCODER_BORDER
-#include "common/tools_common.h"
-#endif // CONFIG_REDUCED_ENCODER_BORDER
-
#define MAG_SIZE (4)
#define MAX_NUM_ENHANCEMENT_LAYERS 3
@@ -48,6 +44,7 @@ struct av1_extracfg {
unsigned int arnr_strength;
unsigned int min_gf_interval;
unsigned int max_gf_interval;
+ unsigned int gf_max_pyr_height;
aom_tune_metric tuning;
unsigned int cq_level; // constrained quality level
unsigned int rc_max_intra_bitrate_pct;
@@ -56,6 +53,7 @@ struct av1_extracfg {
unsigned int lossless;
unsigned int enable_cdef;
unsigned int enable_restoration;
+ unsigned int enable_obmc;
unsigned int disable_trellis_quant;
unsigned int enable_qm;
unsigned int qm_y;
@@ -71,7 +69,7 @@ struct av1_extracfg {
aom_timing_info_type_t timing_info_type;
unsigned int frame_parallel_decoding_mode;
- int use_dual_filter;
+ int enable_dual_filter;
AQ_MODE aq_mode;
DELTAQ_MODE deltaq_mode;
unsigned int frame_periodic_boost;
@@ -93,13 +91,39 @@ struct av1_extracfg {
const char *film_grain_table_filename;
unsigned int motion_vector_unit_test;
unsigned int cdf_update_mode;
- int enable_order_hint;
- int enable_jnt_comp;
- int enable_ref_frame_mvs; // sequence level
- int allow_ref_frame_mvs; // frame level
- int enable_warped_motion; // sequence level
- int allow_warped_motion; // frame level
+ int enable_rect_partitions; // enable rectangular partitions for sequence
+ int enable_ab_partitions; // enable AB partitions for sequence
+ int enable_1to4_partitions; // enable 1:4 and 4:1 partitions for sequence
+ int min_partition_size; // min partition size [4,8,16,32,64,128]
+ int max_partition_size; // max partition size [4,8,16,32,64,128]
+ int enable_intra_edge_filter; // enable intra-edge filter for sequence
+ int enable_order_hint; // enable order hint for sequence
+ int enable_tx64; // enable 64-pt transform usage for sequence
+ int tx_size_search_method; // set transform block size search method
+ int enable_flip_idtx; // enable flip and identity transform types
+ int enable_dist_wtd_comp; // enable dist wtd compound for sequence
+ int max_reference_frames; // maximum number of references per frame
+ int enable_reduced_reference_set; // enable reduced set of references
+ int enable_ref_frame_mvs; // sequence level
+ int allow_ref_frame_mvs; // frame level
+ int enable_masked_comp; // enable masked compound for sequence
+ int enable_onesided_comp; // enable one sided compound for sequence
+ int enable_interintra_comp; // enable interintra compound for sequence
+ int enable_smooth_interintra; // enable smooth interintra mode usage
+ int enable_diff_wtd_comp; // enable diff-wtd compound usage
+ int enable_interinter_wedge; // enable interinter-wedge compound usage
+ int enable_interintra_wedge; // enable interintra-wedge compound usage
+ int enable_global_motion; // enable global motion usage for sequence
+ int enable_warped_motion; // sequence level
+ int allow_warped_motion; // frame level
+ int enable_filter_intra; // enable filter intra for sequence
+ int enable_smooth_intra; // enable smooth intra modes for sequence
+ int enable_paeth_intra; // enable Paeth intra mode for sequence
+ int enable_cfl_intra; // enable CFL uv intra mode for sequence
int enable_superres;
+ int enable_palette;
+ int enable_intrabc;
+ int enable_angle_delta;
#if CONFIG_DENOISE
float noise_level;
int noise_block_size;
@@ -107,6 +131,17 @@ struct av1_extracfg {
unsigned int chroma_subsampling_x;
unsigned int chroma_subsampling_y;
+ int reduced_tx_type_set;
+ int use_intra_dct_only;
+ int use_inter_dct_only;
+ int use_intra_default_tx_only;
+ int quant_b_adapt;
+ AV1_LEVEL target_seq_level_idx[MAX_NUM_OPERATING_POINTS];
+ // Bit mask to specify which tier each of the 32 possible operating points
+ // conforms to.
+ unsigned int tier_mask;
+ COST_UPDATE_TYPE coeff_cost_upd_freq;
+ COST_UPDATE_TYPE mode_cost_upd_freq;
};
static struct av1_extracfg default_extra_cfg = {
@@ -116,7 +151,7 @@ static struct av1_extracfg default_extra_cfg = {
0, // noise_sensitivity
CONFIG_SHARP_SETTINGS, // sharpness
0, // static_thresh
- 0, // row_mt
+ 1, // row_mt
0, // tile_columns
0, // tile_rows
0, // enable_tpl_model
@@ -124,6 +159,7 @@ static struct av1_extracfg default_extra_cfg = {
5, // arnr_strength
0, // min_gf_interval; 0 -> default decision
0, // max_gf_interval; 0 -> default decision
+ 4, // gf_max_pyr_height
AOM_TUNE_PSNR, // tuning
10, // cq_level
0, // rc_max_intra_bitrate_pct
@@ -132,6 +168,7 @@ static struct av1_extracfg default_extra_cfg = {
0, // lossless
!CONFIG_SHARP_SETTINGS, // enable_cdef
1, // enable_restoration
+ 1, // enable_obmc
0, // disable_trellis_quant
0, // enable_qm
DEFAULT_QM_Y, // qm_y
@@ -145,7 +182,7 @@ static struct av1_extracfg default_extra_cfg = {
1, // max number of tile groups
0, // mtu_size
AOM_TIMING_UNSPECIFIED, // No picture timing signaling in bitstream
- 1, // frame_parallel_decoding_mode
+ 0, // frame_parallel_decoding_mode
1, // enable dual filter
NO_AQ, // aq_mode
NO_DELTA_Q, // deltaq_mode
@@ -167,19 +204,57 @@ static struct av1_extracfg default_extra_cfg = {
0, // film_grain_table_filename
0, // motion_vector_unit_test
1, // CDF update mode
+ 1, // enable rectangular partitions
+ 1, // enable ab shape partitions
+ 1, // enable 1:4 and 4:1 partitions
+ 4, // min_partition_size
+ 128, // max_partition_size
+ 1, // enable intra edge filter
1, // frame order hint
- 1, // jnt_comp
+ 1, // enable 64-pt transform usage
+ 0, // transform block size search method
+ 1, // enable flip and identity transform
+ 1, // dist-wtd compound
+ 7, // max_reference_frames
+ 0, // enable_reduced_reference_set
1, // enable_ref_frame_mvs sequence level
1, // allow ref_frame_mvs frame level
+ 1, // enable masked compound at sequence level
+ 1, // enable one sided compound at sequence level
+ 1, // enable interintra compound at sequence level
+ 1, // enable smooth interintra mode
+ 1, // enable difference-weighted compound
+ 1, // enable interinter wedge compound
+ 1, // enable interintra wedge compound
+ 1, // enable_global_motion usage
1, // enable_warped_motion at sequence level
1, // allow_warped_motion at frame level
+ 1, // enable filter intra at sequence level
+ 1, // enable smooth intra modes usage for sequence
+ 1, // enable Paeth intra mode usage for sequence
+ 1, // enable CFL uv intra mode usage for sequence
1, // superres
+ 1, // enable palette
+ !CONFIG_SHARP_SETTINGS, // enable intrabc
+ 1, // enable angle delta
#if CONFIG_DENOISE
0, // noise_level
32, // noise_block_size
#endif
0, // chroma_subsampling_x
0, // chroma_subsampling_y
+ 0, // reduced_tx_type_set
+ 0, // use_intra_dct_only
+ 0, // use_inter_dct_only
+ 0, // use_intra_default_tx_only
+ 0, // quant_b_adapt
+ {
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ }, // target_seq_level_idx
+ 0, // tier_mask
+ COST_UPD_SB, // coeff_cost_upd_freq
+ COST_UPD_SB, // mode_cost_upd_freq
};
struct aom_codec_alg_priv {
@@ -251,6 +326,7 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
RANGE_CHECK_HI(extra_cfg, aq_mode, AQ_MODE_COUNT - 1);
RANGE_CHECK_HI(extra_cfg, deltaq_mode, DELTAQ_MODE_COUNT - 1);
RANGE_CHECK_HI(extra_cfg, frame_periodic_boost, 1);
+ RANGE_CHECK_HI(cfg, g_usage, 1);
RANGE_CHECK_HI(cfg, g_threads, MAX_NUM_THREADS);
RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS);
RANGE_CHECK(cfg, rc_end_usage, AOM_VBR, AOM_Q);
@@ -266,6 +342,7 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
RANGE_CHECK(extra_cfg, max_gf_interval, MAX(2, extra_cfg->min_gf_interval),
(MAX_LAG_BUFFERS - 1));
}
+ RANGE_CHECK_HI(extra_cfg, gf_max_pyr_height, 4);
RANGE_CHECK_HI(cfg, rc_resize_mode, RESIZE_MODES - 1);
RANGE_CHECK(cfg, rc_resize_denominator, SCALE_NUMERATOR,
@@ -382,9 +459,26 @@ static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
#endif
}
+ RANGE_CHECK(extra_cfg, max_reference_frames, 3, 7);
+ RANGE_CHECK(extra_cfg, enable_reduced_reference_set, 0, 1);
RANGE_CHECK_HI(extra_cfg, chroma_subsampling_x, 1);
RANGE_CHECK_HI(extra_cfg, chroma_subsampling_y, 1);
+ RANGE_CHECK_HI(extra_cfg, disable_trellis_quant, 3);
+ RANGE_CHECK(extra_cfg, coeff_cost_upd_freq, 0, 2);
+ RANGE_CHECK(extra_cfg, mode_cost_upd_freq, 0, 2);
+
+ RANGE_CHECK(extra_cfg, min_partition_size, 4, 128);
+ RANGE_CHECK(extra_cfg, max_partition_size, 4, 128);
+ RANGE_CHECK_HI(extra_cfg, min_partition_size, extra_cfg->max_partition_size);
+
+ RANGE_CHECK(extra_cfg, tx_size_search_method, 0, 2);
+
+ for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) {
+ if (!is_valid_seq_level_idx(extra_cfg->target_seq_level_idx[i]))
+ ERROR("Target sequence level index is invalid");
+ }
+
return AOM_CODEC_OK;
}
@@ -452,6 +546,7 @@ static aom_codec_err_t set_encoder_config(
oxcf->profile = cfg->g_profile;
oxcf->fwd_kf_enabled = cfg->fwd_kf_enabled;
oxcf->max_threads = (int)cfg->g_threads;
+ oxcf->mode = (cfg->g_usage == 1) ? REALTIME : GOOD;
oxcf->width = cfg->g_w;
oxcf->height = cfg->g_h;
oxcf->forced_max_frame_width = cfg->g_forced_max_frame_width;
@@ -494,7 +589,6 @@ static aom_codec_err_t set_encoder_config(
oxcf->init_framerate = 30;
oxcf->timing_info_present = 0;
}
- oxcf->mode = GOOD;
oxcf->cfg = &cfg->cfg;
switch (cfg->g_pass) {
@@ -522,6 +616,10 @@ static aom_codec_err_t set_encoder_config(
oxcf->enable_cdef = extra_cfg->enable_cdef;
oxcf->enable_restoration = extra_cfg->enable_restoration;
+ oxcf->enable_obmc = extra_cfg->enable_obmc;
+ oxcf->enable_palette = extra_cfg->enable_palette;
+ oxcf->enable_intrabc = extra_cfg->enable_intrabc;
+ oxcf->enable_angle_delta = extra_cfg->enable_angle_delta;
oxcf->disable_trellis_quant = extra_cfg->disable_trellis_quant;
oxcf->using_qm = extra_cfg->enable_qm;
oxcf->qm_y = extra_cfg->qm_y;
@@ -529,6 +627,13 @@ static aom_codec_err_t set_encoder_config(
oxcf->qm_v = extra_cfg->qm_v;
oxcf->qm_minlevel = extra_cfg->qm_min;
oxcf->qm_maxlevel = extra_cfg->qm_max;
+ oxcf->reduced_tx_type_set = extra_cfg->reduced_tx_type_set;
+ oxcf->use_intra_dct_only = extra_cfg->use_intra_dct_only;
+ oxcf->use_inter_dct_only = extra_cfg->use_inter_dct_only;
+ oxcf->use_intra_default_tx_only = extra_cfg->use_intra_default_tx_only;
+ oxcf->quant_b_adapt = extra_cfg->quant_b_adapt;
+ oxcf->coeff_cost_upd_freq = (COST_UPDATE_TYPE)extra_cfg->coeff_cost_upd_freq;
+ oxcf->mode_cost_upd_freq = (COST_UPDATE_TYPE)extra_cfg->mode_cost_upd_freq;
#if CONFIG_DIST_8X8
oxcf->using_dist_8x8 = extra_cfg->enable_dist_8x8;
if (extra_cfg->tuning == AOM_TUNE_CDEF_DIST ||
@@ -539,7 +644,6 @@ static aom_codec_err_t set_encoder_config(
// In large-scale tile encoding mode, num_tile_groups is always 1.
if (cfg->large_scale_tile) oxcf->num_tile_groups = 1;
oxcf->mtu = extra_cfg->mtu_size;
- oxcf->enable_tpl_model = extra_cfg->enable_tpl_model;
// FIXME(debargha): Should this be:
// oxcf->allow_ref_frame_mvs = extra_cfg->allow_ref_frame_mvs &
@@ -579,6 +683,9 @@ static aom_codec_err_t set_encoder_config(
}
}
+ oxcf->enable_tpl_model =
+ extra_cfg->enable_tpl_model && (oxcf->superres_mode == SUPERRES_NONE);
+
oxcf->maximum_buffer_size_ms = is_vbr ? 240000 : cfg->rc_buf_sz;
oxcf->starting_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_initial_sz;
oxcf->optimal_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_optimal_sz;
@@ -604,10 +711,6 @@ static aom_codec_err_t set_encoder_config(
oxcf->two_pass_stats_in = cfg->rc_twopass_stats_in;
-#if CONFIG_FP_MB_STATS
- oxcf->firstpass_mb_stats_in = cfg->rc_firstpass_mb_stats_in;
-#endif
-
oxcf->color_primaries = extra_cfg->color_primaries;
oxcf->transfer_characteristics = extra_cfg->transfer_characteristics;
oxcf->matrix_coefficients = extra_cfg->matrix_coefficients;
@@ -623,6 +726,7 @@ static aom_codec_err_t set_encoder_config(
oxcf->arnr_strength = extra_cfg->arnr_strength;
oxcf->min_gf_interval = extra_cfg->min_gf_interval;
oxcf->max_gf_interval = extra_cfg->max_gf_interval;
+ oxcf->gf_max_pyr_height = extra_cfg->gf_max_pyr_height;
oxcf->tuning = extra_cfg->tuning;
oxcf->content = extra_cfg->content;
@@ -659,16 +763,43 @@ static aom_codec_err_t set_encoder_config(
oxcf->monochrome = cfg->monochrome;
oxcf->full_still_picture_hdr = cfg->full_still_picture_hdr;
- oxcf->enable_dual_filter = extra_cfg->use_dual_filter;
+ oxcf->enable_dual_filter = extra_cfg->enable_dual_filter;
+ oxcf->enable_rect_partitions = extra_cfg->enable_rect_partitions;
+ oxcf->enable_ab_partitions = extra_cfg->enable_ab_partitions;
+ oxcf->enable_1to4_partitions = extra_cfg->enable_1to4_partitions;
+ oxcf->min_partition_size = extra_cfg->min_partition_size;
+ oxcf->max_partition_size = extra_cfg->max_partition_size;
+ oxcf->enable_intra_edge_filter = extra_cfg->enable_intra_edge_filter;
+ oxcf->enable_tx64 = extra_cfg->enable_tx64;
+ oxcf->tx_size_search_method = extra_cfg->tx_size_search_method;
+ oxcf->enable_flip_idtx = extra_cfg->enable_flip_idtx;
oxcf->enable_order_hint = extra_cfg->enable_order_hint;
- oxcf->enable_jnt_comp =
- extra_cfg->enable_jnt_comp & extra_cfg->enable_order_hint;
+ oxcf->enable_dist_wtd_comp =
+ extra_cfg->enable_dist_wtd_comp & extra_cfg->enable_order_hint;
+ oxcf->max_reference_frames = extra_cfg->max_reference_frames;
+ oxcf->enable_reduced_reference_set = extra_cfg->enable_reduced_reference_set;
+ oxcf->enable_masked_comp = extra_cfg->enable_masked_comp;
+ oxcf->enable_onesided_comp = extra_cfg->enable_onesided_comp;
+ oxcf->enable_diff_wtd_comp =
+ extra_cfg->enable_masked_comp & extra_cfg->enable_diff_wtd_comp;
+ oxcf->enable_interinter_wedge =
+ extra_cfg->enable_masked_comp & extra_cfg->enable_interinter_wedge;
+ oxcf->enable_interintra_comp = extra_cfg->enable_interintra_comp;
+ oxcf->enable_smooth_interintra =
+ extra_cfg->enable_interintra_comp && extra_cfg->enable_smooth_interintra;
+ oxcf->enable_interintra_wedge =
+ extra_cfg->enable_interintra_comp & extra_cfg->enable_interintra_wedge;
oxcf->enable_ref_frame_mvs =
extra_cfg->enable_ref_frame_mvs & extra_cfg->enable_order_hint;
+ oxcf->enable_global_motion = extra_cfg->enable_global_motion;
oxcf->enable_warped_motion = extra_cfg->enable_warped_motion;
oxcf->allow_warped_motion =
extra_cfg->allow_warped_motion & extra_cfg->enable_warped_motion;
+ oxcf->enable_filter_intra = extra_cfg->enable_filter_intra;
+ oxcf->enable_smooth_intra = extra_cfg->enable_smooth_intra;
+ oxcf->enable_paeth_intra = extra_cfg->enable_paeth_intra;
+ oxcf->enable_cfl_intra = extra_cfg->enable_cfl_intra;
oxcf->enable_superres =
(oxcf->superres_mode != SUPERRES_NONE) && extra_cfg->enable_superres;
@@ -710,23 +841,14 @@ static aom_codec_err_t set_encoder_config(
oxcf->frame_periodic_boost = extra_cfg->frame_periodic_boost;
oxcf->motion_vector_unit_test = extra_cfg->motion_vector_unit_test;
-#if CONFIG_REDUCED_ENCODER_BORDER
- if (oxcf->superres_mode != SUPERRES_NONE ||
- oxcf->resize_mode != RESIZE_NONE) {
- warn(
- "Superres / resize cannot be used with CONFIG_REDUCED_ENCODER_BORDER. "
- "Disabling superres/resize.\n");
- // return AOM_CODEC_INVALID_PARAM;
- disable_superres(oxcf);
- oxcf->resize_mode = RESIZE_NONE;
- oxcf->resize_scale_denominator = SCALE_NUMERATOR;
- oxcf->resize_kf_scale_denominator = SCALE_NUMERATOR;
- }
-#endif // CONFIG_REDUCED_ENCODER_BORDER
-
oxcf->chroma_subsampling_x = extra_cfg->chroma_subsampling_x;
oxcf->chroma_subsampling_y = extra_cfg->chroma_subsampling_y;
-
+ oxcf->border_in_pixels = (oxcf->resize_mode || oxcf->superres_mode)
+ ? AOM_BORDER_IN_PIXELS
+ : AOM_ENC_NO_SCALE_BORDER;
+ memcpy(oxcf->target_seq_level_idx, extra_cfg->target_seq_level_idx,
+ sizeof(oxcf->target_seq_level_idx));
+ oxcf->tier_mask = extra_cfg->tier_mask;
return AOM_CODEC_OK;
}
@@ -939,6 +1061,13 @@ static aom_codec_err_t ctrl_set_enable_restoration(aom_codec_alg_priv_t *ctx,
return update_extra_cfg(ctx, &extra_cfg);
}
+static aom_codec_err_t ctrl_set_enable_obmc(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_obmc = CAST(AV1E_SET_ENABLE_OBMC, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
static aom_codec_err_t ctrl_set_disable_trellis_quant(aom_codec_alg_priv_t *ctx,
va_list args) {
struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -1007,10 +1136,55 @@ static aom_codec_err_t ctrl_set_timing_info_type(aom_codec_alg_priv_t *ctx,
return update_extra_cfg(ctx, &extra_cfg);
}
-static aom_codec_err_t ctrl_set_enable_df(aom_codec_alg_priv_t *ctx,
- va_list args) {
+static aom_codec_err_t ctrl_set_enable_dual_filter(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_dual_filter = CAST(AV1E_SET_ENABLE_DUAL_FILTER, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_rect_partitions(
+ aom_codec_alg_priv_t *ctx, va_list args) {
struct av1_extracfg extra_cfg = ctx->extra_cfg;
- extra_cfg.use_dual_filter = CAST(AV1E_SET_ENABLE_DF, args);
+ extra_cfg.enable_rect_partitions =
+ CAST(AV1E_SET_ENABLE_RECT_PARTITIONS, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_ab_partitions(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_ab_partitions = CAST(AV1E_SET_ENABLE_AB_PARTITIONS, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_1to4_partitions(
+ aom_codec_alg_priv_t *ctx, va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_1to4_partitions =
+ CAST(AV1E_SET_ENABLE_1TO4_PARTITIONS, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_min_partition_size(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.min_partition_size = CAST(AV1E_SET_MIN_PARTITION_SIZE, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_max_partition_size(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.max_partition_size = CAST(AV1E_SET_MAX_PARTITION_SIZE, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_intra_edge_filter(
+ aom_codec_alg_priv_t *ctx, va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_intra_edge_filter =
+ CAST(AV1E_SET_ENABLE_INTRA_EDGE_FILTER, args);
return update_extra_cfg(ctx, &extra_cfg);
}
@@ -1021,10 +1195,46 @@ static aom_codec_err_t ctrl_set_enable_order_hint(aom_codec_alg_priv_t *ctx,
return update_extra_cfg(ctx, &extra_cfg);
}
-static aom_codec_err_t ctrl_set_enable_jnt_comp(aom_codec_alg_priv_t *ctx,
- va_list args) {
+static aom_codec_err_t ctrl_set_enable_tx64(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_tx64 = CAST(AV1E_SET_ENABLE_TX64, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_tx_size_search_method(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.tx_size_search_method = CAST(AV1E_SET_TX_SIZE_SEARCH_METHOD, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_flip_idtx(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_flip_idtx = CAST(AV1E_SET_ENABLE_FLIP_IDTX, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_dist_wtd_comp(aom_codec_alg_priv_t *ctx,
+ va_list args) {
struct av1_extracfg extra_cfg = ctx->extra_cfg;
- extra_cfg.enable_jnt_comp = CAST(AV1E_SET_ENABLE_JNT_COMP, args);
+ extra_cfg.enable_dist_wtd_comp = CAST(AV1E_SET_ENABLE_DIST_WTD_COMP, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_max_reference_frames(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.max_reference_frames = CAST(AV1E_SET_MAX_REFERENCE_FRAMES, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_reduced_reference_set(
+ aom_codec_alg_priv_t *ctx, va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_reduced_reference_set =
+ CAST(AV1E_SET_REDUCED_REFERENCE_SET, args);
return update_extra_cfg(ctx, &extra_cfg);
}
@@ -1042,6 +1252,66 @@ static aom_codec_err_t ctrl_set_allow_ref_frame_mvs(aom_codec_alg_priv_t *ctx,
return update_extra_cfg(ctx, &extra_cfg);
}
+static aom_codec_err_t ctrl_set_enable_masked_comp(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_masked_comp = CAST(AV1E_SET_ENABLE_MASKED_COMP, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_onesided_comp(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_onesided_comp = CAST(AV1E_SET_ENABLE_ONESIDED_COMP, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_interintra_comp(
+ aom_codec_alg_priv_t *ctx, va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_interintra_comp =
+ CAST(AV1E_SET_ENABLE_INTERINTRA_COMP, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_smooth_interintra(
+ aom_codec_alg_priv_t *ctx, va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_smooth_interintra =
+ CAST(AV1E_SET_ENABLE_SMOOTH_INTERINTRA, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_diff_wtd_comp(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_diff_wtd_comp = CAST(AV1E_SET_ENABLE_DIFF_WTD_COMP, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_interinter_wedge(
+ aom_codec_alg_priv_t *ctx, va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_interinter_wedge =
+ CAST(AV1E_SET_ENABLE_INTERINTER_WEDGE, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_interintra_wedge(
+ aom_codec_alg_priv_t *ctx, va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_interintra_wedge =
+ CAST(AV1E_SET_ENABLE_INTERINTRA_WEDGE, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_global_motion(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_global_motion = CAST(AV1E_SET_ENABLE_GLOBAL_MOTION, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
static aom_codec_err_t ctrl_set_enable_warped_motion(aom_codec_alg_priv_t *ctx,
va_list args) {
struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -1056,6 +1326,34 @@ static aom_codec_err_t ctrl_set_allow_warped_motion(aom_codec_alg_priv_t *ctx,
return update_extra_cfg(ctx, &extra_cfg);
}
+static aom_codec_err_t ctrl_set_enable_filter_intra(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_filter_intra = CAST(AV1E_SET_ENABLE_FILTER_INTRA, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_smooth_intra(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_smooth_intra = CAST(AV1E_SET_ENABLE_SMOOTH_INTRA, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_paeth_intra(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_paeth_intra = CAST(AV1E_SET_ENABLE_PAETH_INTRA, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_cfl_intra(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_cfl_intra = CAST(AV1E_SET_ENABLE_CFL_INTRA, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
static aom_codec_err_t ctrl_set_enable_superres(aom_codec_alg_priv_t *ctx,
va_list args) {
struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -1063,6 +1361,27 @@ static aom_codec_err_t ctrl_set_enable_superres(aom_codec_alg_priv_t *ctx,
return update_extra_cfg(ctx, &extra_cfg);
}
+static aom_codec_err_t ctrl_set_enable_palette(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_palette = CAST(AV1E_SET_ENABLE_PALETTE, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_intrabc(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_intrabc = CAST(AV1E_SET_ENABLE_INTRABC, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_angle_delta(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_angle_delta = CAST(AV1E_SET_ENABLE_ANGLE_DELTA, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
static aom_codec_err_t ctrl_set_error_resilient_mode(aom_codec_alg_priv_t *ctx,
va_list args) {
struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -1099,6 +1418,56 @@ static aom_codec_err_t ctrl_set_aq_mode(aom_codec_alg_priv_t *ctx,
return update_extra_cfg(ctx, &extra_cfg);
}
+static aom_codec_err_t ctrl_set_reduced_tx_type_set(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.reduced_tx_type_set = CAST(AV1E_SET_REDUCED_TX_TYPE_SET, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_intra_dct_only(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.use_intra_dct_only = CAST(AV1E_SET_INTRA_DCT_ONLY, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_inter_dct_only(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.use_inter_dct_only = CAST(AV1E_SET_INTER_DCT_ONLY, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_intra_default_tx_only(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.use_intra_default_tx_only =
+ CAST(AV1E_SET_INTRA_DEFAULT_TX_ONLY, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_quant_b_adapt(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.quant_b_adapt = CAST(AV1E_SET_QUANT_B_ADAPT, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_coeff_cost_upd_freq(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.coeff_cost_upd_freq = CAST(AV1E_SET_COEFF_COST_UPD_FREQ, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_mode_cost_upd_freq(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.mode_cost_upd_freq = CAST(AV1E_SET_MODE_COST_UPD_FREQ, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
static aom_codec_err_t ctrl_set_film_grain_test_vector(
aom_codec_alg_priv_t *ctx, va_list args) {
struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -1152,6 +1521,13 @@ static aom_codec_err_t ctrl_set_max_gf_interval(aom_codec_alg_priv_t *ctx,
return update_extra_cfg(ctx, &extra_cfg);
}
+static aom_codec_err_t ctrl_set_gf_max_pyr_height(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.gf_max_pyr_height = CAST(AV1E_SET_GF_MAX_PYRAMID_HEIGHT, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
static aom_codec_err_t ctrl_set_frame_periodic_boost(aom_codec_alg_priv_t *ctx,
va_list args) {
struct av1_extracfg extra_cfg = ctx->extra_cfg;
@@ -1167,6 +1543,26 @@ static aom_codec_err_t ctrl_enable_motion_vector_unit_test(
return update_extra_cfg(ctx, &extra_cfg);
}
+static aom_codec_err_t ctrl_set_target_seq_level_idx(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ const int val = CAST(AV1E_SET_TARGET_SEQ_LEVEL_IDX, args);
+ const int level = val % 100;
+ const int operating_point_idx = val / 100;
+ if (operating_point_idx >= 0 &&
+ operating_point_idx < MAX_NUM_OPERATING_POINTS) {
+ extra_cfg.target_seq_level_idx[operating_point_idx] = (AV1_LEVEL)level;
+ }
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_tier_mask(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.tier_mask = CAST(AV1E_SET_TIER_MASK, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
static aom_codec_err_t encoder_init(aom_codec_ctx_t *ctx,
aom_codec_priv_enc_mr_cfg_t *data) {
aom_codec_err_t res = AOM_CODEC_OK;
@@ -1269,8 +1665,7 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
}
}
}
-
- if (ctx->oxcf.mode != GOOD) {
+ if (ctx->oxcf.mode != GOOD && ctx->oxcf.mode != REALTIME) {
ctx->oxcf.mode = GOOD;
av1_change_config(ctx->cpi, &ctx->oxcf);
}
@@ -1328,6 +1723,8 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
unsigned char *cx_data = ctx->cx_data;
size_t cx_data_sz = ctx->cx_data_sz;
+ assert(!(cx_data == NULL && cx_data_sz != 0));
+
/* Any pending invisible frames? */
if (ctx->pending_cx_data) {
memmove(cx_data, ctx->pending_cx_data, ctx->pending_cx_data_sz);
@@ -1355,12 +1752,6 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
-1 != av1_get_compressed_data(cpi, &lib_flags, &frame_size, cx_data,
&dst_time_stamp, &dst_end_time_stamp,
!img, timebase)) {
- if (cpi->common.seq_params.frame_id_numbers_present_flag) {
- if (cpi->common.invalid_delta_frame_id_minus_1) {
- aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR,
- "Invalid delta_frame_id_minus_1");
- }
- }
cpi->seq_params_locked = 1;
if (frame_size) {
if (ctx->pending_cx_data == 0) ctx->pending_cx_data = cx_data;
@@ -1380,8 +1771,8 @@ static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
frame_size);
}
const uint32_t obu_header_offset = 0;
- obu_header_size = write_obu_header(
- OBU_TEMPORAL_DELIMITER, 0,
+ obu_header_size = av1_write_obu_header(
+ cpi, OBU_TEMPORAL_DELIMITER, 0,
(uint8_t *)(ctx->pending_cx_data + obu_header_offset));
// OBUs are preceded/succeeded by an unsigned leb128 coded integer.
@@ -1742,6 +2133,13 @@ static aom_codec_err_t ctrl_set_chroma_subsampling_y(aom_codec_alg_priv_t *ctx,
return update_extra_cfg(ctx, &extra_cfg);
}
+static aom_codec_err_t ctrl_get_seq_level_idx(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ int *const arg = va_arg(args, int *);
+ if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+ return av1_get_seq_level_idx(ctx->cpi, arg);
+}
+
static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
{ AV1_COPY_REFERENCE, ctrl_copy_reference },
{ AOME_USE_REFERENCE, ctrl_use_reference },
@@ -1773,6 +2171,7 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
{ AV1E_SET_LOSSLESS, ctrl_set_lossless },
{ AV1E_SET_ENABLE_CDEF, ctrl_set_enable_cdef },
{ AV1E_SET_ENABLE_RESTORATION, ctrl_set_enable_restoration },
+ { AV1E_SET_ENABLE_OBMC, ctrl_set_enable_obmc },
{ AV1E_SET_DISABLE_TRELLIS_QUANT, ctrl_set_disable_trellis_quant },
{ AV1E_SET_ENABLE_QM, ctrl_set_enable_qm },
{ AV1E_SET_QM_Y, ctrl_set_qm_y },
@@ -1789,15 +2188,48 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
{ AV1E_SET_FRAME_PARALLEL_DECODING, ctrl_set_frame_parallel_decoding_mode },
{ AV1E_SET_ERROR_RESILIENT_MODE, ctrl_set_error_resilient_mode },
{ AV1E_SET_S_FRAME_MODE, ctrl_set_s_frame_mode },
- { AV1E_SET_ENABLE_DF, ctrl_set_enable_df },
+ { AV1E_SET_ENABLE_RECT_PARTITIONS, ctrl_set_enable_rect_partitions },
+ { AV1E_SET_ENABLE_AB_PARTITIONS, ctrl_set_enable_ab_partitions },
+ { AV1E_SET_ENABLE_1TO4_PARTITIONS, ctrl_set_enable_1to4_partitions },
+ { AV1E_SET_MIN_PARTITION_SIZE, ctrl_set_min_partition_size },
+ { AV1E_SET_MAX_PARTITION_SIZE, ctrl_set_max_partition_size },
+ { AV1E_SET_ENABLE_DUAL_FILTER, ctrl_set_enable_dual_filter },
+ { AV1E_SET_ENABLE_INTRA_EDGE_FILTER, ctrl_set_enable_intra_edge_filter },
{ AV1E_SET_ENABLE_ORDER_HINT, ctrl_set_enable_order_hint },
- { AV1E_SET_ENABLE_JNT_COMP, ctrl_set_enable_jnt_comp },
+ { AV1E_SET_ENABLE_TX64, ctrl_set_enable_tx64 },
+ { AV1E_SET_TX_SIZE_SEARCH_METHOD, ctrl_set_tx_size_search_method },
+ { AV1E_SET_ENABLE_FLIP_IDTX, ctrl_set_enable_flip_idtx },
+ { AV1E_SET_ENABLE_DIST_WTD_COMP, ctrl_set_enable_dist_wtd_comp },
+ { AV1E_SET_MAX_REFERENCE_FRAMES, ctrl_set_max_reference_frames },
+ { AV1E_SET_REDUCED_REFERENCE_SET, ctrl_set_enable_reduced_reference_set },
{ AV1E_SET_ENABLE_REF_FRAME_MVS, ctrl_set_enable_ref_frame_mvs },
{ AV1E_SET_ALLOW_REF_FRAME_MVS, ctrl_set_allow_ref_frame_mvs },
+ { AV1E_SET_ENABLE_MASKED_COMP, ctrl_set_enable_masked_comp },
+ { AV1E_SET_ENABLE_ONESIDED_COMP, ctrl_set_enable_onesided_comp },
+ { AV1E_SET_ENABLE_INTERINTRA_COMP, ctrl_set_enable_interintra_comp },
+ { AV1E_SET_ENABLE_SMOOTH_INTERINTRA, ctrl_set_enable_smooth_interintra },
+ { AV1E_SET_ENABLE_DIFF_WTD_COMP, ctrl_set_enable_diff_wtd_comp },
+ { AV1E_SET_ENABLE_INTERINTER_WEDGE, ctrl_set_enable_interinter_wedge },
+ { AV1E_SET_ENABLE_INTERINTRA_WEDGE, ctrl_set_enable_interintra_wedge },
+ { AV1E_SET_ENABLE_GLOBAL_MOTION, ctrl_set_enable_global_motion },
{ AV1E_SET_ENABLE_WARPED_MOTION, ctrl_set_enable_warped_motion },
{ AV1E_SET_ALLOW_WARPED_MOTION, ctrl_set_allow_warped_motion },
+ { AV1E_SET_ENABLE_FILTER_INTRA, ctrl_set_enable_filter_intra },
+ { AV1E_SET_ENABLE_SMOOTH_INTRA, ctrl_set_enable_smooth_intra },
+ { AV1E_SET_ENABLE_PAETH_INTRA, ctrl_set_enable_paeth_intra },
+ { AV1E_SET_ENABLE_CFL_INTRA, ctrl_set_enable_cfl_intra },
{ AV1E_SET_ENABLE_SUPERRES, ctrl_set_enable_superres },
+ { AV1E_SET_ENABLE_PALETTE, ctrl_set_enable_palette },
+ { AV1E_SET_ENABLE_INTRABC, ctrl_set_enable_intrabc },
+ { AV1E_SET_ENABLE_ANGLE_DELTA, ctrl_set_enable_angle_delta },
{ AV1E_SET_AQ_MODE, ctrl_set_aq_mode },
+ { AV1E_SET_REDUCED_TX_TYPE_SET, ctrl_set_reduced_tx_type_set },
+ { AV1E_SET_INTRA_DCT_ONLY, ctrl_set_intra_dct_only },
+ { AV1E_SET_INTER_DCT_ONLY, ctrl_set_inter_dct_only },
+ { AV1E_SET_INTRA_DEFAULT_TX_ONLY, ctrl_set_intra_default_tx_only },
+ { AV1E_SET_QUANT_B_ADAPT, ctrl_set_quant_b_adapt },
+ { AV1E_SET_COEFF_COST_UPD_FREQ, ctrl_set_coeff_cost_upd_freq },
+ { AV1E_SET_MODE_COST_UPD_FREQ, ctrl_set_mode_cost_upd_freq },
{ AV1E_SET_DELTAQ_MODE, ctrl_set_deltaq_mode },
{ AV1E_SET_FRAME_PERIODIC_BOOST, ctrl_set_frame_periodic_boost },
{ AV1E_SET_TUNE_CONTENT, ctrl_set_tune_content },
@@ -1810,6 +2242,7 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
{ AV1E_SET_NOISE_SENSITIVITY, ctrl_set_noise_sensitivity },
{ AV1E_SET_MIN_GF_INTERVAL, ctrl_set_min_gf_interval },
{ AV1E_SET_MAX_GF_INTERVAL, ctrl_set_max_gf_interval },
+ { AV1E_SET_GF_MAX_PYRAMID_HEIGHT, ctrl_set_gf_max_pyr_height },
{ AV1E_SET_RENDER_SIZE, ctrl_set_render_size },
{ AV1E_SET_SUPERBLOCK_SIZE, ctrl_set_superblock_size },
{ AV1E_SET_SINGLE_TILE_DECODING, ctrl_set_single_tile_decoding },
@@ -1820,6 +2253,8 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
{ AV1E_SET_DENOISE_BLOCK_SIZE, ctrl_set_denoise_block_size },
#endif // CONFIG_FILM_GRAIN
{ AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST, ctrl_enable_motion_vector_unit_test },
+ { AV1E_SET_TARGET_SEQ_LEVEL_IDX, ctrl_set_target_seq_level_idx },
+ { AV1E_SET_TIER_MASK, ctrl_set_tier_mask },
// Getters
{ AOME_GET_LAST_QUANTIZER, ctrl_get_quantizer },
@@ -1830,6 +2265,7 @@ static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
{ AV1_COPY_NEW_FRAME_IMAGE, ctrl_copy_new_frame_image },
{ AV1E_SET_CHROMA_SUBSAMPLING_X, ctrl_set_chroma_subsampling_x },
{ AV1E_SET_CHROMA_SUBSAMPLING_Y, ctrl_set_chroma_subsampling_y },
+ { AV1E_GET_SEQ_LEVEL_IDX, ctrl_get_seq_level_idx },
{ -1, NULL },
};
@@ -1837,7 +2273,7 @@ static aom_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
{ 0,
{
// NOLINT
- 0, // g_usage
+ 0, // g_usage - non-realtime usage
0, // g_threads
0, // g_profile
@@ -1862,11 +2298,11 @@ static aom_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
SCALE_NUMERATOR, // rc_resize_denominator
SCALE_NUMERATOR, // rc_resize_kf_denominator
- 0, // rc_superres_mode
+ SUPERRES_NONE, // rc_superres_mode
SCALE_NUMERATOR, // rc_superres_denominator
SCALE_NUMERATOR, // rc_superres_kf_denominator
63, // rc_superres_qthresh
- 63, // rc_superres_kf_qthresh
+ 32, // rc_superres_kf_qthresh
AOM_VBR, // rc_end_usage
{ NULL, 0 }, // rc_twopass_stats_in
@@ -1902,6 +2338,74 @@ static aom_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
{ 0 }, // tile_heights
{ 1 }, // config file
} },
+ { 1,
+ {
+ // NOLINT
+ 1, // g_usage - real-time usage
+ 0, // g_threads
+ 0, // g_profile
+
+ 320, // g_width
+ 240, // g_height
+ 0, // g_limit
+ 0, // g_forced_max_frame_width
+ 0, // g_forced_max_frame_height
+ AOM_BITS_8, // g_bit_depth
+ 8, // g_input_bit_depth
+
+ { 1, 30 }, // g_timebase
+
+ 0, // g_error_resilient
+
+ AOM_RC_ONE_PASS, // g_pass
+
+ 1, // g_lag_in_frames
+
+ 0, // rc_dropframe_thresh
+ RESIZE_NONE, // rc_resize_mode
+ SCALE_NUMERATOR, // rc_resize_denominator
+ SCALE_NUMERATOR, // rc_resize_kf_denominator
+
+ 0, // rc_superres_mode
+ SCALE_NUMERATOR, // rc_superres_denominator
+ SCALE_NUMERATOR, // rc_superres_kf_denominator
+ 63, // rc_superres_qthresh
+ 32, // rc_superres_kf_qthresh
+
+ AOM_CBR, // rc_end_usage
+ { NULL, 0 }, // rc_twopass_stats_in
+ { NULL, 0 }, // rc_firstpass_mb_stats_in
+ 256, // rc_target_bandwidth
+ 0, // rc_min_quantizer
+ 63, // rc_max_quantizer
+ 25, // rc_undershoot_pct
+ 25, // rc_overshoot_pct
+
+ 6000, // rc_max_buffer_size
+ 4000, // rc_buffer_initial_size
+ 5000, // rc_buffer_optimal_size
+
+ 50, // rc_two_pass_vbrbias
+ 0, // rc_two_pass_vbrmin_section
+ 2000, // rc_two_pass_vbrmax_section
+
+ // keyframing settings (kf)
+ 0, // fwd_kf_enabled
+ AOM_KF_AUTO, // g_kfmode
+ 0, // kf_min_dist
+ 9999, // kf_max_dist
+ 0, // sframe_dist
+ 1, // sframe_mode
+ 0, // large_scale_tile
+ 0, // monochrome
+ 0, // full_still_picture_hdr
+ 0, // save_as_annexb
+ 0, // tile_width_count
+ 0, // tile_height_count
+ { 0 }, // tile_widths
+ { 0 }, // tile_heights
+ { 1 }, // config file
+ } },
};
#ifndef VERSION_STRING
@@ -1925,7 +2429,7 @@ CODEC_INTERFACE(aom_codec_av1_cx) = {
},
{
// NOLINT
- 1, // 1 cfg map
+ 2, // 2 cfg map
encoder_usage_cfg_map, // aom_codec_enc_cfg_map_t
encoder_encode, // aom_codec_encode_fn_t
encoder_get_cxdata, // aom_codec_get_cx_data_fn_t
diff --git a/libaom/av1/av1_dx_iface.c b/libaom/av1/av1_dx_iface.c
index 08da650..ca872d7 100644
--- a/libaom/av1/av1_dx_iface.c
+++ b/libaom/av1/av1_dx_iface.c
@@ -44,7 +44,7 @@ struct aom_codec_alg_priv {
int img_avail;
int flushed;
int invert_tile_order;
- int last_show_frame; // Index of last output frame.
+ RefCntBuffer *last_show_frame; // Last output frame buffer
int byte_alignment;
int skip_loop_filter;
int skip_film_grain;
@@ -154,6 +154,49 @@ static aom_codec_err_t decoder_destroy(aom_codec_alg_priv_t *ctx) {
return AOM_CODEC_OK;
}
+static aom_codec_err_t parse_timing_info(struct aom_read_bit_buffer *rb) {
+ const uint32_t num_units_in_display_tick =
+ aom_rb_read_unsigned_literal(rb, 32);
+ const uint32_t time_scale = aom_rb_read_unsigned_literal(rb, 32);
+ if (num_units_in_display_tick == 0 || time_scale == 0)
+ return AOM_CODEC_UNSUP_BITSTREAM;
+ const uint8_t equal_picture_interval = aom_rb_read_bit(rb);
+ if (equal_picture_interval) {
+ const uint32_t num_ticks_per_picture_minus_1 = aom_rb_read_uvlc(rb);
+ if (num_ticks_per_picture_minus_1 == UINT32_MAX) {
+ // num_ticks_per_picture_minus_1 cannot be (1 << 32) − 1.
+ return AOM_CODEC_UNSUP_BITSTREAM;
+ }
+ }
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t parse_decoder_model_info(
+ struct aom_read_bit_buffer *rb, int *buffer_delay_length_minus_1) {
+ *buffer_delay_length_minus_1 = aom_rb_read_literal(rb, 5);
+ const uint32_t num_units_in_decoding_tick =
+ aom_rb_read_unsigned_literal(rb, 32);
+ const uint8_t buffer_removal_time_length_minus_1 = aom_rb_read_literal(rb, 5);
+ const uint8_t frame_presentation_time_length_minus_1 =
+ aom_rb_read_literal(rb, 5);
+ (void)num_units_in_decoding_tick;
+ (void)buffer_removal_time_length_minus_1;
+ (void)frame_presentation_time_length_minus_1;
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t parse_op_parameters_info(
+ struct aom_read_bit_buffer *rb, int buffer_delay_length_minus_1) {
+ const int n = buffer_delay_length_minus_1 + 1;
+ const uint32_t decoder_buffer_delay = aom_rb_read_unsigned_literal(rb, n);
+ const uint32_t encoder_buffer_delay = aom_rb_read_unsigned_literal(rb, n);
+ const uint8_t low_delay_mode_flag = aom_rb_read_bit(rb);
+ (void)decoder_buffer_delay;
+ (void)encoder_buffer_delay;
+ (void)low_delay_mode_flag;
+ return AOM_CODEC_OK;
+}
+
// Parses the operating points (including operating_point_idc, seq_level_idx,
// and seq_tier) and then sets si->number_spatial_layers and
// si->number_temporal_layers based on operating_point_idc[0].
@@ -161,10 +204,23 @@ static aom_codec_err_t parse_operating_points(struct aom_read_bit_buffer *rb,
int is_reduced_header,
aom_codec_stream_info_t *si) {
int operating_point_idc0 = 0;
-
if (is_reduced_header) {
aom_rb_read_literal(rb, LEVEL_BITS); // level
} else {
+ uint8_t decoder_model_info_present_flag = 0;
+ int buffer_delay_length_minus_1 = 0;
+ aom_codec_err_t status;
+ const uint8_t timing_info_present_flag = aom_rb_read_bit(rb);
+ if (timing_info_present_flag) {
+ if ((status = parse_timing_info(rb)) != AOM_CODEC_OK) return status;
+ decoder_model_info_present_flag = aom_rb_read_bit(rb);
+ if (decoder_model_info_present_flag) {
+ if ((status = parse_decoder_model_info(
+ rb, &buffer_delay_length_minus_1)) != AOM_CODEC_OK)
+ return status;
+ }
+ }
+ const uint8_t initial_display_delay_present_flag = aom_rb_read_bit(rb);
const uint8_t operating_points_cnt_minus_1 =
aom_rb_read_literal(rb, OP_POINTS_CNT_MINUS_1_BITS);
for (int i = 0; i < operating_points_cnt_minus_1 + 1; i++) {
@@ -173,6 +229,20 @@ static aom_codec_err_t parse_operating_points(struct aom_read_bit_buffer *rb,
if (i == 0) operating_point_idc0 = operating_point_idc;
int seq_level_idx = aom_rb_read_literal(rb, LEVEL_BITS); // level
if (seq_level_idx > 7) aom_rb_read_bit(rb); // tier
+ if (decoder_model_info_present_flag) {
+ const uint8_t decoder_model_present_for_this_op = aom_rb_read_bit(rb);
+ if (decoder_model_present_for_this_op) {
+ if ((status = parse_op_parameters_info(
+ rb, buffer_delay_length_minus_1)) != AOM_CODEC_OK)
+ return status;
+ }
+ }
+ if (initial_display_delay_present_flag) {
+ const uint8_t initial_display_delay_present_for_this_op =
+ aom_rb_read_bit(rb);
+ if (initial_display_delay_present_for_this_op)
+ aom_rb_read_literal(rb, 4); // initial_display_delay_minus_1
+ }
}
}
@@ -203,7 +273,7 @@ static aom_codec_err_t decoder_peek_si_internal(const uint8_t *data,
memset(&obu_header, 0, sizeof(obu_header));
size_t payload_size = 0;
size_t bytes_read = 0;
- int reduced_still_picture_hdr = 0;
+ uint8_t reduced_still_picture_hdr = 0;
aom_codec_err_t status = aom_read_obu_header_and_size(
data, data_sz, si->is_annexb, &obu_header, &payload_size, &bytes_read);
if (status != AOM_CODEC_OK) return status;
@@ -232,7 +302,7 @@ static aom_codec_err_t decoder_peek_si_internal(const uint8_t *data,
struct aom_read_bit_buffer rb = { data, data + data_sz, 0, NULL, NULL };
av1_read_profile(&rb); // profile
- const int still_picture = aom_rb_read_bit(&rb);
+ const uint8_t still_picture = aom_rb_read_bit(&rb);
reduced_still_picture_hdr = aom_rb_read_bit(&rb);
if (!still_picture && reduced_still_picture_hdr) {
@@ -317,7 +387,6 @@ static void init_buffer_callbacks(aom_codec_alg_priv_t *ctx) {
AV1_COMMON *const cm = &frame_worker_data->pbi->common;
BufferPool *const pool = cm->buffer_pool;
- cm->new_fb_idx = INVALID_IDX;
cm->cur_frame = NULL;
cm->byte_alignment = ctx->byte_alignment;
cm->skip_loop_filter = ctx->skip_loop_filter;
@@ -357,7 +426,6 @@ static int frame_worker_hook(void *arg1, void *arg2) {
if (result != 0) {
// Check decode result in serial decode.
- frame_worker_data->pbi->common.cur_frame->buf.corrupted = 1;
frame_worker_data->pbi->need_resync = 1;
}
return !result;
@@ -367,7 +435,7 @@ static aom_codec_err_t init_decoder(aom_codec_alg_priv_t *ctx) {
int i;
const AVxWorkerInterface *const winterface = aom_get_worker_interface();
- ctx->last_show_frame = -1;
+ ctx->last_show_frame = NULL;
ctx->next_output_worker_id = 0;
ctx->need_resync = 1;
ctx->num_frame_workers = 1;
@@ -449,8 +517,7 @@ static INLINE void check_resync(aom_codec_alg_priv_t *const ctx,
const AV1Decoder *const pbi) {
// Clear resync flag if worker got a key frame or intra only frame.
if (ctx->need_resync == 1 && pbi->need_resync == 0 &&
- (pbi->common.current_frame.intra_only ||
- pbi->common.current_frame.frame_type == KEY_FRAME))
+ frame_is_intra_only(&pbi->common))
ctx->need_resync = 0;
}
@@ -529,7 +596,7 @@ static aom_codec_err_t decoder_inspect(aom_codec_alg_priv_t *ctx,
data2->idx = -1;
for (int i = 0; i < REF_FRAMES; ++i)
- if (cm->ref_frame_map[i] == cm->new_fb_idx) data2->idx = i;
+ if (cm->ref_frame_map[i] == cm->cur_frame) data2->idx = i;
data2->buf = data;
data2->show_existing = cm->show_existing_frame;
return res;
@@ -551,7 +618,6 @@ static aom_codec_err_t decoder_decode(aom_codec_alg_priv_t *ctx,
// arguments are invalid.
if (ctx->frame_workers) {
BufferPool *const pool = ctx->buffer_pool;
- RefCntBuffer *const frame_bufs = pool->frame_bufs;
lock_buffer_pool(pool);
for (int i = 0; i < ctx->num_frame_workers; ++i) {
AVxWorker *const worker = &ctx->frame_workers[i];
@@ -559,7 +625,7 @@ static aom_codec_err_t decoder_decode(aom_codec_alg_priv_t *ctx,
(FrameWorkerData *)worker->data1;
struct AV1Decoder *pbi = frame_worker_data->pbi;
for (size_t j = 0; j < pbi->num_output_frames; j++) {
- decrease_ref_count(pbi->output_frame_index[j], frame_bufs, pool);
+ decrease_ref_count(pbi->output_frames[j], pool);
}
pbi->num_output_frames = 0;
}
@@ -696,7 +762,6 @@ static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx,
(FrameWorkerData *)worker->data1;
AV1Decoder *const pbi = frame_worker_data->pbi;
AV1_COMMON *const cm = &pbi->common;
- RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
ctx->next_output_worker_id =
(ctx->next_output_worker_id + 1) % ctx->num_frame_workers;
// Wait for the frame from worker thread.
@@ -709,8 +774,8 @@ static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx,
aom_film_grain_t *grain_params;
if (av1_get_raw_frame(frame_worker_data->pbi, *index, &sd,
&grain_params) == 0) {
- const int buf_idx = pbi->output_frame_index[*index];
- ctx->last_show_frame = buf_idx;
+ RefCntBuffer *const output_frame_buf = pbi->output_frames[*index];
+ ctx->last_show_frame = output_frame_buf;
if (ctx->need_resync) return NULL;
yuvconfig2image(&ctx->img, sd, frame_worker_data->user_priv);
@@ -725,8 +790,10 @@ static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx,
const int num_planes = av1_num_planes(cm);
if (pbi->ext_tile_debug && cm->single_tile_decoding &&
pbi->dec_tile_row >= 0) {
+ int tile_width, tile_height;
+ av1_get_uniform_tile_size(cm, &tile_width, &tile_height);
const int tile_row = AOMMIN(pbi->dec_tile_row, cm->tile_rows - 1);
- const int mi_row = tile_row * cm->tile_height;
+ const int mi_row = tile_row * tile_height;
const int ssy = ctx->img.y_chroma_shift;
int plane;
ctx->img.planes[0] += mi_row * MI_SIZE * ctx->img.stride[0];
@@ -736,14 +803,15 @@ static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx,
mi_row * (MI_SIZE >> ssy) * ctx->img.stride[plane];
}
}
- ctx->img.d_h =
- AOMMIN(cm->tile_height, cm->mi_rows - mi_row) * MI_SIZE;
+ ctx->img.d_h = AOMMIN(tile_height, cm->mi_rows - mi_row) * MI_SIZE;
}
if (pbi->ext_tile_debug && cm->single_tile_decoding &&
pbi->dec_tile_col >= 0) {
+ int tile_width, tile_height;
+ av1_get_uniform_tile_size(cm, &tile_width, &tile_height);
const int tile_col = AOMMIN(pbi->dec_tile_col, cm->tile_cols - 1);
- const int mi_col = tile_col * cm->tile_width;
+ const int mi_col = tile_col * tile_width;
const int ssx = ctx->img.x_chroma_shift;
const int is_hbd =
(ctx->img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 1 : 0;
@@ -755,11 +823,10 @@ static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx,
mi_col * (MI_SIZE >> ssx) * (1 + is_hbd);
}
}
- ctx->img.d_w =
- AOMMIN(cm->tile_width, cm->mi_cols - mi_col) * MI_SIZE;
+ ctx->img.d_w = AOMMIN(tile_width, cm->mi_cols - mi_col) * MI_SIZE;
}
- ctx->img.fb_priv = frame_bufs[buf_idx].raw_frame_buffer.priv;
+ ctx->img.fb_priv = output_frame_buf->raw_frame_buffer.priv;
img = &ctx->img;
img->temporal_id = cm->temporal_layer_id;
img->spatial_id = cm->spatial_layer_id;
@@ -911,7 +978,8 @@ static aom_codec_err_t ctrl_get_last_ref_updates(aom_codec_alg_priv_t *ctx,
AVxWorker *const worker = ctx->frame_workers;
FrameWorkerData *const frame_worker_data =
(FrameWorkerData *)worker->data1;
- *update_info = frame_worker_data->pbi->refresh_frame_flags;
+ *update_info =
+ frame_worker_data->pbi->common.current_frame.refresh_frame_flags;
return AOM_CODEC_OK;
} else {
return AOM_CODEC_ERROR;
@@ -940,11 +1008,10 @@ static aom_codec_err_t ctrl_get_frame_corrupted(aom_codec_alg_priv_t *ctx,
FrameWorkerData *const frame_worker_data =
(FrameWorkerData *)worker->data1;
AV1Decoder *const pbi = frame_worker_data->pbi;
- RefCntBuffer *const frame_bufs = pbi->common.buffer_pool->frame_bufs;
if (pbi->seen_frame_header && pbi->num_output_frames == 0)
return AOM_CODEC_ERROR;
- if (ctx->last_show_frame >= 0)
- *corrupted = frame_bufs[ctx->last_show_frame].buf.corrupted;
+ if (ctx->last_show_frame != NULL)
+ *corrupted = ctx->last_show_frame->buf.corrupted;
return AOM_CODEC_OK;
} else {
return AOM_CODEC_ERROR;
@@ -1124,8 +1191,9 @@ static aom_codec_err_t ctrl_get_tile_size(aom_codec_alg_priv_t *ctx,
FrameWorkerData *const frame_worker_data =
(FrameWorkerData *)worker->data1;
const AV1_COMMON *const cm = &frame_worker_data->pbi->common;
- *tile_size =
- ((cm->tile_width * MI_SIZE) << 16) + cm->tile_height * MI_SIZE;
+ int tile_width, tile_height;
+ av1_get_uniform_tile_size(cm, &tile_width, &tile_height);
+ *tile_size = ((tile_width * MI_SIZE) << 16) + tile_height * MI_SIZE;
return AOM_CODEC_OK;
} else {
return AOM_CODEC_ERROR;
diff --git a/libaom/av1/av1_iface_common.h b/libaom/av1/av1_iface_common.h
index 713d8c3..5568c89 100644
--- a/libaom/av1/av1_iface_common.h
+++ b/libaom/av1/av1_iface_common.h
@@ -124,7 +124,12 @@ static aom_codec_err_t image2yuvconfig(const aom_image_t *img,
} else {
yv12->flags = 0;
}
- yv12->border = (yv12->y_stride - img->w) / 2;
+
+ // Note(yunqing): if img is allocated the same as the frame buffer, y_stride
+ // is 32-byte aligned. Also, handle the cases while allocating img without a
+ // border or stride_align is less than 32.
+ int border = (yv12->y_stride - (int)((img->w + 31) & ~31)) / 2;
+ yv12->border = (border < 0) ? 0 : border;
yv12->subsampling_x = img->x_chroma_shift;
yv12->subsampling_y = img->y_chroma_shift;
return AOM_CODEC_OK;
diff --git a/libaom/av1/common/alloccommon.c b/libaom/av1/common/alloccommon.c
index 39b6b73..1c8528a 100644
--- a/libaom/av1/common/alloccommon.c
+++ b/libaom/av1/common/alloccommon.c
@@ -139,7 +139,7 @@ void av1_alloc_restoration_buffers(AV1_COMMON *cm) {
// Now we need to allocate enough space to store the line buffers for the
// stripes
const int frame_w = cm->superres_upscaled_width;
- const int use_highbd = cm->seq_params.use_highbitdepth ? 1 : 0;
+ const int use_highbd = cm->seq_params.use_highbitdepth;
for (int p = 0; p < num_planes; ++p) {
const int is_uv = p > 0;
diff --git a/libaom/av1/common/arm/av1_txfm_neon.c b/libaom/av1/common/arm/av1_txfm_neon.c
index de3c547..7e3a05a 100644
--- a/libaom/av1/common/arm/av1_txfm_neon.c
+++ b/libaom/av1/common/arm/av1_txfm_neon.c
@@ -12,6 +12,8 @@
#include <arm_neon.h>
#include <assert.h>
+#include "config/av1_rtcd.h"
+
#include "aom_ports/mem.h"
#include "av1/common/arm/mem_neon.h"
diff --git a/libaom/av1/common/arm/jnt_convolve_neon.c b/libaom/av1/common/arm/jnt_convolve_neon.c
index e5674ef..379ff98 100644
--- a/libaom/av1/common/arm/jnt_convolve_neon.c
+++ b/libaom/av1/common/arm/jnt_convolve_neon.c
@@ -23,19 +23,17 @@
#include "av1/common/arm/transpose_neon.h"
#if !defined(__aarch64__)
-static INLINE void compute_avg_4x1(uint16x4_t res0, uint16x4_t d0,
- const uint16_t fwd_offset,
- const uint16_t bck_offset,
- const int16x4_t sub_const_vec,
- const int16_t round_bits,
- const int use_jnt_comp_avg, uint8x8_t *t0) {
+static INLINE void compute_avg_4x1(
+ uint16x4_t res0, uint16x4_t d0, const uint16_t fwd_offset,
+ const uint16_t bck_offset, const int16x4_t sub_const_vec,
+ const int16_t round_bits, const int use_dist_wtd_comp_avg, uint8x8_t *t0) {
int16x4_t tmp0;
uint16x4_t tmp_u0;
uint32x4_t sum0;
int32x4_t dst0;
int16x8_t tmp4;
- if (use_jnt_comp_avg) {
+ if (use_dist_wtd_comp_avg) {
const int32x4_t round_bits_vec = vdupq_n_s32((int32_t)(-round_bits));
sum0 = vmull_n_u16(res0, fwd_offset);
@@ -65,12 +63,10 @@ static INLINE void compute_avg_4x1(uint16x4_t res0, uint16x4_t d0,
}
}
-static INLINE void compute_avg_8x1(uint16x8_t res0, uint16x8_t d0,
- const uint16_t fwd_offset,
- const uint16_t bck_offset,
- const int16x4_t sub_const,
- const int16_t round_bits,
- const int use_jnt_comp_avg, uint8x8_t *t0) {
+static INLINE void compute_avg_8x1(
+ uint16x8_t res0, uint16x8_t d0, const uint16_t fwd_offset,
+ const uint16_t bck_offset, const int16x4_t sub_const,
+ const int16_t round_bits, const int use_dist_wtd_comp_avg, uint8x8_t *t0) {
int16x4_t tmp0, tmp2;
int16x8_t f0;
uint32x4_t sum0, sum2;
@@ -78,7 +74,7 @@ static INLINE void compute_avg_8x1(uint16x8_t res0, uint16x8_t d0,
uint16x8_t tmp_u0;
- if (use_jnt_comp_avg) {
+ if (use_dist_wtd_comp_avg) {
const int32x4_t sub_const_vec = vmovl_s16(sub_const);
const int32x4_t round_bits_vec = vdupq_n_s32(-(int32_t)round_bits);
@@ -123,7 +119,7 @@ static INLINE void compute_avg_4x4(
uint16x4_t d0, uint16x4_t d1, uint16x4_t d2, uint16x4_t d3,
const uint16_t fwd_offset, const uint16_t bck_offset,
const int16x4_t sub_const_vec, const int16_t round_bits,
- const int use_jnt_comp_avg, uint8x8_t *t0, uint8x8_t *t1) {
+ const int use_dist_wtd_comp_avg, uint8x8_t *t0, uint8x8_t *t1) {
int16x4_t tmp0, tmp1, tmp2, tmp3;
uint16x4_t tmp_u0, tmp_u1, tmp_u2, tmp_u3;
uint32x4_t sum0, sum1, sum2, sum3;
@@ -132,7 +128,7 @@ static INLINE void compute_avg_4x4(
int16x8_t tmp4, tmp5;
const int16x8_t zero = vdupq_n_s16(0);
- if (use_jnt_comp_avg) {
+ if (use_dist_wtd_comp_avg) {
const int32x4_t round_bits_vec = vdupq_n_s32((int32_t)(-round_bits));
const int32x4_t const_vec = vmovl_s16(sub_const_vec);
@@ -203,8 +199,8 @@ static INLINE void compute_avg_8x4(
uint16x8_t d0, uint16x8_t d1, uint16x8_t d2, uint16x8_t d3,
const uint16_t fwd_offset, const uint16_t bck_offset,
const int16x4_t sub_const, const int16_t round_bits,
- const int use_jnt_comp_avg, uint8x8_t *t0, uint8x8_t *t1, uint8x8_t *t2,
- uint8x8_t *t3) {
+ const int use_dist_wtd_comp_avg, uint8x8_t *t0, uint8x8_t *t1,
+ uint8x8_t *t2, uint8x8_t *t3) {
int16x4_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int16x8_t f0, f1, f2, f3;
uint32x4_t sum0, sum1, sum2, sum3;
@@ -214,7 +210,7 @@ static INLINE void compute_avg_8x4(
uint16x8_t tmp_u0, tmp_u1, tmp_u2, tmp_u3;
const int16x8_t zero = vdupq_n_s16(0);
- if (use_jnt_comp_avg) {
+ if (use_dist_wtd_comp_avg) {
const int32x4_t sub_const_vec = vmovl_s16(sub_const);
const int32x4_t round_bits_vec = vdupq_n_s32(-(int32_t)round_bits);
@@ -319,7 +315,7 @@ static INLINE void compute_avg_8x4(
}
}
-static INLINE void jnt_convolve_2d_horiz_neon(
+static INLINE void dist_wtd_convolve_2d_horiz_neon(
const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride,
int16_t *x_filter_tmp, const int im_h, int w, const int round_0) {
const int bd = 8;
@@ -563,7 +559,7 @@ static INLINE void jnt_convolve_2d_horiz_neon(
}
}
-static INLINE void jnt_convolve_2d_vert_neon(
+static INLINE void dist_wtd_convolve_2d_vert_neon(
int16_t *im_block, const int im_stride, uint8_t *dst8, int dst8_stride,
ConvolveParams *conv_params, const int16_t *y_filter, int h, int w) {
uint8_t *dst_u8_ptr, *d_u8;
@@ -587,7 +583,7 @@ static INLINE void jnt_convolve_2d_vert_neon(
const uint16_t fwd_offset = conv_params->fwd_offset;
const uint16_t bck_offset = conv_params->bck_offset;
const int do_average = conv_params->do_average;
- const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
uint16x4_t res4, d0;
@@ -652,8 +648,8 @@ static INLINE void jnt_convolve_2d_vert_neon(
d += (dst_stride << 2);
compute_avg_4x4(res4, res5, res6, res7, d0, d1, d2, d3, fwd_offset,
- bck_offset, sub_const_vec, round_bits, use_jnt_comp_avg,
- &t0, &t1);
+ bck_offset, sub_const_vec, round_bits,
+ use_dist_wtd_comp_avg, &t0, &t1);
vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0);
d_u8 += dst8_stride;
@@ -691,7 +687,7 @@ static INLINE void jnt_convolve_2d_vert_neon(
d += (dst_stride);
compute_avg_4x1(res4, d0, fwd_offset, bck_offset, sub_const_vec,
- round_bits, use_jnt_comp_avg, &t0);
+ round_bits, use_dist_wtd_comp_avg, &t0);
vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0);
d_u8 += dst8_stride;
@@ -717,12 +713,12 @@ static INLINE void jnt_convolve_2d_vert_neon(
} while (w > 0);
}
-void av1_jnt_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
- int dst8_stride, int w, int h,
- const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_2d_neon(const uint8_t *src, int src_stride,
+ uint8_t *dst8, int dst8_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
assert(!(w % 4));
assert(!(h % 4));
@@ -748,19 +744,18 @@ void av1_jnt_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
filter_x_coef = vshrq_n_s16(filter_x_coef, 1);
vst1q_s16(&x_filter_tmp[0], filter_x_coef);
- jnt_convolve_2d_horiz_neon(src_ptr, src_stride, im_block, im_stride,
- x_filter_tmp, im_h, w, round_0);
+ dist_wtd_convolve_2d_horiz_neon(src_ptr, src_stride, im_block, im_stride,
+ x_filter_tmp, im_h, w, round_0);
- jnt_convolve_2d_vert_neon(im_block, im_stride, dst8, dst8_stride, conv_params,
- y_filter, h, w);
+ dist_wtd_convolve_2d_vert_neon(im_block, im_stride, dst8, dst8_stride,
+ conv_params, y_filter, h, w);
}
-void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride,
- uint8_t *dst8, int dst8_stride, int w, int h,
- const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_2d_copy_neon(
+ const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ const int subpel_y_q4, ConvolveParams *conv_params) {
uint8x8_t res0_8, res1_8, res2_8, res3_8, tmp_shift0, tmp_shift1, tmp_shift2,
tmp_shift3;
uint16x8_t res_q0, res_q1, res_q2, res_q3, tmp_q0, tmp_q1, tmp_q2, tmp_q3;
@@ -811,7 +806,7 @@ void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride,
compute_avg_8x4(tmp_q0, tmp_q1, tmp_q2, tmp_q3, res_q0, res_q1,
res_q2, res_q3, conv_params->fwd_offset,
conv_params->bck_offset, sub_const_vec, bits,
- conv_params->use_jnt_comp_avg, &tmp_shift0,
+ conv_params->use_dist_wtd_comp_avg, &tmp_shift0,
&tmp_shift1, &tmp_shift2, &tmp_shift3);
vst1_u8(dst8_1 + (0 * dst8_stride), tmp_shift0);
@@ -854,7 +849,7 @@ void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride,
compute_avg_4x4(tmp4, tmp5, tmp6, tmp7, res4, res5, res6, res7,
conv_params->fwd_offset, conv_params->bck_offset,
- sub_const_vec, bits, conv_params->use_jnt_comp_avg,
+ sub_const_vec, bits, conv_params->use_dist_wtd_comp_avg,
&tmp_shift0, &tmp_shift1);
vst1_lane_u32((uint32_t *)(dst8_1), vreinterpret_u32_u8(tmp_shift0), 0);
@@ -881,12 +876,12 @@ void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride,
}
}
-void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
- int dst8_stride, int w, int h,
- const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_x_neon(const uint8_t *src, int src_stride,
+ uint8_t *dst8, int dst8_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
assert(!(w % 4));
assert(!(h % 4));
@@ -902,7 +897,7 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
const uint16_t fwd_offset = conv_params->fwd_offset;
const uint16_t bck_offset = conv_params->bck_offset;
- const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
(void)filter_params_y;
(void)subpel_y_q4;
@@ -1031,8 +1026,8 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
compute_avg_4x4(res4, res5, res6, res7, vreinterpret_u16_s16(d0),
vreinterpret_u16_s16(d1), vreinterpret_u16_s16(d2),
vreinterpret_u16_s16(d3), fwd_offset, bck_offset,
- round_offset_vec, round_bits, use_jnt_comp_avg, &t0,
- &t1);
+ round_offset_vec, round_bits, use_dist_wtd_comp_avg,
+ &t0, &t1);
vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0),
0); // 00 01 02 03
@@ -1103,7 +1098,7 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
compute_avg_4x1(res4, vreinterpret_u16_s16(d0), fwd_offset,
bck_offset, round_offset_vec, round_bits,
- use_jnt_comp_avg, &t0);
+ use_dist_wtd_comp_avg, &t0);
vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0),
0); // 00 01 02 03
@@ -1231,11 +1226,12 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11);
d_tmp += (dst_stride << 2);
- compute_avg_8x4(
- res8, res9, res10, res11, vreinterpretq_u16_s16(res0),
- vreinterpretq_u16_s16(res1), vreinterpretq_u16_s16(res2),
- vreinterpretq_u16_s16(res3), fwd_offset, bck_offset,
- round_offset64, round_bits, use_jnt_comp_avg, &t0, &t1, &t2, &t3);
+ compute_avg_8x4(res8, res9, res10, res11, vreinterpretq_u16_s16(res0),
+ vreinterpretq_u16_s16(res1),
+ vreinterpretq_u16_s16(res2),
+ vreinterpretq_u16_s16(res3), fwd_offset, bck_offset,
+ round_offset64, round_bits, use_dist_wtd_comp_avg,
+ &t0, &t1, &t2, &t3);
store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3);
d_u8 += (dst8_stride << 2);
@@ -1243,11 +1239,12 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11);
d_tmp += (dst_stride << 2);
- compute_avg_8x4(
- res8, res9, res10, res11, vreinterpretq_u16_s16(res4),
- vreinterpretq_u16_s16(res5), vreinterpretq_u16_s16(res6),
- vreinterpretq_u16_s16(res7), fwd_offset, bck_offset,
- round_offset64, round_bits, use_jnt_comp_avg, &t0, &t1, &t2, &t3);
+ compute_avg_8x4(res8, res9, res10, res11, vreinterpretq_u16_s16(res4),
+ vreinterpretq_u16_s16(res5),
+ vreinterpretq_u16_s16(res6),
+ vreinterpretq_u16_s16(res7), fwd_offset, bck_offset,
+ round_offset64, round_bits, use_dist_wtd_comp_avg,
+ &t0, &t1, &t2, &t3);
store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3);
d_u8 += (dst8_stride << 2);
@@ -1319,7 +1316,7 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
compute_avg_8x1(res8, vreinterpretq_u16_s16(res0), fwd_offset,
bck_offset, round_offset64, round_bits,
- use_jnt_comp_avg, &t0);
+ use_dist_wtd_comp_avg, &t0);
vst1_u8(d_u8, t0);
d_u8 += (dst8_stride);
@@ -1342,12 +1339,12 @@ void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
}
}
-void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
- int dst8_stride, int w, int h,
- const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_y_neon(const uint8_t *src, int src_stride,
+ uint8_t *dst8, int dst8_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
assert(!(w % 4));
assert(!(h % 4));
@@ -1363,7 +1360,7 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
const uint16_t fwd_offset = conv_params->fwd_offset;
const uint16_t bck_offset = conv_params->bck_offset;
- const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
const int shift_value = (conv_params->round_1 - 1 - bits);
(void)filter_params_x;
@@ -1489,8 +1486,8 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
compute_avg_4x4(res4, res5, res6, res7, vreinterpret_u16_s16(d0),
vreinterpret_u16_s16(d1), vreinterpret_u16_s16(d2),
vreinterpret_u16_s16(d3), fwd_offset, bck_offset,
- round_offset64, round_bits, use_jnt_comp_avg, &t0,
- &t1);
+ round_offset64, round_bits, use_dist_wtd_comp_avg,
+ &t0, &t1);
vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0);
d_u8 += dst8_stride;
@@ -1535,7 +1532,7 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
compute_avg_4x1(res4, vreinterpret_u16_s16(d0), fwd_offset,
bck_offset, round_offset64, round_bits,
- use_jnt_comp_avg, &t0);
+ use_dist_wtd_comp_avg, &t0);
vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0);
d_u8 += dst8_stride;
@@ -1654,11 +1651,12 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11);
d_tmp += (dst_stride << 2);
- compute_avg_8x4(
- res8, res9, res10, res11, vreinterpretq_u16_s16(res0),
- vreinterpretq_u16_s16(res1), vreinterpretq_u16_s16(res2),
- vreinterpretq_u16_s16(res3), fwd_offset, bck_offset,
- round_offset64, round_bits, use_jnt_comp_avg, &t0, &t1, &t2, &t3);
+ compute_avg_8x4(res8, res9, res10, res11, vreinterpretq_u16_s16(res0),
+ vreinterpretq_u16_s16(res1),
+ vreinterpretq_u16_s16(res2),
+ vreinterpretq_u16_s16(res3), fwd_offset, bck_offset,
+ round_offset64, round_bits, use_dist_wtd_comp_avg,
+ &t0, &t1, &t2, &t3);
store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3);
d_u8 += (dst8_stride << 2);
@@ -1666,11 +1664,12 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11);
d_tmp += (dst_stride << 2);
- compute_avg_8x4(
- res8, res9, res10, res11, vreinterpretq_u16_s16(res4),
- vreinterpretq_u16_s16(res5), vreinterpretq_u16_s16(res6),
- vreinterpretq_u16_s16(res7), fwd_offset, bck_offset,
- round_offset64, round_bits, use_jnt_comp_avg, &t0, &t1, &t2, &t3);
+ compute_avg_8x4(res8, res9, res10, res11, vreinterpretq_u16_s16(res4),
+ vreinterpretq_u16_s16(res5),
+ vreinterpretq_u16_s16(res6),
+ vreinterpretq_u16_s16(res7), fwd_offset, bck_offset,
+ round_offset64, round_bits, use_dist_wtd_comp_avg,
+ &t0, &t1, &t2, &t3);
store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3);
d_u8 += (dst8_stride << 2);
@@ -1718,7 +1717,7 @@ void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
compute_avg_8x1(res8, vreinterpretq_u16_s16(res0), fwd_offset,
bck_offset, round_offset64, round_bits,
- use_jnt_comp_avg, &t0);
+ use_dist_wtd_comp_avg, &t0);
vst1_u8(d_u8, t0);
d_u8 += (dst8_stride);
diff --git a/libaom/av1/common/arm/warp_plane_neon.c b/libaom/av1/common/arm/warp_plane_neon.c
index 7f02d42..1062cc3 100644
--- a/libaom/av1/common/arm/warp_plane_neon.c
+++ b/libaom/av1/common/arm/warp_plane_neon.c
@@ -640,7 +640,7 @@ void av1_warp_affine_neon(const int32_t *mat, const uint8_t *ref, int width,
uint16x4_t tmp16_lo = vld1_u16(p);
int32x4_t tmp32_lo = vreinterpretq_s32_u32(vmovl_u16(tmp16_lo));
int16x4_t tmp16_low;
- if (conv_params->use_jnt_comp_avg) {
+ if (conv_params->use_dist_wtd_comp_avg) {
res_lo = vmulq_s32(res_lo, bwd);
tmp32_lo = vmulq_s32(tmp32_lo, fwd);
tmp32_lo = vaddq_s32(tmp32_lo, res_lo);
@@ -671,7 +671,7 @@ void av1_warp_affine_neon(const int32_t *mat, const uint8_t *ref, int width,
uint16x4_t tmp16_hi = vld1_u16(p4);
int32x4_t tmp32_hi = vreinterpretq_s32_u32(vmovl_u16(tmp16_hi));
int16x4_t tmp16_high;
- if (conv_params->use_jnt_comp_avg) {
+ if (conv_params->use_dist_wtd_comp_avg) {
res_hi = vmulq_s32(res_hi, bwd);
tmp32_hi = vmulq_s32(tmp32_hi, fwd);
tmp32_hi = vaddq_s32(tmp32_hi, res_hi);
diff --git a/libaom/av1/common/av1_inv_txfm2d.c b/libaom/av1/common/av1_inv_txfm2d.c
index 4f2d57b..fc9c8d2 100644
--- a/libaom/av1/common/av1_inv_txfm2d.c
+++ b/libaom/av1/common/av1_inv_txfm2d.c
@@ -228,7 +228,7 @@ void av1_gen_inv_stage_range(int8_t *stage_range_col, int8_t *stage_range_row,
(void)real_range_row;
if (cfg->txfm_type_row == TXFM_TYPE_ADST4 && i == 1) {
// the adst4 may use 1 extra bit on top of opt_range_row at stage 1
- // so opt_range_col >= real_range_col will not hold
+ // so opt_range_row >= real_range_row will not hold
stage_range_row[i] = opt_range_row;
} else {
assert(opt_range_row >= real_range_row);
@@ -241,7 +241,7 @@ void av1_gen_inv_stage_range(int8_t *stage_range_col, int8_t *stage_range_row,
cfg->stage_range_col[i] + fwd_shift + shift[0] + bd + 1;
(void)real_range_col;
if (cfg->txfm_type_col == TXFM_TYPE_ADST4 && i == 1) {
- // the adst4 may use 1 extra bit on top of opt_range_row at stage 1
+ // the adst4 may use 1 extra bit on top of opt_range_col at stage 1
// so opt_range_col >= real_range_col will not hold
stage_range_col[i] = opt_range_col;
} else {
diff --git a/libaom/av1/common/av1_loopfilter.c b/libaom/av1/common/av1_loopfilter.c
index c5a86fb..0aa1f9b 100644
--- a/libaom/av1/common/av1_loopfilter.c
+++ b/libaom/av1/common/av1_loopfilter.c
@@ -32,7 +32,7 @@ static const int delta_lf_id_lut[MAX_MB_PLANE][2] = {
{ 0, 1 }, { 2, 2 }, { 3, 3 }
};
-typedef enum EDGE_DIR { VERT_EDGE = 0, HORZ_EDGE = 1, NUM_EDGE_DIRS } EDGE_DIR;
+enum { VERT_EDGE = 0, HORZ_EDGE = 1, NUM_EDGE_DIRS } UENUM1BYTE(EDGE_DIR);
static const int mode_lf_lut[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // INTRA_MODES
@@ -1426,9 +1426,9 @@ static void highbd_filter_selectively_horiz(
lfi->hev_thr, lfin->mblim,
lfin->lim, lfin->hev_thr, bd);
} else {
- aom_highbd_lpf_horizontal_14_dual_c(s, pitch, lfi->mblim, lfi->lim,
- lfi->hev_thr, lfin->mblim,
- lfin->lim, lfin->hev_thr, bd);
+ aom_highbd_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, lfin->mblim,
+ lfin->lim, lfin->hev_thr, bd);
}
count = 2;
} else {
diff --git a/libaom/av1/common/av1_rtcd_defs.pl b/libaom/av1/common/av1_rtcd_defs.pl
index 7049f16..aca5ec7 100755..100644
--- a/libaom/av1/common/av1_rtcd_defs.pl
+++ b/libaom/av1/common/av1_rtcd_defs.pl
@@ -81,8 +81,11 @@ specialize qw/av1_highbd_wiener_convolve_add_src avx2/;
# directional intra predictor functions
add_proto qw/void av1_dr_prediction_z1/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy";
+specialize qw/av1_dr_prediction_z1 avx2/;
add_proto qw/void av1_dr_prediction_z2/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int dx, int dy";
+specialize qw/av1_dr_prediction_z2 avx2/;
add_proto qw/void av1_dr_prediction_z3/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_left, int dx, int dy";
+specialize qw/av1_dr_prediction_z3 avx2/;
# FILTER_INTRA predictor functions
add_proto qw/void av1_filter_intra_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode";
@@ -108,31 +111,19 @@ specialize qw/av1_highbd_convolve8_vert/, "$sse2_x86_64";
#inv txfm
add_proto qw/void av1_inv_txfm_add/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_inv_txfm_add ssse3 avx2 neon/;
+# TODO(http://crbug.com/aomedia/2350): avx2 is disabled due to test vector
+# mismatches.
+specialize qw/av1_inv_txfm_add ssse3 neon/;
add_proto qw/void av1_highbd_inv_txfm_add/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add sse4_1 avx2/;
+# TODO(http://crbug.com/aomedia/2350): avx2 is disabled due to test vector
+# mismatches.
+specialize qw/av1_highbd_inv_txfm_add sse4_1/;
add_proto qw/void av1_highbd_inv_txfm_add_4x4/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
specialize qw/av1_highbd_inv_txfm_add_4x4 sse4_1/;
add_proto qw/void av1_highbd_inv_txfm_add_8x8/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_8x8 sse4_1 avx2/;
-add_proto qw/void av1_highbd_inv_txfm_add_16x8/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_16x8 sse4_1 avx2/;
-add_proto qw/void av1_highbd_inv_txfm_add_8x16/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_8x16 sse4_1 avx2/;
-add_proto qw/void av1_highbd_inv_txfm_add_16x16/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_16x16 sse4_1 avx2/;
-add_proto qw/void av1_highbd_inv_txfm_add_32x32/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_32x32 sse4_1 avx2/;
-add_proto qw/void av1_highbd_inv_txfm_add_16x32/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_16x32 sse4_1 avx2/;
-add_proto qw/void av1_highbd_inv_txfm_add_32x16/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_32x16 sse4_1 avx2/;
-add_proto qw/void av1_highbd_inv_txfm_add_8x32/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_8x32 sse4_1 avx2/;
-add_proto qw/void av1_highbd_inv_txfm_add_32x8/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
-specialize qw/av1_highbd_inv_txfm_add_32x8 sse4_1 avx2/;
+specialize qw/av1_highbd_inv_txfm_add_8x8 sse4_1/;
add_proto qw/void av1_highbd_inv_txfm_add_4x8/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
specialize qw/av1_highbd_inv_txfm_add_4x8 sse4_1/;
add_proto qw/void av1_highbd_inv_txfm_add_8x4/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
@@ -173,7 +164,9 @@ add_proto qw/void av1_inv_txfm2d_add_32x8/, "const int32_t *input, uint16_t *out
add_proto qw/void av1_highbd_dr_prediction_z1/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd";
specialize qw/av1_highbd_dr_prediction_z1 avx2/;
add_proto qw/void av1_highbd_dr_prediction_z2/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy, int bd";
-#specialize qw/av1_highbd_dr_prediction_z2 avx2/;
+# TODO(niva213@gmail.com): Re-enable avx2 after fixing valgrind issue
+# https://crbug.com/aomedia/2316
+# specialize qw/av1_highbd_dr_prediction_z2 avx2/;
add_proto qw/void av1_highbd_dr_prediction_z3/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_left, int dx, int dy, int bd";
specialize qw/av1_highbd_dr_prediction_z3 avx2/;
@@ -187,6 +180,10 @@ specialize qw/av1_build_compound_diffwtd_mask_highbd ssse3 avx2/;
add_proto qw/void av1_build_compound_diffwtd_mask_d16/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd";
specialize qw/av1_build_compound_diffwtd_mask_d16 sse4_1 avx2 neon/;
+# Helper functions.
+add_proto qw/void av1_round_shift_array/, "int32_t *arr, int size, int bit";
+specialize "av1_round_shift_array", qw/sse4_1 neon/;
+
#
# Encoder functions below this point.
#
@@ -221,9 +218,9 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
add_proto qw/void av1_fwd_txfm2d_8x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
specialize qw/av1_fwd_txfm2d_8x4 sse4_1/;
add_proto qw/void av1_fwd_txfm2d_8x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
- specialize qw/av1_fwd_txfm2d_8x16 sse4_1/;
+ specialize qw/av1_fwd_txfm2d_8x16 sse4_1 avx2/;
add_proto qw/void av1_fwd_txfm2d_16x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
- specialize qw/av1_fwd_txfm2d_16x8 sse4_1/;
+ specialize qw/av1_fwd_txfm2d_16x8 sse4_1 avx2/;
add_proto qw/void av1_fwd_txfm2d_16x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
specialize qw/av1_fwd_txfm2d_16x32 sse4_1/;
add_proto qw/void av1_fwd_txfm2d_32x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
@@ -239,14 +236,14 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
add_proto qw/void av1_fwd_txfm2d_4x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
specialize qw/av1_fwd_txfm2d_4x4 sse4_1/;
add_proto qw/void av1_fwd_txfm2d_8x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
- specialize qw/av1_fwd_txfm2d_8x8 sse4_1/;
+ specialize qw/av1_fwd_txfm2d_8x8 sse4_1 avx2/;
add_proto qw/void av1_fwd_txfm2d_16x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
- specialize qw/av1_fwd_txfm2d_16x16 sse4_1/;
+ specialize qw/av1_fwd_txfm2d_16x16 sse4_1 avx2/;
add_proto qw/void av1_fwd_txfm2d_32x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
- specialize qw/av1_fwd_txfm2d_32x32 sse4_1/;
+ specialize qw/av1_fwd_txfm2d_32x32 sse4_1 avx2/;
add_proto qw/void av1_fwd_txfm2d_64x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
- specialize qw/av1_fwd_txfm2d_64x64 sse4_1/;
+ specialize qw/av1_fwd_txfm2d_64x64 sse4_1 avx2/;
add_proto qw/void av1_fwd_txfm2d_32x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
specialize qw/av1_fwd_txfm2d_32x64 sse4_1/;
add_proto qw/void av1_fwd_txfm2d_64x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
@@ -263,17 +260,18 @@ if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
add_proto qw/int av1_full_range_search/, "const struct macroblock *x, const struct search_site_config *cfg, MV *ref_mv, MV *best_mv, int search_param, int sad_per_bit, int *num00, const struct aom_variance_vtable *fn_ptr, const MV *center_mv";
- add_proto qw/void av1_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
- specialize qw/av1_temporal_filter_apply sse2 msa/;
+ add_proto qw/void av1_apply_temporal_filter/, "const uint8_t *y_frame1, int y_stride, const uint8_t *y_pred, int y_buf_stride, const uint8_t *u_frame1, const uint8_t *v_frame1, int uv_stride, const uint8_t *u_pred, const uint8_t *v_pred, int uv_buf_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count";
+ specialize qw/av1_apply_temporal_filter sse4_1/;
add_proto qw/void av1_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
# ENCODEMB INVOKE
add_proto qw/int64_t av1_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
- specialize qw/av1_highbd_block_error sse2/;
+ specialize qw/av1_highbd_block_error sse2 avx2/;
- add_proto qw/void av1_highbd_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
+ add_proto qw/void av1_highbd_apply_temporal_filter/, "const uint8_t *yf, int y_stride, const uint8_t *yp, int y_buf_stride, const uint8_t *uf, const uint8_t *vf, int uv_stride, const uint8_t *up, const uint8_t *vp, int uv_buf_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count";
+ specialize qw/av1_highbd_apply_temporal_filter sse4_1/;
add_proto qw/void av1_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale";
specialize qw/av1_highbd_quantize_fp sse4_1 avx2/;
@@ -347,7 +345,7 @@ specialize qw/av1_highbd_warp_affine sse4_1/;
if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
add_proto qw/double compute_cross_correlation/, "unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2";
- specialize qw/compute_cross_correlation sse4_1/;
+ specialize qw/compute_cross_correlation sse4_1 avx2/;
}
# LOOP_RESTORATION functions
@@ -366,18 +364,18 @@ add_proto qw/void av1_convolve_2d_sr/, "const uint8_t *src, int src_stride, uint
add_proto qw/void av1_convolve_2d_copy_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
add_proto qw/void av1_convolve_x_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
add_proto qw/void av1_convolve_y_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_jnt_convolve_2d/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_jnt_convolve_2d_copy/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_jnt_convolve_x/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
-add_proto qw/void av1_jnt_convolve_y/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_dist_wtd_convolve_2d/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_dist_wtd_convolve_2d_copy/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_dist_wtd_convolve_x/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_dist_wtd_convolve_y/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
add_proto qw/void av1_highbd_convolve_2d_copy_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
add_proto qw/void av1_highbd_convolve_2d_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
add_proto qw/void av1_highbd_convolve_x_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
add_proto qw/void av1_highbd_convolve_y_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_jnt_convolve_2d/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_jnt_convolve_x/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_jnt_convolve_y/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-add_proto qw/void av1_highbd_jnt_convolve_2d_copy/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_dist_wtd_convolve_2d/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_dist_wtd_convolve_x/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_dist_wtd_convolve_y/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_dist_wtd_convolve_2d_copy/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
add_proto qw/void av1_convolve_2d_scale/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params";
add_proto qw/void av1_highbd_convolve_2d_scale/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd";
@@ -387,19 +385,19 @@ add_proto qw/void av1_highbd_jnt_convolve_2d_copy/, "const uint16_t *src, int sr
specialize qw/av1_convolve_x_sr sse2 avx2 neon/;
specialize qw/av1_convolve_y_sr sse2 avx2 neon/;
specialize qw/av1_convolve_2d_scale sse4_1/;
- specialize qw/av1_jnt_convolve_2d sse2 ssse3 avx2 neon/;
- specialize qw/av1_jnt_convolve_2d_copy sse2 avx2 neon/;
- specialize qw/av1_jnt_convolve_x sse2 avx2 neon/;
- specialize qw/av1_jnt_convolve_y sse2 avx2 neon/;
+ specialize qw/av1_dist_wtd_convolve_2d sse2 ssse3 avx2 neon/;
+ specialize qw/av1_dist_wtd_convolve_2d_copy sse2 avx2 neon/;
+ specialize qw/av1_dist_wtd_convolve_x sse2 avx2 neon/;
+ specialize qw/av1_dist_wtd_convolve_y sse2 avx2 neon/;
specialize qw/av1_highbd_convolve_2d_copy_sr sse2 avx2/;
specialize qw/av1_highbd_convolve_2d_sr ssse3 avx2/;
specialize qw/av1_highbd_convolve_x_sr ssse3 avx2/;
specialize qw/av1_highbd_convolve_y_sr ssse3 avx2/;
specialize qw/av1_highbd_convolve_2d_scale sse4_1/;
- specialize qw/av1_highbd_jnt_convolve_2d sse4_1 avx2/;
- specialize qw/av1_highbd_jnt_convolve_x sse4_1 avx2/;
- specialize qw/av1_highbd_jnt_convolve_y sse4_1 avx2/;
- specialize qw/av1_highbd_jnt_convolve_2d_copy sse4_1 avx2/;
+ specialize qw/av1_highbd_dist_wtd_convolve_2d sse4_1 avx2/;
+ specialize qw/av1_highbd_dist_wtd_convolve_x sse4_1 avx2/;
+ specialize qw/av1_highbd_dist_wtd_convolve_y sse4_1 avx2/;
+ specialize qw/av1_highbd_dist_wtd_convolve_2d_copy sse4_1 avx2/;
# INTRA_EDGE functions
add_proto qw/void av1_filter_intra_edge/, "uint8_t *p, int sz, int strength";
diff --git a/libaom/av1/common/av1_txfm.c b/libaom/av1/common/av1_txfm.c
index 4fbb756..ac43402 100644
--- a/libaom/av1/common/av1_txfm.c
+++ b/libaom/av1/common/av1_txfm.c
@@ -10,6 +10,7 @@
*/
#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
#include "av1/common/av1_txfm.h"
diff --git a/libaom/av1/common/av1_txfm.h b/libaom/av1/common/av1_txfm.h
index 59d64ca..20049b6 100644
--- a/libaom/av1/common/av1_txfm.h
+++ b/libaom/av1/common/av1_txfm.h
@@ -59,7 +59,9 @@ static INLINE int32_t range_check_value(int32_t value, int8_t bit) {
const int64_t min_value = -(1LL << (bit - 1));
if (value < min_value || value > max_value) {
fprintf(stderr, "coeff out of bit range, value: %d bit %d\n", value, bit);
+#if !CONFIG_AV1_ENCODER
assert(0);
+#endif
}
#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
#if DO_RANGE_CHECK_CLAMP
@@ -110,7 +112,7 @@ typedef void (*TxfmFunc)(const int32_t *input, int32_t *output, int8_t cos_bit,
typedef void (*FwdTxfm2dFunc)(const int16_t *input, int32_t *output, int stride,
TX_TYPE tx_type, int bd);
-typedef enum TXFM_TYPE {
+enum {
TXFM_TYPE_DCT4,
TXFM_TYPE_DCT8,
TXFM_TYPE_DCT16,
@@ -125,7 +127,7 @@ typedef enum TXFM_TYPE {
TXFM_TYPE_IDENTITY32,
TXFM_TYPES,
TXFM_TYPE_INVALID,
-} TXFM_TYPE;
+} UENUM1BYTE(TXFM_TYPE);
typedef struct TXFM_2D_FLIP_CFG {
TX_SIZE tx_size;
diff --git a/libaom/av1/common/blockd.h b/libaom/av1/common/blockd.h
index d6727b8..91ef3df 100644
--- a/libaom/av1/common/blockd.h
+++ b/libaom/av1/common/blockd.h
@@ -38,19 +38,19 @@ extern "C" {
#define MAX_DIFFWTD_MASK_BITS 1
// DIFFWTD_MASK_TYPES should not surpass 1 << MAX_DIFFWTD_MASK_BITS
-typedef enum ATTRIBUTE_PACKED {
+enum {
DIFFWTD_38 = 0,
DIFFWTD_38_INV,
DIFFWTD_MASK_TYPES,
-} DIFFWTD_MASK_TYPE;
+} UENUM1BYTE(DIFFWTD_MASK_TYPE);
-typedef enum ATTRIBUTE_PACKED {
+enum {
KEY_FRAME = 0,
INTER_FRAME = 1,
INTRA_ONLY_FRAME = 2, // replaces intra-only
S_FRAME = 3,
FRAME_TYPES,
-} FRAME_TYPE;
+} UENUM1BYTE(FRAME_TYPE);
static INLINE int is_comp_ref_allowed(BLOCK_SIZE bsize) {
return AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8;
@@ -157,15 +157,15 @@ static INLINE int is_masked_compound_type(COMPOUND_TYPE type) {
is a single probability table. */
typedef struct {
- // Number of base colors for Y (0) and UV (1)
- uint8_t palette_size[2];
// Value of base colors for Y, U, and V
uint16_t palette_colors[3 * PALETTE_MAX_SIZE];
+ // Number of base colors for Y (0) and UV (1)
+ uint8_t palette_size[2];
} PALETTE_MODE_INFO;
typedef struct {
- uint8_t use_filter_intra;
FILTER_INTRA_MODE filter_intra_mode;
+ uint8_t use_filter_intra;
} FILTER_INTRA_MODE_INFO;
static const PREDICTION_MODE fimode_to_intradir[FILTER_INTRA_MODES] = {
@@ -190,11 +190,6 @@ typedef struct RD_STATS {
int64_t ref_rdcost;
int zero_rate;
uint8_t invalid_rate;
-#if CONFIG_ONE_PASS_SVM
- int eob, eob_0, eob_1, eob_2, eob_3;
- int64_t rd, rd_0, rd_1, rd_2, rd_3;
- int64_t y_sse, sse_0, sse_1, sse_2, sse_3;
-#endif
#if CONFIG_RD_DEBUG
int txb_coeff_cost[MAX_MB_PLANE];
int txb_coeff_cost_map[MAX_MB_PLANE][TXB_COEFF_COST_MAP_SIZE]
@@ -205,10 +200,10 @@ typedef struct RD_STATS {
// This struct is used to group function args that are commonly
// sent together in functions related to interinter compound modes
typedef struct {
+ uint8_t *seg_mask;
int wedge_index;
int wedge_sign;
DIFFWTD_MASK_TYPE mask_type;
- uint8_t *seg_mask;
COMPOUND_TYPE type;
} INTERINTER_COMPOUND_DATA;
@@ -216,48 +211,18 @@ typedef struct {
#define TXK_TYPE_BUF_LEN 64
// This structure now relates to 4x4 block regions.
typedef struct MB_MODE_INFO {
- // Common for both INTER and INTRA blocks
- BLOCK_SIZE sb_type;
- PREDICTION_MODE mode;
- TX_SIZE tx_size;
- uint8_t inter_tx_size[INTER_TX_SIZE_BUF_LEN];
- int8_t skip;
- int8_t skip_mode;
- int8_t segment_id;
- int8_t seg_id_predicted; // valid only when temporal_update is enabled
-
- // Only for INTRA blocks
- UV_PREDICTION_MODE uv_mode;
-
PALETTE_MODE_INFO palette_mode_info;
- uint8_t use_intrabc;
-
+ WarpedMotionParams wm_params;
+ // interinter members
+ INTERINTER_COMPOUND_DATA interinter_comp;
+ FILTER_INTRA_MODE_INFO filter_intra_mode_info;
+ int_mv mv[2];
// Only for INTER blocks
InterpFilters interp_filters;
- MV_REFERENCE_FRAME ref_frame[2];
-
- TX_TYPE txk_type[TXK_TYPE_BUF_LEN];
-
- FILTER_INTRA_MODE_INFO filter_intra_mode_info;
-
- // The actual prediction angle is the base angle + (angle_delta * step).
- int8_t angle_delta[PLANE_TYPES];
-
- // interintra members
- INTERINTRA_MODE interintra_mode;
// TODO(debargha): Consolidate these flags
- int use_wedge_interintra;
int interintra_wedge_index;
int interintra_wedge_sign;
- // interinter members
- INTERINTER_COMPOUND_DATA interinter_comp;
- MOTION_MODE motion_mode;
int overlappable_neighbors[2];
- int_mv mv[2];
- uint8_t ref_mv_idx;
- PARTITION_TYPE partition;
- /* deringing gain *per-superblock* */
- int8_t cdef_strength;
int current_qindex;
int delta_lf_from_base;
int delta_lf[FRAME_LF_COUNT];
@@ -267,15 +232,43 @@ typedef struct MB_MODE_INFO {
int mi_col;
#endif
int num_proj_ref;
- WarpedMotionParams wm_params;
// Index of the alpha Cb and alpha Cr combination
int cfl_alpha_idx;
// Joint sign of alpha Cb and alpha Cr
int cfl_alpha_signs;
- int compound_idx;
+ // Indicate if masked compound is used(1) or not(0).
int comp_group_idx;
+ // If comp_group_idx=0, indicate if dist_wtd_comp(0) or avg_comp(1) is used.
+ int compound_idx;
+#if CONFIG_INSPECTION
+ int16_t tx_skip[TXK_TYPE_BUF_LEN];
+#endif
+ // Common for both INTER and INTRA blocks
+ BLOCK_SIZE sb_type;
+ PREDICTION_MODE mode;
+ // Only for INTRA blocks
+ UV_PREDICTION_MODE uv_mode;
+ // interintra members
+ INTERINTRA_MODE interintra_mode;
+ MOTION_MODE motion_mode;
+ PARTITION_TYPE partition;
+ TX_TYPE txk_type[TXK_TYPE_BUF_LEN];
+ MV_REFERENCE_FRAME ref_frame[2];
+ int8_t use_wedge_interintra;
+ int8_t skip;
+ int8_t skip_mode;
+ uint8_t inter_tx_size[INTER_TX_SIZE_BUF_LEN];
+ TX_SIZE tx_size;
+ int8_t segment_id;
+ int8_t seg_id_predicted; // valid only when temporal_update is enabled
+ uint8_t use_intrabc;
+ // The actual prediction angle is the base angle + (angle_delta * step).
+ int8_t angle_delta[PLANE_TYPES];
+ /* deringing gain *per-superblock* */
+ int8_t cdef_strength;
+ uint8_t ref_mv_idx;
} MB_MODE_INFO;
static INLINE int is_intrabc_block(const MB_MODE_INFO *mbmi) {
@@ -375,7 +368,7 @@ static INLINE void mi_to_pixel_loc(int *pixel_c, int *pixel_r, int mi_col,
}
#endif
-enum ATTRIBUTE_PACKED mv_precision { MV_PRECISION_Q3, MV_PRECISION_Q4 };
+enum { MV_PRECISION_Q3, MV_PRECISION_Q4 } UENUM1BYTE(mv_precision);
struct buf_2d {
uint8_t *buf;
@@ -431,14 +424,6 @@ typedef struct macroblockd_plane {
#define BLOCK_OFFSET(x, i) \
((x) + (i) * (1 << (tx_size_wide_log2[0] + tx_size_high_log2[0])))
-struct RefCntBuffer;
-
-typedef struct RefBuffer {
- int map_idx; // frame map idx
- struct RefCntBuffer *buf;
- struct scale_factors sf;
-} RefBuffer;
-
typedef struct {
DECLARE_ALIGNED(16, InterpKernel, vfilter);
DECLARE_ALIGNED(16, InterpKernel, hfilter);
@@ -494,11 +479,13 @@ typedef struct cfl_ctx {
int is_chroma_reference;
} CFL_CTX;
-typedef struct jnt_comp_params {
- int use_jnt_comp_avg;
+typedef struct dist_wtd_comp_params {
+ int use_dist_wtd_comp_avg;
int fwd_offset;
int bck_offset;
-} JNT_COMP_PARAMS;
+} DIST_WTD_COMP_PARAMS;
+
+struct scale_factors;
// Most/all of the pointers are mere pointers to actual arrays are allocated
// elsewhere. This is mostly for coding convenience.
@@ -526,8 +513,8 @@ typedef struct macroblockd {
int mb_to_top_edge;
int mb_to_bottom_edge;
- /* pointers to reference frames */
- const RefBuffer *block_refs[2];
+ /* pointers to reference frame scale factors */
+ const struct scale_factors *block_ref_scale_factors[2];
/* pointer to current frame */
const YV12_BUFFER_CONFIG *cur_buf;
@@ -596,7 +583,7 @@ typedef struct macroblockd {
uint8_t *mc_buf[2];
CFL_CTX cfl;
- JNT_COMP_PARAMS jcp_param;
+ DIST_WTD_COMP_PARAMS jcp_param;
uint16_t cb_offset[MAX_MB_PLANE];
uint16_t txb_offset[MAX_MB_PLANE];
@@ -606,7 +593,7 @@ typedef struct macroblockd {
uint8_t *tmp_obmc_bufs[2];
} MACROBLOCKD;
-static INLINE int get_bitdepth_data_path_index(const MACROBLOCKD *xd) {
+static INLINE int is_cur_buf_hbd(const MACROBLOCKD *xd) {
return xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH ? 1 : 0;
}
@@ -781,11 +768,13 @@ static INLINE int av1_raster_order_to_block_index(TX_SIZE tx_size,
static INLINE TX_TYPE get_default_tx_type(PLANE_TYPE plane_type,
const MACROBLOCKD *xd,
- TX_SIZE tx_size) {
+ TX_SIZE tx_size,
+ int is_screen_content_type) {
const MB_MODE_INFO *const mbmi = xd->mi[0];
if (is_inter_block(mbmi) || plane_type != PLANE_TYPE_Y ||
- xd->lossless[mbmi->segment_id] || tx_size >= TX_32X32)
+ xd->lossless[mbmi->segment_id] || tx_size >= TX_32X32 ||
+ is_screen_content_type)
return DCT_DCT;
return intra_mode_to_tx_type(mbmi, plane_type);
@@ -1049,7 +1038,8 @@ motion_mode_allowed(const WarpedMotionParams *gm_params, const MACROBLOCKD *xd,
if (!check_num_overlappable_neighbors(mbmi)) return SIMPLE_TRANSLATION;
assert(!has_second_ref(mbmi));
if (mbmi->num_proj_ref >= 1 &&
- (allow_warped_motion && !av1_is_scaled(&(xd->block_refs[0]->sf)))) {
+ (allow_warped_motion &&
+ !av1_is_scaled(xd->block_ref_scale_factors[0]))) {
if (xd->cur_frame_force_integer_mv) {
return OBMC_CAUSAL;
}
diff --git a/libaom/av1/common/cdef.c b/libaom/av1/common/cdef.c
index 556dede..63f9883 100644
--- a/libaom/av1/common/cdef.c
+++ b/libaom/av1/common/cdef.c
@@ -80,7 +80,6 @@ int sb_compute_cdef_list(const AV1_COMMON *const cm, int mi_row, int mi_col,
if (!is_8x8_block_skip(grid, mi_row + r, mi_col + c, cm->mi_stride)) {
dlist[count].by = r >> r_shift;
dlist[count].bx = c >> c_shift;
- dlist[count].skip = 0;
count++;
}
}
diff --git a/libaom/av1/common/cdef_block.c b/libaom/av1/common/cdef_block.c
index 845df37..dfd5882 100644
--- a/libaom/av1/common/cdef_block.c
+++ b/libaom/av1/common/cdef_block.c
@@ -232,8 +232,8 @@ void cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, uint16_t *in,
}
for (bi = 0; bi < cdef_count; bi++) {
- int t = dlist[bi].skip ? 0 : pri_strength;
- int s = dlist[bi].skip ? 0 : sec_strength;
+ int t = pri_strength;
+ int s = sec_strength;
by = dlist[bi].by;
bx = dlist[bi].bx;
if (dst8)
diff --git a/libaom/av1/common/cdef_block.h b/libaom/av1/common/cdef_block.h
index 0e921e0..8321d48 100644
--- a/libaom/av1/common/cdef_block.h
+++ b/libaom/av1/common/cdef_block.h
@@ -38,7 +38,6 @@ DECLARE_ALIGNED(16, extern const int, cdef_directions[8][2]);
typedef struct {
uint8_t by;
uint8_t bx;
- uint8_t skip;
} cdef_list;
typedef void (*cdef_filter_block_func)(uint8_t *dst8, uint16_t *dst16,
diff --git a/libaom/av1/common/cfl.c b/libaom/av1/common/cfl.c
index 99410be..65e18e8 100644
--- a/libaom/av1/common/cfl.c
+++ b/libaom/av1/common/cfl.c
@@ -37,7 +37,7 @@ void cfl_store_dc_pred(MACROBLOCKD *const xd, const uint8_t *input,
assert(pred_plane < CFL_PRED_PLANES);
assert(width <= CFL_BUF_LINE);
- if (get_bitdepth_data_path_index(xd)) {
+ if (is_cur_buf_hbd(xd)) {
uint16_t *const input_16 = CONVERT_TO_SHORTPTR(input);
memcpy(xd->cfl.dc_pred_cache[pred_plane], input_16, width << 1);
return;
@@ -69,7 +69,7 @@ void cfl_load_dc_pred(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
assert(pred_plane < CFL_PRED_PLANES);
assert(width <= CFL_BUF_LINE);
assert(height <= CFL_BUF_LINE);
- if (get_bitdepth_data_path_index(xd)) {
+ if (is_cur_buf_hbd(xd)) {
uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst);
cfl_load_dc_pred_hbd(xd->cfl.dc_pred_cache[pred_plane], dst_16, dst_stride,
width, height);
@@ -196,7 +196,7 @@ void cfl_predict_block(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
cfl_idx_to_alpha(mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, plane - 1);
assert((tx_size_high[tx_size] - 1) * CFL_BUF_LINE + tx_size_wide[tx_size] <=
CFL_BUF_SQUARE);
- if (get_bitdepth_data_path_index(xd)) {
+ if (is_cur_buf_hbd(xd)) {
uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst);
get_predict_hbd_fn(tx_size)(cfl->ac_buf_q3, dst_16, dst_stride, alpha_q3,
xd->bd);
@@ -388,8 +388,7 @@ void cfl_store_tx(MACROBLOCKD *const xd, int row, int col, TX_SIZE tx_size,
assert(!((row & 1) && tx_size_high[tx_size] != 4));
sub8x8_adjust_offset(cfl, &row, &col);
}
- cfl_store(cfl, dst, pd->dst.stride, row, col, tx_size,
- get_bitdepth_data_path_index(xd));
+ cfl_store(cfl, dst, pd->dst.stride, row, col, tx_size, is_cur_buf_hbd(xd));
}
void cfl_store_block(MACROBLOCKD *const xd, BLOCK_SIZE bsize, TX_SIZE tx_size) {
@@ -405,5 +404,5 @@ void cfl_store_block(MACROBLOCKD *const xd, BLOCK_SIZE bsize, TX_SIZE tx_size) {
const int height = max_intra_block_height(xd, bsize, AOM_PLANE_Y, tx_size);
tx_size = get_tx_size(width, height);
cfl_store(cfl, pd->dst.buf, pd->dst.stride, row, col, tx_size,
- get_bitdepth_data_path_index(xd));
+ is_cur_buf_hbd(xd));
}
diff --git a/libaom/av1/common/convolve.c b/libaom/av1/common/convolve.c
index 8ba3ed4..5a55ece 100644
--- a/libaom/av1/common/convolve.c
+++ b/libaom/av1/common/convolve.c
@@ -238,16 +238,16 @@ void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
(void)conv_params;
for (int y = 0; y < h; ++y) {
- memcpy(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0]));
+ memmove(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0]));
}
}
-void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8,
- int dst8_stride, int w, int h,
- const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride,
+ uint8_t *dst8, int dst8_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
CONV_BUF_TYPE *dst = conv_params->dst;
int dst_stride = conv_params->dst_stride;
int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
@@ -290,7 +290,7 @@ void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8,
CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
if (conv_params->do_average) {
int32_t tmp = dst[y * dst_stride + x];
- if (conv_params->use_jnt_comp_avg) {
+ if (conv_params->use_dist_wtd_comp_avg) {
tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
tmp = tmp >> DIST_PRECISION_BITS;
} else {
@@ -308,12 +308,12 @@ void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8,
}
}
-void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst8,
- int dst8_stride, int w, int h,
- const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride,
+ uint8_t *dst8, int dst8_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
CONV_BUF_TYPE *dst = conv_params->dst;
int dst_stride = conv_params->dst_stride;
const int fo_vert = filter_params_y->taps / 2 - 1;
@@ -341,7 +341,7 @@ void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst8,
if (conv_params->do_average) {
int32_t tmp = dst[y * dst_stride + x];
- if (conv_params->use_jnt_comp_avg) {
+ if (conv_params->use_dist_wtd_comp_avg) {
tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
tmp = tmp >> DIST_PRECISION_BITS;
} else {
@@ -358,12 +358,12 @@ void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst8,
}
}
-void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst8,
- int dst8_stride, int w, int h,
- const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride,
+ uint8_t *dst8, int dst8_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
CONV_BUF_TYPE *dst = conv_params->dst;
int dst_stride = conv_params->dst_stride;
const int fo_horiz = filter_params_x->taps / 2 - 1;
@@ -391,7 +391,7 @@ void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst8,
if (conv_params->do_average) {
int32_t tmp = dst[y * dst_stride + x];
- if (conv_params->use_jnt_comp_avg) {
+ if (conv_params->use_dist_wtd_comp_avg) {
tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
tmp = tmp >> DIST_PRECISION_BITS;
} else {
@@ -408,12 +408,11 @@ void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst8,
}
}
-void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride,
- uint8_t *dst8, int dst8_stride, int w, int h,
- const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_2d_copy_c(
+ const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ const int subpel_y_q4, ConvolveParams *conv_params) {
CONV_BUF_TYPE *dst = conv_params->dst;
int dst_stride = conv_params->dst_stride;
const int bits =
@@ -434,7 +433,7 @@ void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride,
if (conv_params->do_average) {
int32_t tmp = dst[y * dst_stride + x];
- if (conv_params->use_jnt_comp_avg) {
+ if (conv_params->use_dist_wtd_comp_avg) {
tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
tmp = tmp >> DIST_PRECISION_BITS;
} else {
@@ -511,7 +510,7 @@ void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst8,
if (conv_params->is_compound) {
if (conv_params->do_average) {
int32_t tmp = dst16[y * dst16_stride + x];
- if (conv_params->use_jnt_comp_avg) {
+ if (conv_params->use_dist_wtd_comp_avg) {
tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
tmp = tmp >> DIST_PRECISION_BITS;
} else {
@@ -632,7 +631,7 @@ void av1_highbd_convolve_2d_copy_sr_c(
(void)bd;
for (int y = 0; y < h; ++y) {
- memcpy(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0]));
+ memmove(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0]));
}
}
@@ -748,13 +747,11 @@ void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride,
}
}
-void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride,
- uint16_t *dst16, int dst16_stride, int w,
- int h,
- const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params, int bd) {
+void av1_highbd_dist_wtd_convolve_2d_c(
+ const uint16_t *src, int src_stride, uint16_t *dst16, int dst16_stride,
+ int w, int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
int x, y, k;
int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
CONV_BUF_TYPE *dst = conv_params->dst;
@@ -799,7 +796,7 @@ void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride,
CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
if (conv_params->do_average) {
int32_t tmp = dst[y * dst_stride + x];
- if (conv_params->use_jnt_comp_avg) {
+ if (conv_params->use_dist_wtd_comp_avg) {
tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
tmp = tmp >> DIST_PRECISION_BITS;
} else {
@@ -817,13 +814,11 @@ void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride,
}
}
-void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride,
- uint16_t *dst16, int dst16_stride, int w,
- int h,
- const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params, int bd) {
+void av1_highbd_dist_wtd_convolve_x_c(
+ const uint16_t *src, int src_stride, uint16_t *dst16, int dst16_stride,
+ int w, int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
CONV_BUF_TYPE *dst = conv_params->dst;
int dst_stride = conv_params->dst_stride;
const int fo_horiz = filter_params_x->taps / 2 - 1;
@@ -851,7 +846,7 @@ void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride,
if (conv_params->do_average) {
int32_t tmp = dst[y * dst_stride + x];
- if (conv_params->use_jnt_comp_avg) {
+ if (conv_params->use_dist_wtd_comp_avg) {
tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
tmp = tmp >> DIST_PRECISION_BITS;
} else {
@@ -868,13 +863,11 @@ void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride,
}
}
-void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride,
- uint16_t *dst16, int dst16_stride, int w,
- int h,
- const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params, int bd) {
+void av1_highbd_dist_wtd_convolve_y_c(
+ const uint16_t *src, int src_stride, uint16_t *dst16, int dst16_stride,
+ int w, int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
CONV_BUF_TYPE *dst = conv_params->dst;
int dst_stride = conv_params->dst_stride;
const int fo_vert = filter_params_y->taps / 2 - 1;
@@ -902,7 +895,7 @@ void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride,
if (conv_params->do_average) {
int32_t tmp = dst[y * dst_stride + x];
- if (conv_params->use_jnt_comp_avg) {
+ if (conv_params->use_dist_wtd_comp_avg) {
tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
tmp = tmp >> DIST_PRECISION_BITS;
} else {
@@ -919,7 +912,7 @@ void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride,
}
}
-void av1_highbd_jnt_convolve_2d_copy_c(
+void av1_highbd_dist_wtd_convolve_2d_copy_c(
const uint16_t *src, int src_stride, uint16_t *dst16, int dst16_stride,
int w, int h, const InterpFilterParams *filter_params_x,
const InterpFilterParams *filter_params_y, const int subpel_x_q4,
@@ -943,7 +936,7 @@ void av1_highbd_jnt_convolve_2d_copy_c(
res += round_offset;
if (conv_params->do_average) {
int32_t tmp = dst[y * dst_stride + x];
- if (conv_params->use_jnt_comp_avg) {
+ if (conv_params->use_dist_wtd_comp_avg) {
tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
tmp = tmp >> DIST_PRECISION_BITS;
} else {
@@ -1019,7 +1012,7 @@ void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
if (conv_params->is_compound) {
if (conv_params->do_average) {
int32_t tmp = dst16[y * dst16_stride + x];
- if (conv_params->use_jnt_comp_avg) {
+ if (conv_params->use_dist_wtd_comp_avg) {
tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
tmp = tmp >> DIST_PRECISION_BITS;
} else {
diff --git a/libaom/av1/common/convolve.h b/libaom/av1/common/convolve.h
index d0972db..e5479e6 100644
--- a/libaom/av1/common/convolve.h
+++ b/libaom/av1/common/convolve.h
@@ -26,7 +26,7 @@ typedef struct ConvolveParams {
int round_1;
int plane;
int is_compound;
- int use_jnt_comp_avg;
+ int use_dist_wtd_comp_avg;
int fwd_offset;
int bck_offset;
} ConvolveParams;
diff --git a/libaom/av1/common/debugmodes.c b/libaom/av1/common/debugmodes.c
index 5242f19..b26c7dd 100644
--- a/libaom/av1/common/debugmodes.c
+++ b/libaom/av1/common/debugmodes.c
@@ -40,7 +40,7 @@ static void print_mi_data(AV1_COMMON *cm, FILE *file, const char *descriptor,
mi++;
}
fprintf(file, "\n");
- mi += MAX_MIB_SIZE;
+ mi += cm->mi_stride - cols;
}
fprintf(file, "\n");
}
@@ -68,7 +68,7 @@ void av1_print_modes_and_motion_vectors(AV1_COMMON *cm, const char *file) {
mi++;
}
fprintf(mvs, "\n");
- mi += MAX_MIB_SIZE;
+ mi += cm->mi_stride - cols;
}
fprintf(mvs, "\n");
@@ -82,7 +82,7 @@ void av1_print_modes_and_motion_vectors(AV1_COMMON *cm, const char *file) {
mi++;
}
fprintf(mvs, "\n");
- mi += MAX_MIB_SIZE;
+ mi += cm->mi_stride - cols;
}
fprintf(mvs, "\n");
diff --git a/libaom/av1/common/entropy.c b/libaom/av1/common/entropy.c
index 4f95ef6..f63ac98 100644
--- a/libaom/av1/common/entropy.c
+++ b/libaom/av1/common/entropy.c
@@ -101,7 +101,7 @@ void av1_reset_cdf_symbol_counters(FRAME_CONTEXT *fc) {
RESET_CDF_COUNTER(fc->refmv_cdf, 2);
RESET_CDF_COUNTER(fc->drl_cdf, 2);
RESET_CDF_COUNTER(fc->inter_compound_mode_cdf, INTER_COMPOUND_MODES);
- RESET_CDF_COUNTER(fc->compound_type_cdf, COMPOUND_TYPES - 1);
+ RESET_CDF_COUNTER(fc->compound_type_cdf, MASKED_COMPOUND_TYPES);
RESET_CDF_COUNTER(fc->wedge_idx_cdf, 16);
RESET_CDF_COUNTER(fc->interintra_cdf, 2);
RESET_CDF_COUNTER(fc->wedge_interintra_cdf, 2);
diff --git a/libaom/av1/common/entropy.h b/libaom/av1/common/entropy.h
index 991692c..41218d3 100644
--- a/libaom/av1/common/entropy.h
+++ b/libaom/av1/common/entropy.h
@@ -54,12 +54,12 @@ extern "C" {
#define BASE_CONTEXT_POSITION_NUM 12
-typedef enum TX_CLASS {
+enum {
TX_CLASS_2D = 0,
TX_CLASS_HORIZ = 1,
TX_CLASS_VERT = 2,
TX_CLASSES = 3,
-} TX_CLASS;
+} UENUM1BYTE(TX_CLASS);
#define DCT_MAX_VALUE 16384
#define DCT_MAX_VALUE_HIGH10 65536
diff --git a/libaom/av1/common/entropymode.c b/libaom/av1/common/entropymode.c
index 51bbea7..90702ac 100644
--- a/libaom/av1/common/entropymode.c
+++ b/libaom/av1/common/entropymode.c
@@ -488,17 +488,17 @@ static const aom_cdf_prob
{ AOM_CDF2(16384) }
};
-static const aom_cdf_prob
- default_compound_type_cdf[BLOCK_SIZES_ALL][CDF_SIZE(COMPOUND_TYPES - 1)] = {
- { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
- { AOM_CDF2(23431) }, { AOM_CDF2(13171) }, { AOM_CDF2(11470) },
- { AOM_CDF2(9770) }, { AOM_CDF2(9100) }, { AOM_CDF2(8233) },
- { AOM_CDF2(6172) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
- { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
- { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
- { AOM_CDF2(11820) }, { AOM_CDF2(7701) }, { AOM_CDF2(16384) },
- { AOM_CDF2(16384) }
- };
+static const aom_cdf_prob default_compound_type_cdf[BLOCK_SIZES_ALL][CDF_SIZE(
+ MASKED_COMPOUND_TYPES)] = {
+ { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+ { AOM_CDF2(23431) }, { AOM_CDF2(13171) }, { AOM_CDF2(11470) },
+ { AOM_CDF2(9770) }, { AOM_CDF2(9100) }, { AOM_CDF2(8233) },
+ { AOM_CDF2(6172) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+ { AOM_CDF2(11820) }, { AOM_CDF2(7701) }, { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) }
+};
static const aom_cdf_prob default_wedge_idx_cdf[BLOCK_SIZES_ALL][CDF_SIZE(16)] =
{ { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
@@ -1072,9 +1072,9 @@ void av1_setup_frame_contexts(AV1_COMMON *cm) {
// TODO(jack.haughton@argondesign.com): don't think this should be necessary,
// but could do with fuller testing
if (cm->large_scale_tile) {
- for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
- if (cm->current_frame.frame_refs[i].buf != NULL)
- cm->current_frame.frame_refs[i].buf->frame_context = *cm->fc;
+ for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+ RefCntBuffer *const buf = get_ref_frame_buf(cm, i);
+ if (buf != NULL) buf->frame_context = *cm->fc;
}
for (int i = 0; i < FRAME_BUFFERS; ++i)
cm->buffer_pool->frame_bufs[i].frame_context = *cm->fc;
@@ -1086,10 +1086,8 @@ void av1_setup_past_independence(AV1_COMMON *cm) {
// Features disabled, 0, with delta coding (Default state).
av1_clearall_segfeatures(&cm->seg);
- cm->current_frame_seg_map = cm->cur_frame->seg_map;
-
- if (cm->current_frame_seg_map)
- memset(cm->current_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols));
+ if (cm->cur_frame->seg_map)
+ memset(cm->cur_frame->seg_map, 0, (cm->mi_rows * cm->mi_cols));
// reset mode ref deltas
av1_set_default_ref_deltas(cm->cur_frame->ref_deltas);
@@ -1099,7 +1097,6 @@ void av1_setup_past_independence(AV1_COMMON *cm) {
av1_default_coef_probs(cm);
init_mode_probs(cm->fc);
av1_init_mv_probs(cm);
- av1_init_lv_map(cm);
cm->fc->initialized = 1;
av1_setup_frame_contexts(cm);
diff --git a/libaom/av1/common/entropymode.h b/libaom/av1/common/entropymode.h
index 7047f34..69b5218 100644
--- a/libaom/av1/common/entropymode.h
+++ b/libaom/av1/common/entropymode.h
@@ -92,7 +92,8 @@ typedef struct frame_contexts {
aom_cdf_prob inter_compound_mode_cdf[INTER_MODE_CONTEXTS]
[CDF_SIZE(INTER_COMPOUND_MODES)];
- aom_cdf_prob compound_type_cdf[BLOCK_SIZES_ALL][CDF_SIZE(COMPOUND_TYPES - 1)];
+ aom_cdf_prob compound_type_cdf[BLOCK_SIZES_ALL]
+ [CDF_SIZE(MASKED_COMPOUND_TYPES)];
aom_cdf_prob wedge_idx_cdf[BLOCK_SIZES_ALL][CDF_SIZE(16)];
aom_cdf_prob interintra_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(2)];
aom_cdf_prob wedge_interintra_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)];
diff --git a/libaom/av1/common/entropymv.h b/libaom/av1/common/entropymv.h
index fa818a2..cddc807 100644
--- a/libaom/av1/common/entropymv.h
+++ b/libaom/av1/common/entropymv.h
@@ -30,12 +30,12 @@ void av1_init_mv_probs(struct AV1Common *cm);
/* Symbols for coding which components are zero jointly */
#define MV_JOINTS 4
-typedef enum {
+enum {
MV_JOINT_ZERO = 0, /* Zero vector */
MV_JOINT_HNZVZ = 1, /* Vert zero, hor nonzero */
MV_JOINT_HZVNZ = 2, /* Hor zero, vert nonzero */
MV_JOINT_HNZVNZ = 3, /* Both components nonzero */
-} MV_JOINT_TYPE;
+} UENUM1BYTE(MV_JOINT_TYPE);
static INLINE int mv_joint_vertical(MV_JOINT_TYPE type) {
return type == MV_JOINT_HZVNZ || type == MV_JOINT_HNZVNZ;
@@ -47,7 +47,7 @@ static INLINE int mv_joint_horizontal(MV_JOINT_TYPE type) {
/* Symbols for coding magnitude class of nonzero components */
#define MV_CLASSES 11
-typedef enum {
+enum {
MV_CLASS_0 = 0, /* (0, 2] integer pel */
MV_CLASS_1 = 1, /* (2, 4] integer pel */
MV_CLASS_2 = 2, /* (4, 8] integer pel */
@@ -59,7 +59,7 @@ typedef enum {
MV_CLASS_8 = 8, /* (256, 512] integer pel */
MV_CLASS_9 = 9, /* (512, 1024] integer pel */
MV_CLASS_10 = 10, /* (1024,2048] integer pel */
-} MV_CLASS_TYPE;
+} UENUM1BYTE(MV_CLASS_TYPE);
#define CLASS0_BITS 1 /* bits at integer precision for class 0 */
#define CLASS0_SIZE (1 << CLASS0_BITS)
@@ -91,11 +91,11 @@ typedef struct {
nmv_component comps[2];
} nmv_context;
-typedef enum {
+enum {
MV_SUBPEL_NONE = -1,
MV_SUBPEL_LOW_PRECISION = 0,
MV_SUBPEL_HIGH_PRECISION,
-} MvSubpelPrecision;
+} SENUM1BYTE(MvSubpelPrecision);
#ifdef __cplusplus
} // extern "C"
diff --git a/libaom/av1/common/enums.h b/libaom/av1/common/enums.h
index eb17c58..fbacc89 100644
--- a/libaom/av1/common/enums.h
+++ b/libaom/av1/common/enums.h
@@ -16,6 +16,7 @@
#include "aom/aom_codec.h"
#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
#ifdef __cplusplus
extern "C" {
@@ -84,21 +85,12 @@ extern "C" {
// Profile 2. 8-bit and 10-bit 4:2:2
// 12-bit 4:0:0, 4:2:2 and 4:4:4
// Since we have three bits for the profiles, it can be extended later.
-typedef enum BITSTREAM_PROFILE {
+enum {
PROFILE_0,
PROFILE_1,
PROFILE_2,
MAX_PROFILES,
-} BITSTREAM_PROFILE;
-
-#define LEVEL_MAJOR_BITS 3
-#define LEVEL_MINOR_BITS 2
-#define LEVEL_BITS (LEVEL_MAJOR_BITS + LEVEL_MINOR_BITS)
-
-#define LEVEL_MAJOR_MIN 2
-#define LEVEL_MAJOR_MAX ((1 << LEVEL_MAJOR_BITS) - 1 + LEVEL_MAJOR_MIN)
-#define LEVEL_MINOR_MIN 0
-#define LEVEL_MINOR_MAX ((1 << LEVEL_MINOR_BITS) - 1)
+} SENUM1BYTE(BITSTREAM_PROFILE);
#define OP_POINTS_CNT_MINUS_1_BITS 5
#define OP_POINTS_IDC_BITS 12
@@ -138,7 +130,7 @@ typedef enum ATTRIBUTE_PACKED {
// 4X4, 8X8, 16X16, 32X32, 64X64, 128X128
#define SQR_BLOCK_SIZES 6
-typedef enum ATTRIBUTE_PACKED {
+enum {
PARTITION_NONE,
PARTITION_HORZ,
PARTITION_VERT,
@@ -152,7 +144,7 @@ typedef enum ATTRIBUTE_PACKED {
EXT_PARTITION_TYPES,
PARTITION_TYPES = PARTITION_SPLIT + 1,
PARTITION_INVALID = 255
-} PARTITION_TYPE;
+} UENUM1BYTE(PARTITION_TYPE);
typedef char PARTITION_CONTEXT;
#define PARTITION_PLOFFSET 4 // number of probability models per block size
@@ -160,12 +152,7 @@ typedef char PARTITION_CONTEXT;
#define PARTITION_CONTEXTS (PARTITION_BLOCK_SIZES * PARTITION_PLOFFSET)
// block transform size
-#if defined(_MSC_VER)
-typedef uint8_t TX_SIZE;
-enum ATTRIBUTE_PACKED {
-#else
-typedef enum ATTRIBUTE_PACKED {
-#endif
+enum {
TX_4X4, // 4x4 transform
TX_8X8, // 8x8 transform
TX_16X16, // 16x16 transform
@@ -189,11 +176,7 @@ typedef enum ATTRIBUTE_PACKED {
TX_SIZES = TX_4X8, // Does NOT include rectangular transforms
TX_SIZES_LARGEST = TX_64X64,
TX_INVALID = 255 // Invalid transform size
-#if defined(_MSC_VER)
-};
-#else
-} TX_SIZE;
-#endif
+} UENUM1BYTE(TX_SIZE);
#define TX_SIZE_LUMA_MIN (TX_4X4)
/* We don't need to code a transform size unless the allowed size is at least
@@ -215,7 +198,7 @@ typedef enum ATTRIBUTE_PACKED {
#define TX_PAD_HOR 4
// Pad 6 extra rows (2 on top and 4 on bottom) to remove vertical availability
// check.
-#define TX_PAD_TOP 2
+#define TX_PAD_TOP 0
#define TX_PAD_BOTTOM 4
#define TX_PAD_VER (TX_PAD_TOP + TX_PAD_BOTTOM)
// Pad 16 extra bytes to avoid reading overflow in SIMD optimization.
@@ -227,23 +210,23 @@ typedef enum ATTRIBUTE_PACKED {
#define MAX_TX_BLOCKS_IN_MAX_SB (1 << MAX_TX_BLOCKS_IN_MAX_SB_LOG2)
// frame transform mode
-typedef enum ATTRIBUTE_PACKED {
+enum {
ONLY_4X4, // use only 4x4 transform
TX_MODE_LARGEST, // transform size is the largest possible for pu size
TX_MODE_SELECT, // transform specified for each block
TX_MODES,
-} TX_MODE;
+} UENUM1BYTE(TX_MODE);
// 1D tx types
-typedef enum ATTRIBUTE_PACKED {
+enum {
DCT_1D,
ADST_1D,
FLIPADST_1D,
IDTX_1D,
TX_TYPES_1D,
-} TX_TYPE_1D;
+} UENUM1BYTE(TX_TYPE_1D);
-typedef enum ATTRIBUTE_PACKED {
+enum {
DCT_DCT, // DCT in both horizontal and vertical
ADST_DCT, // ADST in vertical, DCT in horizontal
DCT_ADST, // DCT in vertical, ADST in horizontal
@@ -261,9 +244,9 @@ typedef enum ATTRIBUTE_PACKED {
V_FLIPADST, // FLIPADST in vertical, identity in horizontal
H_FLIPADST, // Identity in vertical, FLIPADST in horizontal
TX_TYPES,
-} TX_TYPE;
+} UENUM1BYTE(TX_TYPE);
-typedef enum ATTRIBUTE_PACKED {
+enum {
REG_REG,
REG_SMOOTH,
REG_SHARP,
@@ -273,9 +256,9 @@ typedef enum ATTRIBUTE_PACKED {
SHARP_REG,
SHARP_SMOOTH,
SHARP_SHARP,
-} DUAL_FILTER_TYPE;
+} UENUM1BYTE(DUAL_FILTER_TYPE);
-typedef enum ATTRIBUTE_PACKED {
+enum {
// DCT only
EXT_TX_SET_DCTONLY,
// DCT + Identity only
@@ -289,7 +272,7 @@ typedef enum ATTRIBUTE_PACKED {
// Discrete Trig transforms w/ flip (9) + Identity (1) + 1D Hor/Ver (6)
EXT_TX_SET_ALL16,
EXT_TX_SET_TYPES
-} TxSetType;
+} UENUM1BYTE(TxSetType);
#define IS_2D_TRANSFORM(tx_type) (tx_type < IDTX)
@@ -297,7 +280,7 @@ typedef enum ATTRIBUTE_PACKED {
#define EXT_TX_SETS_INTER 4 // Sets of transform selections for INTER
#define EXT_TX_SETS_INTRA 3 // Sets of transform selections for INTRA
-typedef enum ATTRIBUTE_PACKED {
+enum {
AOM_LAST_FLAG = 1 << 0,
AOM_LAST2_FLAG = 1 << 1,
AOM_LAST3_FLAG = 1 << 2,
@@ -306,19 +289,15 @@ typedef enum ATTRIBUTE_PACKED {
AOM_ALT2_FLAG = 1 << 5,
AOM_ALT_FLAG = 1 << 6,
AOM_REFFRAME_ALL = (1 << 7) - 1
-} AOM_REFFRAME;
+} UENUM1BYTE(AOM_REFFRAME);
-typedef enum ATTRIBUTE_PACKED {
+enum {
UNIDIR_COMP_REFERENCE,
BIDIR_COMP_REFERENCE,
COMP_REFERENCE_TYPES,
-} COMP_REFERENCE_TYPE;
+} UENUM1BYTE(COMP_REFERENCE_TYPE);
-typedef enum ATTRIBUTE_PACKED {
- PLANE_TYPE_Y,
- PLANE_TYPE_UV,
- PLANE_TYPES
-} PLANE_TYPE;
+enum { PLANE_TYPE_Y, PLANE_TYPE_UV, PLANE_TYPES } UENUM1BYTE(PLANE_TYPE);
#define CFL_ALPHABET_SIZE_LOG2 4
#define CFL_ALPHABET_SIZE (1 << CFL_ALPHABET_SIZE_LOG2)
@@ -326,24 +305,20 @@ typedef enum ATTRIBUTE_PACKED {
#define CFL_IDX_U(idx) (idx >> CFL_ALPHABET_SIZE_LOG2)
#define CFL_IDX_V(idx) (idx & (CFL_ALPHABET_SIZE - 1))
-typedef enum ATTRIBUTE_PACKED {
- CFL_PRED_U,
- CFL_PRED_V,
- CFL_PRED_PLANES
-} CFL_PRED_TYPE;
+enum { CFL_PRED_U, CFL_PRED_V, CFL_PRED_PLANES } UENUM1BYTE(CFL_PRED_TYPE);
-typedef enum ATTRIBUTE_PACKED {
+enum {
CFL_SIGN_ZERO,
CFL_SIGN_NEG,
CFL_SIGN_POS,
CFL_SIGNS
-} CFL_SIGN_TYPE;
+} UENUM1BYTE(CFL_SIGN_TYPE);
-typedef enum ATTRIBUTE_PACKED {
+enum {
CFL_DISALLOWED,
CFL_ALLOWED,
CFL_ALLOWED_TYPES
-} CFL_ALLOWED_TYPE;
+} UENUM1BYTE(CFL_ALLOWED_TYPE);
// CFL_SIGN_ZERO,CFL_SIGN_ZERO is invalid
#define CFL_JOINT_SIGNS (CFL_SIGNS * CFL_SIGNS - 1)
@@ -360,12 +335,12 @@ typedef enum ATTRIBUTE_PACKED {
#define CFL_CONTEXT_V(js) \
(CFL_SIGN_V(js) * CFL_SIGNS + CFL_SIGN_U(js) - CFL_SIGNS)
-typedef enum ATTRIBUTE_PACKED {
+enum {
PALETTE_MAP,
COLOR_MAP_TYPES,
-} COLOR_MAP_TYPE;
+} UENUM1BYTE(COLOR_MAP_TYPE);
-typedef enum ATTRIBUTE_PACKED {
+enum {
TWO_COLORS,
THREE_COLORS,
FOUR_COLORS,
@@ -374,9 +349,9 @@ typedef enum ATTRIBUTE_PACKED {
SEVEN_COLORS,
EIGHT_COLORS,
PALETTE_SIZES
-} PALETTE_SIZE;
+} UENUM1BYTE(PALETTE_SIZE);
-typedef enum ATTRIBUTE_PACKED {
+enum {
PALETTE_COLOR_ONE,
PALETTE_COLOR_TWO,
PALETTE_COLOR_THREE,
@@ -386,11 +361,11 @@ typedef enum ATTRIBUTE_PACKED {
PALETTE_COLOR_SEVEN,
PALETTE_COLOR_EIGHT,
PALETTE_COLORS
-} PALETTE_COLOR;
+} UENUM1BYTE(PALETTE_COLOR);
// Note: All directional predictors must be between V_PRED and D67_PRED (both
// inclusive).
-typedef enum ATTRIBUTE_PACKED {
+enum {
DC_PRED, // Average of above and left pixels
V_PRED, // Vertical
H_PRED, // Horizontal
@@ -431,11 +406,11 @@ typedef enum ATTRIBUTE_PACKED {
INTER_MODE_END = MB_MODE_COUNT,
INTRA_MODES = PAETH_PRED + 1, // PAETH_PRED has to be the last intra mode.
INTRA_INVALID = MB_MODE_COUNT // For uv_mode in inter blocks
-} PREDICTION_MODE;
+} UENUM1BYTE(PREDICTION_MODE);
// TODO(ltrudeau) Do we really want to pack this?
// TODO(ltrudeau) Do we match with PREDICTION_MODE?
-typedef enum ATTRIBUTE_PACKED {
+enum {
UV_DC_PRED, // Average of above and left pixels
UV_V_PRED, // Vertical
UV_H_PRED, // Horizontal
@@ -452,38 +427,71 @@ typedef enum ATTRIBUTE_PACKED {
UV_CFL_PRED, // Chroma-from-Luma
UV_INTRA_MODES,
UV_MODE_INVALID, // For uv_mode in inter blocks
-} UV_PREDICTION_MODE;
+} UENUM1BYTE(UV_PREDICTION_MODE);
-typedef enum ATTRIBUTE_PACKED {
+enum {
SIMPLE_TRANSLATION,
OBMC_CAUSAL, // 2-sided OBMC
WARPED_CAUSAL, // 2-sided WARPED
MOTION_MODES
-} MOTION_MODE;
+} UENUM1BYTE(MOTION_MODE);
-typedef enum ATTRIBUTE_PACKED {
+enum {
II_DC_PRED,
II_V_PRED,
II_H_PRED,
II_SMOOTH_PRED,
INTERINTRA_MODES
-} INTERINTRA_MODE;
+} UENUM1BYTE(INTERINTRA_MODE);
-typedef enum ATTRIBUTE_PACKED {
+enum {
COMPOUND_AVERAGE,
+ COMPOUND_DISTWTD,
COMPOUND_WEDGE,
COMPOUND_DIFFWTD,
COMPOUND_TYPES,
-} COMPOUND_TYPE;
+ MASKED_COMPOUND_TYPES = 2,
+} UENUM1BYTE(COMPOUND_TYPE);
-typedef enum ATTRIBUTE_PACKED {
+enum {
FILTER_DC_PRED,
FILTER_V_PRED,
FILTER_H_PRED,
FILTER_D157_PRED,
FILTER_PAETH_PRED,
FILTER_INTRA_MODES,
-} FILTER_INTRA_MODE;
+} UENUM1BYTE(FILTER_INTRA_MODE);
+
+enum {
+ SEQ_LEVEL_2_0,
+ SEQ_LEVEL_2_1,
+ SEQ_LEVEL_2_2,
+ SEQ_LEVEL_2_3,
+ SEQ_LEVEL_3_0,
+ SEQ_LEVEL_3_1,
+ SEQ_LEVEL_3_2,
+ SEQ_LEVEL_3_3,
+ SEQ_LEVEL_4_0,
+ SEQ_LEVEL_4_1,
+ SEQ_LEVEL_4_2,
+ SEQ_LEVEL_4_3,
+ SEQ_LEVEL_5_0,
+ SEQ_LEVEL_5_1,
+ SEQ_LEVEL_5_2,
+ SEQ_LEVEL_5_3,
+ SEQ_LEVEL_6_0,
+ SEQ_LEVEL_6_1,
+ SEQ_LEVEL_6_2,
+ SEQ_LEVEL_6_3,
+ SEQ_LEVEL_7_0,
+ SEQ_LEVEL_7_1,
+ SEQ_LEVEL_7_2,
+ SEQ_LEVEL_7_3,
+ SEQ_LEVELS,
+ SEQ_LEVEL_MAX = 31
+} UENUM1BYTE(AV1_LEVEL);
+
+#define LEVEL_BITS 5
#define DIRECTIONAL_MODES 8
#define MAX_ANGLE_DELTA 3
@@ -540,7 +548,7 @@ typedef enum ATTRIBUTE_PACKED {
typedef uint8_t TXFM_CONTEXT;
// An enum for single reference types (and some derived values).
-enum ATTRIBUTE_PACKED {
+enum {
NONE_FRAME = -1,
INTRA_FRAME,
LAST_FRAME,
@@ -572,14 +580,14 @@ enum ATTRIBUTE_PACKED {
#define REF_FRAMES_LOG2 3
// REF_FRAMES for the cm->ref_frame_map array, 1 scratch frame for the new
-// frame in cm->new_fb_idx, INTER_REFS_PER_FRAME for scaled references on the
-// encoder in the cpi->scaled_ref_idx array.
+// frame in cm->cur_frame, INTER_REFS_PER_FRAME for scaled references on the
+// encoder in the cpi->scaled_ref_buf array.
#define FRAME_BUFFERS (REF_FRAMES + 1 + INTER_REFS_PER_FRAME)
#define FWD_RF_OFFSET(ref) (ref - LAST_FRAME)
#define BWD_RF_OFFSET(ref) (ref - BWDREF_FRAME)
-typedef enum ATTRIBUTE_PACKED {
+enum {
LAST_LAST2_FRAMES, // { LAST_FRAME, LAST2_FRAME }
LAST_LAST3_FRAMES, // { LAST_FRAME, LAST3_FRAME }
LAST_GOLDEN_FRAMES, // { LAST_FRAME, GOLDEN_FRAME }
@@ -593,7 +601,7 @@ typedef enum ATTRIBUTE_PACKED {
// NOTE: UNIDIR_COMP_REFS is the number of uni-directional reference pairs
// that are explicitly signaled.
UNIDIR_COMP_REFS = BWDREF_ALTREF_FRAMES + 1,
-} UNIDIR_COMP_REF;
+} UENUM1BYTE(UNIDIR_COMP_REF);
#define TOTAL_COMP_REFS (FWD_REFS * BWD_REFS + TOTAL_UNIDIR_COMP_REFS)
@@ -608,14 +616,14 @@ typedef enum ATTRIBUTE_PACKED {
// NONE_FRAME to (MODE_CTX_REF_FRAMES - 1). Hence, it is not defined as an enum.
typedef int8_t MV_REFERENCE_FRAME;
-typedef enum ATTRIBUTE_PACKED {
+enum {
RESTORE_NONE,
RESTORE_WIENER,
RESTORE_SGRPROJ,
RESTORE_SWITCHABLE,
RESTORE_SWITCHABLE_TYPES = RESTORE_SWITCHABLE,
RESTORE_TYPES = 4,
-} RestorationType;
+} UENUM1BYTE(RestorationType);
#define SUPERRES_SCALE_BITS 3
#define SUPERRES_SCALE_DENOMINATOR_MIN (SCALE_NUMERATOR + 1)
diff --git a/libaom/av1/common/filter.h b/libaom/av1/common/filter.h
index d7ef5c9..184f5b2 100644
--- a/libaom/av1/common/filter.h
+++ b/libaom/av1/common/filter.h
@@ -37,12 +37,12 @@ typedef enum ATTRIBUTE_PACKED {
EXTRA_FILTERS = INTERP_FILTERS_ALL - SWITCHABLE_FILTERS,
} InterpFilter;
-typedef enum {
+enum {
USE_2_TAPS_ORIG = 0, // This is used in temporal filtering.
USE_2_TAPS,
USE_4_TAPS,
USE_8_TAPS,
-} SUBPEL_SEARCH_TYPE;
+} UENUM1BYTE(SUBPEL_SEARCH_TYPE);
// Pack two InterpFilter's into a uint32_t: since there are at most 10 filters,
// we can use 16 bits for each and have more than enough space. This reduces
diff --git a/libaom/av1/common/idct.c b/libaom/av1/common/idct.c
index 55925a5..bff438f 100644
--- a/libaom/av1/common/idct.c
+++ b/libaom/av1/common/idct.c
@@ -204,7 +204,7 @@ static void init_txfm_param(const MACROBLOCKD *xd, int plane, TX_SIZE tx_size,
txfm_param->eob = eob;
txfm_param->lossless = xd->lossless[xd->mi[0]->segment_id];
txfm_param->bd = xd->bd;
- txfm_param->is_hbd = get_bitdepth_data_path_index(xd);
+ txfm_param->is_hbd = is_cur_buf_hbd(xd);
txfm_param->tx_set_type = av1_get_ext_tx_set_type(
txfm_param->tx_size, is_inter_block(xd->mi[0]), reduced_tx_set);
}
diff --git a/libaom/av1/common/mv.h b/libaom/av1/common/mv.h
index 5b02251..d097f9e 100644
--- a/libaom/av1/common/mv.h
+++ b/libaom/av1/common/mv.h
@@ -56,13 +56,13 @@ typedef struct mv32 {
#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
/* clang-format off */
-typedef enum ATTRIBUTE_PACKED {
+enum {
IDENTITY = 0, // identity transformation, 0-parameter
TRANSLATION = 1, // translational motion 2-parameter
ROTZOOM = 2, // simplified affine with rotation + zoom only, 4-parameter
AFFINE = 3, // affine, 6-parameter
TRANS_TYPES,
-} TransformationType;
+} UENUM1BYTE(TransformationType);
/* clang-format on */
// Number of types used for global motion (must be >= 3 and <= TRANS_TYPES)
@@ -87,18 +87,18 @@ static const int trans_model_params[TRANS_TYPES] = { 0, 2, 4, 6 };
// z . y' = m4 m5 m1 * y
// 1] m6 m7 1) 1]
typedef struct {
- TransformationType wmtype;
int32_t wmmat[8];
int16_t alpha, beta, gamma, delta;
+ TransformationType wmtype;
int8_t invalid;
} WarpedMotionParams;
/* clang-format off */
static const WarpedMotionParams default_warp_params = {
- IDENTITY,
{ 0, 0, (1 << WARPEDMODEL_PREC_BITS), 0, 0, (1 << WARPEDMODEL_PREC_BITS), 0,
0 },
0, 0, 0, 0,
+ IDENTITY,
0,
};
/* clang-format on */
@@ -263,7 +263,7 @@ static INLINE int_mv gm_get_motion_vector(const WarpedMotionParams *gm,
return res;
}
-static INLINE TransformationType get_gmtype(const WarpedMotionParams *gm) {
+static INLINE TransformationType get_wmtype(const WarpedMotionParams *gm) {
if (gm->wmmat[5] == (1 << WARPEDMODEL_PREC_BITS) && !gm->wmmat[4] &&
gm->wmmat[2] == (1 << WARPEDMODEL_PREC_BITS) && !gm->wmmat[3]) {
return ((!gm->wmmat[1] && !gm->wmmat[0]) ? IDENTITY : TRANSLATION);
diff --git a/libaom/av1/common/mvref_common.c b/libaom/av1/common/mvref_common.c
index b3d9c2f..e38891f 100644
--- a/libaom/av1/common/mvref_common.c
+++ b/libaom/av1/common/mvref_common.c
@@ -347,8 +347,7 @@ static int add_tpl_ref_mv(const AV1_COMMON *cm, const MACROBLOCKD *xd,
if (rf[1] == NONE_FRAME) {
int cur_frame_index = cm->cur_frame->order_hint;
- const RefCntBuffer *const buf_0 =
- cm->current_frame.frame_refs[FWD_RF_OFFSET(rf[0])].buf;
+ const RefCntBuffer *const buf_0 = get_ref_frame_buf(cm, rf[0]);
int frame0_index = buf_0->order_hint;
int cur_offset_0 = get_relative_dist(&cm->seq_params.order_hint_info,
cur_frame_index, frame0_index);
@@ -383,14 +382,12 @@ static int add_tpl_ref_mv(const AV1_COMMON *cm, const MACROBLOCKD *xd,
} else {
// Process compound inter mode
int cur_frame_index = cm->cur_frame->order_hint;
- const RefCntBuffer *const buf_0 =
- cm->current_frame.frame_refs[FWD_RF_OFFSET(rf[0])].buf;
+ const RefCntBuffer *const buf_0 = get_ref_frame_buf(cm, rf[0]);
int frame0_index = buf_0->order_hint;
int cur_offset_0 = get_relative_dist(&cm->seq_params.order_hint_info,
cur_frame_index, frame0_index);
- const RefCntBuffer *const buf_1 =
- cm->current_frame.frame_refs[FWD_RF_OFFSET(rf[1])].buf;
+ const RefCntBuffer *const buf_1 = get_ref_frame_buf(cm, rf[1]);
int frame1_index = buf_1->order_hint;
int cur_offset_1 = get_relative_dist(&cm->seq_params.order_hint_info,
cur_frame_index, frame1_index);
@@ -824,7 +821,7 @@ void av1_find_mv_refs(const AV1_COMMON *cm, const MACROBLOCKD *xd,
MV_REFERENCE_FRAME rf[2];
av1_set_ref_frame(rf, ref_frame);
- if (ref_frame < REF_FRAMES) {
+ if (global_mvs != NULL && ref_frame < REF_FRAMES) {
if (ref_frame != INTRA_FRAME) {
global_mvs[ref_frame] = gm_get_motion_vector(
&cm->global_motion[ref_frame], cm->allow_high_precision_mv, bsize,
@@ -871,8 +868,7 @@ void av1_setup_frame_buf_refs(AV1_COMMON *cm) {
MV_REFERENCE_FRAME ref_frame;
for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
- const RefCntBuffer *const buf =
- cm->current_frame.frame_refs[ref_frame - LAST_FRAME].buf;
+ const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
if (buf != NULL)
cm->cur_frame->ref_order_hints[ref_frame - LAST_FRAME] = buf->order_hint;
}
@@ -881,8 +877,7 @@ void av1_setup_frame_buf_refs(AV1_COMMON *cm) {
void av1_setup_frame_sign_bias(AV1_COMMON *cm) {
MV_REFERENCE_FRAME ref_frame;
for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
- const RefCntBuffer *const buf =
- cm->current_frame.frame_refs[ref_frame - LAST_FRAME].buf;
+ const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
if (cm->seq_params.order_hint_info.enable_order_hint && buf != NULL) {
const int ref_order_hint = buf->order_hint;
cm->ref_frame_sign_bias[ref_frame] =
@@ -942,13 +937,13 @@ static int motion_field_projection(AV1_COMMON *cm,
TPL_MV_REF *tpl_mvs_base = cm->tpl_mvs;
int ref_offset[REF_FRAMES] = { 0 };
- (void)dir;
-
const RefCntBuffer *const start_frame_buf =
- cm->current_frame.frame_refs[FWD_RF_OFFSET(start_frame)].buf;
+ get_ref_frame_buf(cm, start_frame);
if (start_frame_buf == NULL) return 0;
- if (start_frame_buf->intra_only) return 0;
+ if (start_frame_buf->frame_type == KEY_FRAME ||
+ start_frame_buf->frame_type == INTRA_ONLY_FRAME)
+ return 0;
if (start_frame_buf->mi_rows != cm->mi_rows ||
start_frame_buf->mi_cols != cm->mi_cols)
@@ -1029,7 +1024,7 @@ void av1_setup_motion_field(AV1_COMMON *cm) {
for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
const int ref_idx = ref_frame - LAST_FRAME;
- const RefCntBuffer *const buf = cm->current_frame.frame_refs[ref_idx].buf;
+ const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
int order_hint = 0;
if (buf != NULL) order_hint = buf->order_hint;
@@ -1074,8 +1069,7 @@ void av1_setup_motion_field(AV1_COMMON *cm) {
ref_stamp >= 0)
if (motion_field_projection(cm, ALTREF_FRAME, 0)) --ref_stamp;
- if (ref_stamp >= 0 && ref_buf[LAST2_FRAME - LAST_FRAME] != NULL)
- if (motion_field_projection(cm, LAST2_FRAME, 2)) --ref_stamp;
+ if (ref_stamp >= 0) motion_field_projection(cm, LAST2_FRAME, 2);
}
static INLINE void record_samples(MB_MODE_INFO *mbmi, int *pts, int *pts_inref,
@@ -1293,7 +1287,7 @@ void av1_setup_skip_mode_allowed(AV1_COMMON *cm) {
// Identify the nearest forward and backward references.
for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
- const RefCntBuffer *const buf = cm->current_frame.frame_refs[i].buf;
+ const RefCntBuffer *const buf = get_ref_frame_buf(cm, LAST_FRAME + i);
if (buf == NULL) continue;
const int ref_order_hint = buf->order_hint;
@@ -1328,7 +1322,7 @@ void av1_setup_skip_mode_allowed(AV1_COMMON *cm) {
// Identify the second nearest forward reference.
ref_order_hints[1] = -1;
for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
- const RefCntBuffer *const buf = cm->current_frame.frame_refs[i].buf;
+ const RefCntBuffer *const buf = get_ref_frame_buf(cm, LAST_FRAME + i);
if (buf == NULL) continue;
const int ref_order_hint = buf->order_hint;
@@ -1352,38 +1346,31 @@ void av1_setup_skip_mode_allowed(AV1_COMMON *cm) {
}
typedef struct {
- int map_idx; // frame map index
- int buf_idx; // frame buffer index
- int sort_idx; // index based on the offset to be used for sorting
+ int map_idx; // frame map index
+ RefCntBuffer *buf; // frame buffer
+ int sort_idx; // index based on the offset to be used for sorting
} REF_FRAME_INFO;
+// Compares the sort_idx fields. If they are equal, then compares the map_idx
+// fields to break the tie. This ensures a stable sort.
static int compare_ref_frame_info(const void *arg_a, const void *arg_b) {
const REF_FRAME_INFO *info_a = (REF_FRAME_INFO *)arg_a;
const REF_FRAME_INFO *info_b = (REF_FRAME_INFO *)arg_b;
- if (info_a->sort_idx < info_b->sort_idx) return -1;
- if (info_a->sort_idx > info_b->sort_idx) return 1;
- return (info_a->map_idx < info_b->map_idx)
- ? -1
- : ((info_a->map_idx > info_b->map_idx) ? 1 : 0);
+ const int sort_idx_diff = info_a->sort_idx - info_b->sort_idx;
+ if (sort_idx_diff != 0) return sort_idx_diff;
+ return info_a->map_idx - info_b->map_idx;
}
-static void set_ref_frame_info(AV1_COMMON *const cm, int frame_idx,
+static void set_ref_frame_info(int *remapped_ref_idx, int frame_idx,
REF_FRAME_INFO *ref_info) {
assert(frame_idx >= 0 && frame_idx < INTER_REFS_PER_FRAME);
- const int buf_idx = ref_info->buf_idx;
-
- cm->current_frame.frame_refs[frame_idx].buf =
- &cm->buffer_pool->frame_bufs[buf_idx];
- cm->current_frame.frame_refs[frame_idx].map_idx = ref_info->map_idx;
+ remapped_ref_idx[frame_idx] = ref_info->map_idx;
}
-void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx,
- int gld_map_idx) {
- BufferPool *const pool = cm->buffer_pool;
- RefCntBuffer *const frame_bufs = pool->frame_bufs;
-
+void av1_set_frame_refs(AV1_COMMON *const cm, int *remapped_ref_idx,
+ int lst_map_idx, int gld_map_idx) {
int lst_frame_sort_idx = -1;
int gld_frame_sort_idx = -1;
@@ -1402,15 +1389,14 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx,
ref_frame_info[i].map_idx = map_idx;
ref_frame_info[i].sort_idx = -1;
- const int buf_idx = cm->ref_frame_map[map_idx];
- ref_frame_info[i].buf_idx = buf_idx;
+ RefCntBuffer *const buf = cm->ref_frame_map[map_idx];
+ ref_frame_info[i].buf = buf;
- assert(buf_idx < FRAME_BUFFERS);
- if (buf_idx < 0) continue;
- // TODO(zoeliu@google.com): To verify the checking on ref_count.
- if (frame_bufs[buf_idx].ref_count <= 0) continue;
+ if (buf == NULL) continue;
+ // If this assertion fails, there is a reference leak.
+ assert(buf->ref_count > 0);
- const int offset = (int)frame_bufs[buf_idx].order_hint;
+ const int offset = (int)buf->order_hint;
ref_frame_info[i].sort_idx =
(offset == -1) ? -1
: cur_frame_sort_idx +
@@ -1461,7 +1447,7 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx,
// == ALTREF_FRAME ==
if (bwd_start_idx <= bwd_end_idx) {
- set_ref_frame_info(cm, ALTREF_FRAME - LAST_FRAME,
+ set_ref_frame_info(remapped_ref_idx, ALTREF_FRAME - LAST_FRAME,
&ref_frame_info[bwd_end_idx]);
ref_flag_list[ALTREF_FRAME - LAST_FRAME] = 1;
bwd_end_idx--;
@@ -1469,7 +1455,7 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx,
// == BWDREF_FRAME ==
if (bwd_start_idx <= bwd_end_idx) {
- set_ref_frame_info(cm, BWDREF_FRAME - LAST_FRAME,
+ set_ref_frame_info(remapped_ref_idx, BWDREF_FRAME - LAST_FRAME,
&ref_frame_info[bwd_start_idx]);
ref_flag_list[BWDREF_FRAME - LAST_FRAME] = 1;
bwd_start_idx++;
@@ -1477,7 +1463,7 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx,
// == ALTREF2_FRAME ==
if (bwd_start_idx <= bwd_end_idx) {
- set_ref_frame_info(cm, ALTREF2_FRAME - LAST_FRAME,
+ set_ref_frame_info(remapped_ref_idx, ALTREF2_FRAME - LAST_FRAME,
&ref_frame_info[bwd_start_idx]);
ref_flag_list[ALTREF2_FRAME - LAST_FRAME] = 1;
}
@@ -1487,13 +1473,15 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx,
for (int i = fwd_start_idx; i <= fwd_end_idx; ++i) {
// == LAST_FRAME ==
if (ref_frame_info[i].map_idx == lst_map_idx) {
- set_ref_frame_info(cm, LAST_FRAME - LAST_FRAME, &ref_frame_info[i]);
+ set_ref_frame_info(remapped_ref_idx, LAST_FRAME - LAST_FRAME,
+ &ref_frame_info[i]);
ref_flag_list[LAST_FRAME - LAST_FRAME] = 1;
}
// == GOLDEN_FRAME ==
if (ref_frame_info[i].map_idx == gld_map_idx) {
- set_ref_frame_info(cm, GOLDEN_FRAME - LAST_FRAME, &ref_frame_info[i]);
+ set_ref_frame_info(remapped_ref_idx, GOLDEN_FRAME - LAST_FRAME,
+ &ref_frame_info[i]);
ref_flag_list[GOLDEN_FRAME - LAST_FRAME] = 1;
}
}
@@ -1525,7 +1513,7 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx,
}
if (fwd_start_idx > fwd_end_idx) break;
- set_ref_frame_info(cm, ref_frame - LAST_FRAME,
+ set_ref_frame_info(remapped_ref_idx, ref_frame - LAST_FRAME,
&ref_frame_info[fwd_end_idx]);
ref_flag_list[ref_frame - LAST_FRAME] = 1;
@@ -1536,7 +1524,7 @@ void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx,
for (; ref_idx < (INTER_REFS_PER_FRAME - 2); ref_idx++) {
const MV_REFERENCE_FRAME ref_frame = ref_frame_list[ref_idx];
if (ref_flag_list[ref_frame - LAST_FRAME] == 1) continue;
- set_ref_frame_info(cm, ref_frame - LAST_FRAME,
+ set_ref_frame_info(remapped_ref_idx, ref_frame - LAST_FRAME,
&ref_frame_info[fwd_start_idx]);
ref_flag_list[ref_frame - LAST_FRAME] = 1;
}
diff --git a/libaom/av1/common/mvref_common.h b/libaom/av1/common/mvref_common.h
index 2dbd12c..0aa9d38 100644
--- a/libaom/av1/common/mvref_common.h
+++ b/libaom/av1/common/mvref_common.h
@@ -70,18 +70,6 @@ static INLINE int_mv get_sub_block_pred_mv(const MB_MODE_INFO *candidate,
return candidate->mv[which_mv];
}
-// Performs mv sign inversion if indicated by the reference frame combination.
-static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref,
- const MV_REFERENCE_FRAME this_ref_frame,
- const int *ref_sign_bias) {
- int_mv mv = mbmi->mv[ref];
- if (ref_sign_bias[mbmi->ref_frame[ref]] != ref_sign_bias[this_ref_frame]) {
- mv.as_mv.row *= -1;
- mv.as_mv.col *= -1;
- }
- return mv;
-}
-
// Checks that the given mi_row, mi_col and search point
// are inside the borders of the tile.
static INLINE int is_inside(const TileInfo *const tile, int mi_col, int mi_row,
@@ -222,7 +210,8 @@ void av1_setup_frame_buf_refs(AV1_COMMON *cm);
void av1_setup_frame_sign_bias(AV1_COMMON *cm);
void av1_setup_skip_mode_allowed(AV1_COMMON *cm);
void av1_setup_motion_field(AV1_COMMON *cm);
-void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx, int gld_map_idx);
+void av1_set_frame_refs(AV1_COMMON *const cm, int *remapped_ref_idx,
+ int lst_map_idx, int gld_map_idx);
static INLINE void av1_collect_neighbors_ref_counts(MACROBLOCKD *const xd) {
av1_zero(xd->neighbors_ref_counts);
@@ -255,6 +244,9 @@ void av1_copy_frame_mvs(const AV1_COMMON *const cm,
const MB_MODE_INFO *const mi, int mi_row, int mi_col,
int x_mis, int y_mis);
+// The global_mvs output parameter points to an array of REF_FRAMES elements.
+// The caller may pass a null global_mvs if it does not need the global_mvs
+// output.
void av1_find_mv_refs(const AV1_COMMON *cm, const MACROBLOCKD *xd,
MB_MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
uint8_t ref_mv_count[MODE_CTX_REF_FRAMES],
diff --git a/libaom/av1/common/onyxc_int.h b/libaom/av1/common/onyxc_int.h
index 117afb6..8117dfc 100644
--- a/libaom/av1/common/onyxc_int.h
+++ b/libaom/av1/common/onyxc_int.h
@@ -79,14 +79,14 @@ extern "C" {
#define TXCOEFF_TIMER 0
#define TXCOEFF_COST_TIMER 0
-typedef enum {
+enum {
SINGLE_REFERENCE = 0,
COMPOUND_REFERENCE = 1,
REFERENCE_MODE_SELECT = 2,
REFERENCE_MODES = 3,
-} REFERENCE_MODE;
+} UENUM1BYTE(REFERENCE_MODE);
-typedef enum {
+enum {
/**
* Frame context updates are disabled
*/
@@ -96,7 +96,7 @@ typedef enum {
* updates based on entropy/counts in the decoded frame
*/
REFRESH_FRAME_CONTEXT_BACKWARD,
-} REFRESH_FRAME_CONTEXT_MODE;
+} UENUM1BYTE(REFRESH_FRAME_CONTEXT_MODE);
#define MFMV_STACK_SIZE 3
typedef struct {
@@ -109,24 +109,12 @@ typedef struct {
MV_REFERENCE_FRAME ref_frame;
} MV_REF;
-// FIXME(jack.haughton@argondesign.com): This enum was originally in
-// encoder/ratectrl.h, and is encoder specific. When we move to C++, this
-// should go back there and BufferPool should be templatized.
-typedef enum {
- INTER_NORMAL = 0,
- INTER_LOW = 1,
- INTER_HIGH = 2,
- GF_ARF_LOW = 3,
- GF_ARF_STD = 4,
- KF_STD = 5,
- RATE_FACTOR_LEVELS = 6
-} RATE_FACTOR_LEVEL;
typedef struct RefCntBuffer {
// For a RefCntBuffer, the following are reference-holding variables:
// - cm->ref_frame_map[]
- // - cm->new_fb_idx
- // - cm->scaled_ref_idx[] (encoder only)
+ // - cm->cur_frame
+ // - cm->scaled_ref_buf[] (encoder only)
// - cm->next_ref_frame_map[] (decoder only)
// - pbi->output_frame_index[] (decoder only)
// With that definition, 'ref_count' is the number of reference-holding
@@ -136,8 +124,6 @@ typedef struct RefCntBuffer {
// - Total 'n' of the variables / array elements above have value 'k' (that
// is, they are pointing to buffer at index 'k').
// Then, pool->frame_bufs[k].ref_count = n.
- // TODO(david.turner@argondesign.com) Check whether this helpful comment is
- // still correct after we finish restructuring
int ref_count;
unsigned int order_hint;
@@ -154,14 +140,17 @@ typedef struct RefCntBuffer {
int height;
WarpedMotionParams global_motion[REF_FRAMES];
int showable_frame; // frame can be used as show existing frame in future
- int film_grain_params_present;
+ uint8_t film_grain_params_present;
aom_film_grain_t film_grain_params;
aom_codec_frame_buffer_t raw_frame_buffer;
YV12_BUFFER_CONFIG buf;
hash_table hash_table;
- uint8_t intra_only;
FRAME_TYPE frame_type;
+ // This is only used in the encoder but needs to be indexed per ref frame
+ // so it's extremely convenient to keep it here.
+ int interp_filter_selected[SWITCHABLE];
+
// Inter frame reference frame delta for loop filter
int8_t ref_deltas[REF_FRAMES];
@@ -169,7 +158,6 @@ typedef struct RefCntBuffer {
int8_t mode_deltas[MAX_MODE_LF_DELTAS];
FRAME_CONTEXT frame_context;
- RATE_FACTOR_LEVEL frame_rf_level;
} RefCntBuffer;
typedef struct BufferPool {
@@ -195,18 +183,6 @@ typedef struct BufferPool {
} BufferPool;
typedef struct {
- int base_ctx_table[2 /*row*/][2 /*col*/][3 /*sig_map*/]
- [BASE_CONTEXT_POSITION_NUM + 1];
-} LV_MAP_CTX_TABLE;
-typedef int BASE_CTX_TABLE[2 /*col*/][3 /*sig_map*/]
- [BASE_CONTEXT_POSITION_NUM + 1];
-
-typedef struct BitstreamLevel {
- uint8_t major;
- uint8_t minor;
-} BitstreamLevel;
-
-typedef struct {
int cdef_pri_damping;
int cdef_sec_damping;
int nb_cdef_strengths;
@@ -230,11 +206,11 @@ typedef struct {
typedef struct {
int enable_order_hint; // 0 - disable order hint, and related tools
- int order_hint_bits_minus_1;
- // jnt_comp, ref_frame_mvs, frame_sign_bias
- // if 0, enable_jnt_comp and
- // enable_ref_frame_mvs must be set zs 0.
- int enable_jnt_comp; // 0 - disable joint compound modes
+ int order_hint_bits_minus_1; // dist_wtd_comp, ref_frame_mvs,
+ // frame_sign_bias
+ // if 0, enable_dist_wtd_comp and
+ // enable_ref_frame_mvs must be set as 0.
+ int enable_dist_wtd_comp; // 0 - disable dist-wtd compound modes
// 1 - enable it
int enable_ref_frame_mvs; // 0 - disable ref frame mvs
// 1 - enable it
@@ -249,7 +225,7 @@ typedef struct SequenceHeader {
int num_bits_height;
int max_frame_width;
int max_frame_height;
- int frame_id_numbers_present_flag;
+ uint8_t frame_id_numbers_present_flag;
int frame_id_length;
int delta_frame_id_length;
BLOCK_SIZE sb_size; // Size of the superblock used for this frame
@@ -258,45 +234,44 @@ typedef struct SequenceHeader {
OrderHintInfo order_hint_info;
- int force_screen_content_tools; // 0 - force off
- // 1 - force on
- // 2 - adaptive
- int force_integer_mv; // 0 - Not to force. MV can be in 1/4 or 1/8
- // 1 - force to integer
- // 2 - adaptive
- int still_picture; // Video is a single frame still picture
- int reduced_still_picture_hdr; // Use reduced header for still picture
- int enable_filter_intra; // enables/disables filterintra
- int enable_intra_edge_filter; // enables/disables corner/edge/upsampling
- int enable_interintra_compound; // enables/disables interintra_compound
- int enable_masked_compound; // enables/disables masked compound
- int enable_dual_filter; // 0 - disable dual interpolation filter
- // 1 - enable vert/horiz filter selection
- int enable_warped_motion; // 0 - disable warped motion for sequence
- // 1 - enable it for the sequence
- int enable_superres; // 0 - Disable superres for the sequence, and disable
- // transmitting per-frame superres enabled flag.
- // 1 - Enable superres for the sequence, and also
- // enable per-frame flag to denote if superres is
- // enabled for that frame.
- int enable_cdef; // To turn on/off CDEF
- int enable_restoration; // To turn on/off loop restoration
+ uint8_t force_screen_content_tools; // 0 - force off
+ // 1 - force on
+ // 2 - adaptive
+ uint8_t still_picture; // Video is a single frame still picture
+ uint8_t reduced_still_picture_hdr; // Use reduced header for still picture
+ uint8_t force_integer_mv; // 0 - Don't force. MV can use subpel
+ // 1 - force to integer
+ // 2 - adaptive
+ uint8_t enable_filter_intra; // enables/disables filterintra
+ uint8_t enable_intra_edge_filter; // enables/disables edge upsampling
+ uint8_t enable_interintra_compound; // enables/disables interintra_compound
+ uint8_t enable_masked_compound; // enables/disables masked compound
+ uint8_t enable_dual_filter; // 0 - disable dual interpolation filter
+ // 1 - enable vert/horz filter selection
+ uint8_t enable_warped_motion; // 0 - disable warp for the sequence
+ // 1 - enable warp for the sequence
+ uint8_t enable_superres; // 0 - Disable superres for the sequence
+ // and no frame level superres flag
+ // 1 - Enable superres for the sequence
+ // enable per-frame superres flag
+ uint8_t enable_cdef; // To turn on/off CDEF
+ uint8_t enable_restoration; // To turn on/off loop restoration
BITSTREAM_PROFILE profile;
// Operating point info.
int operating_points_cnt_minus_1;
int operating_point_idc[MAX_NUM_OPERATING_POINTS];
- int display_model_info_present_flag;
- int decoder_model_info_present_flag;
- BitstreamLevel level[MAX_NUM_OPERATING_POINTS];
+ uint8_t display_model_info_present_flag;
+ uint8_t decoder_model_info_present_flag;
+ AV1_LEVEL seq_level_idx[MAX_NUM_OPERATING_POINTS];
uint8_t tier[MAX_NUM_OPERATING_POINTS]; // seq_tier in the spec. One bit: 0
// or 1.
// Color config.
aom_bit_depth_t bit_depth; // AOM_BITS_8 in profile 0 or 1,
// AOM_BITS_10 or AOM_BITS_12 in profile 2 or 3.
- int use_highbitdepth; // If true, we need to use 16bit frame buffers.
- int monochrome; // Monochorme video
+ uint8_t use_highbitdepth; // If true, we need to use 16bit frame buffers.
+ uint8_t monochrome; // Monochorme video
aom_color_primaries_t color_primaries;
aom_transfer_characteristics_t transfer_characteristics;
aom_matrix_coefficients_t matrix_coefficients;
@@ -304,9 +279,8 @@ typedef struct SequenceHeader {
int subsampling_x; // Chroma subsampling for x
int subsampling_y; // Chroma subsampling for y
aom_chroma_sample_position_t chroma_sample_position;
- int separate_uv_delta_q;
-
- int film_grain_params_present;
+ uint8_t separate_uv_delta_q;
+ uint8_t film_grain_params_present;
} SequenceHeader;
typedef struct {
@@ -318,16 +292,13 @@ typedef struct {
typedef struct {
FRAME_TYPE frame_type;
- // Flag signaling that the frame is encoded using only INTRA modes.
- uint8_t intra_only;
REFERENCE_MODE reference_mode;
unsigned int order_hint;
unsigned int frame_number;
SkipModeInfo skip_mode_info;
- // Each Inter frame can reference INTER_REFS_PER_FRAME buffers. This maps each
- // (inter) reference frame type to the corresponding reference buffer.
- RefBuffer frame_refs[INTER_REFS_PER_FRAME];
+ int refresh_frame_flags; // Which ref frames are overwritten by this frame
+ int frame_refs_short_signaling;
} CurrentFrame;
typedef struct AV1Common {
@@ -337,8 +308,6 @@ typedef struct AV1Common {
int height;
int render_width;
int render_height;
- int last_width;
- int last_height;
int timing_info_present;
aom_timing_info_t timing_info;
int buffer_removal_time_present;
@@ -347,49 +316,59 @@ typedef struct AV1Common {
aom_op_timing_info_t op_frame_timing[MAX_NUM_OPERATING_POINTS + 1];
uint32_t frame_presentation_time;
- int largest_tile_id;
- size_t largest_tile_size;
int context_update_tile_id;
// Scale of the current frame with respect to itself.
struct scale_factors sf_identity;
- YV12_BUFFER_CONFIG *frame_to_show;
RefCntBuffer *prev_frame;
// TODO(hkuang): Combine this with cur_buf in macroblockd.
RefCntBuffer *cur_frame;
- // For decoder, ref_frame_map[i] maps reference type 'i' to actual index of
- // the buffer in the buffer pool ‘cm->buffer_pool.frame_bufs’.
+ // For encoder, we have a two-level mapping from reference frame type to the
+ // corresponding buffer in the buffer pool:
+ // * 'remapped_ref_idx[i - 1]' maps reference type 'i' (range: LAST_FRAME ...
+ // EXTREF_FRAME) to a remapped index 'j' (in range: 0 ... REF_FRAMES - 1)
+ // * Later, 'cm->ref_frame_map[j]' maps the remapped index 'j' to a pointer to
+ // the reference counted buffer structure RefCntBuffer, taken from the buffer
+ // pool cm->buffer_pool->frame_bufs.
+ //
+ // LAST_FRAME, ..., EXTREF_FRAME
+ // | |
+ // v v
+ // remapped_ref_idx[LAST_FRAME - 1], ..., remapped_ref_idx[EXTREF_FRAME - 1]
+ // | |
+ // v v
+ // ref_frame_map[], ..., ref_frame_map[]
+ //
+ // Note: INTRA_FRAME always refers to the current frame, so there's no need to
+ // have a remapped index for the same.
+ int remapped_ref_idx[REF_FRAMES];
+
+ struct scale_factors ref_scale_factors[REF_FRAMES];
+
+ // For decoder, ref_frame_map[i] maps reference type 'i' to a pointer to
+ // the buffer in the buffer pool 'cm->buffer_pool.frame_bufs'.
// For encoder, ref_frame_map[j] (where j = remapped_ref_idx[i]) maps
// remapped reference index 'j' (that is, original reference type 'i') to
- // actual index of the buffer in the buffer pool ‘cm->buffer_pool.frame_bufs’.
- int ref_frame_map[REF_FRAMES];
+ // a pointer to the buffer in the buffer pool 'cm->buffer_pool.frame_bufs'.
+ RefCntBuffer *ref_frame_map[REF_FRAMES];
// Prepare ref_frame_map for the next frame.
// Only used in frame parallel decode.
- int next_ref_frame_map[REF_FRAMES];
-
- // Index to the 'new' frame (i.e. the frame currently being encoded or
- // decoded) in the buffer pool 'cm->buffer_pool'.
- int new_fb_idx;
-
+ RefCntBuffer *next_ref_frame_map[REF_FRAMES];
FRAME_TYPE last_frame_type; /* last frame's frame type for motion search.*/
int show_frame;
int showable_frame; // frame can be used as show existing frame in future
int show_existing_frame;
- // Flag for a frame used as a reference - not written to the bitstream
- int is_reference_frame;
- int reset_decoder_state;
- uint8_t last_intra_only;
uint8_t disable_cdf_update;
int allow_high_precision_mv;
- int cur_frame_force_integer_mv; // 0 the default in AOM, 1 only integer
+ uint8_t cur_frame_force_integer_mv; // 0 the default in AOM, 1 only integer
- int allow_screen_content_tools;
+ uint8_t allow_screen_content_tools;
int allow_intrabc;
int allow_warped_motion;
@@ -437,6 +416,7 @@ typedef struct AV1Common {
int qm_v;
int min_qmlevel;
int max_qmlevel;
+ int use_quant_b_adapt;
/* We allocate a MB_MODE_INFO struct for each macroblock, together with
an extra row on top and column on the left to simplify prediction. */
@@ -465,8 +445,6 @@ typedef struct AV1Common {
int allow_ref_frame_mvs;
uint8_t *last_frame_seg_map;
- uint8_t *current_frame_seg_map;
- int seg_map_alloc_size;
InterpFilter interp_filter;
@@ -505,17 +483,11 @@ typedef struct AV1Common {
FRAME_CONTEXT *fc; /* this frame entropy */
FRAME_CONTEXT *default_frame_context;
- unsigned int frame_context_idx; /* Context to use/update */
- int fb_of_context_type[REF_FRAMES];
int primary_ref_frame;
- aom_bit_depth_t dequant_bit_depth; // bit_depth of current dequantizer
-
int error_resilient_mode;
- int force_primary_ref_none;
int tile_cols, tile_rows;
- int last_tile_cols, last_tile_rows;
int max_tile_width_sb;
int min_log2_tile_cols;
@@ -530,6 +502,7 @@ typedef struct AV1Common {
int tile_col_start_sb[MAX_TILE_COLS + 1]; // valid for 0 <= i <= tile_cols
int tile_row_start_sb[MAX_TILE_ROWS + 1]; // valid for 0 <= i <= tile_rows
int tile_width, tile_height; // In MI units
+ int min_inner_tile_width; // min width of non-rightmost tile
unsigned int large_scale_tile;
unsigned int single_tile_decoding;
@@ -555,8 +528,6 @@ typedef struct AV1Common {
int current_frame_id;
int ref_frame_id[REF_FRAMES];
int valid_for_referencing[REF_FRAMES];
- int invalid_delta_frame_id_minus_1;
- LV_MAP_CTX_TABLE coeff_ctx_table;
TPL_MV_REF *tpl_mvs;
int tpl_mvs_mem_size;
// TODO(jingning): This can be combined with sign_bias later.
@@ -564,7 +535,6 @@ typedef struct AV1Common {
int is_annexb;
- int frame_refs_short_signaling;
int temporal_layer_id;
int spatial_layer_id;
unsigned int number_temporal_layers;
@@ -608,9 +578,8 @@ static void unlock_buffer_pool(BufferPool *const pool) {
static INLINE YV12_BUFFER_CONFIG *get_ref_frame(AV1_COMMON *cm, int index) {
if (index < 0 || index >= REF_FRAMES) return NULL;
- if (cm->ref_frame_map[index] < 0) return NULL;
- assert(cm->ref_frame_map[index] < FRAME_BUFFERS);
- return &cm->buffer_pool->frame_bufs[cm->ref_frame_map[index]].buf;
+ if (cm->ref_frame_map[index] == NULL) return NULL;
+ return &cm->ref_frame_map[index]->buf;
}
static INLINE int get_free_fb(AV1_COMMON *cm) {
@@ -646,38 +615,83 @@ static INLINE int get_free_fb(AV1_COMMON *cm) {
return i;
}
-// Modify 'idx_ptr' to reference the buffer at 'new_idx', and update the ref
+static INLINE RefCntBuffer *assign_cur_frame_new_fb(AV1_COMMON *const cm) {
+ // Release the previously-used frame-buffer
+ if (cm->cur_frame != NULL) {
+ --cm->cur_frame->ref_count;
+ cm->cur_frame = NULL;
+ }
+
+ // Assign a new framebuffer
+ const int new_fb_idx = get_free_fb(cm);
+ if (new_fb_idx == INVALID_IDX) return NULL;
+
+ cm->cur_frame = &cm->buffer_pool->frame_bufs[new_fb_idx];
+ cm->cur_frame->buf.buf_8bit_valid = 0;
+ av1_zero(cm->cur_frame->interp_filter_selected);
+ return cm->cur_frame;
+}
+
+// Modify 'lhs_ptr' to reference the buffer at 'rhs_ptr', and update the ref
// counts accordingly.
-static INLINE void assign_frame_buffer(RefCntBuffer *bufs, int *idx_ptr,
- int new_idx) {
- const int old_idx = *idx_ptr;
- if (old_idx >= 0) {
- assert(bufs[old_idx].ref_count > 0);
- // One less reference to the buffer at 'old_idx', so decrease ref count.
- --bufs[old_idx].ref_count;
+static INLINE void assign_frame_buffer_p(RefCntBuffer **lhs_ptr,
+ RefCntBuffer *rhs_ptr) {
+ RefCntBuffer *const old_ptr = *lhs_ptr;
+ if (old_ptr != NULL) {
+ assert(old_ptr->ref_count > 0);
+ // One less reference to the buffer at 'old_ptr', so decrease ref count.
+ --old_ptr->ref_count;
}
- *idx_ptr = new_idx;
- // One more reference to the buffer at 'new_idx', so increase ref count.
- ++bufs[new_idx].ref_count;
+ *lhs_ptr = rhs_ptr;
+ // One more reference to the buffer at 'rhs_ptr', so increase ref count.
+ ++rhs_ptr->ref_count;
}
static INLINE int frame_is_intra_only(const AV1_COMMON *const cm) {
return cm->current_frame.frame_type == KEY_FRAME ||
- cm->current_frame.intra_only;
+ cm->current_frame.frame_type == INTRA_ONLY_FRAME;
}
static INLINE int frame_is_sframe(const AV1_COMMON *cm) {
return cm->current_frame.frame_type == S_FRAME;
}
-static INLINE RefCntBuffer *get_prev_frame(const AV1_COMMON *const cm) {
- if (cm->primary_ref_frame == PRIMARY_REF_NONE ||
- cm->current_frame.frame_refs[cm->primary_ref_frame].buf == NULL) {
- return NULL;
- } else {
- return cm->current_frame.frame_refs[cm->primary_ref_frame].buf;
- }
+// These functions take a reference frame label between LAST_FRAME and
+// EXTREF_FRAME inclusive. Note that this is different to the indexing
+// previously used by the frame_refs[] array.
+static INLINE int get_ref_frame_map_idx(const AV1_COMMON *const cm,
+ const MV_REFERENCE_FRAME ref_frame) {
+ return (ref_frame >= LAST_FRAME && ref_frame <= EXTREF_FRAME)
+ ? cm->remapped_ref_idx[ref_frame - LAST_FRAME]
+ : INVALID_IDX;
+}
+
+static INLINE RefCntBuffer *get_ref_frame_buf(
+ const AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) {
+ const int map_idx = get_ref_frame_map_idx(cm, ref_frame);
+ return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : NULL;
+}
+
+// Both const and non-const versions of this function are provided so that it
+// can be used with a const AV1_COMMON if needed.
+static INLINE const struct scale_factors *get_ref_scale_factors_const(
+ const AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) {
+ const int map_idx = get_ref_frame_map_idx(cm, ref_frame);
+ return (map_idx != INVALID_IDX) ? &cm->ref_scale_factors[map_idx] : NULL;
+}
+
+static INLINE struct scale_factors *get_ref_scale_factors(
+ AV1_COMMON *const cm, const MV_REFERENCE_FRAME ref_frame) {
+ const int map_idx = get_ref_frame_map_idx(cm, ref_frame);
+ return (map_idx != INVALID_IDX) ? &cm->ref_scale_factors[map_idx] : NULL;
+}
+
+static INLINE RefCntBuffer *get_primary_ref_frame_buf(
+ const AV1_COMMON *const cm) {
+ if (cm->primary_ref_frame == PRIMARY_REF_NONE) return NULL;
+ const int map_idx = get_ref_frame_map_idx(cm, cm->primary_ref_frame + 1);
+ return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : NULL;
}
// Returns 1 if this frame might allow mvs from some reference frame.
@@ -1233,8 +1247,8 @@ static INLINE TX_SIZE get_tx_size(int width, int height) {
return TX_4X4;
}
-static INLINE int txfm_partition_context(TXFM_CONTEXT *above_ctx,
- TXFM_CONTEXT *left_ctx,
+static INLINE int txfm_partition_context(const TXFM_CONTEXT *const above_ctx,
+ const TXFM_CONTEXT *const left_ctx,
BLOCK_SIZE bsize, TX_SIZE tx_size) {
const uint8_t txw = tx_size_wide[tx_size];
const uint8_t txh = tx_size_high[tx_size];
@@ -1358,17 +1372,8 @@ static INLINE int is_coded_lossless(const AV1_COMMON *cm,
return coded_lossless;
}
-static INLINE int is_valid_seq_level_idx(uint8_t seq_level_idx) {
- return seq_level_idx < 24 || seq_level_idx == 31;
-}
-
-static INLINE uint8_t major_minor_to_seq_level_idx(BitstreamLevel bl) {
- assert(bl.major >= LEVEL_MAJOR_MIN && bl.major <= LEVEL_MAJOR_MAX);
- // Since bl.minor is unsigned a comparison will return a warning:
- // comparison is always true due to limited range of data type
- assert(LEVEL_MINOR_MIN == 0);
- assert(bl.minor <= LEVEL_MINOR_MAX);
- return ((bl.major - LEVEL_MAJOR_MIN) << LEVEL_MINOR_BITS) + bl.minor;
+static INLINE int is_valid_seq_level_idx(AV1_LEVEL seq_level_idx) {
+ return seq_level_idx < SEQ_LEVELS || seq_level_idx == SEQ_LEVEL_MAX;
}
#ifdef __cplusplus
diff --git a/libaom/av1/common/pred_common.h b/libaom/av1/common/pred_common.h
index f667057..d9b30a9 100644
--- a/libaom/av1/common/pred_common.h
+++ b/libaom/av1/common/pred_common.h
@@ -48,20 +48,24 @@ static INLINE int av1_get_spatial_seg_pred(const AV1_COMMON *const cm,
int prev_l = -1; // left segment_id
int prev_u = -1; // top segment_id
if ((xd->up_available) && (xd->left_available)) {
- prev_ul = get_segment_id(cm, cm->current_frame_seg_map, BLOCK_4X4,
- mi_row - 1, mi_col - 1);
+ prev_ul = get_segment_id(cm, cm->cur_frame->seg_map, BLOCK_4X4, mi_row - 1,
+ mi_col - 1);
}
if (xd->up_available) {
- prev_u = get_segment_id(cm, cm->current_frame_seg_map, BLOCK_4X4,
- mi_row - 1, mi_col - 0);
+ prev_u = get_segment_id(cm, cm->cur_frame->seg_map, BLOCK_4X4, mi_row - 1,
+ mi_col - 0);
}
if (xd->left_available) {
- prev_l = get_segment_id(cm, cm->current_frame_seg_map, BLOCK_4X4,
- mi_row - 0, mi_col - 1);
+ prev_l = get_segment_id(cm, cm->cur_frame->seg_map, BLOCK_4X4, mi_row - 0,
+ mi_col - 1);
}
+ // This property follows from the fact that get_segment_id() returns a
+ // nonnegative value. This allows us to test for all edge cases with a simple
+ // prev_ul < 0 check.
+ assert(IMPLIES(prev_ul >= 0, prev_u >= 0 && prev_l >= 0));
// Pick CDF index based on number of matching/out-of-bounds segment IDs.
- if (prev_ul < 0 || prev_u < 0 || prev_l < 0) /* Edge case */
+ if (prev_ul < 0) /* Edge cases */
*cdf_index = 0;
else if ((prev_ul == prev_u) && (prev_ul == prev_l))
*cdf_index = 2;
@@ -90,10 +94,8 @@ static INLINE int av1_get_pred_context_seg_id(const MACROBLOCKD *xd) {
static INLINE int get_comp_index_context(const AV1_COMMON *cm,
const MACROBLOCKD *xd) {
MB_MODE_INFO *mbmi = xd->mi[0];
- const RefCntBuffer *const bck_buf =
- cm->current_frame.frame_refs[mbmi->ref_frame[0] - LAST_FRAME].buf;
- const RefCntBuffer *const fwd_buf =
- cm->current_frame.frame_refs[mbmi->ref_frame[1] - LAST_FRAME].buf;
+ const RefCntBuffer *const bck_buf = get_ref_frame_buf(cm, mbmi->ref_frame[0]);
+ const RefCntBuffer *const fwd_buf = get_ref_frame_buf(cm, mbmi->ref_frame[1]);
int bck_frame_index = 0, fwd_frame_index = 0;
int cur_frame_index = cm->cur_frame->order_hint;
diff --git a/libaom/av1/common/reconinter.c b/libaom/av1/common/reconinter.c
index f338e1b..ea351cf 100644
--- a/libaom/av1/common/reconinter.c
+++ b/libaom/av1/common/reconinter.c
@@ -84,12 +84,11 @@ void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
if (do_warp && xd->cur_frame_force_integer_mv == 0) {
const struct macroblockd_plane *const pd = &xd->plane[plane];
const struct buf_2d *const pre_buf = &pd->pre[ref];
- av1_warp_plane(&final_warp_params,
- xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
+ av1_warp_plane(&final_warp_params, is_cur_buf_hbd(xd), xd->bd,
pre_buf->buf0, pre_buf->width, pre_buf->height,
pre_buf->stride, dst, p_col, p_row, w, h, dst_stride,
pd->subsampling_x, pd->subsampling_y, conv_params);
- } else if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ } else if (is_cur_buf_hbd(xd)) {
highbd_inter_predictor(src, src_stride, dst, dst_stride, subpel_params, sf,
w, h, conv_params, interp_filters, is_intrabc,
xd->bd);
@@ -568,14 +567,15 @@ static void build_masked_compound_no_round(
const int subh = (2 << mi_size_high_log2[sb_type]) == h;
const int subw = (2 << mi_size_wide_log2[sb_type]) == w;
const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+ if (is_cur_buf_hbd(xd)) {
aom_highbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1,
src1_stride, mask, block_size_wide[sb_type],
w, h, subw, subh, conv_params, xd->bd);
- else
+ } else {
aom_lowbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1,
src1_stride, mask, block_size_wide[sb_type], w,
h, subw, subh, conv_params);
+ }
}
void av1_make_masked_inter_predictor(
@@ -626,20 +626,20 @@ void av1_make_masked_inter_predictor(
mi->sb_type, h, w, conv_params, xd);
}
-void av1_jnt_comp_weight_assign(const AV1_COMMON *cm, const MB_MODE_INFO *mbmi,
- int order_idx, int *fwd_offset, int *bck_offset,
- int *use_jnt_comp_avg, int is_compound) {
+void av1_dist_wtd_comp_weight_assign(const AV1_COMMON *cm,
+ const MB_MODE_INFO *mbmi, int order_idx,
+ int *fwd_offset, int *bck_offset,
+ int *use_dist_wtd_comp_avg,
+ int is_compound) {
assert(fwd_offset != NULL && bck_offset != NULL);
if (!is_compound || mbmi->compound_idx) {
- *use_jnt_comp_avg = 0;
+ *use_dist_wtd_comp_avg = 0;
return;
}
- *use_jnt_comp_avg = 1;
- const RefCntBuffer *const bck_buf =
- cm->current_frame.frame_refs[mbmi->ref_frame[0] - LAST_FRAME].buf;
- const RefCntBuffer *const fwd_buf =
- cm->current_frame.frame_refs[mbmi->ref_frame[1] - LAST_FRAME].buf;
+ *use_dist_wtd_comp_avg = 1;
+ const RefCntBuffer *const bck_buf = get_ref_frame_buf(cm, mbmi->ref_frame[0]);
+ const RefCntBuffer *const fwd_buf = get_ref_frame_buf(cm, mbmi->ref_frame[1]);
const int cur_frame_index = cm->cur_frame->order_hint;
int bck_frame_index = 0, fwd_frame_index = 0;
@@ -800,53 +800,6 @@ void av1_modify_neighbor_predictor_for_obmc(MB_MODE_INFO *mbmi) {
return;
}
-struct obmc_check_mv_field_ctxt {
- MB_MODE_INFO *current_mi;
- int mv_field_check_result;
-};
-
-static INLINE void obmc_check_identical_mv(MACROBLOCKD *xd, int rel_mi_col,
- uint8_t nb_mi_width,
- MB_MODE_INFO *nb_mi, void *fun_ctxt,
- const int num_planes) {
- (void)xd;
- (void)rel_mi_col;
- (void)nb_mi_width;
- (void)num_planes;
- struct obmc_check_mv_field_ctxt *ctxt =
- (struct obmc_check_mv_field_ctxt *)fun_ctxt;
- const MB_MODE_INFO *current_mi = ctxt->current_mi;
-
- if (ctxt->mv_field_check_result == 0) return;
-
- if (nb_mi->ref_frame[0] != current_mi->ref_frame[0] ||
- nb_mi->mv[0].as_int != current_mi->mv[0].as_int ||
- nb_mi->interp_filters != current_mi->interp_filters) {
- ctxt->mv_field_check_result = 0;
- }
- return;
-}
-
-// Check if the neighbors' motions used by obmc have same parameters as for
-// the current block. If all the parameters are identical, obmc will produce
-// the same prediction as from regular bmc, therefore we can skip the
-// overlapping operations for less complexity. The parameters checked include
-// reference frame, motion vector, and interpolation filter.
-int av1_check_identical_obmc_mv_field(const AV1_COMMON *cm, MACROBLOCKD *xd,
- int mi_row, int mi_col) {
- const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
- struct obmc_check_mv_field_ctxt mv_field_check_ctxt = { xd->mi[0], 1 };
-
- foreach_overlappable_nb_above(cm, xd, mi_col,
- max_neighbor_obmc[mi_size_wide_log2[bsize]],
- obmc_check_identical_mv, &mv_field_check_ctxt);
- foreach_overlappable_nb_left(cm, xd, mi_row,
- max_neighbor_obmc[mi_size_high_log2[bsize]],
- obmc_check_identical_mv, &mv_field_check_ctxt);
-
- return mv_field_check_ctxt.mv_field_check_result;
-}
-
struct obmc_inter_pred_ctxt {
uint8_t **adjacent;
int *adjacent_stride;
@@ -860,7 +813,7 @@ static INLINE void build_obmc_inter_pred_above(MACROBLOCKD *xd, int rel_mi_col,
(void)above_mi;
struct obmc_inter_pred_ctxt *ctxt = (struct obmc_inter_pred_ctxt *)fun_ctxt;
const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
- const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+ const int is_hbd = is_cur_buf_hbd(xd);
const int overlap =
AOMMIN(block_size_high[bsize], block_size_high[BLOCK_64X64]) >> 1;
@@ -897,7 +850,7 @@ static INLINE void build_obmc_inter_pred_left(MACROBLOCKD *xd, int rel_mi_row,
const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
const int overlap =
AOMMIN(block_size_wide[bsize], block_size_wide[BLOCK_64X64]) >> 1;
- const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+ const int is_hbd = is_cur_buf_hbd(xd);
for (int plane = 0; plane < num_planes; ++plane) {
const struct macroblockd_plane *pd = &xd->plane[plane];
@@ -968,15 +921,15 @@ void av1_setup_build_prediction_by_above_pred(
for (int ref = 0; ref < num_refs; ++ref) {
const MV_REFERENCE_FRAME frame = above_mbmi->ref_frame[ref];
- const RefBuffer *const ref_buf =
- &ctxt->cm->current_frame.frame_refs[frame - LAST_FRAME];
-
- xd->block_refs[ref] = ref_buf;
- if ((!av1_is_valid_scale(&ref_buf->sf)))
+ const RefCntBuffer *const ref_buf = get_ref_frame_buf(ctxt->cm, frame);
+ const struct scale_factors *const sf =
+ get_ref_scale_factors_const(ctxt->cm, frame);
+ xd->block_ref_scale_factors[ref] = sf;
+ if ((!av1_is_valid_scale(sf)))
aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
"Reference frame has invalid dimensions");
- av1_setup_pre_planes(xd, ref, &ref_buf->buf->buf, ctxt->mi_row,
- above_mi_col, &ref_buf->sf, num_planes);
+ av1_setup_pre_planes(xd, ref, &ref_buf->buf, ctxt->mi_row, above_mi_col, sf,
+ num_planes);
}
xd->mb_to_left_edge = 8 * MI_SIZE * (-above_mi_col);
@@ -1006,15 +959,16 @@ void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row,
for (int ref = 0; ref < num_refs; ++ref) {
const MV_REFERENCE_FRAME frame = left_mbmi->ref_frame[ref];
- const RefBuffer *const ref_buf =
- &ctxt->cm->current_frame.frame_refs[frame - LAST_FRAME];
+ const RefCntBuffer *const ref_buf = get_ref_frame_buf(ctxt->cm, frame);
+ const struct scale_factors *const ref_scale_factors =
+ get_ref_scale_factors_const(ctxt->cm, frame);
- xd->block_refs[ref] = ref_buf;
- if ((!av1_is_valid_scale(&ref_buf->sf)))
+ xd->block_ref_scale_factors[ref] = ref_scale_factors;
+ if ((!av1_is_valid_scale(ref_scale_factors)))
aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
"Reference frame has invalid dimensions");
- av1_setup_pre_planes(xd, ref, &ref_buf->buf->buf, left_mi_row, ctxt->mi_col,
- &ref_buf->sf, num_planes);
+ av1_setup_pre_planes(xd, ref, &ref_buf->buf, left_mi_row, ctxt->mi_col,
+ ref_scale_factors, num_planes);
}
xd->mb_to_top_edge = 8 * MI_SIZE * (-left_mi_row);
@@ -1081,12 +1035,13 @@ static void build_smooth_interintra_mask(uint8_t *mask, int stride,
}
}
-static void combine_interintra(INTERINTRA_MODE mode, int use_wedge_interintra,
- int wedge_index, int wedge_sign,
- BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize,
- uint8_t *comppred, int compstride,
- const uint8_t *interpred, int interstride,
- const uint8_t *intrapred, int intrastride) {
+static void combine_interintra(INTERINTRA_MODE mode,
+ int8_t use_wedge_interintra, int wedge_index,
+ int wedge_sign, BLOCK_SIZE bsize,
+ BLOCK_SIZE plane_bsize, uint8_t *comppred,
+ int compstride, const uint8_t *interpred,
+ int interstride, const uint8_t *intrapred,
+ int intrastride) {
const int bw = block_size_wide[plane_bsize];
const int bh = block_size_high[plane_bsize];
@@ -1110,7 +1065,7 @@ static void combine_interintra(INTERINTRA_MODE mode, int use_wedge_interintra,
}
static void combine_interintra_highbd(
- INTERINTRA_MODE mode, int use_wedge_interintra, int wedge_index,
+ INTERINTRA_MODE mode, int8_t use_wedge_interintra, int wedge_index,
int wedge_sign, BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize,
uint8_t *comppred8, int compstride, const uint8_t *interpred8,
int interstride, const uint8_t *intrapred8, int intrastride, int bd) {
@@ -1140,8 +1095,8 @@ static void combine_interintra_highbd(
void av1_build_intra_predictors_for_interintra(const AV1_COMMON *cm,
MACROBLOCKD *xd,
BLOCK_SIZE bsize, int plane,
- BUFFER_SET *ctx, uint8_t *dst,
- int dst_stride) {
+ const BUFFER_SET *ctx,
+ uint8_t *dst, int dst_stride) {
struct macroblockd_plane *const pd = &xd->plane[plane];
const int ssx = xd->plane[plane].subsampling_x;
const int ssy = xd->plane[plane].subsampling_y;
@@ -1164,7 +1119,7 @@ void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
const int ssx = xd->plane[plane].subsampling_x;
const int ssy = xd->plane[plane].subsampling_y;
const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ssx, ssy);
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ if (is_cur_buf_hbd(xd)) {
combine_interintra_highbd(
xd->mi[0]->interintra_mode, xd->mi[0]->use_wedge_interintra,
xd->mi[0]->interintra_wedge_index, xd->mi[0]->interintra_wedge_sign,
@@ -1183,9 +1138,9 @@ void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
// build interintra_predictors for one plane
void av1_build_interintra_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
uint8_t *pred, int stride,
- BUFFER_SET *ctx, int plane,
+ const BUFFER_SET *ctx, int plane,
BLOCK_SIZE bsize) {
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ if (is_cur_buf_hbd(xd)) {
DECLARE_ALIGNED(16, uint16_t, intrapredictor[MAX_SB_SQUARE]);
av1_build_intra_predictors_for_interintra(
cm, xd, bsize, plane, ctx, CONVERT_TO_BYTEPTR(intrapredictor),
@@ -1204,7 +1159,8 @@ void av1_build_interintra_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
void av1_build_interintra_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
uint8_t *upred, uint8_t *vpred,
int ustride, int vstride,
- BUFFER_SET *ctx, BLOCK_SIZE bsize) {
+ const BUFFER_SET *ctx,
+ BLOCK_SIZE bsize) {
av1_build_interintra_predictors_sbp(cm, xd, upred, ustride, ctx, 1, bsize);
av1_build_interintra_predictors_sbp(cm, xd, vpred, vstride, ctx, 2, bsize);
}
diff --git a/libaom/av1/common/reconinter.h b/libaom/av1/common/reconinter.h
index b773679..9d562f9 100644
--- a/libaom/av1/common/reconinter.h
+++ b/libaom/av1/common/reconinter.h
@@ -47,7 +47,7 @@ extern "C" {
#define WEDGE_NONE -1
// Angles are with respect to horizontal anti-clockwise
-typedef enum {
+enum {
WEDGE_HORIZONTAL = 0,
WEDGE_VERTICAL = 1,
WEDGE_OBLIQUE27 = 2,
@@ -55,7 +55,7 @@ typedef enum {
WEDGE_OBLIQUE117 = 4,
WEDGE_OBLIQUE153 = 5,
WEDGE_DIRECTIONS
-} WedgeDirectionType;
+} UENUM1BYTE(WedgeDirectionType);
// 3-tuple: {direction, x_offset, y_offset}
typedef struct {
@@ -161,14 +161,13 @@ static INLINE void highbd_inter_predictor(const uint8_t *src, int src_stride,
void av1_modify_neighbor_predictor_for_obmc(MB_MODE_INFO *mbmi);
int av1_skip_u4x4_pred_in_obmc(BLOCK_SIZE bsize,
const struct macroblockd_plane *pd, int dir);
-int av1_check_identical_obmc_mv_field(const AV1_COMMON *cm, MACROBLOCKD *xd,
- int mi_row, int mi_col);
static INLINE int is_interinter_compound_used(COMPOUND_TYPE type,
BLOCK_SIZE sb_type) {
const int comp_allowed = is_comp_ref_allowed(sb_type);
switch (type) {
case COMPOUND_AVERAGE:
+ case COMPOUND_DISTWTD:
case COMPOUND_DIFFWTD: return comp_allowed;
case COMPOUND_WEDGE:
return comp_allowed && wedge_params_lookup[sb_type].bits > 0;
@@ -247,13 +246,14 @@ static INLINE MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd,
return clamped_mv;
}
-static INLINE int scaled_buffer_offset(int x_offset, int y_offset, int stride,
- const struct scale_factors *sf) {
+static INLINE int64_t scaled_buffer_offset(int x_offset, int y_offset,
+ int stride,
+ const struct scale_factors *sf) {
const int x =
sf ? sf->scale_value_x(x_offset, sf) >> SCALE_EXTRA_BITS : x_offset;
const int y =
sf ? sf->scale_value_y(y_offset, sf) >> SCALE_EXTRA_BITS : y_offset;
- return y * stride + x;
+ return (int64_t)y * stride + x;
}
static INLINE void setup_pred_plane(struct buf_2d *dst, BLOCK_SIZE bsize,
@@ -335,25 +335,28 @@ const uint8_t *av1_get_compound_type_mask(
// build interintra_predictors for one plane
void av1_build_interintra_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
uint8_t *pred, int stride,
- BUFFER_SET *ctx, int plane,
+ const BUFFER_SET *ctx, int plane,
BLOCK_SIZE bsize);
void av1_build_interintra_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
uint8_t *upred, uint8_t *vpred,
int ustride, int vstride,
- BUFFER_SET *ctx, BLOCK_SIZE bsize);
+ const BUFFER_SET *ctx,
+ BLOCK_SIZE bsize);
void av1_build_intra_predictors_for_interintra(
const AV1_COMMON *cm, MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
- BUFFER_SET *ctx, uint8_t *intra_pred, int intra_stride);
+ const BUFFER_SET *ctx, uint8_t *intra_pred, int intra_stride);
void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
const uint8_t *inter_pred, int inter_stride,
const uint8_t *intra_pred, int intra_stride);
-void av1_jnt_comp_weight_assign(const AV1_COMMON *cm, const MB_MODE_INFO *mbmi,
- int order_idx, int *fwd_offset, int *bck_offset,
- int *use_jnt_comp_avg, int is_compound);
+void av1_dist_wtd_comp_weight_assign(const AV1_COMMON *cm,
+ const MB_MODE_INFO *mbmi, int order_idx,
+ int *fwd_offset, int *bck_offset,
+ int *use_dist_wtd_comp_avg,
+ int is_compound);
int av1_allow_warp(const MB_MODE_INFO *const mbmi,
const WarpTypesAllowed *const warp_types,
const WarpedMotionParams *const gm_params,
diff --git a/libaom/av1/common/reconintra.c b/libaom/av1/common/reconintra.c
index df69d6b..559e499 100644
--- a/libaom/av1/common/reconintra.c
+++ b/libaom/av1/common/reconintra.c
@@ -1510,7 +1510,7 @@ void av1_predict_intra_block(
xd->color_index_map_offset[plane != 0];
const uint16_t *const palette =
mbmi->palette_mode_info.palette_colors + plane * PALETTE_MAX_SIZE;
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ if (is_cur_buf_hbd(xd)) {
uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
for (r = 0; r < txhpx; ++r) {
for (c = 0; c < txwpx; ++c) {
@@ -1569,7 +1569,7 @@ void av1_predict_intra_block(
tx_size, row_off, col_off, pd->subsampling_x, pd->subsampling_y);
const int disable_edge_filter = !cm->seq_params.enable_intra_edge_filter;
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ if (is_cur_buf_hbd(xd)) {
build_intra_predictors_high(
xd, ref, ref_stride, dst, dst_stride, mode, angle_delta,
filter_intra_mode, tx_size, disable_edge_filter,
diff --git a/libaom/av1/common/restoration.c b/libaom/av1/common/restoration.c
index c62862b..9e472b8 100644
--- a/libaom/av1/common/restoration.c
+++ b/libaom/av1/common/restoration.c
@@ -1099,7 +1099,7 @@ void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
const int frame_height = frame->crop_heights[0];
if (aom_realloc_frame_buffer(
lr_ctxt->dst, frame_width, frame_height, seq_params->subsampling_x,
- seq_params->subsampling_y, highbd, AOM_BORDER_IN_PIXELS,
+ seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER,
cm->byte_alignment, NULL, NULL, NULL) < 0)
aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
"Failed to allocate restoration dst buffer");
diff --git a/libaom/av1/common/restoration.h b/libaom/av1/common/restoration.h
index d834f92..6d6ba37 100644
--- a/libaom/av1/common/restoration.h
+++ b/libaom/av1/common/restoration.h
@@ -22,6 +22,8 @@
extern "C" {
#endif
+// Border for Loop restoration buffer
+#define AOM_RESTORATION_FRAME_BORDER 32
#define CLIP(x, lo, hi) ((x) < (lo) ? (lo) : (x) > (hi) ? (hi) : (x))
#define RINT(x) ((x) < 0 ? (int)((x)-0.5) : (int)((x) + 0.5))
diff --git a/libaom/av1/common/scale.c b/libaom/av1/common/scale.c
index c525fe2..bac7bd9 100644
--- a/libaom/av1/common/scale.c
+++ b/libaom/av1/common/scale.c
@@ -97,13 +97,13 @@ void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w,
// subpel_x_q4 != 0 && subpel_y_q4 != 0
sf->convolve[1][1][0] = av1_convolve_2d_sr;
// subpel_x_q4 == 0 && subpel_y_q4 == 0
- sf->convolve[0][0][1] = av1_jnt_convolve_2d_copy;
+ sf->convolve[0][0][1] = av1_dist_wtd_convolve_2d_copy;
// subpel_x_q4 == 0
- sf->convolve[0][1][1] = av1_jnt_convolve_y;
+ sf->convolve[0][1][1] = av1_dist_wtd_convolve_y;
// subpel_y_q4 == 0
- sf->convolve[1][0][1] = av1_jnt_convolve_x;
+ sf->convolve[1][0][1] = av1_dist_wtd_convolve_x;
// subpel_x_q4 != 0 && subpel_y_q4 != 0
- sf->convolve[1][1][1] = av1_jnt_convolve_2d;
+ sf->convolve[1][1][1] = av1_dist_wtd_convolve_2d;
// AV1 High BD convolve functions
// Special case convolve functions should produce the same result as
// av1_highbd_convolve_2d.
@@ -116,11 +116,11 @@ void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w,
// subpel_x_q4 != 0 && subpel_y_q4 != 0
sf->highbd_convolve[1][1][0] = av1_highbd_convolve_2d_sr;
// subpel_x_q4 == 0 && subpel_y_q4 == 0
- sf->highbd_convolve[0][0][1] = av1_highbd_jnt_convolve_2d_copy;
+ sf->highbd_convolve[0][0][1] = av1_highbd_dist_wtd_convolve_2d_copy;
// subpel_x_q4 == 0
- sf->highbd_convolve[0][1][1] = av1_highbd_jnt_convolve_y;
+ sf->highbd_convolve[0][1][1] = av1_highbd_dist_wtd_convolve_y;
// subpel_y_q4 == 0
- sf->highbd_convolve[1][0][1] = av1_highbd_jnt_convolve_x;
+ sf->highbd_convolve[1][0][1] = av1_highbd_dist_wtd_convolve_x;
// subpel_x_q4 != 0 && subpel_y_q4 != 0
- sf->highbd_convolve[1][1][1] = av1_highbd_jnt_convolve_2d;
+ sf->highbd_convolve[1][1][1] = av1_highbd_dist_wtd_convolve_2d;
}
diff --git a/libaom/av1/common/scan.h b/libaom/av1/common/scan.h
index 233dc0e..f9c3392 100644
--- a/libaom/av1/common/scan.h
+++ b/libaom/av1/common/scan.h
@@ -25,14 +25,14 @@ extern "C" {
#define MAX_NEIGHBORS 2
-typedef enum SCAN_MODE {
+enum {
SCAN_MODE_ZIG_ZAG,
SCAN_MODE_COL_DIAG,
SCAN_MODE_ROW_DIAG,
SCAN_MODE_COL_1D,
SCAN_MODE_ROW_1D,
SCAN_MODES
-} SCAN_MODE;
+} UENUM1BYTE(SCAN_MODE);
extern const SCAN_ORDER av1_default_scan_orders[TX_SIZES];
extern const SCAN_ORDER av1_scan_orders[TX_SIZES_ALL][TX_TYPES];
diff --git a/libaom/av1/common/seg_common.h b/libaom/av1/common/seg_common.h
index 8c35bba..fa7894c 100644
--- a/libaom/av1/common/seg_common.h
+++ b/libaom/av1/common/seg_common.h
@@ -24,7 +24,7 @@ extern "C" {
#define SEG_TEMPORAL_PRED_CTXS 3
#define SPATIAL_PREDICTION_PROBS 3
-typedef enum {
+enum {
SEG_LVL_ALT_Q, // Use alternate Quantizer ....
SEG_LVL_ALT_LF_Y_V, // Use alternate loop filter value on y plane vertical
SEG_LVL_ALT_LF_Y_H, // Use alternate loop filter value on y plane horizontal
@@ -34,7 +34,7 @@ typedef enum {
SEG_LVL_SKIP, // Optional Segment (0,0) + skip mode
SEG_LVL_GLOBALMV,
SEG_LVL_MAX
-} SEG_LVL_FEATURES;
+} UENUM1BYTE(SEG_LVL_FEATURES);
struct segmentation {
uint8_t enabled;
diff --git a/libaom/av1/common/tile_common.c b/libaom/av1/common/tile_common.c
index 1b41348..02f50f5 100644
--- a/libaom/av1/common/tile_common.c
+++ b/libaom/av1/common/tile_common.c
@@ -51,6 +51,10 @@ void av1_calculate_tile_cols(AV1_COMMON *const cm) {
int sb_rows = mi_rows >> cm->seq_params.mib_size_log2;
int i;
+ // This will be overridden if there is at least two columns of tiles
+ // (otherwise there is no inner tile width)
+ cm->min_inner_tile_width = -1;
+
if (cm->uniform_tile_spacing_flag) {
int start_sb;
int size_sb = ALIGN_POWER_OF_TWO(sb_cols, cm->log2_tile_cols);
@@ -67,18 +71,29 @@ void av1_calculate_tile_cols(AV1_COMMON *const cm) {
cm->tile_width = size_sb << cm->seq_params.mib_size_log2;
cm->tile_width = AOMMIN(cm->tile_width, cm->mi_cols);
+ if (cm->tile_cols > 1) {
+ cm->min_inner_tile_width = cm->tile_width;
+ }
} else {
int max_tile_area_sb = (sb_rows * sb_cols);
int widest_tile_sb = 1;
+ int narrowest_inner_tile_sb = 65536;
cm->log2_tile_cols = tile_log2(1, cm->tile_cols);
for (i = 0; i < cm->tile_cols; i++) {
int size_sb = cm->tile_col_start_sb[i + 1] - cm->tile_col_start_sb[i];
widest_tile_sb = AOMMAX(widest_tile_sb, size_sb);
+ // ignore the rightmost tile in frame for determining the narrowest
+ if (i < cm->tile_cols - 1)
+ narrowest_inner_tile_sb = AOMMIN(narrowest_inner_tile_sb, size_sb);
}
if (cm->min_log2_tiles) {
max_tile_area_sb >>= (cm->min_log2_tiles + 1);
}
cm->max_tile_height_sb = AOMMAX(max_tile_area_sb / widest_tile_sb, 1);
+ if (cm->tile_cols > 1) {
+ cm->min_inner_tile_width = narrowest_inner_tile_sb
+ << cm->seq_params.mib_size_log2;
+ }
}
}
@@ -143,30 +158,6 @@ int av1_get_sb_cols_in_tile(AV1_COMMON *cm, TileInfo tile) {
return sb_cols;
}
-int get_tile_size(int mi_frame_size, int log2_tile_num, int *ntiles) {
- // Round the frame up to a whole number of max superblocks
- mi_frame_size = ALIGN_POWER_OF_TWO(mi_frame_size, MAX_MIB_SIZE_LOG2);
-
- // Divide by the signalled number of tiles, rounding up to the multiple of
- // the max superblock size. To do this, shift right (and round up) to get the
- // tile size in max super-blocks and then shift left again to convert it to
- // mi units.
- const int shift = log2_tile_num + MAX_MIB_SIZE_LOG2;
- const int max_sb_tile_size =
- ALIGN_POWER_OF_TWO(mi_frame_size, shift) >> shift;
- const int mi_tile_size = max_sb_tile_size << MAX_MIB_SIZE_LOG2;
-
- // The actual number of tiles is the ceiling of the frame size in mi units
- // divided by mi_size. This is at most 1 << log2_tile_num but might be
- // strictly less if max_sb_tile_size got rounded up significantly.
- if (ntiles) {
- *ntiles = (mi_frame_size + mi_tile_size - 1) / mi_tile_size;
- assert(*ntiles <= (1 << log2_tile_num));
- }
-
- return mi_tile_size;
-}
-
AV1PixelRect av1_get_tile_rect(const TileInfo *tile_info, const AV1_COMMON *cm,
int is_uv) {
AV1PixelRect r;
@@ -205,3 +196,34 @@ AV1PixelRect av1_get_tile_rect(const TileInfo *tile_info, const AV1_COMMON *cm,
return r;
}
+
+void av1_get_uniform_tile_size(const AV1_COMMON *cm, int *w, int *h) {
+ if (cm->uniform_tile_spacing_flag) {
+ *w = cm->tile_width;
+ *h = cm->tile_height;
+ } else {
+ for (int i = 0; i < cm->tile_cols; ++i) {
+ const int tile_width_sb =
+ cm->tile_col_start_sb[i + 1] - cm->tile_col_start_sb[i];
+ const int tile_w = tile_width_sb * cm->seq_params.mib_size;
+ assert(i == 0 || tile_w == *w); // ensure all tiles have same dimension
+ *w = tile_w;
+ }
+
+ for (int i = 0; i < cm->tile_rows; ++i) {
+ const int tile_height_sb =
+ cm->tile_row_start_sb[i + 1] - cm->tile_row_start_sb[i];
+ const int tile_h = tile_height_sb * cm->seq_params.mib_size;
+ assert(i == 0 || tile_h == *h); // ensure all tiles have same dimension
+ *h = tile_h;
+ }
+ }
+}
+
+int is_min_tile_width_satisfied(const AV1_COMMON *cm) {
+ // Disable check if there is a single tile col in the frame
+ if (cm->tile_cols == 1) return 1;
+
+ return ((cm->min_inner_tile_width << MI_SIZE_LOG2) >=
+ (64 << av1_superres_scaled(cm)));
+}
diff --git a/libaom/av1/common/tile_common.h b/libaom/av1/common/tile_common.h
index c03553d..a235f2d 100644
--- a/libaom/av1/common/tile_common.h
+++ b/libaom/av1/common/tile_common.h
@@ -25,7 +25,6 @@ struct AV1Common;
typedef struct TileInfo {
int mi_row_start, mi_row_end;
int mi_col_start, mi_col_end;
- int tg_horz_boundary;
int tile_row;
int tile_col;
} TileInfo;
@@ -37,12 +36,6 @@ void av1_tile_init(TileInfo *tile, const struct AV1Common *cm, int row,
void av1_tile_set_row(TileInfo *tile, const struct AV1Common *cm, int row);
void av1_tile_set_col(TileInfo *tile, const struct AV1Common *cm, int col);
-void av1_get_tile_n_bits(int mi_cols, int *min_log2_tile_cols,
- int *max_log2_tile_cols);
-
-// Calculate the correct tile size (width or height) for (1 << log2_tile_num)
-// tiles horizontally or vertically in the frame.
-int get_tile_size(int mi_frame_size, int log2_tile_num, int *ntiles);
int av1_get_sb_rows_in_tile(struct AV1Common *cm, TileInfo tile);
int av1_get_sb_cols_in_tile(struct AV1Common *cm, TileInfo tile);
@@ -61,10 +54,14 @@ AV1PixelRect av1_get_tile_rect(const TileInfo *tile_info,
#define MAX_TILE_WIDTH (4096) // Max Tile width in pixels
#define MAX_TILE_AREA (4096 * 2304) // Maximum tile area in pixels
+void av1_get_uniform_tile_size(const struct AV1Common *cm, int *w, int *h);
void av1_get_tile_limits(struct AV1Common *const cm);
void av1_calculate_tile_cols(struct AV1Common *const cm);
void av1_calculate_tile_rows(struct AV1Common *const cm);
+// Checks if the minimum tile_width requirement is satisfied
+int is_min_tile_width_satisfied(const struct AV1Common *cm);
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/libaom/av1/common/txb_common.c b/libaom/av1/common/txb_common.c
index c96d37c..cb92bd8 100644
--- a/libaom/av1/common/txb_common.c
+++ b/libaom/av1/common/txb_common.c
@@ -453,23 +453,6 @@ const int8_t *av1_nz_map_ctx_offset[19] = {
av1_nz_map_ctx_offset_64x32, // TX_64x16
};
-void av1_init_lv_map(AV1_COMMON *cm) {
- LV_MAP_CTX_TABLE *coeff_ctx_table = &cm->coeff_ctx_table;
- for (int row = 0; row < 2; ++row) {
- for (int col = 0; col < 2; ++col) {
- for (int sig_mag = 0; sig_mag < 3; ++sig_mag) {
- for (int count = 0; count < BASE_CONTEXT_POSITION_NUM + 1; ++count) {
- if (row == 0 && col == 0 && count > 5) continue;
- if ((row == 0 || col == 0) && count > 8) continue;
-
- coeff_ctx_table->base_ctx_table[row][col][sig_mag][count] =
- get_base_ctx_from_count_mag(row, col, count, sig_mag);
- }
- }
- }
- }
-}
-
const int16_t k_eob_group_start[12] = { 0, 1, 2, 3, 5, 9,
17, 33, 65, 129, 257, 513 };
const int16_t k_eob_offset_bits[12] = { 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
diff --git a/libaom/av1/common/txb_common.h b/libaom/av1/common/txb_common.h
index 698e95b..8a3932d 100644
--- a/libaom/av1/common/txb_common.h
+++ b/libaom/av1/common/txb_common.h
@@ -159,6 +159,19 @@ static INLINE int get_br_ctx_2d(const uint8_t *const levels,
return mag + 14;
}
+static AOM_FORCE_INLINE int get_br_ctx_eob(const int c, // raster order
+ const int bwl,
+ const TX_CLASS tx_class) {
+ const int row = c >> bwl;
+ const int col = c - (row << bwl);
+ if (c == 0) return 0;
+ if ((tx_class == TX_CLASS_2D && row < 2 && col < 2) ||
+ (tx_class == TX_CLASS_HORIZ && col == 0) ||
+ (tx_class == TX_CLASS_VERT && row == 0))
+ return 7;
+ return 14;
+}
+
static AOM_FORCE_INLINE int get_br_ctx(const uint8_t *const levels,
const int c, // raster order
const int bwl, const TX_CLASS tx_class) {
@@ -272,12 +285,10 @@ static AOM_FORCE_INLINE int get_nz_map_ctx_from_stats(
const int row = coeff_idx >> bwl;
const int col = coeff_idx - (row << bwl);
return ctx + nz_map_ctx_offset_1d[col];
- break;
}
case TX_CLASS_VERT: {
const int row = coeff_idx >> bwl;
return ctx + nz_map_ctx_offset_1d[row];
- break;
}
default: break;
}
@@ -421,6 +432,4 @@ static INLINE void get_txb_ctx(const BLOCK_SIZE plane_bsize,
#undef MAX_TX_SIZE_UNIT
}
-void av1_init_lv_map(AV1_COMMON *cm);
-
#endif // AOM_AV1_COMMON_TXB_COMMON_H_
diff --git a/libaom/av1/common/warped_motion.c b/libaom/av1/common/warped_motion.c
index 4144c43..e232e10 100644
--- a/libaom/av1/common/warped_motion.c
+++ b/libaom/av1/common/warped_motion.c
@@ -485,7 +485,7 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
uint16_t *dst16 =
&pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
int32_t tmp32 = *p;
- if (conv_params->use_jnt_comp_avg) {
+ if (conv_params->use_dist_wtd_comp_avg) {
tmp32 = tmp32 * conv_params->fwd_offset +
sum * conv_params->bck_offset;
tmp32 = tmp32 >> DIST_PRECISION_BITS;
@@ -563,7 +563,7 @@ static int64_t highbd_warp_error(
uint16_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK];
ConvolveParams conv_params = get_conv_params(0, 0, bd);
- conv_params.use_jnt_comp_avg = 0;
+ conv_params.use_dist_wtd_comp_avg = 0;
for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) {
for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) {
// avoid warping extra 8x8 blocks in the padded region of the frame
@@ -773,7 +773,7 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
uint8_t *dst8 =
&pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
int32_t tmp32 = *p;
- if (conv_params->use_jnt_comp_avg) {
+ if (conv_params->use_dist_wtd_comp_avg) {
tmp32 = tmp32 * conv_params->fwd_offset +
sum * conv_params->bck_offset;
tmp32 = tmp32 >> DIST_PRECISION_BITS;
@@ -846,7 +846,7 @@ static int64_t warp_error(WarpedMotionParams *wm, const uint8_t *const ref,
int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
uint8_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK];
ConvolveParams conv_params = get_conv_params(0, 0, 8);
- conv_params.use_jnt_comp_avg = 0;
+ conv_params.use_dist_wtd_comp_avg = 0;
for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) {
for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) {
diff --git a/libaom/av1/common/x86/av1_convolve_scale_sse4.c b/libaom/av1/common/x86/av1_convolve_scale_sse4.c
index d9fb537..8f44238 100644
--- a/libaom/av1/common/x86/av1_convolve_scale_sse4.c
+++ b/libaom/av1/common/x86/av1_convolve_scale_sse4.c
@@ -175,7 +175,7 @@ static void vfilter8(const int16_t *src, int src_stride, uint8_t *dst,
if (conv_params->is_compound) {
if (conv_params->do_average) {
const __m128i p_16 = _mm_loadl_epi64((__m128i *)dst_16_x);
- if (conv_params->use_jnt_comp_avg) {
+ if (conv_params->use_dist_wtd_comp_avg) {
const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, shifted_16);
const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, wt);
const __m128i shifted_32 =
@@ -207,7 +207,7 @@ static void vfilter8(const int16_t *src, int src_stride, uint8_t *dst,
if (conv_params->is_compound) {
if (conv_params->do_average) {
int32_t tmp = dst16[y * dst16_stride + x];
- if (conv_params->use_jnt_comp_avg) {
+ if (conv_params->use_dist_wtd_comp_avg) {
tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
tmp = tmp >> DIST_PRECISION_BITS;
} else {
@@ -408,7 +408,7 @@ static void highbd_vfilter8(const int16_t *src, int src_stride, uint16_t *dst,
__m128i p_32 =
_mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i *)dst_16_x));
- if (conv_params->use_jnt_comp_avg) {
+ if (conv_params->use_dist_wtd_comp_avg) {
shifted = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0),
_mm_mullo_epi32(shifted, wt1));
shifted = _mm_srai_epi32(shifted, DIST_PRECISION_BITS);
@@ -443,7 +443,7 @@ static void highbd_vfilter8(const int16_t *src, int src_stride, uint16_t *dst,
if (conv_params->is_compound) {
if (conv_params->do_average) {
int32_t tmp = dst16[y * dst16_stride + x];
- if (conv_params->use_jnt_comp_avg) {
+ if (conv_params->use_dist_wtd_comp_avg) {
tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
tmp = tmp >> DIST_PRECISION_BITS;
} else {
diff --git a/libaom/av1/common/x86/av1_inv_txfm_ssse3.c b/libaom/av1/common/x86/av1_inv_txfm_ssse3.c
index 9841bf3..de0a561 100644
--- a/libaom/av1/common/x86/av1_inv_txfm_ssse3.c
+++ b/libaom/av1/common/x86/av1_inv_txfm_ssse3.c
@@ -2920,8 +2920,18 @@ void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
const TxfmParam *txfm_param) {
const TX_TYPE tx_type = txfm_param->tx_type;
if (!txfm_param->lossless) {
- av1_lowbd_inv_txfm2d_add_ssse3(dqcoeff, dst, stride, tx_type,
- txfm_param->tx_size, txfm_param->eob);
+ switch (txfm_param->tx_size) {
+ case TX_4X16:
+ case TX_16X4:
+ // TODO(http://crbug.com/aomedia/2350): the ssse3 versions cause test
+ // vector mismatches.
+ av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
+ break;
+ default:
+ av1_lowbd_inv_txfm2d_add_ssse3(dqcoeff, dst, stride, tx_type,
+ txfm_param->tx_size, txfm_param->eob);
+ break;
+ }
} else {
av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
}
diff --git a/libaom/av1/common/x86/av1_inv_txfm_ssse3.h b/libaom/av1/common/x86/av1_inv_txfm_ssse3.h
index 66bd339..7d5055d 100644
--- a/libaom/av1/common/x86/av1_inv_txfm_ssse3.h
+++ b/libaom/av1/common/x86/av1_inv_txfm_ssse3.h
@@ -72,13 +72,13 @@ static INLINE void round_shift_16bit_ssse3(__m128i *in, int size, int bit) {
}
// 1D itx types
-typedef enum ATTRIBUTE_PACKED {
+enum {
IDCT_1D,
IADST_1D,
IFLIPADST_1D = IADST_1D,
IIDENTITY_1D,
ITX_TYPES_1D,
-} ITX_TYPE_1D;
+} UENUM1BYTE(ITX_TYPE_1D);
static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = {
IDCT_1D, IADST_1D, IDCT_1D, IADST_1D,
diff --git a/libaom/av1/common/x86/av1_txfm_sse4.c b/libaom/av1/common/x86/av1_txfm_sse4.c
index 90b9879..65ccd19 100644
--- a/libaom/av1/common/x86/av1_txfm_sse4.c
+++ b/libaom/av1/common/x86/av1_txfm_sse4.c
@@ -9,7 +9,7 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
-#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
#include "av1/common/av1_txfm.h"
#include "av1/common/x86/av1_txfm_sse4.h"
diff --git a/libaom/av1/common/x86/convolve_2d_avx2.c b/libaom/av1/common/x86/convolve_2d_avx2.c
index 0acafd0..ae12a60 100644
--- a/libaom/av1/common/x86/convolve_2d_avx2.c
+++ b/libaom/av1/common/x86/convolve_2d_avx2.c
@@ -27,31 +27,15 @@ void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
const int bd = 8;
-
- DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
- int im_h = h + filter_params_y->taps - 1;
int im_stride = 8;
- int i, j;
- const int fo_vert = filter_params_y->taps / 2 - 1;
- const int fo_horiz = filter_params_x->taps / 2 - 1;
- const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
-
+ int i, is_horiz_4tap = 0, is_vert_4tap = 0;
+ DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
const int bits =
FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
- __m256i filt[4], coeffs_h[4], coeffs_v[4];
-
assert(conv_params->round_0 > 0);
- filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
- filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
- filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
- filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
-
- prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_h);
- prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_v);
-
const __m256i round_const_h = _mm256_set1_epi16(
((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2)));
const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1);
@@ -65,58 +49,96 @@ void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
((1 << (offset_bits - conv_params->round_1)) >> 1));
const __m128i round_shift_v = _mm_cvtsi32_si128(bits);
- for (j = 0; j < w; j += 8) {
- for (i = 0; i < im_h; i += 2) {
- __m256i data = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
+ __m256i filt[4], coeffs_h[4], coeffs_v[4];
+
+ filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
+ filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+
+ prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_h);
+ prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_v);
- // Load the next line
- if (i + 1 < im_h)
+ // Condition for checking valid horz_filt taps
+ if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_h[0], coeffs_h[3]), 0)))
+ is_horiz_4tap = 1;
+
+ // Condition for checking valid vert_filt taps
+ if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_v[0], coeffs_v[3]), 0)))
+ is_vert_4tap = 1;
+
+ // horz_filt as 4 tap and vert_filt as 8 tap
+ if (is_horiz_4tap) {
+ int im_h = h + filter_params_y->taps - 1;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ // horz-filter
+ for (int j = 0; j < w; j += 8) {
+ for (i = 0; i < (im_h - 2); i += 2) {
+ __m256i data = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
+
+ // Load the next line
data = _mm256_inserti128_si256(
data,
_mm_loadu_si128(
(__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),
1);
+ __m256i res = convolve_lowbd_x_4tap(data, coeffs_h + 1, filt);
- __m256i res = convolve_lowbd_x(data, coeffs_h, filt);
+ res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h),
+ round_shift_h);
+ _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+ }
+ __m256i data_1 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
+
+ __m256i res = convolve_lowbd_x_4tap(data_1, coeffs_h + 1, filt);
res =
_mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h);
-
_mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
- }
- /* Vertical filter */
- {
+ // vert filter
+ CONVOLVE_SR_VERTICAL_FILTER_8TAP;
+ }
+ } else if (is_vert_4tap) {
+ int im_h = h + 3;
+ const int fo_vert = 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+ filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+ for (int j = 0; j < w; j += 8) {
+ // horz_filter
+ CONVOLVE_SR_HORIZONTAL_FILTER_8TAP;
+ // vert_filter
+ __m256i s[6];
__m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
__m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
__m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
__m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
- __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
- __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
- __m256i s[8];
s[0] = _mm256_unpacklo_epi16(src_0, src_1);
s[1] = _mm256_unpacklo_epi16(src_2, src_3);
- s[2] = _mm256_unpacklo_epi16(src_4, src_5);
-
- s[4] = _mm256_unpackhi_epi16(src_0, src_1);
- s[5] = _mm256_unpackhi_epi16(src_2, src_3);
- s[6] = _mm256_unpackhi_epi16(src_4, src_5);
+ s[3] = _mm256_unpackhi_epi16(src_0, src_1);
+ s[4] = _mm256_unpackhi_epi16(src_2, src_3);
for (i = 0; i < h; i += 2) {
const int16_t *data = &im_block[i * im_stride];
- const __m256i s6 =
- _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
- const __m256i s7 =
- _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
+ const __m256i s4 =
+ _mm256_loadu_si256((__m256i *)(data + 4 * im_stride));
+ const __m256i s5 =
+ _mm256_loadu_si256((__m256i *)(data + 5 * im_stride));
- s[3] = _mm256_unpacklo_epi16(s6, s7);
- s[7] = _mm256_unpackhi_epi16(s6, s7);
+ s[2] = _mm256_unpacklo_epi16(s4, s5);
+ s[5] = _mm256_unpackhi_epi16(s4, s5);
- __m256i res_a = convolve(s, coeffs_v);
- __m256i res_b = convolve(s + 4, coeffs_v);
+ __m256i res_a = convolve_4tap(s, coeffs_v + 1);
+ __m256i res_b = convolve_4tap(s + 3, coeffs_v + 1);
// Combine V round and 2F-H-V round into a single rounding
res_a =
@@ -154,13 +176,25 @@ void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
s[0] = s[1];
s[1] = s[2];
- s[2] = s[3];
-
+ s[3] = s[4];
s[4] = s[5];
- s[5] = s[6];
- s[6] = s[7];
}
}
+ } else {
+ int j;
+ int im_h = h + filter_params_y->taps - 1;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+ filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+ for (j = 0; j < w; j += 8) {
+ CONVOLVE_SR_HORIZONTAL_FILTER_8TAP;
+
+ CONVOLVE_SR_VERTICAL_FILTER_8TAP;
+ }
}
}
@@ -195,20 +229,20 @@ void av1_convolve_2d_copy_sr_avx2(const uint8_t *src, int src_stride,
if (w == 2) {
do {
- memcpy(dst, src, 2 * sizeof(*src));
+ memmove(dst, src, 2 * sizeof(*src));
src += src_stride;
dst += dst_stride;
- memcpy(dst, src, 2 * sizeof(*src));
+ memmove(dst, src, 2 * sizeof(*src));
src += src_stride;
dst += dst_stride;
h -= 2;
} while (h);
} else if (w == 4) {
do {
- memcpy(dst, src, 4 * sizeof(*src));
+ memmove(dst, src, 4 * sizeof(*src));
src += src_stride;
dst += dst_stride;
- memcpy(dst, src, 4 * sizeof(*src));
+ memmove(dst, src, 4 * sizeof(*src));
src += src_stride;
dst += dst_stride;
h -= 2;
diff --git a/libaom/av1/common/x86/convolve_2d_sse2.c b/libaom/av1/common/x86/convolve_2d_sse2.c
index b1a62a4..369922b 100644
--- a/libaom/av1/common/x86/convolve_2d_sse2.c
+++ b/libaom/av1/common/x86/convolve_2d_sse2.c
@@ -255,20 +255,20 @@ void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride,
if (w == 2) {
do {
- memcpy(dst, src, 2 * sizeof(*src));
+ memmove(dst, src, 2 * sizeof(*src));
src += src_stride;
dst += dst_stride;
- memcpy(dst, src, 2 * sizeof(*src));
+ memmove(dst, src, 2 * sizeof(*src));
src += src_stride;
dst += dst_stride;
h -= 2;
} while (h);
} else if (w == 4) {
do {
- memcpy(dst, src, 4 * sizeof(*src));
+ memmove(dst, src, 4 * sizeof(*src));
src += src_stride;
dst += dst_stride;
- memcpy(dst, src, 4 * sizeof(*src));
+ memmove(dst, src, 4 * sizeof(*src));
src += src_stride;
dst += dst_stride;
h -= 2;
@@ -354,12 +354,11 @@ void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride,
}
}
-void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride,
- uint8_t *dst0, int dst_stride0, int w, int h,
- const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_2d_copy_sse2(
+ const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ const int subpel_y_q4, ConvolveParams *conv_params) {
const int bd = 8;
CONV_BUF_TYPE *dst = conv_params->dst;
int dst_stride = conv_params->dst_stride;
@@ -371,7 +370,7 @@ void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride,
const int bits =
FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
const int do_average = conv_params->do_average;
- const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
const __m128i zero = _mm_setzero_si128();
const __m128i left_shift = _mm_cvtsi32_si128(bits);
int i, j;
@@ -411,14 +410,14 @@ void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride,
const __m128i data_ref_0_hi =
_mm_loadu_si128((__m128i *)(&dst[j + 8]));
- const __m128i comp_avg_res_lo =
- comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt, use_jnt_comp_avg);
+ const __m128i comp_avg_res_lo = comp_avg(
+ &data_ref_0_lo, &res_unsigned_lo, &wt, use_dist_wtd_comp_avg);
const __m128i round_result_lo = convolve_rounding(
&comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
- const __m128i comp_avg_res_hi =
- comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt, use_jnt_comp_avg);
+ const __m128i comp_avg_res_hi = comp_avg(
+ &data_ref_0_hi, &res_unsigned_hi, &wt, use_dist_wtd_comp_avg);
const __m128i round_result_hi = convolve_rounding(
&comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
@@ -449,7 +448,7 @@ void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride,
const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)(&dst[j]));
const __m128i comp_avg_res =
- comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
const __m128i round_result = convolve_rounding(
&comp_avg_res, &offset_const, &rounding_const, rounding_shift);
diff --git a/libaom/av1/common/x86/convolve_avx2.c b/libaom/av1/common/x86/convolve_avx2.c
index 0e91ea9..21b9fe4 100644
--- a/libaom/av1/common/x86/convolve_avx2.c
+++ b/libaom/av1/common/x86/convolve_avx2.c
@@ -23,153 +23,239 @@ void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
- int i, j;
- const int fo_vert = filter_params_y->taps / 2 - 1;
- const uint8_t *const src_ptr = src - fo_vert * src_stride;
-
+ int i, j, is_vert_4tap = 0;
// right shift is F-1 because we are already dividing
// filter co-efficients by 2
const int right_shift_bits = (FILTER_BITS - 1);
const __m128i right_shift = _mm_cvtsi32_si128(right_shift_bits);
const __m256i right_shift_const =
_mm256_set1_epi16((1 << right_shift_bits) >> 1);
- __m256i coeffs[4], s[8];
assert(conv_params->round_0 <= FILTER_BITS);
assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
- prepare_coeffs_lowbd(filter_params_y, subpel_y_q4, coeffs);
-
(void)filter_params_x;
(void)subpel_x_q4;
(void)conv_params;
+ __m256i coeffs[4], s[8];
+ __m128i d[6];
- for (j = 0; j < w; j += 16) {
- const uint8_t *data = &src_ptr[j];
- __m256i src6;
-
- // Load lines a and b. Line a to lower 128, line b to upper 128
- const __m256i src_01a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
- 0x20);
-
- const __m256i src_12a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
- 0x20);
-
- const __m256i src_23a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
- 0x20);
-
- const __m256i src_34a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
- 0x20);
-
- const __m256i src_45a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
- 0x20);
-
- src6 = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
- const __m256i src_56a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
- src6, 0x20);
-
- s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
- s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
- s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
-
- s[4] = _mm256_unpackhi_epi8(src_01a, src_12a);
- s[5] = _mm256_unpackhi_epi8(src_23a, src_34a);
- s[6] = _mm256_unpackhi_epi8(src_45a, src_56a);
-
- for (i = 0; i < h; i += 2) {
- data = &src_ptr[i * src_stride + j];
- const __m256i src_67a = _mm256_permute2x128_si256(
- src6,
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
- 0x20);
+ prepare_coeffs_lowbd(filter_params_y, subpel_y_q4, coeffs);
- src6 = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
- const __m256i src_78a = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
- src6, 0x20);
+ // Condition for checking valid vert_filt taps
+ if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0)))
+ is_vert_4tap = 1;
+
+ // vert_filt as 4 tap
+ if (is_vert_4tap) {
+ const int fo_vert = 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride;
+ for (j = 0; j < w; j += 16) {
+ const uint8_t *data = &src_ptr[j];
+ d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
+ d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
+ d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
+ d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
+ d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
+
+ // Load lines a and b. Line a to lower 128, line b to upper 128
+ const __m256i src_01a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
+
+ const __m256i src_12a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
- s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
- s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
+ const __m256i src_23a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
- const __m256i res_lo = convolve_lowbd(s, coeffs);
+ const __m256i src_34a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
- /* rounding code */
- // shift by F - 1
- const __m256i res_16b_lo = _mm256_sra_epi16(
- _mm256_add_epi16(res_lo, right_shift_const), right_shift);
- // 8 bit conversion and saturation to uint8
- __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
+ s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
+ s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
- if (w - j > 8) {
- const __m256i res_hi = convolve_lowbd(s + 4, coeffs);
+ s[3] = _mm256_unpackhi_epi8(src_01a, src_12a);
+ s[4] = _mm256_unpackhi_epi8(src_23a, src_34a);
+ for (i = 0; i < h; i += 2) {
+ data = &src_ptr[i * src_stride + j];
+ d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
+ const __m256i src_45a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
+
+ d[4] = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
+ const __m256i src_56a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[4]), 0x20);
+
+ s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
+ s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
+
+ const __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1);
/* rounding code */
// shift by F - 1
- const __m256i res_16b_hi = _mm256_sra_epi16(
- _mm256_add_epi16(res_hi, right_shift_const), right_shift);
+ const __m256i res_16b_lo = _mm256_sra_epi16(
+ _mm256_add_epi16(res_lo, right_shift_const), right_shift);
// 8 bit conversion and saturation to uint8
- __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
-
- __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
-
- const __m128i res_0 = _mm256_castsi256_si128(res_a);
- const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
-
- _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
- _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
- res_1);
- } else {
- const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
- const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
- if (w - j > 4) {
- _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
- _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
+
+ if (w - j > 8) {
+ const __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1);
+
+ /* rounding code */
+ // shift by F - 1
+ const __m256i res_16b_hi = _mm256_sra_epi16(
+ _mm256_add_epi16(res_hi, right_shift_const), right_shift);
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
+
+ __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_a);
+ const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
+
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
res_1);
- } else if (w - j > 2) {
- xx_storel_32(&dst[i * dst_stride + j], res_0);
- xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
} else {
- __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
- __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];
- *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
- *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+ const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
+ if (w - j > 4) {
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_1);
+ } else if (w - j > 2) {
+ xx_storel_32(&dst[i * dst_stride + j], res_0);
+ xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
+ } else {
+ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+ __m128i *const p_1 =
+ (__m128i *)&dst[i * dst_stride + j + dst_stride];
+ *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
+ *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+ }
}
+ s[0] = s[1];
+ s[1] = s[2];
+
+ s[3] = s[4];
+ s[4] = s[5];
}
+ }
+ } else {
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride;
+
+ for (j = 0; j < w; j += 16) {
+ const uint8_t *data = &src_ptr[j];
+ __m256i src6;
+
+ d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
+ d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
+ d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
+ d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
+ d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
+ d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
+ // Load lines a and b. Line a to lower 128, line b to upper 128
+ const __m256i src_01a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
- s[0] = s[1];
- s[1] = s[2];
- s[2] = s[3];
+ const __m256i src_12a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
+
+ const __m256i src_23a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
+
+ const __m256i src_34a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
+
+ const __m256i src_45a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
+
+ src6 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
+ const __m256i src_56a =
+ _mm256_permute2x128_si256(_mm256_castsi128_si256(d[5]), src6, 0x20);
+
+ s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
+ s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
+ s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
+
+ s[4] = _mm256_unpackhi_epi8(src_01a, src_12a);
+ s[5] = _mm256_unpackhi_epi8(src_23a, src_34a);
+ s[6] = _mm256_unpackhi_epi8(src_45a, src_56a);
+
+ for (i = 0; i < h; i += 2) {
+ data = &src_ptr[i * src_stride + j];
+ const __m256i src_67a = _mm256_permute2x128_si256(
+ src6,
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+ 0x20);
+
+ src6 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
+ const __m256i src_78a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+ src6, 0x20);
+
+ s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
+ s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
+
+ const __m256i res_lo = convolve_lowbd(s, coeffs);
- s[4] = s[5];
- s[5] = s[6];
- s[6] = s[7];
+ /* rounding code */
+ // shift by F - 1
+ const __m256i res_16b_lo = _mm256_sra_epi16(
+ _mm256_add_epi16(res_lo, right_shift_const), right_shift);
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
+
+ if (w - j > 8) {
+ const __m256i res_hi = convolve_lowbd(s + 4, coeffs);
+
+ /* rounding code */
+ // shift by F - 1
+ const __m256i res_16b_hi = _mm256_sra_epi16(
+ _mm256_add_epi16(res_hi, right_shift_const), right_shift);
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
+
+ __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_a);
+ const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
+
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_1);
+ } else {
+ const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
+ if (w - j > 4) {
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_1);
+ } else if (w - j > 2) {
+ xx_storel_32(&dst[i * dst_stride + j], res_0);
+ xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
+ } else {
+ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+ __m128i *const p_1 =
+ (__m128i *)&dst[i * dst_stride + j + dst_stride];
+ *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
+ *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+ }
+ }
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+
+ s[4] = s[5];
+ s[5] = s[6];
+ s[6] = s[7];
+ }
}
}
}
@@ -180,26 +266,14 @@ void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
const InterpFilterParams *filter_params_y,
const int subpel_x_q4, const int subpel_y_q4,
ConvolveParams *conv_params) {
- int i, j;
- const int fo_horiz = filter_params_x->taps / 2 - 1;
- const uint8_t *const src_ptr = src - fo_horiz;
const int bits = FILTER_BITS - conv_params->round_0;
- __m256i filt[4], coeffs[4];
-
- filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
- filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
- filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
- filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
-
- prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs);
-
const __m256i round_0_const =
_mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1);
const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
const __m256i round_const = _mm256_set1_epi16((1 << bits) >> 1);
const __m128i round_shift = _mm_cvtsi32_si128(bits);
-
+ int i, is_horiz_4tap = 0;
(void)filter_params_y;
(void)subpel_y_q4;
@@ -208,51 +282,101 @@ void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
assert(conv_params->round_0 > 0);
- if (w <= 8) {
- for (i = 0; i < h; i += 2) {
- const __m256i data = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
- _mm256_castsi128_si256(_mm_loadu_si128(
- (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
- 0x20);
-
- __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
-
- res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
- round_0_shift);
-
- res_16b =
- _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), round_shift);
-
- /* rounding code */
- // 8 bit conversion and saturation to uint8
- __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
-
- const __m128i res_0 = _mm256_castsi256_si128(res_8b);
- const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
- if (w > 4) {
- _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
- _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
- } else if (w > 2) {
- xx_storel_32(&dst[i * dst_stride], res_0);
- xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
- } else {
- __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
- __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
- *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
- *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+ __m256i coeffs[4], filt[4];
+ filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
+ filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+
+ prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs);
+
+ // Condition for checking valid horz_filt taps
+ if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0)))
+ is_horiz_4tap = 1;
+
+ // horz_filt as 4 tap
+ if (is_horiz_4tap) {
+ const int fo_horiz = 1;
+ const uint8_t *const src_ptr = src - fo_horiz;
+ if (w <= 8) {
+ for (i = 0; i < h; i += 2) {
+ const __m256i data = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
+ _mm256_castsi128_si256(_mm_loadu_si128(
+ (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
+ 0x20);
+
+ __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
+
+ res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
+ round_0_shift);
+
+ res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
+ round_shift);
+
+ /* rounding code */
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_8b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
+
+ if (w > 4) {
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
+ } else if (w > 2) {
+ xx_storel_32(&dst[i * dst_stride], res_0);
+ xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
+ } else {
+ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
+ __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
+ *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
+ *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+ }
+ }
+ } else {
+ for (i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
+ // 18 19 20 21 22 23
+ const __m256i data = _mm256_inserti128_si256(
+ _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
+ _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
+ 1);
+
+ __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
+
+ res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
+ round_0_shift);
+
+ res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
+ round_shift);
+
+ /* rounding code */
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+ // Store values into the destination buffer
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ res_8b = _mm256_permute4x64_epi64(res_8b, 216);
+ __m128i res = _mm256_castsi256_si128(res_8b);
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+ }
}
}
} else {
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; j += 16) {
- // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17 18
- // 19 20 21 22 23
- const __m256i data = _mm256_inserti128_si256(
- _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
- _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
- 1);
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_horiz;
+ filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+ filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+ if (w <= 8) {
+ for (i = 0; i < h; i += 2) {
+ const __m256i data = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
+ _mm256_castsi128_si256(_mm_loadu_si128(
+ (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
+ 0x20);
__m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
@@ -266,11 +390,49 @@ void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
// 8 bit conversion and saturation to uint8
__m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
- // Store values into the destination buffer
- // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
- res_8b = _mm256_permute4x64_epi64(res_8b, 216);
- __m128i res = _mm256_castsi256_si128(res_8b);
- _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+ const __m128i res_0 = _mm256_castsi256_si128(res_8b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
+ if (w > 4) {
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
+ } else if (w > 2) {
+ xx_storel_32(&dst[i * dst_stride], res_0);
+ xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
+ } else {
+ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
+ __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
+ *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
+ *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+ }
+ }
+ } else {
+ for (i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17
+ // 18 19 20 21 22 23
+ const __m256i data = _mm256_inserti128_si256(
+ _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
+ _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
+ 1);
+
+ __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
+
+ res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
+ round_0_shift);
+
+ res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
+ round_shift);
+
+ /* rounding code */
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+ // Store values into the destination buffer
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ res_8b = _mm256_permute4x64_epi64(res_8b, 216);
+ __m128i res = _mm256_castsi256_si128(res_8b);
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+ }
}
}
}
diff --git a/libaom/av1/common/x86/highbd_convolve_2d_avx2.c b/libaom/av1/common/x86/highbd_convolve_2d_avx2.c
index ae68f0b..357df12 100644
--- a/libaom/av1/common/x86/highbd_convolve_2d_avx2.c
+++ b/libaom/av1/common/x86/highbd_convolve_2d_avx2.c
@@ -238,10 +238,10 @@ void av1_highbd_convolve_2d_copy_sr_avx2(
if (w == 2) {
do {
- memcpy(dst, src, 2 * sizeof(*src));
+ memmove(dst, src, 2 * sizeof(*src));
src += src_stride;
dst += dst_stride;
- memcpy(dst, src, 2 * sizeof(*src));
+ memmove(dst, src, 2 * sizeof(*src));
src += src_stride;
dst += dst_stride;
h -= 2;
diff --git a/libaom/av1/common/x86/highbd_convolve_2d_sse4.c b/libaom/av1/common/x86/highbd_convolve_2d_sse4.c
index 3f8dafb..3c1d5d1 100644
--- a/libaom/av1/common/x86/highbd_convolve_2d_sse4.c
+++ b/libaom/av1/common/x86/highbd_convolve_2d_sse4.c
@@ -21,7 +21,7 @@
#include "aom_dsp/x86/convolve_sse4_1.h"
#include "av1/common/convolve.h"
-void av1_highbd_jnt_convolve_2d_copy_sse4_1(
+void av1_highbd_dist_wtd_convolve_2d_copy_sse4_1(
const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
int h, const InterpFilterParams *filter_params_x,
const InterpFilterParams *filter_params_y, const int subpel_x_q4,
@@ -37,7 +37,7 @@ void av1_highbd_jnt_convolve_2d_copy_sse4_1(
FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
const __m128i left_shift = _mm_cvtsi32_si128(bits);
const int do_average = conv_params->do_average;
- const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
const int w0 = conv_params->fwd_offset;
const int w1 = conv_params->bck_offset;
const __m128i wt0 = _mm_set1_epi32(w0);
@@ -75,15 +75,17 @@ void av1_highbd_jnt_convolve_2d_copy_sse4_1(
const __m128i res_unsigned_lo =
_mm_add_epi32(res_32b_lo, offset_const);
- const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(
- &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+ const __m128i comp_avg_res_lo =
+ highbd_comp_avg_sse4_1(&data_ref_0_lo, &res_unsigned_lo, &wt0,
+ &wt1, use_dist_wtd_comp_avg);
const __m128i res_32b_hi = _mm_unpackhi_epi16(res, zero);
const __m128i res_unsigned_hi =
_mm_add_epi32(res_32b_hi, offset_const);
- const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(
- &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+ const __m128i comp_avg_res_hi =
+ highbd_comp_avg_sse4_1(&data_ref_0_hi, &res_unsigned_hi, &wt0,
+ &wt1, use_dist_wtd_comp_avg);
const __m128i round_result_lo = highbd_convolve_rounding_sse2(
&comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
@@ -132,9 +134,9 @@ void av1_highbd_jnt_convolve_2d_copy_sse4_1(
_mm_add_epi32(res_32b_hi, offset_const);
const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(
- &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+ &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg);
const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(
- &data_ref_1, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+ &data_ref_1, &res_unsigned_hi, &wt0, &wt1, use_dist_wtd_comp_avg);
const __m128i round_result_lo = highbd_convolve_rounding_sse2(
&comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
@@ -166,7 +168,7 @@ void av1_highbd_jnt_convolve_2d_copy_sse4_1(
}
}
-void av1_highbd_jnt_convolve_2d_sse4_1(
+void av1_highbd_dist_wtd_convolve_2d_sse4_1(
const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
int h, const InterpFilterParams *filter_params_x,
const InterpFilterParams *filter_params_y, const int subpel_x_q4,
@@ -179,7 +181,7 @@ void av1_highbd_jnt_convolve_2d_sse4_1(
int im_stride = MAX_SB_SIZE;
int i, j;
const int do_average = conv_params->do_average;
- const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
const int fo_vert = filter_params_y->taps / 2 - 1;
const int fo_horiz = filter_params_x->taps / 2 - 1;
const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
@@ -359,8 +361,9 @@ void av1_highbd_jnt_convolve_2d_sse4_1(
const __m128i data_ref_0 = _mm_cvtepu16_epi32(data_0);
- const __m128i comp_avg_res = highbd_comp_avg_sse4_1(
- &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+ const __m128i comp_avg_res =
+ highbd_comp_avg_sse4_1(&data_ref_0, &res_unsigned_lo, &wt0,
+ &wt1, use_dist_wtd_comp_avg);
const __m128i round_result = highbd_convolve_rounding_sse2(
&comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -391,10 +394,12 @@ void av1_highbd_jnt_convolve_2d_sse4_1(
const __m128i data_ref_0_lo = _mm_cvtepu16_epi32(data_lo);
const __m128i data_ref_0_hi = _mm_cvtepu16_epi32(data_hi);
- const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(
- &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
- const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(
- &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+ const __m128i comp_avg_res_lo =
+ highbd_comp_avg_sse4_1(&data_ref_0_lo, &res_unsigned_lo, &wt0,
+ &wt1, use_dist_wtd_comp_avg);
+ const __m128i comp_avg_res_hi =
+ highbd_comp_avg_sse4_1(&data_ref_0_hi, &res_unsigned_hi, &wt0,
+ &wt1, use_dist_wtd_comp_avg);
const __m128i round_result_lo =
highbd_convolve_rounding_sse2(&comp_avg_res_lo, &offset_const,
diff --git a/libaom/av1/common/x86/highbd_inv_txfm_avx2.c b/libaom/av1/common/x86/highbd_inv_txfm_avx2.c
index 5418057..fe22465 100644
--- a/libaom/av1/common/x86/highbd_inv_txfm_avx2.c
+++ b/libaom/av1/common/x86/highbd_inv_txfm_avx2.c
@@ -4309,213 +4309,17 @@ void av1_highbd_inv_txfm2d_add_universe_avx2(const int32_t *input,
highbd_inv_txfm2d_add_no_identity_avx2(input, CONVERT_TO_SHORTPTR(output),
stride, tx_type, tx_size, eob, bd);
break;
- default: assert(0); break;
- }
-}
-
-void av1_highbd_inv_txfm_add_16x16_avx2(const tran_low_t *input, uint8_t *dest,
- int stride,
- const TxfmParam *txfm_param) {
- int bd = txfm_param->bd;
- const TX_TYPE tx_type = txfm_param->tx_type;
- const int32_t *src = cast_to_int32(input);
- switch (tx_type) {
- // Assembly version doesn't support some transform types, so use C version
- // for those.
- case V_DCT:
- case H_DCT:
- case V_ADST:
- case H_ADST:
- case V_FLIPADST:
- case H_FLIPADST:
- case IDTX:
- av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
- tx_type, bd);
- break;
- default:
- av1_highbd_inv_txfm2d_add_universe_avx2(input, dest, stride, tx_type,
- txfm_param->tx_size,
- txfm_param->eob, bd);
- break;
- }
-}
-
-void av1_highbd_inv_txfm_add_32x32_avx2(const tran_low_t *input, uint8_t *dest,
- int stride,
- const TxfmParam *txfm_param) {
- const int bd = txfm_param->bd;
- const TX_TYPE tx_type = txfm_param->tx_type;
- const int32_t *src = cast_to_int32(input);
- switch (tx_type) {
- case DCT_DCT:
- av1_highbd_inv_txfm2d_add_universe_avx2(input, dest, stride, tx_type,
- txfm_param->tx_size,
- txfm_param->eob, bd);
- break;
- // Assembly version doesn't support IDTX, so use C version for it.
- case IDTX:
- av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
- tx_type, bd);
- break;
-
- default: assert(0);
- }
-}
-
-void av1_highbd_inv_txfm_add_16x32_avx2(const tran_low_t *input, uint8_t *dest,
- int stride,
- const TxfmParam *txfm_param) {
- int bd = txfm_param->bd;
- const TX_TYPE tx_type = txfm_param->tx_type;
- const int32_t *src = cast_to_int32(input);
- switch (tx_type) {
- case DCT_DCT:
- av1_highbd_inv_txfm2d_add_universe_avx2(input, dest, stride, tx_type,
- txfm_param->tx_size,
- txfm_param->eob, bd);
- break;
- case IDTX:
- av1_inv_txfm2d_add_16x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
- txfm_param->tx_type, txfm_param->bd);
- break;
- default: assert(0);
- }
-}
-
-void av1_highbd_inv_txfm_add_32x16_avx2(const tran_low_t *input, uint8_t *dest,
- int stride,
- const TxfmParam *txfm_param) {
- int bd = txfm_param->bd;
- const TX_TYPE tx_type = txfm_param->tx_type;
- const int32_t *src = cast_to_int32(input);
- switch (tx_type) {
- case DCT_DCT:
- av1_highbd_inv_txfm2d_add_universe_avx2(input, dest, stride, tx_type,
- txfm_param->tx_size,
- txfm_param->eob, bd);
- break;
- case IDTX:
- av1_inv_txfm2d_add_32x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
- txfm_param->tx_type, txfm_param->bd);
- break;
- default: assert(0);
- }
-}
-void av1_highbd_inv_txfm_add_8x8_avx2(const tran_low_t *input, uint8_t *dest,
- int stride, const TxfmParam *txfm_param) {
- int bd = txfm_param->bd;
- const TX_TYPE tx_type = txfm_param->tx_type;
- const int32_t *src = cast_to_int32(input);
- switch (tx_type) {
- // Assembly version doesn't support some transform types, so use C version
- // for those.
- case V_DCT:
- case H_DCT:
- case V_ADST:
- case H_ADST:
- case V_FLIPADST:
- case H_FLIPADST:
case IDTX:
- av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
- bd);
- break;
- default:
- av1_highbd_inv_txfm2d_add_universe_avx2(input, dest, stride, tx_type,
- txfm_param->tx_size,
- txfm_param->eob, bd);
- break;
- }
-}
-void av1_highbd_inv_txfm_add_8x32_avx2(const tran_low_t *input, uint8_t *dest,
- int stride,
- const TxfmParam *txfm_param) {
- int bd = txfm_param->bd;
- const TX_TYPE tx_type = txfm_param->tx_type;
- const int32_t *src = cast_to_int32(input);
- switch (tx_type) {
- case DCT_DCT:
- av1_highbd_inv_txfm2d_add_universe_avx2(input, dest, stride, tx_type,
- txfm_param->tx_size,
- txfm_param->eob, bd);
- break;
- case IDTX:
- av1_inv_txfm2d_add_8x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
- txfm_param->tx_type, txfm_param->bd);
- break;
- default: assert(0);
- }
-}
-
-void av1_highbd_inv_txfm_add_32x8_avx2(const tran_low_t *input, uint8_t *dest,
- int stride,
- const TxfmParam *txfm_param) {
- int bd = txfm_param->bd;
- const TX_TYPE tx_type = txfm_param->tx_type;
- const int32_t *src = cast_to_int32(input);
- switch (tx_type) {
- case DCT_DCT:
- av1_highbd_inv_txfm2d_add_universe_avx2(input, dest, stride, tx_type,
- txfm_param->tx_size,
- txfm_param->eob, bd);
- break;
- case IDTX:
- av1_inv_txfm2d_add_32x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
- txfm_param->tx_type, txfm_param->bd);
- break;
- default: assert(0);
- }
-}
-void av1_highbd_inv_txfm_add_16x8_avx2(const tran_low_t *input, uint8_t *dest,
- int stride,
- const TxfmParam *txfm_param) {
- int bd = txfm_param->bd;
- const TX_TYPE tx_type = txfm_param->tx_type;
- const int32_t *src = cast_to_int32(input);
- switch (tx_type) {
- // Assembly version doesn't support some transform types, so use C version
- // for those.
- case V_DCT:
case H_DCT:
- case V_ADST:
case H_ADST:
- case V_FLIPADST:
case H_FLIPADST:
- case IDTX:
- av1_inv_txfm2d_add_16x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
- txfm_param->tx_type, txfm_param->bd);
- break;
- default:
- av1_highbd_inv_txfm2d_add_universe_avx2(input, dest, stride, tx_type,
- txfm_param->tx_size,
- txfm_param->eob, bd);
- break;
- }
-}
-
-void av1_highbd_inv_txfm_add_8x16_avx2(const tran_low_t *input, uint8_t *dest,
- int stride,
- const TxfmParam *txfm_param) {
- int bd = txfm_param->bd;
- const TX_TYPE tx_type = txfm_param->tx_type;
- const int32_t *src = cast_to_int32(input);
- switch (tx_type) {
- // Assembly version doesn't support some transform types, so use C version
- // for those.
case V_DCT:
- case H_DCT:
case V_ADST:
- case H_ADST:
case V_FLIPADST:
- case H_FLIPADST:
- case IDTX:
- av1_inv_txfm2d_add_8x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
- txfm_param->tx_type, txfm_param->bd);
- break;
- default:
- av1_highbd_inv_txfm2d_add_universe_avx2(input, dest, stride, tx_type,
- txfm_param->tx_size,
- txfm_param->eob, bd);
+ av1_highbd_inv_txfm2d_add_universe_sse4_1(input, output, stride, tx_type,
+ tx_size, eob, bd);
break;
+ default: assert(0); break;
}
}
void av1_highbd_inv_txfm_add_avx2(const tran_low_t *input, uint8_t *dest,
@@ -4523,33 +4327,12 @@ void av1_highbd_inv_txfm_add_avx2(const tran_low_t *input, uint8_t *dest,
assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
const TX_SIZE tx_size = txfm_param->tx_size;
switch (tx_size) {
- case TX_32X32:
- av1_highbd_inv_txfm_add_32x32_avx2(input, dest, stride, txfm_param);
- break;
- case TX_16X16:
- av1_highbd_inv_txfm_add_16x16_avx2(input, dest, stride, txfm_param);
- break;
- case TX_8X8:
- av1_highbd_inv_txfm_add_8x8_avx2(input, dest, stride, txfm_param);
- break;
case TX_4X8:
av1_highbd_inv_txfm_add_4x8_sse4_1(input, dest, stride, txfm_param);
break;
case TX_8X4:
av1_highbd_inv_txfm_add_8x4_sse4_1(input, dest, stride, txfm_param);
break;
- case TX_8X16:
- av1_highbd_inv_txfm_add_8x16_avx2(input, dest, stride, txfm_param);
- break;
- case TX_16X8:
- av1_highbd_inv_txfm_add_16x8_avx2(input, dest, stride, txfm_param);
- break;
- case TX_16X32:
- av1_highbd_inv_txfm_add_16x32_avx2(input, dest, stride, txfm_param);
- break;
- case TX_32X16:
- av1_highbd_inv_txfm_add_32x16_avx2(input, dest, stride, txfm_param);
- break;
case TX_4X4:
av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param);
break;
@@ -4559,21 +4342,10 @@ void av1_highbd_inv_txfm_add_avx2(const tran_low_t *input, uint8_t *dest,
case TX_4X16:
av1_highbd_inv_txfm_add_4x16_sse4_1(input, dest, stride, txfm_param);
break;
- case TX_8X32:
- av1_highbd_inv_txfm_add_8x32_avx2(input, dest, stride, txfm_param);
- break;
- case TX_32X8:
- av1_highbd_inv_txfm_add_32x8_avx2(input, dest, stride, txfm_param);
- break;
- case TX_64X64:
- case TX_32X64:
- case TX_64X32:
- case TX_16X64:
- case TX_64X16:
+ default:
av1_highbd_inv_txfm2d_add_universe_avx2(
input, dest, stride, txfm_param->tx_type, txfm_param->tx_size,
txfm_param->eob, txfm_param->bd);
break;
- default: assert(0 && "Invalid transform size"); break;
}
}
diff --git a/libaom/av1/common/x86/highbd_inv_txfm_sse4.c b/libaom/av1/common/x86/highbd_inv_txfm_sse4.c
index 12c6350..8a8641d 100644
--- a/libaom/av1/common/x86/highbd_inv_txfm_sse4.c
+++ b/libaom/av1/common/x86/highbd_inv_txfm_sse4.c
@@ -583,7 +583,66 @@ static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride,
_mm_storel_epi64((__m128i *)(output + 2 * stride), v2);
_mm_storel_epi64((__m128i *)(output + 3 * stride), v3);
}
+static void highbd_clamp_epi32_sse4_1(const __m128i *in, __m128i *out,
+ const __m128i *clamp_lo,
+ const __m128i *clamp_hi, int size) {
+ __m128i a0, a1;
+ for (int i = 0; i < size; i += 4) {
+ a0 = _mm_max_epi32(in[i], *clamp_lo);
+ out[i] = _mm_min_epi32(a0, *clamp_hi);
+
+ a1 = _mm_max_epi32(in[i + 1], *clamp_lo);
+ out[i + 1] = _mm_min_epi32(a1, *clamp_hi);
+
+ a0 = _mm_max_epi32(in[i + 2], *clamp_lo);
+ out[i + 2] = _mm_min_epi32(a0, *clamp_hi);
+
+ a1 = _mm_max_epi32(in[i + 3], *clamp_lo);
+ out[i + 3] = _mm_min_epi32(a1, *clamp_hi);
+ }
+}
+static void iidentity4_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ (void)bit;
+ (void)out_shift;
+ __m128i v[4];
+ __m128i fact = _mm_set1_epi32(NewSqrt2);
+ __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
+ __m128i a0, a1;
+
+ a0 = _mm_mullo_epi32(in[0], fact);
+ a1 = _mm_mullo_epi32(in[1], fact);
+ a0 = _mm_add_epi32(a0, offset);
+ a1 = _mm_add_epi32(a1, offset);
+ out[0] = _mm_srai_epi32(a0, NewSqrt2Bits);
+ out[1] = _mm_srai_epi32(a1, NewSqrt2Bits);
+
+ a0 = _mm_mullo_epi32(in[2], fact);
+ a1 = _mm_mullo_epi32(in[3], fact);
+ a0 = _mm_add_epi32(a0, offset);
+ a1 = _mm_add_epi32(a1, offset);
+ out[2] = _mm_srai_epi32(a0, NewSqrt2Bits);
+ out[3] = _mm_srai_epi32(a1, NewSqrt2Bits);
+
+ if (!do_cols) {
+ const int log_range = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+ highbd_clamp_epi32_sse4_1(out, out, &clamp_lo, &clamp_hi, 4);
+ }
+
+ // Transpose for 4x4
+ v[0] = _mm_unpacklo_epi32(out[0], out[1]);
+ v[1] = _mm_unpackhi_epi32(out[0], out[1]);
+ v[2] = _mm_unpacklo_epi32(out[2], out[3]);
+ v[3] = _mm_unpackhi_epi32(out[2], out[3]);
+ out[0] = _mm_unpacklo_epi64(v[0], v[2]);
+ out[1] = _mm_unpackhi_epi64(v[0], v[2]);
+ out[2] = _mm_unpacklo_epi64(v[1], v[3]);
+ out[3] = _mm_unpackhi_epi64(v[1], v[3]);
+}
void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output,
int stride, TX_TYPE tx_type, int bd) {
__m128i in[4];
@@ -646,6 +705,48 @@ void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output,
iadst4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
break;
+ case IDTX:
+ load_buffer_4x4(coeff, in);
+ iidentity4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+ iidentity4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case V_DCT:
+ load_buffer_4x4(coeff, in);
+ iidentity4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+ idct4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case H_DCT:
+ load_buffer_4x4(coeff, in);
+ idct4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+ iidentity4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case V_ADST:
+ load_buffer_4x4(coeff, in);
+ iidentity4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+ iadst4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case H_ADST:
+ load_buffer_4x4(coeff, in);
+ iadst4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+ iidentity4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case V_FLIPADST:
+ load_buffer_4x4(coeff, in);
+ iidentity4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+ iadst4x4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
+ break;
+ case H_FLIPADST:
+ load_buffer_4x4(coeff, in);
+ iadst4x4_sse4_1(in, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, 0);
+ iidentity4_sse4_1(in, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
+ break;
default: assert(0);
}
}
@@ -1116,6 +1217,61 @@ static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
&clamp_hi_out, out_shift);
}
}
+static void shift_sse4_1(const __m128i *in, __m128i *out,
+ const __m128i *clamp_lo, const __m128i *clamp_hi,
+ int shift, int size) {
+ __m128i offset = _mm_set1_epi32((1 << shift) >> 1);
+ __m128i shift_vec = _mm_cvtsi32_si128(shift);
+ __m128i a0, a1;
+ for (int i = 0; i < size; i += 4) {
+ a0 = _mm_add_epi32(in[i], offset);
+ a1 = _mm_add_epi32(in[i + 1], offset);
+ a0 = _mm_sra_epi32(a0, shift_vec);
+ a1 = _mm_sra_epi32(a1, shift_vec);
+ a0 = _mm_max_epi32(a0, *clamp_lo);
+ a1 = _mm_max_epi32(a1, *clamp_lo);
+ out[i] = _mm_min_epi32(a0, *clamp_hi);
+ out[i + 1] = _mm_min_epi32(a1, *clamp_hi);
+
+ a0 = _mm_add_epi32(in[i + 2], offset);
+ a1 = _mm_add_epi32(in[i + 3], offset);
+ a0 = _mm_sra_epi32(a0, shift_vec);
+ a1 = _mm_sra_epi32(a1, shift_vec);
+ a0 = _mm_max_epi32(a0, *clamp_lo);
+ a1 = _mm_max_epi32(a1, *clamp_lo);
+ out[i + 2] = _mm_min_epi32(a0, *clamp_hi);
+ out[i + 3] = _mm_min_epi32(a1, *clamp_hi);
+ }
+}
+
+static void iidentity8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ (void)bit;
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i v[8];
+ v[0] = _mm_add_epi32(in[0], in[0]);
+ v[1] = _mm_add_epi32(in[1], in[1]);
+ v[2] = _mm_add_epi32(in[2], in[2]);
+ v[3] = _mm_add_epi32(in[3], in[3]);
+ v[4] = _mm_add_epi32(in[4], in[4]);
+ v[5] = _mm_add_epi32(in[5], in[5]);
+ v[6] = _mm_add_epi32(in[6], in[6]);
+ v[7] = _mm_add_epi32(in[7], in[7]);
+
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+ -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+ const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+ (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+ shift_sse4_1(v, out, &clamp_lo_out, &clamp_hi_out, out_shift, 8);
+ } else {
+ highbd_clamp_epi32_sse4_1(v, out, &clamp_lo, &clamp_hi, 8);
+ }
+}
static void round_shift_8x8(__m128i *in, int shift) {
round_shift_4x4(&in[0], shift);
@@ -3000,7 +3156,59 @@ static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
}
}
}
+static void iidentity16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ (void)bit;
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i v[16];
+ __m128i fact = _mm_set1_epi32(2 * NewSqrt2);
+ __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
+ __m128i a0, a1, a2, a3;
+
+ for (int i = 0; i < 16; i += 8) {
+ a0 = _mm_mullo_epi32(in[i], fact);
+ a1 = _mm_mullo_epi32(in[i + 1], fact);
+ a0 = _mm_add_epi32(a0, offset);
+ a1 = _mm_add_epi32(a1, offset);
+ v[i] = _mm_srai_epi32(a0, NewSqrt2Bits);
+ v[i + 1] = _mm_srai_epi32(a1, NewSqrt2Bits);
+
+ a2 = _mm_mullo_epi32(in[i + 2], fact);
+ a3 = _mm_mullo_epi32(in[i + 3], fact);
+ a2 = _mm_add_epi32(a2, offset);
+ a3 = _mm_add_epi32(a3, offset);
+ v[i + 2] = _mm_srai_epi32(a2, NewSqrt2Bits);
+ v[i + 3] = _mm_srai_epi32(a3, NewSqrt2Bits);
+
+ a0 = _mm_mullo_epi32(in[i + 4], fact);
+ a1 = _mm_mullo_epi32(in[i + 5], fact);
+ a0 = _mm_add_epi32(a0, offset);
+ a1 = _mm_add_epi32(a1, offset);
+ v[i + 4] = _mm_srai_epi32(a0, NewSqrt2Bits);
+ v[i + 5] = _mm_srai_epi32(a1, NewSqrt2Bits);
+
+ a2 = _mm_mullo_epi32(in[i + 6], fact);
+ a3 = _mm_mullo_epi32(in[i + 7], fact);
+ a2 = _mm_add_epi32(a2, offset);
+ a3 = _mm_add_epi32(a3, offset);
+ v[i + 6] = _mm_srai_epi32(a2, NewSqrt2Bits);
+ v[i + 7] = _mm_srai_epi32(a3, NewSqrt2Bits);
+ }
+
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+ -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+ const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+ (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+ shift_sse4_1(v, out, &clamp_lo_out, &clamp_hi_out, out_shift, 16);
+ } else {
+ highbd_clamp_epi32_sse4_1(v, out, &clamp_lo, &clamp_hi, 16);
+ }
+}
static INLINE void idct64_stage8_sse4_1(
__m128i *u, const __m128i *cospim32, const __m128i *cospi32,
const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16,
@@ -5020,207 +5228,23 @@ void av1_highbd_inv_txfm_add_8x8_sse4_1(const tran_low_t *input, uint8_t *dest,
const TX_TYPE tx_type = txfm_param->tx_type;
const int32_t *src = cast_to_int32(input);
switch (tx_type) {
- // Assembly version doesn't support some transform types, so use C version
- // for those.
- case V_DCT:
- case H_DCT:
- case V_ADST:
- case H_ADST:
- case V_FLIPADST:
- case H_FLIPADST:
case IDTX:
- av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
- bd);
- break;
- default:
- av1_inv_txfm2d_add_8x8_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride,
- tx_type, bd);
- break;
- }
-}
-
-void av1_highbd_inv_txfm_add_16x8_sse4_1(const tran_low_t *input, uint8_t *dest,
- int stride,
- const TxfmParam *txfm_param) {
- int bd = txfm_param->bd;
- const TX_TYPE tx_type = txfm_param->tx_type;
- const int32_t *src = cast_to_int32(input);
- switch (tx_type) {
- // Assembly version doesn't support some transform types, so use C version
- // for those.
- case V_DCT:
case H_DCT:
- case V_ADST:
case H_ADST:
- case V_FLIPADST:
case H_FLIPADST:
- case IDTX:
- av1_inv_txfm2d_add_16x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
- txfm_param->tx_type, txfm_param->bd);
- break;
- default:
- av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
- txfm_param->tx_size,
- txfm_param->eob, bd);
- break;
- }
-}
-
-void av1_highbd_inv_txfm_add_8x16_sse4_1(const tran_low_t *input, uint8_t *dest,
- int stride,
- const TxfmParam *txfm_param) {
- int bd = txfm_param->bd;
- const TX_TYPE tx_type = txfm_param->tx_type;
- const int32_t *src = cast_to_int32(input);
- switch (tx_type) {
- // Assembly version doesn't support some transform types, so use C version
- // for those.
case V_DCT:
- case H_DCT:
case V_ADST:
- case H_ADST:
case V_FLIPADST:
- case H_FLIPADST:
- case IDTX:
- av1_inv_txfm2d_add_8x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
- txfm_param->tx_type, txfm_param->bd);
- break;
- default:
av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
txfm_param->tx_size,
txfm_param->eob, bd);
break;
- }
-}
-
-void av1_highbd_inv_txfm_add_16x16_sse4_1(const tran_low_t *input,
- uint8_t *dest, int stride,
- const TxfmParam *txfm_param) {
- int bd = txfm_param->bd;
- const TX_TYPE tx_type = txfm_param->tx_type;
- const int32_t *src = cast_to_int32(input);
- switch (tx_type) {
- // Assembly version doesn't support some transform types, so use C version
- // for those.
- case V_DCT:
- case H_DCT:
- case V_ADST:
- case H_ADST:
- case V_FLIPADST:
- case H_FLIPADST:
- case IDTX:
- av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
- tx_type, bd);
- break;
default:
- av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
- txfm_param->tx_size,
- txfm_param->eob, bd);
- break;
- }
-}
-
-void av1_highbd_inv_txfm_add_32x32_sse4_1(const tran_low_t *input,
- uint8_t *dest, int stride,
- const TxfmParam *txfm_param) {
- int bd = txfm_param->bd;
- const TX_TYPE tx_type = txfm_param->tx_type;
- const int32_t *src = cast_to_int32(input);
- switch (tx_type) {
- case DCT_DCT:
- av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
- txfm_param->tx_size,
- txfm_param->eob, bd);
- break;
- // Assembly version doesn't support IDTX, so use C version for it.
- case IDTX:
- av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
- tx_type, bd);
- break;
- default: assert(0);
- }
-}
-
-void av1_highbd_inv_txfm_add_16x32_sse4_1(const tran_low_t *input,
- uint8_t *dest, int stride,
- const TxfmParam *txfm_param) {
- int bd = txfm_param->bd;
- const TX_TYPE tx_type = txfm_param->tx_type;
- const int32_t *src = cast_to_int32(input);
- switch (tx_type) {
- case DCT_DCT:
- av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
- txfm_param->tx_size,
- txfm_param->eob, bd);
- break;
- case IDTX:
- av1_inv_txfm2d_add_16x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
- txfm_param->tx_type, txfm_param->bd);
- break;
- default: assert(0);
- }
-}
-
-void av1_highbd_inv_txfm_add_32x16_sse4_1(const tran_low_t *input,
- uint8_t *dest, int stride,
- const TxfmParam *txfm_param) {
- int bd = txfm_param->bd;
- const TX_TYPE tx_type = txfm_param->tx_type;
- const int32_t *src = cast_to_int32(input);
- switch (tx_type) {
- case DCT_DCT:
- av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
- txfm_param->tx_size,
- txfm_param->eob, bd);
- break;
- case IDTX:
- av1_inv_txfm2d_add_32x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
- txfm_param->tx_type, txfm_param->bd);
- break;
- default: assert(0);
- }
-}
-
-void av1_highbd_inv_txfm_add_8x32_sse4_1(const tran_low_t *input, uint8_t *dest,
- int stride,
- const TxfmParam *txfm_param) {
- int bd = txfm_param->bd;
- const TX_TYPE tx_type = txfm_param->tx_type;
- const int32_t *src = cast_to_int32(input);
- switch (tx_type) {
- case DCT_DCT:
- av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
- txfm_param->tx_size,
- txfm_param->eob, bd);
- break;
- case IDTX:
- av1_inv_txfm2d_add_8x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
- txfm_param->tx_type, txfm_param->bd);
- break;
- default: assert(0);
- }
-}
-
-void av1_highbd_inv_txfm_add_32x8_sse4_1(const tran_low_t *input, uint8_t *dest,
- int stride,
- const TxfmParam *txfm_param) {
- int bd = txfm_param->bd;
- const TX_TYPE tx_type = txfm_param->tx_type;
- const int32_t *src = cast_to_int32(input);
- switch (tx_type) {
- case DCT_DCT:
- av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
- txfm_param->tx_size,
- txfm_param->eob, bd);
- break;
- case IDTX:
- av1_inv_txfm2d_add_32x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
- txfm_param->tx_type, txfm_param->bd);
+ av1_inv_txfm2d_add_8x8_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride,
+ tx_type, bd);
break;
- default: assert(0);
}
}
-
void av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t *input, uint8_t *dest,
int stride,
const TxfmParam *txfm_param) {
@@ -5235,53 +5259,271 @@ void av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t *input, uint8_t *dest,
av1_highbd_iwht4x4_add(input, dest, stride, eob, bd);
return;
}
- switch (tx_type) {
- // Assembly version doesn't support some transform types, so use C version
- // for those.
- case V_DCT:
- case H_DCT:
- case V_ADST:
- case H_ADST:
- case V_FLIPADST:
- case H_FLIPADST:
- case IDTX:
- av1_inv_txfm2d_add_4x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
- bd);
- break;
- default:
- av1_inv_txfm2d_add_4x4_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride,
- tx_type, bd);
- break;
- }
+ av1_inv_txfm2d_add_4x4_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+ bd);
}
+static void iidentity32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ (void)bit;
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i v[32];
+ for (int i = 0; i < 32; i += 16) {
+ v[i] = _mm_slli_epi32(in[i], 2);
+ v[i + 1] = _mm_slli_epi32(in[i + 1], 2);
+ v[i + 2] = _mm_slli_epi32(in[i + 2], 2);
+ v[i + 3] = _mm_slli_epi32(in[i + 3], 2);
+ v[i + 4] = _mm_slli_epi32(in[i + 4], 2);
+ v[i + 5] = _mm_slli_epi32(in[i + 5], 2);
+ v[i + 6] = _mm_slli_epi32(in[i + 6], 2);
+ v[i + 7] = _mm_slli_epi32(in[i + 7], 2);
+ v[i + 8] = _mm_slli_epi32(in[i + 8], 2);
+ v[i + 9] = _mm_slli_epi32(in[i + 9], 2);
+ v[i + 10] = _mm_slli_epi32(in[i + 10], 2);
+ v[i + 11] = _mm_slli_epi32(in[i + 11], 2);
+ v[i + 12] = _mm_slli_epi32(in[i + 12], 2);
+ v[i + 13] = _mm_slli_epi32(in[i + 13], 2);
+ v[i + 14] = _mm_slli_epi32(in[i + 14], 2);
+ v[i + 15] = _mm_slli_epi32(in[i + 15], 2);
+ }
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+ -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+ const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+ (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+ shift_sse4_1(v, out, &clamp_lo_out, &clamp_hi_out, out_shift, 32);
+ } else {
+ highbd_clamp_epi32_sse4_1(v, out, &clamp_lo, &clamp_hi, 32);
+ }
+}
static const transform_1d_sse4_1
highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
{
{ idct4x4_sse4_1, NULL, NULL, NULL },
{ iadst4x4_sse4_1, NULL, NULL, NULL },
- { NULL, NULL, NULL, NULL },
+ { iidentity4_sse4_1, iidentity4_sse4_1, iidentity4_sse4_1, NULL },
},
{ { idct8x8_low1_sse4_1, idct8x8_new_sse4_1, NULL, NULL },
{ iadst8x8_low1_sse4_1, iadst8x8_new_sse4_1, NULL, NULL },
- { NULL, NULL, NULL, NULL } },
+ { iidentity8_sse4_1, iidentity8_sse4_1, NULL, NULL } },
{
{ idct16x16_low1_sse4_1, idct16x16_low8_sse4_1, idct16x16_sse4_1,
NULL },
{ iadst16x16_low1_sse4_1, iadst16x16_low8_sse4_1, iadst16x16_sse4_1,
NULL },
- { NULL, NULL, NULL, NULL },
+ { iidentity16_sse4_1, NULL, iidentity16_sse4_1, NULL },
},
{ { idct32x32_low1_sse4_1, idct32x32_low8_sse4_1, idct32x32_low16_sse4_1,
idct32x32_sse4_1 },
{ NULL, NULL, NULL, NULL },
- { NULL, NULL, NULL, NULL } },
+ { iidentity32_sse4_1, NULL, NULL, NULL } },
{ { idct64x64_low1_sse4_1, idct64x64_low8_sse4_1, idct64x64_low16_sse4_1,
idct64x64_sse4_1 },
{ NULL, NULL, NULL, NULL },
{ NULL, NULL, NULL, NULL } }
};
+static void highbd_inv_txfm2d_add_h_identity_ssse41(const int32_t *input,
+ uint16_t *output,
+ int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob,
+ const int bd) {
+ __m128i buf1[64];
+ int eobx, eoby;
+ get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int input_stride = AOMMIN(32, txfm_size_col);
+ const int buf_size_w_div4 = input_stride >> 2;
+ const int buf_size_h_div8 = (eoby + 8) >> 3;
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const transform_1d_sse4_1 row_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
+ const transform_1d_sse4_1 col_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx];
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < (buf_size_h_div8 << 1); ++i) {
+ __m128i buf0[16];
+ const int32_t *input_row = input + i * input_stride * 4;
+ for (int j = 0; j < buf_size_w_div4; ++j) {
+ __m128i *buf0_cur = buf0 + j * 4;
+ load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
+ }
+ if (rect_type == 1 || rect_type == -1) {
+ av1_round_shift_rect_array_32_sse4_1(buf0, buf0, input_stride, 0,
+ NewInvSqrt2);
+ }
+ row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+
+ __m128i *_buf1 = buf1 + i * 4;
+
+ for (int j = 0; j < buf_size_w_div4; ++j) {
+ _buf1[j * txfm_size_row + 0] = buf0[j * 4 + 0];
+ _buf1[j * txfm_size_row + 1] = buf0[j * 4 + 1];
+ _buf1[j * txfm_size_row + 2] = buf0[j * 4 + 2];
+ _buf1[j * txfm_size_row + 3] = buf0[j * 4 + 3];
+ }
+ }
+ for (int i = 0; i < buf_size_w_div4; i++) {
+ col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
+ inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+
+ av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
+ buf1 + i * txfm_size_row, txfm_size_row,
+ -shift[1]);
+ }
+
+ // write to buffer
+ for (int i = 0; i < (txfm_size_col >> 3); i++) {
+ highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2, output + 8 * i,
+ stride, ud_flip, txfm_size_row, bd);
+ }
+}
+static void highbd_inv_txfm2d_add_v_identity_ssse41(const int32_t *input,
+ uint16_t *output,
+ int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob,
+ const int bd) {
+ __m128i buf1[64];
+ int eobx, eoby;
+ get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int input_stride = AOMMIN(32, txfm_size_col);
+ const int buf_size_w_div8 = input_stride >> 2;
+ const int row_max = AOMMIN(32, txfm_size_row);
+ const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const transform_1d_sse4_1 row_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx];
+ const transform_1d_sse4_1 col_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < (row_max >> 2); ++i) {
+ __m128i buf0[16];
+ const int32_t *input_row = input + i * input_stride * 4;
+ for (int j = 0; j < (buf_size_nonzero_w_div8 << 1); ++j) {
+ __m128i *buf0_cur = buf0 + j * 4;
+ load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
+
+ TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
+ buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
+ }
+ if (rect_type == 1 || rect_type == -1) {
+ av1_round_shift_rect_array_32_sse4_1(
+ buf0, buf0, (buf_size_nonzero_w_div8 << 3), 0, NewInvSqrt2);
+ }
+ row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+
+ __m128i *_buf1 = buf1 + i * 4;
+ if (lr_flip) {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
+ buf0[4 * j],
+ _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 0],
+ _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 1],
+ _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 2],
+ _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 3]);
+ }
+ } else {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ TRANSPOSE_4X4(
+ buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],
+ _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],
+ _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]);
+ }
+ }
+ }
+ for (int i = 0; i < buf_size_w_div8; i++) {
+ col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
+ inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+
+ av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
+ buf1 + i * txfm_size_row, txfm_size_row,
+ -shift[1]);
+ }
+
+ // write to buffer
+ {
+ for (int i = 0; i < (txfm_size_col >> 3); i++) {
+ highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2,
+ output + 8 * i, stride, ud_flip,
+ txfm_size_row, bd);
+ }
+ }
+}
+static void highbd_inv_txfm2d_add_idtx_ssse41(const int32_t *input,
+ uint16_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob, const int bd) {
+ (void)eob;
+ __m128i buf1[64 * 4];
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int input_stride = AOMMIN(32, txfm_size_col);
+ const int row_max = AOMMIN(32, txfm_size_row);
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ const transform_1d_sse4_1 row_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][0];
+ const transform_1d_sse4_1 col_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
+
+ for (int i = 0; i < (row_max >> 2); ++i) {
+ __m128i buf0[32];
+ const int32_t *input_row = input + i * input_stride * 4;
+ for (int j = 0; j < (input_stride >> 2); ++j) {
+ __m128i *buf0_cur = buf0 + j * 4;
+ load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
+ }
+ if (rect_type == 1 || rect_type == -1) {
+ av1_round_shift_rect_array_32_sse4_1(buf0, buf0, input_stride, 0,
+ NewInvSqrt2);
+ }
+ row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+
+ __m128i *_buf1 = buf1 + i * 4;
+ for (int j = 0; j < (input_stride >> 2); ++j) {
+ _buf1[j * txfm_size_row + 0] = buf0[j * 4 + 0];
+ _buf1[j * txfm_size_row + 1] = buf0[j * 4 + 1];
+ _buf1[j * txfm_size_row + 2] = buf0[j * 4 + 2];
+ _buf1[j * txfm_size_row + 3] = buf0[j * 4 + 3];
+ }
+ }
+ for (int i = 0; i < (input_stride >> 2); i++) {
+ col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
+ inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+
+ av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
+ buf1 + i * txfm_size_row, txfm_size_row,
+ -shift[1]);
+ }
+ // write to buffer
+ {
+ for (int i = 0; i < (txfm_size_col >> 3); i++) {
+ highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2,
+ output + 8 * i, stride, 0, txfm_size_row,
+ bd);
+ }
+ }
+}
static void highbd_inv_txfm2d_add_no_identity_sse41(const int32_t *input,
uint16_t *output,
int stride, TX_TYPE tx_type,
@@ -5613,6 +5855,24 @@ void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input,
input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
bd);
break;
+ case V_DCT:
+ case V_ADST:
+ case V_FLIPADST:
+ highbd_inv_txfm2d_add_h_identity_ssse41(
+ input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
+ bd);
+ break;
+ case H_DCT:
+ case H_ADST:
+ case H_FLIPADST:
+ highbd_inv_txfm2d_add_v_identity_ssse41(
+ input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
+ bd);
+ break;
+ case IDTX:
+ highbd_inv_txfm2d_add_idtx_ssse41(input, CONVERT_TO_SHORTPTR(output),
+ stride, tx_type, tx_size, eob, bd);
+ break;
default: assert(0); break;
}
}
@@ -5623,26 +5883,9 @@ void av1_highbd_inv_txfm_add_4x8_sse4_1(const tran_low_t *input, uint8_t *dest,
int bd = txfm_param->bd;
const TX_TYPE tx_type = txfm_param->tx_type;
const TX_SIZE tx_size = txfm_param->tx_size;
- const int32_t *src = cast_to_int32(input);
int eob = txfm_param->eob;
- switch (tx_type) {
- // Assembly version doesn't support some transform types, so use C version
- // for those.
- case V_DCT:
- case H_DCT:
- case V_ADST:
- case H_ADST:
- case V_FLIPADST:
- case H_FLIPADST:
- case IDTX:
- av1_inv_txfm2d_add_4x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
- bd);
- break;
- default:
- highbd_inv_txfm2d_add_4x8_sse41(input, CONVERT_TO_SHORTPTR(dest), stride,
- tx_type, tx_size, eob, bd);
- break;
- }
+ highbd_inv_txfm2d_add_4x8_sse41(input, CONVERT_TO_SHORTPTR(dest), stride,
+ tx_type, tx_size, eob, bd);
}
void av1_highbd_inv_txfm_add_8x4_sse4_1(const tran_low_t *input, uint8_t *dest,
@@ -5651,26 +5894,9 @@ void av1_highbd_inv_txfm_add_8x4_sse4_1(const tran_low_t *input, uint8_t *dest,
int bd = txfm_param->bd;
const TX_TYPE tx_type = txfm_param->tx_type;
const TX_SIZE tx_size = txfm_param->tx_size;
- const int32_t *src = cast_to_int32(input);
int eob = txfm_param->eob;
- switch (tx_type) {
- // Assembly version doesn't support some transform types, so use C version
- // for those.
- case V_DCT:
- case H_DCT:
- case V_ADST:
- case H_ADST:
- case V_FLIPADST:
- case H_FLIPADST:
- case IDTX:
- av1_inv_txfm2d_add_8x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
- bd);
- break;
- default:
- highbd_inv_txfm2d_add_8x4_sse41(input, CONVERT_TO_SHORTPTR(dest), stride,
- tx_type, tx_size, eob, bd);
- break;
- }
+ highbd_inv_txfm2d_add_8x4_sse41(input, CONVERT_TO_SHORTPTR(dest), stride,
+ tx_type, tx_size, eob, bd);
}
void av1_highbd_inv_txfm_add_4x16_sse4_1(const tran_low_t *input, uint8_t *dest,
@@ -5679,26 +5905,9 @@ void av1_highbd_inv_txfm_add_4x16_sse4_1(const tran_low_t *input, uint8_t *dest,
int bd = txfm_param->bd;
const TX_TYPE tx_type = txfm_param->tx_type;
const TX_SIZE tx_size = txfm_param->tx_size;
- const int32_t *src = cast_to_int32(input);
int eob = txfm_param->eob;
- switch (tx_type) {
- // Assembly version doesn't support some transform types, so use C version
- // for those.
- case V_DCT:
- case H_DCT:
- case V_ADST:
- case H_ADST:
- case V_FLIPADST:
- case H_FLIPADST:
- case IDTX:
- av1_inv_txfm2d_add_4x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
- bd);
- break;
- default:
- highbd_inv_txfm2d_add_4x16_sse4_1(input, CONVERT_TO_SHORTPTR(dest),
- stride, tx_type, tx_size, eob, bd);
- break;
- }
+ highbd_inv_txfm2d_add_4x16_sse4_1(input, CONVERT_TO_SHORTPTR(dest), stride,
+ tx_type, tx_size, eob, bd);
}
void av1_highbd_inv_txfm_add_16x4_sse4_1(const tran_low_t *input, uint8_t *dest,
@@ -5707,26 +5916,9 @@ void av1_highbd_inv_txfm_add_16x4_sse4_1(const tran_low_t *input, uint8_t *dest,
int bd = txfm_param->bd;
const TX_TYPE tx_type = txfm_param->tx_type;
const TX_SIZE tx_size = txfm_param->tx_size;
- const int32_t *src = cast_to_int32(input);
int eob = txfm_param->eob;
- switch (tx_type) {
- // Assembly version doesn't support some transform types, so use C version
- // for those.
- case V_DCT:
- case H_DCT:
- case V_ADST:
- case H_ADST:
- case V_FLIPADST:
- case H_FLIPADST:
- case IDTX:
- av1_inv_txfm2d_add_16x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
- bd);
- break;
- default:
- highbd_inv_txfm2d_add_16x4_sse4_1(input, CONVERT_TO_SHORTPTR(dest),
- stride, tx_type, tx_size, eob, bd);
- break;
- }
+ highbd_inv_txfm2d_add_16x4_sse4_1(input, CONVERT_TO_SHORTPTR(dest), stride,
+ tx_type, tx_size, eob, bd);
}
void av1_highbd_inv_txfm_add_sse4_1(const tran_low_t *input, uint8_t *dest,
@@ -5734,57 +5926,16 @@ void av1_highbd_inv_txfm_add_sse4_1(const tran_low_t *input, uint8_t *dest,
assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
const TX_SIZE tx_size = txfm_param->tx_size;
switch (tx_size) {
- case TX_32X32:
- av1_highbd_inv_txfm_add_32x32_sse4_1(input, dest, stride, txfm_param);
- break;
- case TX_16X16:
- av1_highbd_inv_txfm_add_16x16_sse4_1(input, dest, stride, txfm_param);
- break;
- case TX_8X8:
- av1_highbd_inv_txfm_add_8x8_sse4_1(input, dest, stride, txfm_param);
- break;
case TX_4X8:
av1_highbd_inv_txfm_add_4x8_sse4_1(input, dest, stride, txfm_param);
break;
case TX_8X4:
av1_highbd_inv_txfm_add_8x4_sse4_1(input, dest, stride, txfm_param);
break;
- case TX_8X16:
- av1_highbd_inv_txfm_add_8x16_sse4_1(input, dest, stride, txfm_param);
- break;
- case TX_16X8:
- av1_highbd_inv_txfm_add_16x8_sse4_1(input, dest, stride, txfm_param);
- break;
- case TX_16X32:
- av1_highbd_inv_txfm_add_16x32_sse4_1(input, dest, stride, txfm_param);
- break;
- case TX_32X16:
- av1_highbd_inv_txfm_add_32x16_sse4_1(input, dest, stride, txfm_param);
- break;
- case TX_4X4:
- av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param);
- break;
- case TX_16X4:
- av1_highbd_inv_txfm_add_16x4_sse4_1(input, dest, stride, txfm_param);
- break;
- case TX_4X16:
- av1_highbd_inv_txfm_add_4x16_sse4_1(input, dest, stride, txfm_param);
- break;
- case TX_8X32:
- av1_highbd_inv_txfm_add_8x32_sse4_1(input, dest, stride, txfm_param);
- break;
- case TX_32X8:
- av1_highbd_inv_txfm_add_32x8_sse4_1(input, dest, stride, txfm_param);
- break;
- case TX_64X64:
- case TX_32X64:
- case TX_64X32:
- case TX_16X64:
- case TX_64X16:
- av1_highbd_inv_txfm2d_add_universe_sse4_1(
- input, dest, stride, txfm_param->tx_type, txfm_param->tx_size,
- txfm_param->eob, txfm_param->bd);
+ default:
+ // TODO(http://crbug.com/aomedia/2350): the remaining sse4_1 versions
+ // cause test vector mismatches.
+ av1_highbd_inv_txfm_add_c(input, dest, stride, txfm_param);
break;
- default: assert(0 && "Invalid transform size"); break;
}
}
diff --git a/libaom/av1/common/x86/highbd_jnt_convolve_avx2.c b/libaom/av1/common/x86/highbd_jnt_convolve_avx2.c
index e298cf6..c5040c4 100644
--- a/libaom/av1/common/x86/highbd_jnt_convolve_avx2.c
+++ b/libaom/av1/common/x86/highbd_jnt_convolve_avx2.c
@@ -22,7 +22,7 @@
#include "aom_dsp/aom_filter.h"
#include "av1/common/convolve.h"
-void av1_highbd_jnt_convolve_2d_copy_avx2(
+void av1_highbd_dist_wtd_convolve_2d_copy_avx2(
const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
int h, const InterpFilterParams *filter_params_x,
const InterpFilterParams *filter_params_y, const int subpel_x_q4,
@@ -38,7 +38,7 @@ void av1_highbd_jnt_convolve_2d_copy_avx2(
FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
const __m128i left_shift = _mm_cvtsi32_si128(bits);
const int do_average = conv_params->do_average;
- const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
const int w0 = conv_params->fwd_offset;
const int w1 = conv_params->bck_offset;
const __m256i wt0 = _mm256_set1_epi32(w0);
@@ -78,15 +78,17 @@ void av1_highbd_jnt_convolve_2d_copy_avx2(
const __m256i res_unsigned_lo =
_mm256_add_epi32(res_32b_lo, offset_const);
- const __m256i comp_avg_res_lo = highbd_comp_avg(
- &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+ const __m256i comp_avg_res_lo =
+ highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero);
const __m256i res_unsigned_hi =
_mm256_add_epi32(res_32b_hi, offset_const);
- const __m256i comp_avg_res_hi = highbd_comp_avg(
- &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+ const __m256i comp_avg_res_hi =
+ highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
const __m256i round_result_lo = highbd_convolve_rounding(
&comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
@@ -135,8 +137,9 @@ void av1_highbd_jnt_convolve_2d_copy_avx2(
const __m256i res_unsigned_lo =
_mm256_add_epi32(res_32b, offset_const);
- const __m256i comp_avg_res = highbd_comp_avg(
- &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+ const __m256i comp_avg_res =
+ highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
const __m256i round_result = highbd_convolve_rounding(
&comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -179,15 +182,17 @@ void av1_highbd_jnt_convolve_2d_copy_avx2(
const __m256i res_unsigned_lo =
_mm256_add_epi32(res_32b_lo, offset_const);
- const __m256i comp_avg_res_lo = highbd_comp_avg(
- &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+ const __m256i comp_avg_res_lo =
+ highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero);
const __m256i res_unsigned_hi =
_mm256_add_epi32(res_32b_hi, offset_const);
- const __m256i comp_avg_res_hi = highbd_comp_avg(
- &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+ const __m256i comp_avg_res_hi =
+ highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
const __m256i round_result_lo =
highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
@@ -223,7 +228,7 @@ void av1_highbd_jnt_convolve_2d_copy_avx2(
}
}
-void av1_highbd_jnt_convolve_2d_avx2(
+void av1_highbd_dist_wtd_convolve_2d_avx2(
const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
int h, const InterpFilterParams *filter_params_x,
const InterpFilterParams *filter_params_y, const int subpel_x_q4,
@@ -244,7 +249,7 @@ void av1_highbd_jnt_convolve_2d_avx2(
__m256i s[8], coeffs_y[4], coeffs_x[4];
const int do_average = conv_params->do_average;
- const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
const int w0 = conv_params->fwd_offset;
const int w1 = conv_params->bck_offset;
@@ -364,8 +369,9 @@ void av1_highbd_jnt_convolve_2d_avx2(
const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
- const __m256i comp_avg_res = highbd_comp_avg(
- &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+ const __m256i comp_avg_res =
+ highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
const __m256i round_result = highbd_convolve_rounding(
&comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -409,10 +415,12 @@ void av1_highbd_jnt_convolve_2d_avx2(
const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
- const __m256i comp_avg_res_lo = highbd_comp_avg(
- &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
- const __m256i comp_avg_res_hi = highbd_comp_avg(
- &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+ const __m256i comp_avg_res_lo =
+ highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
+ const __m256i comp_avg_res_hi =
+ highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
const __m256i round_result_lo =
highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
@@ -456,7 +464,7 @@ void av1_highbd_jnt_convolve_2d_avx2(
}
}
-void av1_highbd_jnt_convolve_x_avx2(
+void av1_highbd_dist_wtd_convolve_x_avx2(
const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
int h, const InterpFilterParams *filter_params_x,
const InterpFilterParams *filter_params_y, const int subpel_x_q4,
@@ -473,7 +481,7 @@ void av1_highbd_jnt_convolve_x_avx2(
__m256i s[4], coeffs_x[4];
const int do_average = conv_params->do_average;
- const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
const int w0 = conv_params->fwd_offset;
const int w1 = conv_params->bck_offset;
const __m256i wt0 = _mm256_set1_epi32(w0);
@@ -548,7 +556,7 @@ void av1_highbd_jnt_convolve_x_avx2(
const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
const __m256i comp_avg_res = highbd_comp_avg(
- &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+ &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg);
const __m256i round_result = highbd_convolve_rounding(
&comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -588,10 +596,12 @@ void av1_highbd_jnt_convolve_x_avx2(
const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
- const __m256i comp_avg_res_lo = highbd_comp_avg(
- &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
- const __m256i comp_avg_res_hi = highbd_comp_avg(
- &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+ const __m256i comp_avg_res_lo =
+ highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
+ const __m256i comp_avg_res_hi =
+ highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
const __m256i round_result_lo = highbd_convolve_rounding(
&comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
@@ -623,7 +633,7 @@ void av1_highbd_jnt_convolve_x_avx2(
}
}
-void av1_highbd_jnt_convolve_y_avx2(
+void av1_highbd_dist_wtd_convolve_y_avx2(
const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
int h, const InterpFilterParams *filter_params_x,
const InterpFilterParams *filter_params_y, const int subpel_x_q4,
@@ -640,7 +650,7 @@ void av1_highbd_jnt_convolve_y_avx2(
int i, j;
__m256i s[8], coeffs_y[4];
const int do_average = conv_params->do_average;
- const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
const int w0 = conv_params->fwd_offset;
const int w1 = conv_params->bck_offset;
@@ -753,8 +763,9 @@ void av1_highbd_jnt_convolve_y_avx2(
const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
- const __m256i comp_avg_res = highbd_comp_avg(
- &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+ const __m256i comp_avg_res =
+ highbd_comp_avg(&data_ref_0, &res_unsigned_lo, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
const __m256i round_result = highbd_convolve_rounding(
&comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -799,10 +810,12 @@ void av1_highbd_jnt_convolve_y_avx2(
const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
- const __m256i comp_avg_res_lo = highbd_comp_avg(
- &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
- const __m256i comp_avg_res_hi = highbd_comp_avg(
- &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+ const __m256i comp_avg_res_lo =
+ highbd_comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
+ const __m256i comp_avg_res_hi =
+ highbd_comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1,
+ use_dist_wtd_comp_avg);
const __m256i round_result_lo =
highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
diff --git a/libaom/av1/common/x86/highbd_jnt_convolve_sse4.c b/libaom/av1/common/x86/highbd_jnt_convolve_sse4.c
index 1a29985..7fea36a 100644
--- a/libaom/av1/common/x86/highbd_jnt_convolve_sse4.c
+++ b/libaom/av1/common/x86/highbd_jnt_convolve_sse4.c
@@ -17,7 +17,7 @@
#include "aom_dsp/x86/convolve_sse2.h"
#include "aom_dsp/x86/convolve_sse4_1.h"
-void av1_highbd_jnt_convolve_y_sse4_1(
+void av1_highbd_dist_wtd_convolve_y_sse4_1(
const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
int h, const InterpFilterParams *filter_params_x,
const InterpFilterParams *filter_params_y, const int subpel_x_q4,
@@ -33,7 +33,7 @@ void av1_highbd_jnt_convolve_y_sse4_1(
assert(bits >= 0);
int i, j;
const int do_average = conv_params->do_average;
- const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
const int w0 = conv_params->fwd_offset;
const int w1 = conv_params->bck_offset;
@@ -121,10 +121,12 @@ void av1_highbd_jnt_convolve_y_sse4_1(
const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero);
const __m128i data_ref_1 = _mm_unpacklo_epi16(data_1, zero);
- const __m128i comp_avg_res_0 = highbd_comp_avg_sse4_1(
- &data_ref_0, &res_unsigned_lo_0, &wt0, &wt1, use_jnt_comp_avg);
- const __m128i comp_avg_res_1 = highbd_comp_avg_sse4_1(
- &data_ref_1, &res_unsigned_lo_1, &wt0, &wt1, use_jnt_comp_avg);
+ const __m128i comp_avg_res_0 =
+ highbd_comp_avg_sse4_1(&data_ref_0, &res_unsigned_lo_0, &wt0,
+ &wt1, use_dist_wtd_comp_avg);
+ const __m128i comp_avg_res_1 =
+ highbd_comp_avg_sse4_1(&data_ref_1, &res_unsigned_lo_1, &wt0,
+ &wt1, use_dist_wtd_comp_avg);
const __m128i round_result_0 =
highbd_convolve_rounding_sse2(&comp_avg_res_0, &offset_const,
@@ -186,16 +188,16 @@ void av1_highbd_jnt_convolve_y_sse4_1(
const __m128i comp_avg_res_lo_0 =
highbd_comp_avg_sse4_1(&data_ref_0_lo_0, &res_unsigned_lo_0,
- &wt0, &wt1, use_jnt_comp_avg);
+ &wt0, &wt1, use_dist_wtd_comp_avg);
const __m128i comp_avg_res_lo_1 =
highbd_comp_avg_sse4_1(&data_ref_0_lo_1, &res_unsigned_lo_1,
- &wt0, &wt1, use_jnt_comp_avg);
+ &wt0, &wt1, use_dist_wtd_comp_avg);
const __m128i comp_avg_res_hi_0 =
highbd_comp_avg_sse4_1(&data_ref_0_hi_0, &res_unsigned_hi_0,
- &wt0, &wt1, use_jnt_comp_avg);
+ &wt0, &wt1, use_dist_wtd_comp_avg);
const __m128i comp_avg_res_hi_1 =
highbd_comp_avg_sse4_1(&data_ref_0_hi_1, &res_unsigned_hi_1,
- &wt0, &wt1, use_jnt_comp_avg);
+ &wt0, &wt1, use_dist_wtd_comp_avg);
const __m128i round_result_lo_0 =
highbd_convolve_rounding_sse2(&comp_avg_res_lo_0, &offset_const,
@@ -257,7 +259,7 @@ void av1_highbd_jnt_convolve_y_sse4_1(
}
}
-void av1_highbd_jnt_convolve_x_sse4_1(
+void av1_highbd_dist_wtd_convolve_x_sse4_1(
const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
int h, const InterpFilterParams *filter_params_x,
const InterpFilterParams *filter_params_y, const int subpel_x_q4,
@@ -274,7 +276,7 @@ void av1_highbd_jnt_convolve_x_sse4_1(
__m128i s[4], coeffs_x[4];
const int do_average = conv_params->do_average;
- const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
const int w0 = conv_params->fwd_offset;
const int w1 = conv_params->bck_offset;
const __m128i wt0 = _mm_set1_epi32(w0);
@@ -339,7 +341,7 @@ void av1_highbd_jnt_convolve_x_sse4_1(
const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero);
const __m128i comp_avg_res = highbd_comp_avg_sse4_1(
- &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+ &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_dist_wtd_comp_avg);
const __m128i round_result = highbd_convolve_rounding_sse2(
&comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -359,10 +361,12 @@ void av1_highbd_jnt_convolve_x_sse4_1(
const __m128i data_ref_0_lo = _mm_unpacklo_epi16(data_0, zero);
const __m128i data_ref_0_hi = _mm_unpackhi_epi16(data_0, zero);
- const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(
- &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
- const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(
- &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+ const __m128i comp_avg_res_lo =
+ highbd_comp_avg_sse4_1(&data_ref_0_lo, &res_unsigned_lo, &wt0,
+ &wt1, use_dist_wtd_comp_avg);
+ const __m128i comp_avg_res_hi =
+ highbd_comp_avg_sse4_1(&data_ref_0_hi, &res_unsigned_hi, &wt0,
+ &wt1, use_dist_wtd_comp_avg);
const __m128i round_result_lo = highbd_convolve_rounding_sse2(
&comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
diff --git a/libaom/av1/common/x86/highbd_warp_plane_sse4.c b/libaom/av1/common/x86/highbd_warp_plane_sse4.c
index 4bcab05..3765c5e 100644
--- a/libaom/av1/common/x86/highbd_warp_plane_sse4.c
+++ b/libaom/av1/common/x86/highbd_warp_plane_sse4.c
@@ -537,7 +537,7 @@ void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref,
__m128i *const dst16 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
__m128i p_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p));
- if (conv_params->use_jnt_comp_avg) {
+ if (conv_params->use_dist_wtd_comp_avg) {
res_lo = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0),
_mm_mullo_epi32(res_lo, wt1));
res_lo = _mm_srai_epi32(res_lo, DIST_PRECISION_BITS);
@@ -570,7 +570,7 @@ void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref,
(__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
__m128i p4_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p4));
- if (conv_params->use_jnt_comp_avg) {
+ if (conv_params->use_dist_wtd_comp_avg) {
res_hi = _mm_add_epi32(_mm_mullo_epi32(p4_32, wt0),
_mm_mullo_epi32(res_hi, wt1));
res_hi = _mm_srai_epi32(res_hi, DIST_PRECISION_BITS);
diff --git a/libaom/av1/common/x86/jnt_convolve_avx2.c b/libaom/av1/common/x86/jnt_convolve_avx2.c
index 9f2e2b4..23cd6ab 100644
--- a/libaom/av1/common/x86/jnt_convolve_avx2.c
+++ b/libaom/av1/common/x86/jnt_convolve_avx2.c
@@ -35,22 +35,20 @@ static INLINE __m256i load_line2_avx2(const void *a, const void *b) {
_mm256_castsi128_si256(_mm_loadu_si128((__m128i *)b)), 0x20);
}
-void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
- int dst_stride0, int w, int h,
- const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_x_avx2(const uint8_t *src, int src_stride,
+ uint8_t *dst0, int dst_stride0, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
CONV_BUF_TYPE *dst = conv_params->dst;
int dst_stride = conv_params->dst_stride;
const int bd = 8;
- int i, j;
- const int fo_horiz = filter_params_x->taps / 2 - 1;
- const uint8_t *const src_ptr = src - fo_horiz;
+ int i, j, is_horiz_4tap = 0;
const int bits = FILTER_BITS - conv_params->round_1;
const __m256i wt = unpack_weights_avx2(conv_params);
const int do_average = conv_params->do_average;
- const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
const int offset_0 =
bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
@@ -58,18 +56,10 @@ void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
const int rounding_shift =
2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
- __m256i filt[4], coeffs[4];
assert(bits >= 0);
assert(conv_params->round_0 > 0);
- filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
- filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
- filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
- filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
-
- prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs);
-
const __m256i round_const =
_mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1);
const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
@@ -77,68 +67,136 @@ void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
(void)filter_params_y;
(void)subpel_y_q4;
- for (i = 0; i < h; i += 2) {
- const uint8_t *src_data = src_ptr + i * src_stride;
- CONV_BUF_TYPE *dst_data = dst + i * dst_stride;
- for (j = 0; j < w; j += 8) {
- const __m256i data =
- load_line2_avx2(&src_data[j], &src_data[j + src_stride]);
+ __m256i filt[4], coeffs[4];
- __m256i res = convolve_lowbd_x(data, coeffs, filt);
+ filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
+ filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
- res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift);
+ prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs);
- res = _mm256_slli_epi16(res, bits);
+ // Condition for checking valid horz_filt taps
+ if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0)))
+ is_horiz_4tap = 1;
- const __m256i res_unsigned = _mm256_add_epi16(res, offset_const);
+ // horz_filt as 4 tap
+ if (is_horiz_4tap) {
+ const int fo_horiz = 1;
+ const uint8_t *const src_ptr = src - fo_horiz;
+ for (i = 0; i < h; i += 2) {
+ const uint8_t *src_data = src_ptr + i * src_stride;
+ CONV_BUF_TYPE *dst_data = dst + i * dst_stride;
+ for (j = 0; j < w; j += 8) {
+ const __m256i data =
+ load_line2_avx2(&src_data[j], &src_data[j + src_stride]);
- // Accumulate values into the destination buffer
- if (do_average) {
- const __m256i data_ref_0 =
- load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]);
- const __m256i comp_avg_res =
- comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+ __m256i res = convolve_lowbd_x_4tap(data, coeffs + 1, filt);
+ res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift);
+ res = _mm256_slli_epi16(res, bits);
- const __m256i round_result = convolve_rounding(
- &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+ const __m256i res_unsigned = _mm256_add_epi16(res, offset_const);
- const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
- const __m128i res_0 = _mm256_castsi256_si128(res_8);
- const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+ // Accumulate values into the destination buffer
+ if (do_average) {
+ const __m256i data_ref_0 =
+ load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]);
+ const __m256i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
+
+ const __m256i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
- if (w > 4) {
- _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
- _mm_storel_epi64(
- (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+ const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
+ const __m128i res_0 = _mm256_castsi256_si128(res_8);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+ if (w > 4) {
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_storel_epi64(
+ (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+ } else {
+ *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+ _mm_cvtsi128_si32(res_0);
+ *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+ _mm_cvtsi128_si32(res_1);
+ }
} else {
- *(uint32_t *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0);
- *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
- _mm_cvtsi128_si32(res_1);
+ const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+ const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
}
- } else {
- const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
- _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+ }
+ }
+ } else {
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_horiz;
+
+ filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+ filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+ for (i = 0; i < h; i += 2) {
+ const uint8_t *src_data = src_ptr + i * src_stride;
+ CONV_BUF_TYPE *dst_data = dst + i * dst_stride;
+ for (j = 0; j < w; j += 8) {
+ const __m256i data =
+ load_line2_avx2(&src_data[j], &src_data[j + src_stride]);
+
+ __m256i res = convolve_lowbd_x(data, coeffs, filt);
+
+ res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift);
+
+ res = _mm256_slli_epi16(res, bits);
+
+ const __m256i res_unsigned = _mm256_add_epi16(res, offset_const);
+
+ // Accumulate values into the destination buffer
+ if (do_average) {
+ const __m256i data_ref_0 =
+ load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]);
+ const __m256i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
+
+ const __m256i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
+ const __m128i res_0 = _mm256_castsi256_si128(res_8);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+ if (w > 4) {
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_storel_epi64(
+ (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+ } else {
+ *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+ _mm_cvtsi128_si32(res_0);
+ *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+ _mm_cvtsi128_si32(res_1);
+ }
+ } else {
+ const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
- const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
- _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
- res_1);
+ const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
}
}
}
}
-void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
- int dst_stride0, int w, int h,
- const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_y_avx2(const uint8_t *src, int src_stride,
+ uint8_t *dst0, int dst_stride0, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
CONV_BUF_TYPE *dst = conv_params->dst;
int dst_stride = conv_params->dst_stride;
const int bd = 8;
- int i, j;
- const int fo_vert = filter_params_y->taps / 2 - 1;
- const uint8_t *const src_ptr = src - fo_vert * src_stride;
+ int i, j, is_vert_4tap = 0;
// +1 to compensate for dividing the filter coeffs by 2
const int left_shift = FILTER_BITS - conv_params->round_0 + 1;
const __m256i round_const =
@@ -146,7 +204,7 @@ void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
const __m256i wt = unpack_weights_avx2(conv_params);
const int do_average = conv_params->do_average;
- const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
const int offset_0 =
bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
@@ -168,195 +226,389 @@ void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
(void)filter_params_x;
(void)subpel_x_q4;
- for (j = 0; j < w; j += 16) {
- const uint8_t *data = &src_ptr[j];
- __m256i src6;
- // Load lines a and b. Line a to lower 128, line b to upper 128
- {
- __m256i src_ab[7];
- __m256i src_a[7];
- src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
- for (int kk = 0; kk < 6; ++kk) {
- data += src_stride;
- src_a[kk + 1] =
- _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
- src_ab[kk] = _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20);
+ // Condition for checking valid vert_filt taps
+ if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs[0], coeffs[3]), 0)))
+ is_vert_4tap = 1;
+
+ if (is_vert_4tap) {
+ const int fo_vert = 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride;
+ for (j = 0; j < w; j += 16) {
+ const uint8_t *data = &src_ptr[j];
+ __m256i src4;
+ // Load lines a and b. Line a to lower 128, line b to upper 128
+ {
+ __m256i src_ab[4];
+ __m256i src_a[5];
+ src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+ for (int kk = 0; kk < 4; ++kk) {
+ data += src_stride;
+ src_a[kk + 1] =
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+ src_ab[kk] =
+ _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20);
+ }
+ src4 = src_a[4];
+ s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]);
+ s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]);
+
+ s[3] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]);
+ s[4] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]);
}
- src6 = src_a[6];
- s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]);
- s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]);
- s[2] = _mm256_unpacklo_epi8(src_ab[4], src_ab[5]);
- s[4] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]);
- s[5] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]);
- s[6] = _mm256_unpackhi_epi8(src_ab[4], src_ab[5]);
- }
- for (i = 0; i < h; i += 2) {
- data = &src_ptr[(i + 7) * src_stride + j];
- const __m256i src7 =
- _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
- const __m256i src_67a = _mm256_permute2x128_si256(src6, src7, 0x20);
+ for (i = 0; i < h; i += 2) {
+ data = &src_ptr[(i + 5) * src_stride + j];
+ const __m256i src5 =
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+ const __m256i src_45a = _mm256_permute2x128_si256(src4, src5, 0x20);
- src6 = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + src_stride)));
- const __m256i src_78a = _mm256_permute2x128_si256(src7, src6, 0x20);
+ src4 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + src_stride)));
+ const __m256i src_56a = _mm256_permute2x128_si256(src5, src4, 0x20);
- s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
- s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
+ s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
+ s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
- __m256i res_lo = convolve_lowbd(s, coeffs);
+ __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1);
- res_lo = _mm256_add_epi16(res_lo, offset_const_1);
+ res_lo = _mm256_add_epi16(res_lo, offset_const_1);
- const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero);
- const __m256i res_lo_0_shift =
- _mm256_slli_epi32(res_lo_0_32b, left_shift);
- const __m256i res_lo_0_round = _mm256_sra_epi32(
- _mm256_add_epi32(res_lo_0_shift, round_const), round_shift);
+ const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero);
+ const __m256i res_lo_0_shift =
+ _mm256_slli_epi32(res_lo_0_32b, left_shift);
+ const __m256i res_lo_0_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_lo_0_shift, round_const), round_shift);
- const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero);
- const __m256i res_lo_1_shift =
- _mm256_slli_epi32(res_lo_1_32b, left_shift);
- const __m256i res_lo_1_round = _mm256_sra_epi32(
- _mm256_add_epi32(res_lo_1_shift, round_const), round_shift);
+ const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero);
+ const __m256i res_lo_1_shift =
+ _mm256_slli_epi32(res_lo_1_32b, left_shift);
+ const __m256i res_lo_1_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_lo_1_shift, round_const), round_shift);
- const __m256i res_lo_round =
- _mm256_packs_epi32(res_lo_0_round, res_lo_1_round);
+ const __m256i res_lo_round =
+ _mm256_packs_epi32(res_lo_0_round, res_lo_1_round);
- const __m256i res_lo_unsigned =
- _mm256_add_epi16(res_lo_round, offset_const_2);
+ const __m256i res_lo_unsigned =
+ _mm256_add_epi16(res_lo_round, offset_const_2);
- if (w - j < 16) {
- if (do_average) {
- const __m256i data_ref_0 = load_line2_avx2(
- &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]);
- const __m256i comp_avg_res =
- comp_avg(&data_ref_0, &res_lo_unsigned, &wt, use_jnt_comp_avg);
+ if (w - j < 16) {
+ if (do_average) {
+ const __m256i data_ref_0 =
+ load_line2_avx2(&dst[i * dst_stride + j],
+ &dst[i * dst_stride + j + dst_stride]);
+ const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_lo_unsigned,
+ &wt, use_dist_wtd_comp_avg);
- const __m256i round_result = convolve_rounding(
- &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+ const __m256i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
- const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
- const __m128i res_0 = _mm256_castsi256_si128(res_8);
- const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+ const __m256i res_8 =
+ _mm256_packus_epi16(round_result, round_result);
+ const __m128i res_0 = _mm256_castsi256_si128(res_8);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
- if (w - j > 4) {
- _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
- _mm_storel_epi64(
- (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+ if (w - j > 4) {
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_storel_epi64(
+ (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])),
+ res_1);
+ } else {
+ *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+ _mm_cvtsi128_si32(res_0);
+ *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+ _mm_cvtsi128_si32(res_1);
+ }
} else {
- *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
- _mm_cvtsi128_si32(res_0);
- *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
- _mm_cvtsi128_si32(res_1);
+ const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+ const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
}
} else {
- const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned);
- _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+ __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1);
- const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1);
- _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
- res_1);
+ res_hi = _mm256_add_epi16(res_hi, offset_const_1);
+
+ const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero);
+ const __m256i res_hi_0_shift =
+ _mm256_slli_epi32(res_hi_0_32b, left_shift);
+ const __m256i res_hi_0_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_hi_0_shift, round_const), round_shift);
+
+ const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero);
+ const __m256i res_hi_1_shift =
+ _mm256_slli_epi32(res_hi_1_32b, left_shift);
+ const __m256i res_hi_1_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_hi_1_shift, round_const), round_shift);
+
+ const __m256i res_hi_round =
+ _mm256_packs_epi32(res_hi_0_round, res_hi_1_round);
+
+ const __m256i res_hi_unsigned =
+ _mm256_add_epi16(res_hi_round, offset_const_2);
+
+ if (do_average) {
+ const __m256i data_ref_0_lo =
+ load_line2_avx2(&dst[i * dst_stride + j],
+ &dst[i * dst_stride + j + dst_stride]);
+
+ const __m256i data_ref_0_hi =
+ load_line2_avx2(&dst[i * dst_stride + j + 8],
+ &dst[i * dst_stride + j + 8 + dst_stride]);
+
+ const __m256i comp_avg_res_lo = comp_avg(
+ &data_ref_0_lo, &res_lo_unsigned, &wt, use_dist_wtd_comp_avg);
+
+ const __m256i comp_avg_res_hi = comp_avg(
+ &data_ref_0_hi, &res_hi_unsigned, &wt, use_dist_wtd_comp_avg);
+
+ const __m256i round_result_lo =
+ convolve_rounding(&comp_avg_res_lo, &offset_const,
+ &rounding_const, rounding_shift);
+
+ const __m256i round_result_hi =
+ convolve_rounding(&comp_avg_res_hi, &offset_const,
+ &rounding_const, rounding_shift);
+
+ const __m256i res_8 =
+ _mm256_packus_epi16(round_result_lo, round_result_hi);
+ const __m128i res_0 = _mm256_castsi256_si128(res_8);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+ _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_store_si128(
+ (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+
+ } else {
+ const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0);
+
+ const __m128i res_lo_1 =
+ _mm256_extracti128_si256(res_lo_unsigned, 1);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_lo_1);
+
+ const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]),
+ res_hi_0);
+
+ const __m128i res_hi_1 =
+ _mm256_extracti128_si256(res_hi_unsigned, 1);
+ _mm_store_si128(
+ (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]),
+ res_hi_1);
+ }
+ }
+ s[0] = s[1];
+ s[1] = s[2];
+
+ s[3] = s[4];
+ s[4] = s[5];
+ }
+ }
+ } else {
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride;
+ for (j = 0; j < w; j += 16) {
+ const uint8_t *data = &src_ptr[j];
+ __m256i src6;
+ // Load lines a and b. Line a to lower 128, line b to upper 128
+ {
+ __m256i src_ab[7];
+ __m256i src_a[7];
+ src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+ for (int kk = 0; kk < 6; ++kk) {
+ data += src_stride;
+ src_a[kk + 1] =
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+ src_ab[kk] =
+ _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20);
}
- } else {
- __m256i res_hi = convolve_lowbd(s + 4, coeffs);
+ src6 = src_a[6];
+ s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]);
+ s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]);
+ s[2] = _mm256_unpacklo_epi8(src_ab[4], src_ab[5]);
+ s[4] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]);
+ s[5] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]);
+ s[6] = _mm256_unpackhi_epi8(src_ab[4], src_ab[5]);
+ }
- res_hi = _mm256_add_epi16(res_hi, offset_const_1);
+ for (i = 0; i < h; i += 2) {
+ data = &src_ptr[(i + 7) * src_stride + j];
+ const __m256i src7 =
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+ const __m256i src_67a = _mm256_permute2x128_si256(src6, src7, 0x20);
- const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero);
- const __m256i res_hi_0_shift =
- _mm256_slli_epi32(res_hi_0_32b, left_shift);
- const __m256i res_hi_0_round = _mm256_sra_epi32(
- _mm256_add_epi32(res_hi_0_shift, round_const), round_shift);
+ src6 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + src_stride)));
+ const __m256i src_78a = _mm256_permute2x128_si256(src7, src6, 0x20);
- const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero);
- const __m256i res_hi_1_shift =
- _mm256_slli_epi32(res_hi_1_32b, left_shift);
- const __m256i res_hi_1_round = _mm256_sra_epi32(
- _mm256_add_epi32(res_hi_1_shift, round_const), round_shift);
+ s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
+ s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
- const __m256i res_hi_round =
- _mm256_packs_epi32(res_hi_0_round, res_hi_1_round);
+ __m256i res_lo = convolve_lowbd(s, coeffs);
- const __m256i res_hi_unsigned =
- _mm256_add_epi16(res_hi_round, offset_const_2);
+ res_lo = _mm256_add_epi16(res_lo, offset_const_1);
- if (do_average) {
- const __m256i data_ref_0_lo = load_line2_avx2(
- &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]);
+ const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero);
+ const __m256i res_lo_0_shift =
+ _mm256_slli_epi32(res_lo_0_32b, left_shift);
+ const __m256i res_lo_0_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_lo_0_shift, round_const), round_shift);
- const __m256i data_ref_0_hi =
- load_line2_avx2(&dst[i * dst_stride + j + 8],
- &dst[i * dst_stride + j + 8 + dst_stride]);
+ const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero);
+ const __m256i res_lo_1_shift =
+ _mm256_slli_epi32(res_lo_1_32b, left_shift);
+ const __m256i res_lo_1_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_lo_1_shift, round_const), round_shift);
- const __m256i comp_avg_res_lo =
- comp_avg(&data_ref_0_lo, &res_lo_unsigned, &wt, use_jnt_comp_avg);
+ const __m256i res_lo_round =
+ _mm256_packs_epi32(res_lo_0_round, res_lo_1_round);
- const __m256i comp_avg_res_hi =
- comp_avg(&data_ref_0_hi, &res_hi_unsigned, &wt, use_jnt_comp_avg);
+ const __m256i res_lo_unsigned =
+ _mm256_add_epi16(res_lo_round, offset_const_2);
- const __m256i round_result_lo = convolve_rounding(
- &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+ if (w - j < 16) {
+ if (do_average) {
+ const __m256i data_ref_0 =
+ load_line2_avx2(&dst[i * dst_stride + j],
+ &dst[i * dst_stride + j + dst_stride]);
+ const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_lo_unsigned,
+ &wt, use_dist_wtd_comp_avg);
- const __m256i round_result_hi = convolve_rounding(
- &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+ const __m256i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
- const __m256i res_8 =
- _mm256_packus_epi16(round_result_lo, round_result_hi);
- const __m128i res_0 = _mm256_castsi256_si128(res_8);
- const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+ const __m256i res_8 =
+ _mm256_packus_epi16(round_result, round_result);
+ const __m128i res_0 = _mm256_castsi256_si128(res_8);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
- _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
- _mm_store_si128(
- (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+ if (w - j > 4) {
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_storel_epi64(
+ (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])),
+ res_1);
+ } else {
+ *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+ _mm_cvtsi128_si32(res_0);
+ *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+ _mm_cvtsi128_si32(res_1);
+ }
+ } else {
+ const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+ const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
} else {
- const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned);
- _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0);
+ __m256i res_hi = convolve_lowbd(s + 4, coeffs);
- const __m128i res_lo_1 = _mm256_extracti128_si256(res_lo_unsigned, 1);
- _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
- res_lo_1);
+ res_hi = _mm256_add_epi16(res_hi, offset_const_1);
+
+ const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero);
+ const __m256i res_hi_0_shift =
+ _mm256_slli_epi32(res_hi_0_32b, left_shift);
+ const __m256i res_hi_0_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_hi_0_shift, round_const), round_shift);
+
+ const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero);
+ const __m256i res_hi_1_shift =
+ _mm256_slli_epi32(res_hi_1_32b, left_shift);
+ const __m256i res_hi_1_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_hi_1_shift, round_const), round_shift);
+
+ const __m256i res_hi_round =
+ _mm256_packs_epi32(res_hi_0_round, res_hi_1_round);
+
+ const __m256i res_hi_unsigned =
+ _mm256_add_epi16(res_hi_round, offset_const_2);
+
+ if (do_average) {
+ const __m256i data_ref_0_lo =
+ load_line2_avx2(&dst[i * dst_stride + j],
+ &dst[i * dst_stride + j + dst_stride]);
- const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned);
- _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]), res_hi_0);
+ const __m256i data_ref_0_hi =
+ load_line2_avx2(&dst[i * dst_stride + j + 8],
+ &dst[i * dst_stride + j + 8 + dst_stride]);
- const __m128i res_hi_1 = _mm256_extracti128_si256(res_hi_unsigned, 1);
- _mm_store_si128(
- (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]), res_hi_1);
+ const __m256i comp_avg_res_lo = comp_avg(
+ &data_ref_0_lo, &res_lo_unsigned, &wt, use_dist_wtd_comp_avg);
+
+ const __m256i comp_avg_res_hi = comp_avg(
+ &data_ref_0_hi, &res_hi_unsigned, &wt, use_dist_wtd_comp_avg);
+
+ const __m256i round_result_lo =
+ convolve_rounding(&comp_avg_res_lo, &offset_const,
+ &rounding_const, rounding_shift);
+
+ const __m256i round_result_hi =
+ convolve_rounding(&comp_avg_res_hi, &offset_const,
+ &rounding_const, rounding_shift);
+
+ const __m256i res_8 =
+ _mm256_packus_epi16(round_result_lo, round_result_hi);
+ const __m128i res_0 = _mm256_castsi256_si128(res_8);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+ _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_store_si128(
+ (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+
+ } else {
+ const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0);
+
+ const __m128i res_lo_1 =
+ _mm256_extracti128_si256(res_lo_unsigned, 1);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_lo_1);
+
+ const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]),
+ res_hi_0);
+
+ const __m128i res_hi_1 =
+ _mm256_extracti128_si256(res_hi_unsigned, 1);
+ _mm_store_si128(
+ (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]),
+ res_hi_1);
+ }
}
- }
- s[0] = s[1];
- s[1] = s[2];
- s[2] = s[3];
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
- s[4] = s[5];
- s[5] = s[6];
- s[6] = s[7];
+ s[4] = s[5];
+ s[5] = s[6];
+ s[6] = s[7];
+ }
}
}
}
-void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
- int dst_stride0, int w, int h,
- const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_2d_avx2(const uint8_t *src, int src_stride,
+ uint8_t *dst0, int dst_stride0, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
CONV_BUF_TYPE *dst = conv_params->dst;
int dst_stride = conv_params->dst_stride;
const int bd = 8;
DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
- int im_h = h + filter_params_y->taps - 1;
+
int im_stride = 8;
- int i, j;
- const int fo_vert = filter_params_y->taps / 2 - 1;
- const int fo_horiz = filter_params_x->taps / 2 - 1;
- const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+ int i, is_horiz_4tap = 0, is_vert_4tap = 0;
const __m256i wt = unpack_weights_avx2(conv_params);
const int do_average = conv_params->do_average;
- const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
const int offset_0 =
bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
@@ -364,18 +616,9 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
const int rounding_shift =
2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
- __m256i filt[4], s[8], coeffs_x[4], coeffs_y[4];
assert(conv_params->round_0 > 0);
- filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
- filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
- filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
- filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
-
- prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_x);
- prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
-
const __m256i round_const_h = _mm256_set1_epi16(
((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2)));
const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1);
@@ -385,9 +628,29 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
(1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
- for (j = 0; j < w; j += 8) {
- /* Horizontal filter */
- {
+ __m256i filt[4], coeffs_x[4], coeffs_y[4];
+
+ filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
+ filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+
+ prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_x);
+ prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+
+ // Condition for checking valid horz_filt taps
+ if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_x[0], coeffs_x[3]), 0)))
+ is_horiz_4tap = 1;
+
+ // Condition for checking valid vert_filt taps
+ if (!(_mm256_extract_epi32(_mm256_or_si256(coeffs_y[0], coeffs_y[3]), 0)))
+ is_vert_4tap = 1;
+
+ if (is_horiz_4tap) {
+ int im_h = h + filter_params_y->taps - 1;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+ for (int j = 0; j < w; j += 8) {
+ /* Horizontal filter */
const uint8_t *src_h = src_ptr + j;
for (i = 0; i < im_h; i += 2) {
__m256i data =
@@ -396,49 +659,59 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
data = _mm256_inserti128_si256(
data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1);
src_h += (src_stride << 1);
- __m256i res = convolve_lowbd_x(data, coeffs_x, filt);
+ __m256i res = convolve_lowbd_x_4tap(data, coeffs_x + 1, filt);
res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h),
round_shift_h);
_mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
}
+ DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP;
}
+ } else if (is_vert_4tap) {
+ int im_h = h + 3;
+ const int fo_vert = 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+ filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+ for (int j = 0; j < w; j += 8) {
+ /* Horizontal filter */
+ const uint8_t *src_h = src_ptr + j;
+ DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP;
- /* Vertical filter */
- {
+ /* Vertical filter */
+ __m256i s[6];
__m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
__m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
__m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
__m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
- __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
- __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
s[0] = _mm256_unpacklo_epi16(s0, s1);
s[1] = _mm256_unpacklo_epi16(s2, s3);
- s[2] = _mm256_unpacklo_epi16(s4, s5);
- s[4] = _mm256_unpackhi_epi16(s0, s1);
- s[5] = _mm256_unpackhi_epi16(s2, s3);
- s[6] = _mm256_unpackhi_epi16(s4, s5);
+ s[3] = _mm256_unpackhi_epi16(s0, s1);
+ s[4] = _mm256_unpackhi_epi16(s2, s3);
for (i = 0; i < h; i += 2) {
const int16_t *data = &im_block[i * im_stride];
- const __m256i s6 =
- _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
- const __m256i s7 =
- _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
+ const __m256i s4 =
+ _mm256_loadu_si256((__m256i *)(data + 4 * im_stride));
+ const __m256i s5 =
+ _mm256_loadu_si256((__m256i *)(data + 5 * im_stride));
- s[3] = _mm256_unpacklo_epi16(s6, s7);
- s[7] = _mm256_unpackhi_epi16(s6, s7);
+ s[2] = _mm256_unpacklo_epi16(s4, s5);
+ s[5] = _mm256_unpackhi_epi16(s4, s5);
- const __m256i res_a = convolve(s, coeffs_y);
+ const __m256i res_a = convolve_4tap(s, coeffs_y + 1);
const __m256i res_a_round = _mm256_sra_epi32(
_mm256_add_epi32(res_a, round_const_v), round_shift_v);
if (w - j > 4) {
- const __m256i res_b = convolve(s + 4, coeffs_y);
+ const __m256i res_b = convolve_4tap(s + 3, coeffs_y + 1);
const __m256i res_b_round = _mm256_sra_epi32(
_mm256_add_epi32(res_b, round_const_v), round_shift_v);
const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round);
@@ -448,8 +721,8 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
const __m256i data_ref_0 =
load_line2_avx2(&dst[i * dst_stride + j],
&dst[i * dst_stride + j + dst_stride]);
- const __m256i comp_avg_res =
- comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+ const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned,
+ &wt, use_dist_wtd_comp_avg);
const __m256i round_result = convolve_rounding(
&comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -479,8 +752,8 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
load_line2_avx2(&dst[i * dst_stride + j],
&dst[i * dst_stride + j + dst_stride]);
- const __m256i comp_avg_res =
- comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+ const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned,
+ &wt, use_dist_wtd_comp_avg);
const __m256i round_result = convolve_rounding(
&comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -504,25 +777,36 @@ void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
res_1);
}
}
-
s[0] = s[1];
s[1] = s[2];
- s[2] = s[3];
-
+ s[3] = s[4];
s[4] = s[5];
- s[5] = s[6];
- s[6] = s[7];
}
}
+ } else {
+ int im_h = h + filter_params_y->taps - 1;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+ filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+ for (int j = 0; j < w; j += 8) {
+ /* Horizontal filter */
+ const uint8_t *src_h = src_ptr + j;
+ DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP;
+
+ DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP;
+ }
}
}
-void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride,
- uint8_t *dst0, int dst_stride0, int w, int h,
- const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_2d_copy_avx2(
+ const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ const int subpel_y_q4, ConvolveParams *conv_params) {
const int bd = 8;
CONV_BUF_TYPE *dst = conv_params->dst;
int dst_stride = conv_params->dst_stride;
@@ -535,7 +819,7 @@ void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride,
FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
const __m128i left_shift = _mm_cvtsi32_si128(bits);
const int do_average = conv_params->do_average;
- const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
const __m256i wt = unpack_weights_avx2(conv_params);
const __m256i zero = _mm256_setzero_si256();
@@ -562,7 +846,7 @@ void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride,
_mm256_loadu_si256((__m256i *)(&dst[i * dst_stride + j]));
const __m256i comp_avg_res =
- comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
const __m256i round_result = convolve_rounding(
&comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -600,7 +884,7 @@ void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride,
const __m256i data_ref_0 = load_line2_avx2(
&dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]);
const __m256i comp_avg_res =
- comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
const __m256i round_result = convolve_rounding(
&comp_avg_res, &offset_const, &rounding_const, rounding_shift);
diff --git a/libaom/av1/common/x86/jnt_convolve_sse2.c b/libaom/av1/common/x86/jnt_convolve_sse2.c
index 7f5677b..641cd02 100644
--- a/libaom/av1/common/x86/jnt_convolve_sse2.c
+++ b/libaom/av1/common/x86/jnt_convolve_sse2.c
@@ -16,12 +16,12 @@
#include "aom_dsp/aom_filter.h"
#include "aom_dsp/x86/convolve_sse2.h"
-void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
- int dst_stride0, int w, int h,
- const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_x_sse2(const uint8_t *src, int src_stride,
+ uint8_t *dst0, int dst_stride0, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
const int bd = 8;
CONV_BUF_TYPE *dst = conv_params->dst;
const int dst_stride = conv_params->dst_stride;
@@ -37,7 +37,7 @@ void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
const __m128i wt1 = _mm_set1_epi16(w1);
const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
const int do_average = conv_params->do_average;
- const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
const int offset_0 =
bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
@@ -77,7 +77,7 @@ void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst);
const __m128i comp_avg_res =
- comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
const __m128i round_result = convolve_rounding(
&comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -134,7 +134,7 @@ void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
_mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
const __m128i comp_avg_res =
- comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
const __m128i round_result = convolve_rounding(
&comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -150,12 +150,12 @@ void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
}
}
-void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
- int dst_stride0, int w, int h,
- const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_y_sse2(const uint8_t *src, int src_stride,
+ uint8_t *dst0, int dst_stride0, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
const int bd = 8;
CONV_BUF_TYPE *dst = conv_params->dst;
const int dst_stride = conv_params->dst_stride;
@@ -167,7 +167,7 @@ void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
const __m128i wt1 = _mm_set1_epi16(conv_params->bck_offset);
const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
const int do_average = conv_params->do_average;
- const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
const int offset_0 =
bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
@@ -225,7 +225,7 @@ void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst);
const __m128i comp_avg_res =
- comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
const __m128i round_result = convolve_rounding(
&comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -254,7 +254,7 @@ void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst);
const __m128i comp_avg_res =
- comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
const __m128i round_result = convolve_rounding(
&comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -331,7 +331,7 @@ void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
_mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
const __m128i comp_avg_res =
- comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
const __m128i round_result = convolve_rounding(
&comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -360,7 +360,7 @@ void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
_mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
const __m128i comp_avg_res =
- comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
const __m128i round_result = convolve_rounding(
&comp_avg_res, &offset_const, &rounding_const, rounding_shift);
@@ -384,12 +384,12 @@ void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
}
}
-void av1_jnt_convolve_2d_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
- int dst_stride0, int w, int h,
- const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_2d_sse2(const uint8_t *src, int src_stride,
+ uint8_t *dst0, int dst_stride0, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
CONV_BUF_TYPE *dst = conv_params->dst;
int dst_stride = conv_params->dst_stride;
const int bd = 8;
@@ -402,7 +402,7 @@ void av1_jnt_convolve_2d_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
const int fo_vert = filter_params_y->taps / 2 - 1;
const int fo_horiz = filter_params_x->taps / 2 - 1;
const int do_average = conv_params->do_average;
- const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
const __m128i zero = _mm_setzero_si128();
@@ -594,7 +594,7 @@ void av1_jnt_convolve_2d_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
_mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
const __m128i comp_avg_res =
- comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
const __m128i round_result = convolve_rounding(
&comp_avg_res, &offset_const, &rounding_const, rounding_shift);
diff --git a/libaom/av1/common/x86/jnt_convolve_ssse3.c b/libaom/av1/common/x86/jnt_convolve_ssse3.c
index 8227727..9aeab29 100644
--- a/libaom/av1/common/x86/jnt_convolve_ssse3.c
+++ b/libaom/av1/common/x86/jnt_convolve_ssse3.c
@@ -16,12 +16,11 @@
#include "aom_dsp/aom_filter.h"
#include "aom_dsp/x86/convolve_sse2.h"
-void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride,
- uint8_t *dst0, int dst_stride0, int w, int h,
- const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params) {
+void av1_dist_wtd_convolve_2d_ssse3(
+ const uint8_t *src, int src_stride, uint8_t *dst0, int dst_stride0, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ const int subpel_y_q4, ConvolveParams *conv_params) {
CONV_BUF_TYPE *dst = conv_params->dst;
int dst_stride = conv_params->dst_stride;
const int bd = 8;
@@ -34,7 +33,7 @@ void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride,
const int fo_vert = filter_params_y->taps / 2 - 1;
const int fo_horiz = filter_params_x->taps / 2 - 1;
const int do_average = conv_params->do_average;
- const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg;
const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
const __m128i zero = _mm_setzero_si128();
@@ -211,7 +210,7 @@ void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride,
_mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
const __m128i comp_avg_res =
- comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg);
const __m128i round_result = convolve_rounding(
&comp_avg_res, &offset_const, &rounding_const, rounding_shift);
diff --git a/libaom/av1/common/x86/warp_plane_sse4.c b/libaom/av1/common/x86/warp_plane_sse4.c
index b810cea..4532d17 100644
--- a/libaom/av1/common/x86/warp_plane_sse4.c
+++ b/libaom/av1/common/x86/warp_plane_sse4.c
@@ -577,7 +577,7 @@ static INLINE void store_vertical_filter_output(
__m128i *const dst8 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
const __m128i p_16 = _mm_loadl_epi64(p);
- if (conv_params->use_jnt_comp_avg) {
+ if (conv_params->use_dist_wtd_comp_avg) {
const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, temp_lo_16);
const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, *wt);
const __m128i shifted_32 =
@@ -610,7 +610,7 @@ static INLINE void store_vertical_filter_output(
(__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
const __m128i p4_16 = _mm_loadl_epi64(p4);
- if (conv_params->use_jnt_comp_avg) {
+ if (conv_params->use_dist_wtd_comp_avg) {
const __m128i p_16_hi = _mm_unpacklo_epi16(p4_16, temp_hi_16);
const __m128i wt_res_hi = _mm_madd_epi16(p_16_hi, *wt);
const __m128i shifted_32 =
diff --git a/libaom/av1/common/x86/wiener_convolve_avx2.c b/libaom/av1/common/x86/wiener_convolve_avx2.c
index 1f13e2f..87a6e12 100644
--- a/libaom/av1/common/x86/wiener_convolve_avx2.c
+++ b/libaom/av1/common/x86/wiener_convolve_avx2.c
@@ -17,7 +17,6 @@
#include "av1/common/convolve.h"
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/aom_filter.h"
-#include "aom_dsp/x86/convolve_avx2.h"
#include "aom_dsp/x86/synonyms.h"
#include "aom_dsp/x86/synonyms_avx2.h"
@@ -26,207 +25,236 @@
// on the left.
// A row of, say, 8-bit pixels with values p0, p1, p2, ..., p30, p31 will be
// loaded and stored as [ p31 ... p17 p16 ][ p15 ... p1 p0 ].
-
-// Exploiting the range of wiener filter coefficients,
-// horizontal filtering can be done in 16 bit intermediate precision.
-// The details are as follows :
-// Consider the horizontal wiener filter coefficients of the following form :
-// [C0, C1, C2, 2^(FILTER_BITS) -2 * (C0 + C1 + C2), C2, C1, C0]
-// Subtracting 2^(FILTER_BITS) from the centre tap we get the following :
-// [C0, C1, C2, -2 * (C0 + C1 + C2), C2, C1, C0]
-// The sum of the product "C0 * p0 + C1 * p1 + C2 * p2 -2 * (C0 + C1 + C2) * p3
-// + C2 * p4 + C1 * p5 + C0 * p6" would be in the range of signed 16 bit
-// precision. Finally, after rounding the above result by round_0, we multiply
-// the centre pixel by 2^(FILTER_BITS - round_0) and add it to get the
-// horizontal filter output.
-
void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h,
const ConvolveParams *conv_params) {
+ const int bd = 8;
assert(x_step_q4 == 16 && y_step_q4 == 16);
assert(!(w & 7));
(void)x_step_q4;
(void)y_step_q4;
- DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + SUBPEL_TAPS) * 8]);
- int im_h = h + SUBPEL_TAPS - 2;
- int im_stride = 8;
- memset(im_block + (im_h * im_stride), 0, MAX_SB_SIZE);
- int i, j;
- const int center_tap = (SUBPEL_TAPS - 1) / 2;
+ DECLARE_ALIGNED(32, uint16_t,
+ temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
+ int intermediate_height = h + SUBPEL_TAPS - 2;
+ memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);
+ const int center_tap = ((SUBPEL_TAPS - 1) / 2);
const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap;
- __m256i filt[4], coeffs_h[4], coeffs_v[4], filt_center;
-
- assert(conv_params->round_0 > 0);
-
- filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
- filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
- filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
- filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
-
- filt_center = _mm256_load_si256((__m256i const *)filt_center_global_avx2);
-
- const __m128i coeffs_x = _mm_loadu_si128((__m128i *)filter_x);
- const __m256i filter_coeffs_x = _mm256_broadcastsi128_si256(coeffs_x);
-
- // coeffs 0 1 0 1 0 1 0 1
- coeffs_h[0] =
- _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0200u));
- // coeffs 2 3 2 3 2 3 2 3
- coeffs_h[1] =
- _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0604u));
- // coeffs 4 5 4 5 4 5 4 5
- coeffs_h[2] =
- _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0a08u));
- // coeffs 6 7 6 7 6 7 6 7
- coeffs_h[3] =
- _mm256_shuffle_epi8(filter_coeffs_x, _mm256_set1_epi16(0x0e0cu));
-
- const __m256i round_const_h =
- _mm256_set1_epi16((1 << (conv_params->round_0 - 1)));
- const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0);
+ const __m128i zero_128 = _mm_setzero_si128();
+ const __m256i zero_256 = _mm256_setzero_si256();
// Add an offset to account for the "add_src" part of the convolve function.
- const __m128i zero_128 = _mm_setzero_si128();
- const __m128i offset_0 = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3);
- const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset_0);
-
- const __m256i filter_coeffs_y = _mm256_broadcastsi128_si256(coeffs_y);
-
- // coeffs 0 1 0 1 0 1 0 1
- coeffs_v[0] = _mm256_shuffle_epi32(filter_coeffs_y, 0x00);
- // coeffs 2 3 2 3 2 3 2 3
- coeffs_v[1] = _mm256_shuffle_epi32(filter_coeffs_y, 0x55);
- // coeffs 4 5 4 5 4 5 4 5
- coeffs_v[2] = _mm256_shuffle_epi32(filter_coeffs_y, 0xaa);
- // coeffs 6 7 6 7 6 7 6 7
- coeffs_v[3] = _mm256_shuffle_epi32(filter_coeffs_y, 0xff);
-
- const __m256i round_const_v =
- _mm256_set1_epi32((1 << (conv_params->round_1 - 1)));
- const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
-
- for (j = 0; j < w; j += 8) {
- for (i = 0; i < im_h; i += 2) {
- __m256i data = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
-
- // Load the next line
- if (i + 1 < im_h)
- data = _mm256_inserti128_si256(
- data,
- _mm_loadu_si128(
- (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),
- 1);
-
- __m256i res = convolve_lowbd_x(data, coeffs_h, filt);
-
- res =
- _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h);
-
- __m256i data_0 = _mm256_shuffle_epi8(data, filt_center);
-
- // multiply the center pixel by 2^(FILTER_BITS - round_0) and add it to
- // the result
- data_0 = _mm256_slli_epi16(data_0, FILTER_BITS - conv_params->round_0);
- res = _mm256_add_epi16(res, data_0);
-
- _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+ const __m128i offset = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3);
+
+ const __m256i clamp_low = zero_256;
+ const __m256i clamp_high =
+ _mm256_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1);
+
+ /* Horizontal filter */
+ {
+ // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ]
+ const __m128i coeffs_x = _mm_add_epi16(xx_loadu_128(filter_x), offset);
+
+ // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ]
+ const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+ // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ]
+ const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+ // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ]
+ const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
+ // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ]
+ const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
+ // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ]
+ const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
+ // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ]
+ const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
+
+ // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ]
+ const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
+ // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ]
+ const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
+ // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ]
+ const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
+ // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ]
+ const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
+
+ const __m256i round_const = _mm256_set1_epi32(
+ (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1)));
+
+ for (int i = 0; i < intermediate_height; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ const uint8_t *data_ij = src_ptr + i * src_stride + j;
+
+ // Load 8-bit src data
+ const __m128i data_0 = xx_loadu_128(data_ij + 0);
+ const __m128i data_1 = xx_loadu_128(data_ij + 1);
+ const __m128i data_2 = xx_loadu_128(data_ij + 2);
+ const __m128i data_3 = xx_loadu_128(data_ij + 3);
+ const __m128i data_4 = xx_loadu_128(data_ij + 4);
+ const __m128i data_5 = xx_loadu_128(data_ij + 5);
+ const __m128i data_6 = xx_loadu_128(data_ij + 6);
+ const __m128i data_7 = xx_loadu_128(data_ij + 7);
+
+ // (Zero-)Extend 8-bit data to 16-bit data
+ const __m256i src_0 = _mm256_cvtepu8_epi16(data_0);
+ const __m256i src_1 = _mm256_cvtepu8_epi16(data_1);
+ const __m256i src_2 = _mm256_cvtepu8_epi16(data_2);
+ const __m256i src_3 = _mm256_cvtepu8_epi16(data_3);
+ const __m256i src_4 = _mm256_cvtepu8_epi16(data_4);
+ const __m256i src_5 = _mm256_cvtepu8_epi16(data_5);
+ const __m256i src_6 = _mm256_cvtepu8_epi16(data_6);
+ const __m256i src_7 = _mm256_cvtepu8_epi16(data_7);
+
+ // Multiply src data by filter coeffs and sum pairs
+ const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
+ const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
+ const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
+ const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
+ const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
+ const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
+ const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
+ const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
+
+ // Calculate scalar product for even- and odd-indices separately,
+ // increasing to 32-bit precision
+ const __m256i res_even_sum = _mm256_add_epi32(
+ _mm256_add_epi32(res_0, res_4), _mm256_add_epi32(res_2, res_6));
+ const __m256i res_odd_sum = _mm256_add_epi32(
+ _mm256_add_epi32(res_1, res_5), _mm256_add_epi32(res_3, res_7));
+
+ const __m256i res_even = _mm256_srai_epi32(
+ _mm256_add_epi32(res_even_sum, round_const), conv_params->round_0);
+ const __m256i res_odd = _mm256_srai_epi32(
+ _mm256_add_epi32(res_odd_sum, round_const), conv_params->round_0);
+
+ // Reduce to 16-bit precision and pack even- and odd-index results
+ // back into one register. The _mm256_packs_epi32 intrinsic returns
+ // a register with the pixels ordered as follows:
+ // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
+ const __m256i res = _mm256_packs_epi32(res_even, res_odd);
+ const __m256i res_clamped =
+ _mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high);
+
+ // Store in a temporary array
+ yy_storeu_256(temp + i * MAX_SB_SIZE + j, res_clamped);
+ }
}
+ }
- /* Vertical filter */
- {
- __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
- __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
- __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
- __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
- __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
- __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
-
- __m256i s[8];
- s[0] = _mm256_unpacklo_epi16(src_0, src_1);
- s[1] = _mm256_unpacklo_epi16(src_2, src_3);
- s[2] = _mm256_unpacklo_epi16(src_4, src_5);
-
- s[4] = _mm256_unpackhi_epi16(src_0, src_1);
- s[5] = _mm256_unpackhi_epi16(src_2, src_3);
- s[6] = _mm256_unpackhi_epi16(src_4, src_5);
-
- for (i = 0; i < h - 1; i += 2) {
- const int16_t *data = &im_block[i * im_stride];
-
- const __m256i s6 =
- _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
- const __m256i s7 =
- _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
-
- s[3] = _mm256_unpacklo_epi16(s6, s7);
- s[7] = _mm256_unpackhi_epi16(s6, s7);
-
- __m256i res_a = convolve(s, coeffs_v);
- __m256i res_b = convolve(s + 4, coeffs_v);
-
- const __m256i res_a_round = _mm256_sra_epi32(
- _mm256_add_epi32(res_a, round_const_v), round_shift_v);
- const __m256i res_b_round = _mm256_sra_epi32(
- _mm256_add_epi32(res_b, round_const_v), round_shift_v);
-
- /* rounding code */
- // 16 bit conversion
- const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
- // 8 bit conversion and saturation to uint8
- const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit);
-
- const __m128i res_0 = _mm256_castsi256_si128(res_8b);
- const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
-
- // Store values into the destination buffer
- __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
- __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];
-
- _mm_storel_epi64(p_0, res_0);
- _mm_storel_epi64(p_1, res_1);
-
- s[0] = s[1];
- s[1] = s[2];
- s[2] = s[3];
-
- s[4] = s[5];
- s[5] = s[6];
- s[6] = s[7];
- }
- if (h - i) {
- s[0] = _mm256_permute2x128_si256(s[0], s[4], 0x20);
- s[1] = _mm256_permute2x128_si256(s[1], s[5], 0x20);
- s[2] = _mm256_permute2x128_si256(s[2], s[6], 0x20);
-
- const int16_t *data = &im_block[i * im_stride];
- const __m128i s6_ = _mm_loadu_si128((__m128i *)(data + 6 * im_stride));
- const __m128i s7_ = _mm_loadu_si128((__m128i *)(data + 7 * im_stride));
-
- __m128i s3 = _mm_unpacklo_epi16(s6_, s7_);
- __m128i s7 = _mm_unpackhi_epi16(s6_, s7_);
-
- s[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s7, 1);
- __m256i convolveres = convolve(s, coeffs_v);
-
- const __m256i res_round = _mm256_sra_epi32(
- _mm256_add_epi32(convolveres, round_const_v), round_shift_v);
-
- /* rounding code */
- // 16 bit conversion
- __m128i reslo = _mm256_castsi256_si128(res_round);
- __m128i reshi = _mm256_extracti128_si256(res_round, 1);
- const __m128i res_16bit = _mm_packus_epi32(reslo, reshi);
-
- // 8 bit conversion and saturation to uint8
- const __m128i res_8b = _mm_packus_epi16(res_16bit, res_16bit);
- __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
- _mm_storel_epi64(p_0, res_8b);
+ /* Vertical filter */
+ {
+ // coeffs [ g7 g6 g5 g4 g3 g2 g1 g0 ]
+ const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset);
+
+ // coeffs [ g3 g2 g3 g2 g1 g0 g1 g0 ]
+ const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+ // coeffs [ g7 g6 g7 g6 g5 g4 g5 g4 ]
+ const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+ // coeffs [ g1 g0 g1 g0 g1 g0 g1 g0 ]
+ const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
+ // coeffs [ g3 g2 g3 g2 g3 g2 g3 g2 ]
+ const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
+ // coeffs [ g5 g4 g5 g4 g5 g4 g5 g4 ]
+ const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
+ // coeffs [ g7 g6 g7 g6 g7 g6 g7 g6 ]
+ const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
+
+ // coeffs [ g1 g0 g1 g0 g1 g0 g1 g0 ][ g1 g0 g1 g0 g1 g0 g1 g0 ]
+ const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
+ // coeffs [ g3 g2 g3 g2 g3 g2 g3 g2 ][ g3 g2 g3 g2 g3 g2 g3 g2 ]
+ const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
+ // coeffs [ g5 g4 g5 g4 g5 g4 g5 g4 ][ g5 g4 g5 g4 g5 g4 g5 g4 ]
+ const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
+ // coeffs [ g7 g6 g7 g6 g7 g6 g7 g6 ][ g7 g6 g7 g6 g7 g6 g7 g6 ]
+ const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
+
+ const __m256i round_const =
+ _mm256_set1_epi32((1 << (conv_params->round_1 - 1)) -
+ (1 << (bd + conv_params->round_1 - 1)));
+
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ const uint16_t *data_ij = temp + i * MAX_SB_SIZE + j;
+
+ // Load 16-bit data from the output of the horizontal filter in
+ // which the pixels are ordered as follows:
+ // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
+ const __m256i data_0 = yy_loadu_256(data_ij + 0 * MAX_SB_SIZE);
+ const __m256i data_1 = yy_loadu_256(data_ij + 1 * MAX_SB_SIZE);
+ const __m256i data_2 = yy_loadu_256(data_ij + 2 * MAX_SB_SIZE);
+ const __m256i data_3 = yy_loadu_256(data_ij + 3 * MAX_SB_SIZE);
+ const __m256i data_4 = yy_loadu_256(data_ij + 4 * MAX_SB_SIZE);
+ const __m256i data_5 = yy_loadu_256(data_ij + 5 * MAX_SB_SIZE);
+ const __m256i data_6 = yy_loadu_256(data_ij + 6 * MAX_SB_SIZE);
+ const __m256i data_7 = yy_loadu_256(data_ij + 7 * MAX_SB_SIZE);
+
+ // Filter the even-indices, increasing to 32-bit precision
+ const __m256i src_0 = _mm256_unpacklo_epi16(data_0, data_1);
+ const __m256i src_2 = _mm256_unpacklo_epi16(data_2, data_3);
+ const __m256i src_4 = _mm256_unpacklo_epi16(data_4, data_5);
+ const __m256i src_6 = _mm256_unpacklo_epi16(data_6, data_7);
+
+ const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
+ const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
+ const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
+ const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
+
+ const __m256i res_even = _mm256_add_epi32(
+ _mm256_add_epi32(res_0, res_2), _mm256_add_epi32(res_4, res_6));
+
+ // Filter the odd-indices, increasing to 32-bit precision
+ const __m256i src_1 = _mm256_unpackhi_epi16(data_0, data_1);
+ const __m256i src_3 = _mm256_unpackhi_epi16(data_2, data_3);
+ const __m256i src_5 = _mm256_unpackhi_epi16(data_4, data_5);
+ const __m256i src_7 = _mm256_unpackhi_epi16(data_6, data_7);
+
+ const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
+ const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
+ const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
+ const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
+
+ const __m256i res_odd = _mm256_add_epi32(
+ _mm256_add_epi32(res_1, res_3), _mm256_add_epi32(res_5, res_7));
+
+ // Pixels are currently in the following order:
+ // res_even order: [ 14 12 10 8 ] [ 6 4 2 0 ]
+ // res_odd order: [ 15 13 11 9 ] [ 7 5 3 1 ]
+ //
+ // Rearrange the pixels into the following order:
+ // res_lo order: [ 11 10 9 8 ] [ 3 2 1 0 ]
+ // res_hi order: [ 15 14 13 12 ] [ 7 6 5 4 ]
+ const __m256i res_lo = _mm256_unpacklo_epi32(res_even, res_odd);
+ const __m256i res_hi = _mm256_unpackhi_epi32(res_even, res_odd);
+
+ const __m256i res_lo_round = _mm256_srai_epi32(
+ _mm256_add_epi32(res_lo, round_const), conv_params->round_1);
+ const __m256i res_hi_round = _mm256_srai_epi32(
+ _mm256_add_epi32(res_hi, round_const), conv_params->round_1);
+
+ // Reduce to 16-bit precision and pack into the correct order:
+ // [ 15 14 13 12 11 10 9 8 ][ 7 6 5 4 3 2 1 0 ]
+ const __m256i res_16bit =
+ _mm256_packs_epi32(res_lo_round, res_hi_round);
+
+ // Reduce to 8-bit precision. This messes up the order:
+ // [ - - - - - - - - 15 14 13 12 11 10 9 8 ]
+ // [ - - - - - - - - 7 6 5 4 3 2 1 0 ]
+ const __m256i res_8bit =
+ _mm256_packus_epi16(res_16bit, zero_256 /* don't care value */);
+
+ // Swap the two central 32-bit values to get the order:
+ // [ - - - - - - - - - - - - - - - - ]
+ // [ 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 ]
+ const __m256i res_8bit2 = _mm256_permute4x64_epi64(res_8bit, 0xd8);
+
+ // Store the lower 128-bit lane in the dst array
+ xx_storeu_128(dst + i * dst_stride + j,
+ _mm256_castsi256_si128(res_8bit2));
}
}
}
diff --git a/libaom/av1/decoder/decodeframe.c b/libaom/av1/decoder/decodeframe.c
index a30b267..b7fc370 100644
--- a/libaom/av1/decoder/decodeframe.c
+++ b/libaom/av1/decoder/decodeframe.c
@@ -64,6 +64,9 @@
#define ACCT_STR __func__
+#define AOM_MIN_THREADS_PER_TILE 1
+#define AOM_MAX_THREADS_PER_TILE 2
+
// This is needed by ext_tile related unit tests.
#define EXT_TILE_DEBUG 1
#define MC_TEMP_BUF_PELS \
@@ -153,13 +156,10 @@ static void inverse_transform_block(MACROBLOCKD *xd, int plane,
const TX_SIZE tx_size, uint8_t *dst,
int stride, int reduced_tx_set) {
struct macroblockd_plane *const pd = &xd->plane[plane];
- tran_low_t *const dqcoeff = pd->dqcoeff;
+ tran_low_t *const dqcoeff = pd->dqcoeff_block + xd->cb_offset[plane];
eob_info *eob_data = pd->eob_data + xd->txb_offset[plane];
uint16_t scan_line = eob_data->max_scan_line;
uint16_t eob = eob_data->eob;
-
- memcpy(dqcoeff, pd->dqcoeff_block + xd->cb_offset[plane],
- (scan_line + 1) * sizeof(dqcoeff[0]));
av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst, stride,
eob, reduced_tx_set);
memset(dqcoeff, 0, (scan_line + 1) * sizeof(dqcoeff[0]));
@@ -696,27 +696,28 @@ static INLINE void dec_build_inter_predictors(const AV1_COMMON *cm,
assert(bw < 8 || bh < 8);
ConvolveParams conv_params = get_conv_params_no_round(
0, plane, xd->tmp_conv_dst, tmp_dst_stride, is_compound, xd->bd);
- conv_params.use_jnt_comp_avg = 0;
+ conv_params.use_dist_wtd_comp_avg = 0;
struct buf_2d *const dst_buf = &pd->dst;
uint8_t *dst = dst_buf->buf + dst_buf->stride * y + x;
ref = 0;
- const RefBuffer *ref_buf =
- &cm->current_frame
- .frame_refs[this_mbmi->ref_frame[ref] - LAST_FRAME];
+ const RefCntBuffer *ref_buf =
+ get_ref_frame_buf(cm, this_mbmi->ref_frame[ref]);
+ const struct scale_factors *ref_scale_factors =
+ get_ref_scale_factors_const(cm, this_mbmi->ref_frame[ref]);
- pd->pre[ref].buf0 = (plane == 1) ? ref_buf->buf->buf.u_buffer
- : ref_buf->buf->buf.v_buffer;
+ pd->pre[ref].buf0 =
+ (plane == 1) ? ref_buf->buf.u_buffer : ref_buf->buf.v_buffer;
pd->pre[ref].buf =
- pd->pre[ref].buf0 +
- scaled_buffer_offset(pre_x, pre_y, ref_buf->buf->buf.uv_stride,
- &ref_buf->sf);
- pd->pre[ref].width = ref_buf->buf->buf.uv_crop_width;
- pd->pre[ref].height = ref_buf->buf->buf.uv_crop_height;
- pd->pre[ref].stride = ref_buf->buf->buf.uv_stride;
+ pd->pre[ref].buf0 + scaled_buffer_offset(pre_x, pre_y,
+ ref_buf->buf.uv_stride,
+ ref_scale_factors);
+ pd->pre[ref].width = ref_buf->buf.uv_crop_width;
+ pd->pre[ref].height = ref_buf->buf.uv_crop_height;
+ pd->pre[ref].stride = ref_buf->buf.uv_stride;
const struct scale_factors *const sf =
- is_intrabc ? &cm->sf_identity : &ref_buf->sf;
+ is_intrabc ? &cm->sf_identity : ref_scale_factors;
struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
const MV mv = this_mbmi->mv[ref].as_mv;
@@ -736,7 +737,7 @@ static INLINE void dec_build_inter_predictors(const AV1_COMMON *cm,
&scaled_mv, &subpel_x_mv, &subpel_y_mv);
pre = pre_buf->buf0 + block.y0 * pre_buf->stride + block.x0;
src_stride = pre_buf->stride;
- highbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+ highbd = is_cur_buf_hbd(xd);
extend_mc_border(sf, pre_buf, scaled_mv, block, subpel_x_mv,
subpel_y_mv, 0, is_intrabc, highbd, xd->mc_buf[ref],
&pre, &src_stride);
@@ -769,7 +770,7 @@ static INLINE void dec_build_inter_predictors(const AV1_COMMON *cm,
int src_stride[2];
for (ref = 0; ref < 1 + is_compound; ++ref) {
const struct scale_factors *const sf =
- is_intrabc ? &cm->sf_identity : &xd->block_refs[ref]->sf;
+ is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref];
struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
const MV mv = mi->mv[ref].as_mv;
PadBlock block;
@@ -780,9 +781,9 @@ static INLINE void dec_build_inter_predictors(const AV1_COMMON *cm,
dec_calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, 0, 0, pre_buf,
&subpel_params[ref], bw, bh, &block, mi_x, mi_y,
&scaled_mv, &subpel_x_mv, &subpel_y_mv);
- pre[ref] = pre_buf->buf0 + block.y0 * pre_buf->stride + block.x0;
+ pre[ref] = pre_buf->buf0 + (int64_t)block.y0 * pre_buf->stride + block.x0;
src_stride[ref] = pre_buf->stride;
- highbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+ highbd = is_cur_buf_hbd(xd);
WarpTypesAllowed warp_types;
warp_types.global_warp_allowed = is_global[ref];
@@ -800,13 +801,13 @@ static INLINE void dec_build_inter_predictors(const AV1_COMMON *cm,
ConvolveParams conv_params = get_conv_params_no_round(
0, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd);
- av1_jnt_comp_weight_assign(cm, mi, 0, &conv_params.fwd_offset,
- &conv_params.bck_offset,
- &conv_params.use_jnt_comp_avg, is_compound);
+ av1_dist_wtd_comp_weight_assign(
+ cm, mi, 0, &conv_params.fwd_offset, &conv_params.bck_offset,
+ &conv_params.use_dist_wtd_comp_avg, is_compound);
for (ref = 0; ref < 1 + is_compound; ++ref) {
const struct scale_factors *const sf =
- is_intrabc ? &cm->sf_identity : &xd->block_refs[ref]->sf;
+ is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref];
WarpTypesAllowed warp_types;
warp_types.global_warp_allowed = is_global[ref];
warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
@@ -855,7 +856,7 @@ static void dec_build_inter_predictors_for_planes(const AV1_COMMON *cm,
static void dec_build_inter_predictors_sby(const AV1_COMMON *cm,
MACROBLOCKD *xd, int mi_row,
- int mi_col, BUFFER_SET *ctx,
+ int mi_col, const BUFFER_SET *ctx,
BLOCK_SIZE bsize) {
dec_build_inter_predictors_for_planes(cm, xd, bsize, mi_row, mi_col, 0, 0);
@@ -870,7 +871,7 @@ static void dec_build_inter_predictors_sby(const AV1_COMMON *cm,
static void dec_build_inter_predictors_sbuv(const AV1_COMMON *cm,
MACROBLOCKD *xd, int mi_row,
- int mi_col, BUFFER_SET *ctx,
+ int mi_col, const BUFFER_SET *ctx,
BLOCK_SIZE bsize) {
dec_build_inter_predictors_for_planes(cm, xd, bsize, mi_row, mi_col, 1,
MAX_MB_PLANE - 1);
@@ -1015,7 +1016,7 @@ static void dec_build_obmc_inter_predictors_sb(const AV1_COMMON *cm,
int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ if (is_cur_buf_hbd(xd)) {
int len = sizeof(uint16_t);
dst_buf1[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0]);
dst_buf1[1] =
@@ -1063,11 +1064,13 @@ static void predict_inter_block(AV1_COMMON *const cm, MACROBLOCKD *const xd,
assert(frame == INTRA_FRAME);
assert(ref == 0);
} else {
- RefBuffer *ref_buf = &cm->current_frame.frame_refs[frame - LAST_FRAME];
+ const RefCntBuffer *ref_buf = get_ref_frame_buf(cm, frame);
+ const struct scale_factors *ref_scale_factors =
+ get_ref_scale_factors_const(cm, frame);
- xd->block_refs[ref] = ref_buf;
- av1_setup_pre_planes(xd, ref, &ref_buf->buf->buf, mi_row, mi_col,
- &ref_buf->sf, num_planes);
+ xd->block_ref_scale_factors[ref] = ref_scale_factors;
+ av1_setup_pre_planes(xd, ref, &ref_buf->buf, mi_row, mi_col,
+ ref_scale_factors, num_planes);
}
}
@@ -2238,7 +2241,6 @@ static void setup_quantization(AV1_COMMON *const cm,
cm->v_dc_delta_q = 0;
cm->v_ac_delta_q = 0;
}
- cm->dequant_bit_depth = seq_params->bit_depth;
cm->using_qmatrix = aom_rb_read_bit(rb);
if (cm->using_qmatrix) {
cm->qm_y = aom_rb_read_literal(rb, QM_LEVEL_BITS);
@@ -2374,7 +2376,7 @@ static void setup_buffer_pool(AV1_COMMON *cm) {
if (aom_realloc_frame_buffer(
&cm->cur_frame->buf, cm->width, cm->height, seq_params->subsampling_x,
seq_params->subsampling_y, seq_params->use_highbitdepth,
- AOM_BORDER_IN_PIXELS, cm->byte_alignment,
+ AOM_DEC_BORDER_IN_PIXELS, cm->byte_alignment,
&cm->cur_frame->raw_frame_buffer, pool->get_fb_cb, pool->cb_priv)) {
unlock_buffer_pool(pool);
aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
@@ -2438,17 +2440,28 @@ static void setup_frame_size_with_refs(AV1_COMMON *cm,
int width, height;
int found = 0;
int has_valid_ref_frame = 0;
- for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+ for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
if (aom_rb_read_bit(rb)) {
- YV12_BUFFER_CONFIG *const buf = &cm->current_frame.frame_refs[i].buf->buf;
- width = buf->y_crop_width;
- height = buf->y_crop_height;
- cm->render_width = buf->render_width;
- cm->render_height = buf->render_height;
- setup_superres(cm, rb, &width, &height);
- resize_context_buffers(cm, width, height);
- found = 1;
- break;
+ const RefCntBuffer *const ref_buf = get_ref_frame_buf(cm, i);
+ // This will never be NULL in a normal stream, as streams are required to
+ // have a shown keyframe before any inter frames, which would refresh all
+ // the reference buffers. However, it might be null if we're starting in
+ // the middle of a stream, and static analysis will error if we don't do
+ // a null check here.
+ if (ref_buf == NULL) {
+ aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Invalid condition: invalid reference buffer");
+ } else {
+ const YV12_BUFFER_CONFIG *const buf = &ref_buf->buf;
+ width = buf->y_crop_width;
+ height = buf->y_crop_height;
+ cm->render_width = buf->render_width;
+ cm->render_height = buf->render_height;
+ setup_superres(cm, rb, &width, &height);
+ resize_context_buffers(cm, width, height);
+ found = 1;
+ break;
+ }
}
}
@@ -2469,20 +2482,20 @@ static void setup_frame_size_with_refs(AV1_COMMON *cm,
// Check to make sure at least one of frames that this frame references
// has valid dimensions.
- for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
- RefBuffer *const ref_frame = &cm->current_frame.frame_refs[i];
+ for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+ const RefCntBuffer *const ref_frame = get_ref_frame_buf(cm, i);
has_valid_ref_frame |=
- valid_ref_frame_size(ref_frame->buf->buf.y_crop_width,
- ref_frame->buf->buf.y_crop_height, width, height);
+ valid_ref_frame_size(ref_frame->buf.y_crop_width,
+ ref_frame->buf.y_crop_height, width, height);
}
if (!has_valid_ref_frame)
aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
"Referenced frame has invalid size");
- for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
- RefBuffer *const ref_frame = &cm->current_frame.frame_refs[i];
+ for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+ const RefCntBuffer *const ref_frame = get_ref_frame_buf(cm, i);
if (!valid_ref_frame_img_fmt(
- ref_frame->buf->buf.bit_depth, ref_frame->buf->buf.subsampling_x,
- ref_frame->buf->buf.subsampling_y, seq_params->bit_depth,
+ ref_frame->buf.bit_depth, ref_frame->buf.subsampling_x,
+ ref_frame->buf.subsampling_y, seq_params->bit_depth,
seq_params->subsampling_x, seq_params->subsampling_y))
aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
"Referenced frame has incompatible color format");
@@ -2716,9 +2729,10 @@ static const uint8_t *get_ls_tile_buffers(
const int tile_col_size_bytes = pbi->tile_col_size_bytes;
const int tile_size_bytes = pbi->tile_size_bytes;
+ int tile_width, tile_height;
+ av1_get_uniform_tile_size(cm, &tile_width, &tile_height);
const int tile_copy_mode =
- ((AOMMAX(cm->tile_width, cm->tile_height) << MI_SIZE_LOG2) <= 256) ? 1
- : 0;
+ ((AOMMAX(tile_width, tile_height) << MI_SIZE_LOG2) <= 256) ? 1 : 0;
// Read tile column sizes for all columns (we need the last tile buffer)
for (int c = 0; c < tile_cols; ++c) {
const int is_last = c == tile_cols - 1;
@@ -3206,7 +3220,7 @@ static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data,
continue;
td->bit_reader = &tile_data->bit_reader;
- av1_zero(td->dqcoeff);
+ av1_zero(td->cb_buffer_base.dqcoeff);
av1_tile_init(&td->xd.tile, cm, row, col);
td->xd.current_qindex = cm->base_qindex;
setup_bool_decoder(tile_bs_buf->data, data_end, tile_bs_buf->size,
@@ -3220,7 +3234,7 @@ static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data,
td->bit_reader->accounting = NULL;
}
#endif
- av1_init_macroblockd(cm, &td->xd, td->dqcoeff);
+ av1_init_macroblockd(cm, &td->xd, NULL);
av1_init_above_context(cm, &td->xd, row);
// Initialise the tile context from the frame context
@@ -3277,7 +3291,7 @@ static void tile_worker_hook_init(AV1Decoder *const pbi,
int tile_col = tile_data->tile_info.tile_col;
td->bit_reader = &tile_data->bit_reader;
- av1_zero(td->dqcoeff);
+ av1_zero(td->cb_buffer_base.dqcoeff);
av1_tile_init(&td->xd.tile, cm, tile_row, tile_col);
td->xd.current_qindex = cm->base_qindex;
setup_bool_decoder(tile_buffer->data, thread_data->data_end,
@@ -3292,7 +3306,7 @@ static void tile_worker_hook_init(AV1Decoder *const pbi,
td->bit_reader->accounting = NULL;
}
#endif
- av1_init_macroblockd(cm, &td->xd, td->dqcoeff);
+ av1_init_macroblockd(cm, &td->xd, NULL);
td->xd.error_info = &thread_data->error_info;
av1_init_above_context(cm, &td->xd, tile_row);
@@ -3350,6 +3364,20 @@ static int tile_worker_hook(void *arg1, void *arg2) {
return !td->xd.corrupted;
}
+static INLINE int get_max_row_mt_workers_per_tile(AV1_COMMON *cm,
+ TileInfo tile) {
+ // NOTE: Currently value of max workers is calculated based
+ // on the parse and decode time. As per the theoretical estimate
+ // when percentage of parse time is equal to percentage of decode
+ // time, number of workers needed to parse + decode a tile can not
+ // exceed more than 2.
+ // TODO(any): Modify this value if parsing is optimized in future.
+ int sb_rows = av1_get_sb_rows_in_tile(cm, tile);
+ int max_workers =
+ sb_rows == 1 ? AOM_MIN_THREADS_PER_TILE : AOM_MAX_THREADS_PER_TILE;
+ return max_workers;
+}
+
// The caller must hold pbi->row_mt_mutex_ when calling this function.
// Returns 1 if either the next job is stored in *next_job_info or 1 is stored
// in *end_of_frame.
@@ -3380,8 +3408,8 @@ static int get_next_job_info(AV1Decoder *const pbi,
int min_threads_working = INT_MAX;
int max_mis_to_decode = 0;
int tile_row_idx, tile_col_idx;
- int tile_row = 0;
- int tile_col = 0;
+ int tile_row = -1;
+ int tile_col = -1;
memset(next_job_info, 0, sizeof(*next_job_info));
@@ -3429,7 +3457,9 @@ static int get_next_job_info(AV1Decoder *const pbi,
max_mis_to_decode = 0;
}
if (num_threads_working == min_threads_working &&
- num_mis_to_decode > max_mis_to_decode) {
+ num_mis_to_decode > max_mis_to_decode &&
+ num_threads_working <
+ get_max_row_mt_workers_per_tile(cm, tile_data->tile_info)) {
max_mis_to_decode = num_mis_to_decode;
tile_row = tile_row_idx;
tile_col = tile_col_idx;
@@ -3437,6 +3467,8 @@ static int get_next_job_info(AV1Decoder *const pbi,
}
}
}
+ // No job found to process
+ if (tile_row == -1 || tile_col == -1) return 0;
tile_data = pbi->tile_data + tile_row * cm->tile_cols + tile_col;
tile_info = tile_data->tile_info;
@@ -3565,9 +3597,22 @@ static int row_mt_worker_hook(void *arg1, void *arg2) {
TileDataDec *const tile_data = cur_job_info->tile_data;
tile_worker_hook_init(pbi, thread_data, tile_buffer, tile_data,
allow_update_cdf);
-
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(pbi->row_mt_mutex_);
+#endif
+ tile_data->dec_row_mt_sync.num_threads_working++;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(pbi->row_mt_mutex_);
+#endif
// decode tile
parse_tile_row_mt(pbi, td, tile_data);
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(pbi->row_mt_mutex_);
+#endif
+ tile_data->dec_row_mt_sync.num_threads_working--;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(pbi->row_mt_mutex_);
+#endif
} else {
break;
}
@@ -3616,7 +3661,7 @@ static int row_mt_worker_hook(void *arg1, void *arg2) {
TileInfo tile_info = tile_data->tile_info;
av1_tile_init(&td->xd.tile, cm, tile_row, tile_col);
- av1_init_macroblockd(cm, &td->xd, td->dqcoeff);
+ av1_init_macroblockd(cm, &td->xd, NULL);
td->xd.error_info = &thread_data->error_info;
decode_tile_sb_row(pbi, td, tile_info, mi_row);
@@ -3825,7 +3870,7 @@ static void decode_mt_init(AV1Decoder *pbi) {
thread_data->error_info.setjmp = 0;
}
}
- const int use_highbd = cm->seq_params.use_highbitdepth ? 1 : 0;
+ const int use_highbd = cm->seq_params.use_highbitdepth;
const int buf_size = MC_TEMP_BUF_PELS << use_highbd;
for (worker_idx = 0; worker_idx < pbi->max_threads - 1; ++worker_idx) {
DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
@@ -3956,6 +4001,7 @@ static void dec_alloc_cb_buf(AV1Decoder *pbi) {
av1_dec_free_cb_buf(pbi);
CHECK_MEM_ERROR(cm, pbi->cb_buffer_base,
aom_memalign(32, sizeof(*pbi->cb_buffer_base) * size));
+ memset(pbi->cb_buffer_base, 0, sizeof(*pbi->cb_buffer_base) * size);
pbi->cb_buffer_alloc_size = size;
}
}
@@ -4043,7 +4089,8 @@ static const uint8_t *decode_tiles_row_mt(AV1Decoder *pbi, const uint8_t *data,
int tile_cols_start;
int tile_cols_end;
int tile_count_tg;
- int num_workers;
+ int num_workers = 0;
+ int max_threads;
const uint8_t *raw_data_end = NULL;
int max_sb_rows = 0;
@@ -4059,7 +4106,7 @@ static const uint8_t *decode_tiles_row_mt(AV1Decoder *pbi, const uint8_t *data,
tile_cols_end = tile_cols;
}
tile_count_tg = end_tile - start_tile + 1;
- num_workers = pbi->max_threads;
+ max_threads = pbi->max_threads;
// No tiles to decode.
if (tile_rows_end <= tile_rows_start || tile_cols_end <= tile_cols_start ||
@@ -4072,7 +4119,7 @@ static const uint8_t *decode_tiles_row_mt(AV1Decoder *pbi, const uint8_t *data,
assert(tile_rows <= MAX_TILE_ROWS);
assert(tile_cols <= MAX_TILE_COLS);
assert(tile_count_tg > 0);
- assert(num_workers > 0);
+ assert(max_threads > 0);
assert(start_tile <= end_tile);
assert(start_tile >= 0 && end_tile < n_tiles);
@@ -4104,8 +4151,10 @@ static const uint8_t *decode_tiles_row_mt(AV1Decoder *pbi, const uint8_t *data,
max_sb_rows = AOMMAX(max_sb_rows,
av1_get_sb_rows_in_tile(cm, tile_data->tile_info));
+ num_workers += get_max_row_mt_workers_per_tile(cm, tile_data->tile_info);
}
}
+ num_workers = AOMMIN(num_workers, max_threads);
if (pbi->allocated_row_mt_sync_rows != max_sb_rows) {
for (int i = 0; i < n_tiles; ++i) {
@@ -4190,20 +4239,38 @@ void av1_read_film_grain_params(AV1_COMMON *cm,
if (!pars->update_parameters) {
// inherit parameters from a previous reference frame
- RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
int film_grain_params_ref_idx = aom_rb_read_literal(rb, 3);
- int buf_idx = cm->ref_frame_map[film_grain_params_ref_idx];
- if (buf_idx == INVALID_IDX) {
+ // Section 6.8.20: It is a requirement of bitstream conformance that
+ // film_grain_params_ref_idx is equal to ref_frame_idx[ j ] for some value
+ // of j in the range 0 to REFS_PER_FRAME - 1.
+ int found = 0;
+ for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+ if (film_grain_params_ref_idx == cm->remapped_ref_idx[i]) {
+ found = 1;
+ break;
+ }
+ }
+ if (!found) {
+ aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "Invalid film grain reference idx %d. ref_frame_idx = "
+ "{%d, %d, %d, %d, %d, %d, %d}",
+ film_grain_params_ref_idx, cm->remapped_ref_idx[0],
+ cm->remapped_ref_idx[1], cm->remapped_ref_idx[2],
+ cm->remapped_ref_idx[3], cm->remapped_ref_idx[4],
+ cm->remapped_ref_idx[5], cm->remapped_ref_idx[6]);
+ }
+ RefCntBuffer *const buf = cm->ref_frame_map[film_grain_params_ref_idx];
+ if (buf == NULL) {
aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
"Invalid Film grain reference idx");
}
- if (!frame_bufs[buf_idx].film_grain_params_present) {
+ if (!buf->film_grain_params_present) {
aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
"Film grain reference parameters not available");
}
uint16_t random_seed = pars->random_seed;
- *pars = frame_bufs[buf_idx].film_grain_params; // inherit paramaters
- pars->random_seed = random_seed; // with new random seed
+ *pars = buf->film_grain_params; // inherit paramaters
+ pars->random_seed = random_seed; // with new random seed
return;
}
@@ -4420,13 +4487,13 @@ void av1_read_timing_info_header(AV1_COMMON *cm,
cm->timing_info.equal_picture_interval =
aom_rb_read_bit(rb); // Equal picture interval bit
if (cm->timing_info.equal_picture_interval) {
- cm->timing_info.num_ticks_per_picture =
- aom_rb_read_uvlc(rb) + 1; // ticks per picture
- if (cm->timing_info.num_ticks_per_picture == 0) {
+ const uint32_t num_ticks_per_picture_minus_1 = aom_rb_read_uvlc(rb);
+ if (num_ticks_per_picture_minus_1 == UINT32_MAX) {
aom_internal_error(
&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
"num_ticks_per_picture_minus_1 cannot be (1 << 32) − 1.");
}
+ cm->timing_info.num_ticks_per_picture = num_ticks_per_picture_minus_1 + 1;
}
}
@@ -4505,7 +4572,7 @@ void av1_read_sequence_header(AV1_COMMON *cm, struct aom_read_bit_buffer *rb,
seq_params->enable_warped_motion = 0;
seq_params->enable_dual_filter = 0;
seq_params->order_hint_info.enable_order_hint = 0;
- seq_params->order_hint_info.enable_jnt_comp = 0;
+ seq_params->order_hint_info.enable_dist_wtd_comp = 0;
seq_params->order_hint_info.enable_ref_frame_mvs = 0;
seq_params->force_screen_content_tools = 2; // SELECT_SCREEN_CONTENT_TOOLS
seq_params->force_integer_mv = 2; // SELECT_INTEGER_MV
@@ -4517,7 +4584,7 @@ void av1_read_sequence_header(AV1_COMMON *cm, struct aom_read_bit_buffer *rb,
seq_params->enable_dual_filter = aom_rb_read_bit(rb);
seq_params->order_hint_info.enable_order_hint = aom_rb_read_bit(rb);
- seq_params->order_hint_info.enable_jnt_comp =
+ seq_params->order_hint_info.enable_dist_wtd_comp =
seq_params->order_hint_info.enable_order_hint ? aom_rb_read_bit(rb) : 0;
seq_params->order_hint_info.enable_ref_frame_mvs =
seq_params->order_hint_info.enable_order_hint ? aom_rb_read_bit(rb) : 0;
@@ -4663,62 +4730,71 @@ static void read_global_motion(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
}
// Release the references to the frame buffers in cm->ref_frame_map and reset
-// all elements of cm->ref_frame_map to -1.
+// all elements of cm->ref_frame_map to NULL.
static void reset_ref_frame_map(AV1_COMMON *const cm) {
BufferPool *const pool = cm->buffer_pool;
- RefCntBuffer *const frame_bufs = pool->frame_bufs;
for (int i = 0; i < REF_FRAMES; i++) {
- decrease_ref_count(cm->ref_frame_map[i], frame_bufs, pool);
+ decrease_ref_count(cm->ref_frame_map[i], pool);
+ cm->ref_frame_map[i] = NULL;
}
- memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
}
// Generate next_ref_frame_map.
static void generate_next_ref_frame_map(AV1Decoder *const pbi) {
AV1_COMMON *const cm = &pbi->common;
BufferPool *const pool = cm->buffer_pool;
- RefCntBuffer *const frame_bufs = pool->frame_bufs;
lock_buffer_pool(pool);
// cm->next_ref_frame_map holds references to frame buffers. After storing a
// frame buffer index in cm->next_ref_frame_map, we need to increase the
// frame buffer's ref_count.
int ref_index = 0;
- for (int mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
+ for (int mask = cm->current_frame.refresh_frame_flags; mask; mask >>= 1) {
if (mask & 1) {
- cm->next_ref_frame_map[ref_index] = cm->new_fb_idx;
+ cm->next_ref_frame_map[ref_index] = cm->cur_frame;
} else {
cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index];
}
- if (cm->next_ref_frame_map[ref_index] >= 0)
- ++frame_bufs[cm->next_ref_frame_map[ref_index]].ref_count;
+ if (cm->next_ref_frame_map[ref_index] != NULL)
+ ++cm->next_ref_frame_map[ref_index]->ref_count;
++ref_index;
}
for (; ref_index < REF_FRAMES; ++ref_index) {
cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index];
- if (cm->next_ref_frame_map[ref_index] >= 0)
- ++frame_bufs[cm->next_ref_frame_map[ref_index]].ref_count;
+ if (cm->next_ref_frame_map[ref_index] != NULL)
+ ++cm->next_ref_frame_map[ref_index]->ref_count;
}
unlock_buffer_pool(pool);
pbi->hold_ref_buf = 1;
}
+// If the refresh_frame_flags bitmask is set, update reference frame id values
+// and mark frames as valid for reference.
+static void update_ref_frame_id(AV1_COMMON *const cm, int frame_id) {
+ assert(cm->seq_params.frame_id_numbers_present_flag);
+ int refresh_frame_flags = cm->current_frame.refresh_frame_flags;
+ for (int i = 0; i < REF_FRAMES; i++) {
+ if ((refresh_frame_flags >> i) & 1) {
+ cm->ref_frame_id[i] = frame_id;
+ cm->valid_for_referencing[i] = 1;
+ }
+ }
+}
+
static void show_existing_frame_reset(AV1Decoder *const pbi,
int existing_frame_idx) {
AV1_COMMON *const cm = &pbi->common;
- BufferPool *const pool = cm->buffer_pool;
- RefCntBuffer *const frame_bufs = pool->frame_bufs;
assert(cm->show_existing_frame);
cm->current_frame.frame_type = KEY_FRAME;
- pbi->refresh_frame_flags = (1 << REF_FRAMES) - 1;
+ cm->current_frame.refresh_frame_flags = (1 << REF_FRAMES) - 1;
for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
- cm->current_frame.frame_refs[i].buf = NULL;
+ cm->remapped_ref_idx[i] = INVALID_IDX;
}
if (pbi->need_resync) {
@@ -4726,22 +4802,10 @@ static void show_existing_frame_reset(AV1Decoder *const pbi,
pbi->need_resync = 0;
}
- cm->cur_frame->intra_only = 1;
-
+ // Note that the displayed frame must be valid for referencing in order to
+ // have been selected.
if (cm->seq_params.frame_id_numbers_present_flag) {
- /* If bitmask is set, update reference frame id values and
- mark frames as valid for reference.
- Note that the displayed frame be valid for referencing
- in order to have been selected.
- */
- int refresh_frame_flags = pbi->refresh_frame_flags;
- int display_frame_id = cm->ref_frame_id[existing_frame_idx];
- for (int i = 0; i < REF_FRAMES; i++) {
- if ((refresh_frame_flags >> i) & 1) {
- cm->ref_frame_id[i] = display_frame_id;
- cm->valid_for_referencing[i] = 1;
- }
- }
+ update_ref_frame_id(cm, cm->ref_frame_id[existing_frame_idx]);
}
cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
@@ -4749,8 +4813,7 @@ static void show_existing_frame_reset(AV1Decoder *const pbi,
generate_next_ref_frame_map(pbi);
// Reload the adapted CDFs from when we originally coded this keyframe
- *cm->fc =
- frame_bufs[cm->next_ref_frame_map[existing_frame_idx]].frame_context;
+ *cm->fc = cm->next_ref_frame_map[existing_frame_idx]->frame_context;
}
static INLINE void reset_frame_buffers(AV1_COMMON *cm) {
@@ -4758,16 +4821,18 @@ static INLINE void reset_frame_buffers(AV1_COMMON *cm) {
int i;
// We have not stored any references to frame buffers in
- // cm->next_ref_frame_map, so we can directly reset it to all -1.
- memset(&cm->next_ref_frame_map, -1, sizeof(cm->next_ref_frame_map));
+ // cm->next_ref_frame_map, so we can directly reset it to all NULL.
+ for (i = 0; i < REF_FRAMES; ++i) {
+ cm->next_ref_frame_map[i] = NULL;
+ }
lock_buffer_pool(cm->buffer_pool);
reset_ref_frame_map(cm);
assert(cm->cur_frame->ref_count == 1);
for (i = 0; i < FRAME_BUFFERS; ++i) {
- // Reset all unreferenced frame buffers. We can also reset cm->new_fb_idx
- // because we are the sole owner of cm->new_fb_idx.
- if (frame_bufs[i].ref_count > 0 && i != cm->new_fb_idx) {
+ // Reset all unreferenced frame buffers. We can also reset cm->cur_frame
+ // because we are the sole owner of cm->cur_frame.
+ if (frame_bufs[i].ref_count > 0 && &frame_bufs[i] != cm->cur_frame) {
continue;
}
frame_bufs[i].order_hint = 0;
@@ -4794,10 +4859,6 @@ static int read_uncompressed_header(AV1Decoder *pbi,
}
cm->last_frame_type = current_frame->frame_type;
- cm->last_intra_only = current_frame->intra_only;
-
- // NOTE: By default all coded frames to be used as a reference
- cm->is_reference_frame = 1;
if (seq_params->reduced_still_picture_hdr) {
cm->show_existing_frame = 0;
@@ -4812,7 +4873,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
cm->error_resilient_mode = 1;
} else {
cm->show_existing_frame = aom_rb_read_bit(rb);
- cm->reset_decoder_state = 0;
+ pbi->reset_decoder_state = 0;
if (cm->show_existing_frame) {
if (pbi->sequence_header_changed) {
@@ -4822,7 +4883,11 @@ static int read_uncompressed_header(AV1Decoder *pbi,
}
// Show an existing frame directly.
const int existing_frame_idx = aom_rb_read_literal(rb, 3);
- const int frame_to_show = cm->ref_frame_map[existing_frame_idx];
+ RefCntBuffer *const frame_to_show = cm->ref_frame_map[existing_frame_idx];
+ if (frame_to_show == NULL) {
+ aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "Buffer does not contain a decoded frame");
+ }
if (seq_params->decoder_model_info_present_flag &&
cm->timing_info.equal_picture_interval == 0) {
av1_read_temporal_point_info(cm, rb);
@@ -4838,42 +4903,36 @@ static int read_uncompressed_header(AV1Decoder *pbi,
"Reference buffer frame ID mismatch");
}
lock_buffer_pool(pool);
- if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) {
- unlock_buffer_pool(pool);
- aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
- "Buffer %d does not contain a decoded frame",
- frame_to_show);
- }
+ assert(frame_to_show->ref_count > 0);
// cm->cur_frame should be the buffer referenced by the return value
// of the get_free_fb() call in av1_receive_compressed_data(), and
// generate_next_ref_frame_map() has not been called, so ref_count
// should still be 1.
assert(cm->cur_frame->ref_count == 1);
- // ref_cnt_fb() decrements ref_count directly rather than call
- // decrease_ref_count(). If cm->cur_frame->raw_frame_buffer
- // has already been allocated, it will not be released by ref_cnt_fb()!
+ // assign_frame_buffer_p() decrements ref_count directly rather than
+ // call decrease_ref_count(). If cm->cur_frame->raw_frame_buffer has
+ // already been allocated, it will not be released by
+ // assign_frame_buffer_p()!
assert(!cm->cur_frame->raw_frame_buffer.data);
- assign_frame_buffer(frame_bufs, &cm->new_fb_idx, frame_to_show);
- cm->cur_frame = &cm->buffer_pool->frame_bufs[cm->new_fb_idx];
- cm->reset_decoder_state =
- frame_bufs[frame_to_show].frame_type == KEY_FRAME;
+ assign_frame_buffer_p(&cm->cur_frame, frame_to_show);
+ pbi->reset_decoder_state = frame_to_show->frame_type == KEY_FRAME;
unlock_buffer_pool(pool);
cm->lf.filter_level[0] = 0;
cm->lf.filter_level[1] = 0;
cm->show_frame = 1;
- if (!frame_bufs[frame_to_show].showable_frame) {
+ if (!frame_to_show->showable_frame) {
aom_merge_corrupted_flag(&xd->corrupted, 1);
}
- if (cm->reset_decoder_state) frame_bufs[frame_to_show].showable_frame = 0;
+ if (pbi->reset_decoder_state) frame_to_show->showable_frame = 0;
- cm->film_grain_params = frame_bufs[frame_to_show].film_grain_params;
+ cm->film_grain_params = frame_to_show->film_grain_params;
- if (cm->reset_decoder_state) {
+ if (pbi->reset_decoder_state) {
show_existing_frame_reset(pbi, existing_frame_idx);
} else {
- pbi->refresh_frame_flags = 0;
+ current_frame->refresh_frame_flags = 0;
}
return 0;
@@ -4908,7 +4967,6 @@ static int read_uncompressed_header(AV1Decoder *pbi,
cm->showable_frame = aom_rb_read_bit(rb);
}
cm->cur_frame->showable_frame = cm->showable_frame;
- current_frame->intra_only = current_frame->frame_type == INTRA_ONLY_FRAME;
cm->error_resilient_mode =
frame_is_sframe(cm) ||
(current_frame->frame_type == KEY_FRAME && cm->show_frame)
@@ -4933,7 +4991,6 @@ static int read_uncompressed_header(AV1Decoder *pbi,
cm->cur_frame_force_integer_mv = 0;
}
- cm->frame_refs_short_signaling = 0;
int frame_size_override_flag = 0;
cm->allow_intrabc = 0;
cm->primary_ref_frame = PRIMARY_REF_NONE;
@@ -5020,22 +5077,23 @@ static int read_uncompressed_header(AV1Decoder *pbi,
}
}
if (current_frame->frame_type == KEY_FRAME) {
- if (!cm->show_frame) // unshown keyframe (forward keyframe)
- pbi->refresh_frame_flags = aom_rb_read_literal(rb, REF_FRAMES);
- else // shown keyframe
- pbi->refresh_frame_flags = (1 << REF_FRAMES) - 1;
+ if (!cm->show_frame) { // unshown keyframe (forward keyframe)
+ current_frame->refresh_frame_flags = aom_rb_read_literal(rb, REF_FRAMES);
+ } else { // shown keyframe
+ current_frame->refresh_frame_flags = (1 << REF_FRAMES) - 1;
+ }
for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
- cm->current_frame.frame_refs[i].buf = NULL;
+ cm->remapped_ref_idx[i] = INVALID_IDX;
}
if (pbi->need_resync) {
reset_ref_frame_map(cm);
pbi->need_resync = 0;
}
} else {
- if (current_frame->intra_only) {
- pbi->refresh_frame_flags = aom_rb_read_literal(rb, REF_FRAMES);
- if (pbi->refresh_frame_flags == 0xFF) {
+ if (current_frame->frame_type == INTRA_ONLY_FRAME) {
+ current_frame->refresh_frame_flags = aom_rb_read_literal(rb, REF_FRAMES);
+ if (current_frame->refresh_frame_flags == 0xFF) {
aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
"Intra only frames cannot have refresh flags 0xFF");
}
@@ -5044,17 +5102,12 @@ static int read_uncompressed_header(AV1Decoder *pbi,
pbi->need_resync = 0;
}
} else if (pbi->need_resync != 1) { /* Skip if need resync */
- pbi->refresh_frame_flags =
+ current_frame->refresh_frame_flags =
frame_is_sframe(cm) ? 0xFF : aom_rb_read_literal(rb, REF_FRAMES);
- if (!pbi->refresh_frame_flags) {
- // NOTE: "pbi->refresh_frame_flags == 0" indicates that the coded frame
- // will not be used as a reference
- cm->is_reference_frame = 0;
- }
}
}
- if (!frame_is_intra_only(cm) || pbi->refresh_frame_flags != 0xFF) {
+ if (!frame_is_intra_only(cm) || current_frame->refresh_frame_flags != 0xFF) {
// Read all ref frame order hints if error_resilient_mode == 1
if (cm->error_resilient_mode &&
seq_params->order_hint_info.enable_order_hint) {
@@ -5062,40 +5115,39 @@ static int read_uncompressed_header(AV1Decoder *pbi,
// Read order hint from bit stream
unsigned int order_hint = aom_rb_read_literal(
rb, seq_params->order_hint_info.order_hint_bits_minus_1 + 1);
- // Get buffer index
- int buf_idx = cm->ref_frame_map[ref_idx];
- assert(buf_idx < FRAME_BUFFERS);
- if (buf_idx == -1 || order_hint != frame_bufs[buf_idx].order_hint) {
- if (buf_idx >= 0) {
+ // Get buffer
+ RefCntBuffer *buf = cm->ref_frame_map[ref_idx];
+ if (buf == NULL || order_hint != buf->order_hint) {
+ if (buf != NULL) {
lock_buffer_pool(pool);
- decrease_ref_count(buf_idx, frame_bufs, pool);
+ decrease_ref_count(buf, pool);
unlock_buffer_pool(pool);
}
// If no corresponding buffer exists, allocate a new buffer with all
// pixels set to neutral grey.
- buf_idx = get_free_fb(cm);
+ int buf_idx = get_free_fb(cm);
if (buf_idx == INVALID_IDX) {
aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
"Unable to find free frame buffer");
}
+ buf = &frame_bufs[buf_idx];
lock_buffer_pool(pool);
if (aom_realloc_frame_buffer(
- &frame_bufs[buf_idx].buf, seq_params->max_frame_width,
+ &buf->buf, seq_params->max_frame_width,
seq_params->max_frame_height, seq_params->subsampling_x,
seq_params->subsampling_y, seq_params->use_highbitdepth,
AOM_BORDER_IN_PIXELS, cm->byte_alignment,
- &pool->frame_bufs[buf_idx].raw_frame_buffer, pool->get_fb_cb,
- pool->cb_priv)) {
- decrease_ref_count(buf_idx, frame_bufs, pool);
+ &buf->raw_frame_buffer, pool->get_fb_cb, pool->cb_priv)) {
+ decrease_ref_count(buf, pool);
unlock_buffer_pool(pool);
aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
"Failed to allocate frame buffer");
}
unlock_buffer_pool(pool);
- set_planes_to_neutral_grey(seq_params, &frame_bufs[buf_idx].buf, 0);
+ set_planes_to_neutral_grey(seq_params, &buf->buf, 0);
- cm->ref_frame_map[ref_idx] = buf_idx;
- frame_bufs[buf_idx].order_hint = order_hint;
+ cm->ref_frame_map[ref_idx] = buf;
+ buf->order_hint = order_hint;
}
}
}
@@ -5111,7 +5163,7 @@ static int read_uncompressed_header(AV1Decoder *pbi,
} else {
cm->allow_ref_frame_mvs = 0;
- if (current_frame->intra_only) {
+ if (current_frame->frame_type == INTRA_ONLY_FRAME) {
cm->cur_frame->film_grain_params_present =
seq_params->film_grain_params_present;
setup_frame_size(cm, frame_size_override_flag, rb);
@@ -5119,57 +5171,53 @@ static int read_uncompressed_header(AV1Decoder *pbi,
cm->allow_intrabc = aom_rb_read_bit(rb);
} else if (pbi->need_resync != 1) { /* Skip if need resync */
-
+ int frame_refs_short_signaling = 0;
// Frame refs short signaling is off when error resilient mode is on.
if (seq_params->order_hint_info.enable_order_hint)
- cm->frame_refs_short_signaling = aom_rb_read_bit(rb);
+ frame_refs_short_signaling = aom_rb_read_bit(rb);
- if (cm->frame_refs_short_signaling) {
+ if (frame_refs_short_signaling) {
// == LAST_FRAME ==
const int lst_ref = aom_rb_read_literal(rb, REF_FRAMES_LOG2);
- const int lst_idx = cm->ref_frame_map[lst_ref];
+ const RefCntBuffer *const lst_buf = cm->ref_frame_map[lst_ref];
// == GOLDEN_FRAME ==
const int gld_ref = aom_rb_read_literal(rb, REF_FRAMES_LOG2);
- const int gld_idx = cm->ref_frame_map[gld_ref];
+ const RefCntBuffer *const gld_buf = cm->ref_frame_map[gld_ref];
// Most of the time, streams start with a keyframe. In that case,
// ref_frame_map will have been filled in at that point and will not
- // contain any -1's. However, streams are explicitly allowed to start
+ // contain any NULLs. However, streams are explicitly allowed to start
// with an intra-only frame, so long as they don't then signal a
// reference to a slot that hasn't been set yet. That's what we are
// checking here.
- if (lst_idx == -1)
+ if (lst_buf == NULL)
aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
"Inter frame requests nonexistent reference");
- if (gld_idx == -1)
+ if (gld_buf == NULL)
aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
"Inter frame requests nonexistent reference");
- av1_set_frame_refs(cm, lst_ref, gld_ref);
+ av1_set_frame_refs(cm, cm->remapped_ref_idx, lst_ref, gld_ref);
}
for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
int ref = 0;
- if (!cm->frame_refs_short_signaling) {
+ if (!frame_refs_short_signaling) {
ref = aom_rb_read_literal(rb, REF_FRAMES_LOG2);
- const int idx = cm->ref_frame_map[ref];
// Most of the time, streams start with a keyframe. In that case,
// ref_frame_map will have been filled in at that point and will not
- // contain any -1's. However, streams are explicitly allowed to start
+ // contain any NULLs. However, streams are explicitly allowed to start
// with an intra-only frame, so long as they don't then signal a
// reference to a slot that hasn't been set yet. That's what we are
// checking here.
- if (idx == -1)
+ if (cm->ref_frame_map[ref] == NULL)
aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
"Inter frame requests nonexistent reference");
-
- RefBuffer *const ref_frame = &cm->current_frame.frame_refs[i];
- ref_frame->buf = &frame_bufs[idx];
- ref_frame->map_idx = ref;
+ cm->remapped_ref_idx[i] = ref;
} else {
- ref = cm->current_frame.frame_refs[i].map_idx;
+ ref = cm->remapped_ref_idx[i];
}
cm->ref_frame_sign_bias[LAST_FRAME + i] = 0;
@@ -5206,26 +5254,29 @@ static int read_uncompressed_header(AV1Decoder *pbi,
cm->switchable_motion_mode = aom_rb_read_bit(rb);
}
- cm->prev_frame = get_prev_frame(cm);
+ cm->prev_frame = get_primary_ref_frame_buf(cm);
if (cm->primary_ref_frame != PRIMARY_REF_NONE &&
- cm->current_frame.frame_refs[cm->primary_ref_frame].buf == NULL) {
+ get_primary_ref_frame_buf(cm) == NULL) {
aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
"Reference frame containing this frame's initial "
"frame context is unavailable.");
}
- if (!current_frame->intra_only && pbi->need_resync != 1) {
+ if (!(current_frame->frame_type == INTRA_ONLY_FRAME) &&
+ pbi->need_resync != 1) {
if (frame_might_allow_ref_frame_mvs(cm))
cm->allow_ref_frame_mvs = aom_rb_read_bit(rb);
else
cm->allow_ref_frame_mvs = 0;
- for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
- RefBuffer *const ref_buf = &cm->current_frame.frame_refs[i];
+ for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+ const RefCntBuffer *const ref_buf = get_ref_frame_buf(cm, i);
+ struct scale_factors *const ref_scale_factors =
+ get_ref_scale_factors(cm, i);
av1_setup_scale_factors_for_frame(
- &ref_buf->sf, ref_buf->buf->buf.y_crop_width,
- ref_buf->buf->buf.y_crop_height, cm->width, cm->height);
- if ((!av1_is_valid_scale(&ref_buf->sf)))
+ ref_scale_factors, ref_buf->buf.y_crop_width,
+ ref_buf->buf.y_crop_height, cm->width, cm->height);
+ if ((!av1_is_valid_scale(ref_scale_factors)))
aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
"Reference frame has invalid dimensions");
}
@@ -5236,20 +5287,10 @@ static int read_uncompressed_header(AV1Decoder *pbi,
av1_setup_frame_sign_bias(cm);
- cm->cur_frame->intra_only =
- current_frame->frame_type == KEY_FRAME || current_frame->intra_only;
cm->cur_frame->frame_type = current_frame->frame_type;
if (seq_params->frame_id_numbers_present_flag) {
- /* If bitmask is set, update reference frame id values and
- mark frames as valid for reference */
- int refresh_frame_flags = pbi->refresh_frame_flags;
- for (int i = 0; i < REF_FRAMES; i++) {
- if ((refresh_frame_flags >> i) & 1) {
- cm->ref_frame_id[i] = cm->current_frame_id;
- cm->valid_for_referencing[i] = 1;
- }
- }
+ update_ref_frame_id(cm, cm->current_frame_id);
}
const int might_bwd_adapt =
@@ -5297,6 +5338,11 @@ static int read_uncompressed_header(AV1Decoder *pbi,
}
read_tile_info(pbi, rb);
+ if (!is_min_tile_width_satisfied(cm)) {
+ aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Minimum tile width requirement not satisfied");
+ }
+
setup_quantization(cm, rb);
xd->bd = (int)seq_params->bit_depth;
@@ -5486,7 +5532,7 @@ uint32_t av1_decode_frame_headers_and_setup(AV1Decoder *pbi,
if (cm->show_existing_frame) {
// showing a frame directly
*p_data_end = data + uncomp_hdr_size;
- if (cm->reset_decoder_state) {
+ if (pbi->reset_decoder_state) {
// Use the default frame context values.
*cm->fc = *cm->default_frame_context;
if (!cm->fc->initialized)
@@ -5498,8 +5544,6 @@ uint32_t av1_decode_frame_headers_and_setup(AV1Decoder *pbi,
cm->setup_mi(cm);
- cm->current_frame_seg_map = cm->cur_frame->seg_map;
-
av1_setup_motion_field(cm);
av1_setup_block_planes(xd, cm->seq_params.subsampling_x,
@@ -5508,8 +5552,7 @@ uint32_t av1_decode_frame_headers_and_setup(AV1Decoder *pbi,
// use the default frame context values
*cm->fc = *cm->default_frame_context;
} else {
- *cm->fc =
- cm->current_frame.frame_refs[cm->primary_ref_frame].buf->frame_context;
+ *cm->fc = get_primary_ref_frame_buf(cm)->frame_context;
}
if (!cm->fc->initialized)
aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
@@ -5528,7 +5571,7 @@ static void setup_frame_info(AV1Decoder *pbi) {
cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
av1_alloc_restoration_buffers(cm);
}
- const int use_highbd = cm->seq_params.use_highbitdepth ? 1 : 0;
+ const int use_highbd = cm->seq_params.use_highbitdepth;
const int buf_size = MC_TEMP_BUF_PELS << use_highbd;
if (pbi->td.mc_buf_size != buf_size) {
av1_free_mc_tmp_buf(&pbi->td);
diff --git a/libaom/av1/decoder/decodemv.c b/libaom/av1/decoder/decodemv.c
index 7a94717..2791f3a 100644
--- a/libaom/av1/decoder/decodemv.c
+++ b/libaom/av1/decoder/decodemv.c
@@ -299,7 +299,7 @@ static void set_segment_id(AV1_COMMON *cm, int mi_offset, int x_mis, int y_mis,
for (int y = 0; y < y_mis; y++)
for (int x = 0; x < x_mis; x++)
- cm->current_frame_seg_map[mi_offset + y * cm->mi_cols + x] = segment_id;
+ cm->cur_frame->seg_map[mi_offset + y * cm->mi_cols + x] = segment_id;
}
static int read_intra_segment_id(AV1_COMMON *const cm,
@@ -355,7 +355,7 @@ static int read_inter_segment_id(AV1_COMMON *const cm, MACROBLOCKD *const xd,
if (!seg->enabled) return 0; // Default for disabled segmentation
if (!seg->update_map) {
- copy_segment_id(cm, cm->last_frame_seg_map, cm->current_frame_seg_map,
+ copy_segment_id(cm, cm->last_frame_seg_map, cm->cur_frame->seg_map,
mi_offset, x_mis, y_mis);
return get_predicted_segment_id(cm, mi_offset, x_mis, y_mis);
}
@@ -364,7 +364,6 @@ static int read_inter_segment_id(AV1_COMMON *const cm, MACROBLOCKD *const xd,
if (preskip) {
if (!seg->segid_preskip) return 0;
} else {
- if (seg->segid_preskip) return mbmi->segment_id;
if (mbmi->skip) {
if (seg->temporal_update) {
mbmi->seg_id_predicted = 0;
@@ -679,11 +678,10 @@ static void read_intrabc_info(AV1_COMMON *const cm, MACROBLOCKD *const xd,
int16_t inter_mode_ctx[MODE_CTX_REF_FRAMES];
int_mv ref_mvs[INTRA_FRAME + 1][MAX_MV_REF_CANDIDATES];
- int_mv global_mvs[REF_FRAMES];
av1_find_mv_refs(cm, xd, mbmi, INTRA_FRAME, xd->ref_mv_count,
- xd->ref_mv_stack, ref_mvs, global_mvs, mi_row, mi_col,
- inter_mode_ctx);
+ xd->ref_mv_stack, ref_mvs, /*global_mvs=*/NULL, mi_row,
+ mi_col, inter_mode_ctx);
int_mv nearestmv, nearmv;
@@ -700,7 +698,8 @@ static void read_intrabc_info(AV1_COMMON *const cm, MACROBLOCKD *const xd,
mi_col, bsize, r);
if (!valid_dv) {
// Intra bc motion vectors are not valid - signal corrupt frame
- aom_merge_corrupted_flag(&xd->corrupted, 1);
+ aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
+ "Invalid intrabc dv");
}
}
}
@@ -1271,9 +1270,9 @@ static void read_inter_block_mode_info(AV1Decoder *const pbi,
const int is_compound = has_second_ref(mbmi);
MV_REFERENCE_FRAME ref_frame = av1_ref_frame_type(mbmi->ref_frame);
- int_mv global_mvs[REF_FRAMES];
av1_find_mv_refs(cm, xd, mbmi, ref_frame, xd->ref_mv_count, xd->ref_mv_stack,
- ref_mvs, global_mvs, mi_row, mi_col, inter_mode_ctx);
+ ref_mvs, /*global_mvs=*/NULL, mi_row, mi_col,
+ inter_mode_ctx);
int mode_ctx = av1_mode_context_analyzer(inter_mode_ctx, mbmi->ref_frame);
mbmi->ref_mv_idx = 0;
@@ -1388,9 +1387,7 @@ static void read_inter_block_mode_info(AV1Decoder *const pbi,
for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
- RefBuffer *ref_buf = &cm->current_frame.frame_refs[frame - LAST_FRAME];
-
- xd->block_refs[ref] = ref_buf;
+ xd->block_ref_scale_factors[ref] = get_ref_scale_factors_const(cm, frame);
}
mbmi->motion_mode = SIMPLE_TRANSLATION;
@@ -1419,13 +1416,16 @@ static void read_inter_block_mode_info(AV1Decoder *const pbi,
}
if (mbmi->comp_group_idx == 0) {
- if (cm->seq_params.order_hint_info.enable_jnt_comp) {
+ if (cm->seq_params.order_hint_info.enable_dist_wtd_comp) {
const int comp_index_ctx = get_comp_index_context(cm, xd);
mbmi->compound_idx = aom_read_symbol(
r, ec_ctx->compound_index_cdf[comp_index_ctx], 2, ACCT_STR);
+ mbmi->interinter_comp.type =
+ mbmi->compound_idx ? COMPOUND_AVERAGE : COMPOUND_DISTWTD;
} else {
// Distance-weighted compound is disabled, so always use average
mbmi->compound_idx = 1;
+ mbmi->interinter_comp.type = COMPOUND_AVERAGE;
}
} else {
assert(cm->current_frame.reference_mode != SINGLE_REFERENCE &&
@@ -1436,8 +1436,9 @@ static void read_inter_block_mode_info(AV1Decoder *const pbi,
// compound_diffwtd, wedge
if (is_interinter_compound_used(COMPOUND_WEDGE, bsize))
mbmi->interinter_comp.type =
- 1 + aom_read_symbol(r, ec_ctx->compound_type_cdf[bsize],
- COMPOUND_TYPES - 1, ACCT_STR);
+ COMPOUND_WEDGE + aom_read_symbol(r,
+ ec_ctx->compound_type_cdf[bsize],
+ MASKED_COMPOUND_TYPES, ACCT_STR);
else
mbmi->interinter_comp.type = COMPOUND_DIFFWTD;
@@ -1502,7 +1503,8 @@ static void read_inter_frame_mode_info(AV1Decoder *const pbi,
else
mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
- mbmi->segment_id = read_inter_segment_id(cm, xd, mi_row, mi_col, 0, r);
+ if (!cm->seg.segid_preskip)
+ mbmi->segment_id = read_inter_segment_id(cm, xd, mi_row, mi_col, 0, r);
read_cdef(cm, r, xd, mi_col, mi_row);
diff --git a/libaom/av1/decoder/decoder.c b/libaom/av1/decoder/decoder.c
index 773305d..bff4b7a 100644
--- a/libaom/av1/decoder/decoder.c
+++ b/libaom/av1/decoder/decoder.c
@@ -100,15 +100,16 @@ AV1Decoder *av1_decoder_create(BufferPool *const pool) {
aom_once(initialize_dec);
// Initialize the references to not point to any frame buffers.
- memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
- memset(&cm->next_ref_frame_map, -1, sizeof(cm->next_ref_frame_map));
+ for (int i = 0; i < REF_FRAMES; i++) {
+ cm->ref_frame_map[i] = NULL;
+ cm->next_ref_frame_map[i] = NULL;
+ }
cm->current_frame.frame_number = 0;
pbi->decoding_first_frame = 1;
pbi->common.buffer_pool = pool;
cm->seq_params.bit_depth = AOM_BITS_8;
- cm->dequant_bit_depth = AOM_BITS_8;
cm->alloc_mi = av1_dec_alloc_mi;
cm->free_mi = dec_free_mi;
@@ -321,26 +322,26 @@ aom_codec_err_t av1_copy_new_frame_dec(AV1_COMMON *cm,
static void release_frame_buffers(AV1Decoder *pbi) {
AV1_COMMON *const cm = &pbi->common;
BufferPool *const pool = cm->buffer_pool;
- RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+ cm->cur_frame->buf.corrupted = 1;
lock_buffer_pool(pool);
// Release all the reference buffers in cm->next_ref_frame_map if the worker
// thread is holding them.
if (pbi->hold_ref_buf) {
- int ref_index;
- for (ref_index = 0; ref_index < REF_FRAMES; ++ref_index) {
- const int new_idx = cm->next_ref_frame_map[ref_index];
- decrease_ref_count(new_idx, frame_bufs, pool);
+ for (int ref_index = 0; ref_index < REF_FRAMES; ++ref_index) {
+ decrease_ref_count(cm->next_ref_frame_map[ref_index], pool);
+ cm->next_ref_frame_map[ref_index] = NULL;
}
pbi->hold_ref_buf = 0;
}
// Release current frame.
- decrease_ref_count(cm->new_fb_idx, frame_bufs, pool);
+ decrease_ref_count(cm->cur_frame, pool);
unlock_buffer_pool(pool);
+ cm->cur_frame = NULL;
}
// If any buffer updating is signaled it should be done here.
-// Consumes a reference to cm->new_fb_idx.
+// Consumes a reference to cm->cur_frame.
//
// This functions returns void. It reports failure by setting
// cm->error.error_code.
@@ -348,7 +349,6 @@ static void swap_frame_buffers(AV1Decoder *pbi, int frame_decoded) {
int ref_index = 0, mask;
AV1_COMMON *const cm = &pbi->common;
BufferPool *const pool = cm->buffer_pool;
- RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
if (frame_decoded) {
lock_buffer_pool(pool);
@@ -358,58 +358,55 @@ static void swap_frame_buffers(AV1Decoder *pbi, int frame_decoded) {
if (!pbi->camera_frame_header_ready) {
// If we are not holding reference buffers in cm->next_ref_frame_map,
// assert that the following two for loops are no-ops.
- assert(IMPLIES(!pbi->hold_ref_buf, pbi->refresh_frame_flags == 0));
assert(IMPLIES(!pbi->hold_ref_buf,
- cm->show_existing_frame && !cm->reset_decoder_state));
+ cm->current_frame.refresh_frame_flags == 0));
+ assert(IMPLIES(!pbi->hold_ref_buf,
+ cm->show_existing_frame && !pbi->reset_decoder_state));
// The following two for loops need to release the reference stored in
// cm->ref_frame_map[ref_index] before transferring the reference stored
// in cm->next_ref_frame_map[ref_index] to cm->ref_frame_map[ref_index].
- for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
- const int old_idx = cm->ref_frame_map[ref_index];
- decrease_ref_count(old_idx, frame_bufs, pool);
+ for (mask = cm->current_frame.refresh_frame_flags; mask; mask >>= 1) {
+ decrease_ref_count(cm->ref_frame_map[ref_index], pool);
cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
+ cm->next_ref_frame_map[ref_index] = NULL;
++ref_index;
}
const int check_on_show_existing_frame =
- !cm->show_existing_frame || cm->reset_decoder_state;
+ !cm->show_existing_frame || pbi->reset_decoder_state;
for (; ref_index < REF_FRAMES && check_on_show_existing_frame;
++ref_index) {
- const int old_idx = cm->ref_frame_map[ref_index];
- decrease_ref_count(old_idx, frame_bufs, pool);
+ decrease_ref_count(cm->ref_frame_map[ref_index], pool);
cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
+ cm->next_ref_frame_map[ref_index] = NULL;
}
}
if (cm->show_existing_frame || cm->show_frame) {
- YV12_BUFFER_CONFIG *cur_frame = &cm->cur_frame->buf;
if (pbi->output_all_layers) {
// Append this frame to the output queue
if (pbi->num_output_frames >= MAX_NUM_SPATIAL_LAYERS) {
// We can't store the new frame anywhere, so drop it and return an
// error
- decrease_ref_count(cm->new_fb_idx, frame_bufs, pool);
- cm->cur_frame = NULL;
+ cm->cur_frame->buf.corrupted = 1;
+ decrease_ref_count(cm->cur_frame, pool);
cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
} else {
- pbi->output_frames[pbi->num_output_frames] = cur_frame;
- pbi->output_frame_index[pbi->num_output_frames] = cm->new_fb_idx;
+ pbi->output_frames[pbi->num_output_frames] = cm->cur_frame;
pbi->num_output_frames++;
}
} else {
// Replace any existing output frame
assert(pbi->num_output_frames == 0 || pbi->num_output_frames == 1);
if (pbi->num_output_frames > 0) {
- decrease_ref_count(pbi->output_frame_index[0], frame_bufs, pool);
+ decrease_ref_count(pbi->output_frames[0], pool);
}
- pbi->output_frames[0] = cur_frame;
- pbi->output_frame_index[0] = cm->new_fb_idx;
+ pbi->output_frames[0] = cm->cur_frame;
pbi->num_output_frames = 1;
}
} else {
- decrease_ref_count(cm->new_fb_idx, frame_bufs, pool);
- cm->cur_frame = NULL;
+ decrease_ref_count(cm->cur_frame, pool);
}
unlock_buffer_pool(pool);
@@ -420,17 +417,17 @@ static void swap_frame_buffers(AV1Decoder *pbi, int frame_decoded) {
assert(IMPLIES(!pbi->camera_frame_header_ready, !pbi->hold_ref_buf));
// Nothing was decoded, so just drop this frame buffer
lock_buffer_pool(pool);
- decrease_ref_count(cm->new_fb_idx, frame_bufs, pool);
- cm->cur_frame = NULL;
+ decrease_ref_count(cm->cur_frame, pool);
unlock_buffer_pool(pool);
}
+ cm->cur_frame = NULL;
if (!pbi->camera_frame_header_ready) {
pbi->hold_ref_buf = 0;
// Invalidate these references until the next frame starts.
for (ref_index = 0; ref_index < INTER_REFS_PER_FRAME; ref_index++) {
- cm->current_frame.frame_refs[ref_index].buf = NULL;
+ cm->remapped_ref_idx[ref_index] = INVALID_IDX;
}
}
}
@@ -438,7 +435,6 @@ static void swap_frame_buffers(AV1Decoder *pbi, int frame_decoded) {
int av1_receive_compressed_data(AV1Decoder *pbi, size_t size,
const uint8_t **psource) {
AV1_COMMON *volatile const cm = &pbi->common;
- BufferPool *volatile const pool = cm->buffer_pool;
const uint8_t *source = *psource;
cm->error.error_code = AOM_CODEC_OK;
cm->error.has_detail = 0;
@@ -452,24 +448,15 @@ int av1_receive_compressed_data(AV1Decoder *pbi, size_t size,
// TODO(jkoleszar): Error concealment is undefined and non-normative
// at this point, but if it becomes so, [0] may not always be the correct
// thing to do here.
- if (cm->current_frame.frame_refs[0].buf != NULL) {
- cm->current_frame.frame_refs[0].buf->buf.corrupted = 1;
- }
+ RefCntBuffer *ref_buf = get_ref_frame_buf(cm, LAST_FRAME);
+ if (ref_buf != NULL) ref_buf->buf.corrupted = 1;
}
- // Find a free buffer for the new frame, releasing the reference previously
- // held.
-
- // Find a free frame buffer. Return error if can not find any.
- cm->new_fb_idx = get_free_fb(cm);
- if (cm->new_fb_idx == INVALID_IDX) {
+ if (assign_cur_frame_new_fb(cm) == NULL) {
cm->error.error_code = AOM_CODEC_MEM_ERROR;
return 1;
}
- // Assign a MV array to the frame buffer.
- cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];
-
if (!pbi->camera_frame_header_ready) pbi->hold_ref_buf = 0;
// The jmp_buf is valid only for the duration of the function that calls
@@ -514,7 +501,7 @@ int av1_receive_compressed_data(AV1Decoder *pbi, size_t size,
cm->txb_count = 0;
#endif
- // Note: At this point, this function holds a reference to cm->new_fb_idx
+ // Note: At this point, this function holds a reference to cm->cur_frame
// in the buffer pool. This reference is consumed by swap_frame_buffers().
swap_frame_buffers(pbi, frame_decoded);
@@ -541,10 +528,6 @@ int av1_receive_compressed_data(AV1Decoder *pbi, size_t size,
}
// Update progress in frame parallel decode.
- cm->last_width = cm->width;
- cm->last_height = cm->height;
- cm->last_tile_cols = cm->tile_cols;
- cm->last_tile_rows = cm->tile_rows;
cm->error.setjmp = 0;
return 0;
@@ -553,11 +536,9 @@ int av1_receive_compressed_data(AV1Decoder *pbi, size_t size,
// Get the frame at a particular index in the output queue
int av1_get_raw_frame(AV1Decoder *pbi, size_t index, YV12_BUFFER_CONFIG **sd,
aom_film_grain_t **grain_params) {
- RefCntBuffer *const frame_bufs = pbi->common.buffer_pool->frame_bufs;
-
if (index >= pbi->num_output_frames) return -1;
- *sd = pbi->output_frames[index];
- *grain_params = &frame_bufs[pbi->output_frame_index[index]].film_grain_params;
+ *sd = &pbi->output_frames[index]->buf;
+ *grain_params = &pbi->output_frames[index]->film_grain_params;
aom_clear_system_state();
return 0;
}
@@ -567,6 +548,6 @@ int av1_get_raw_frame(AV1Decoder *pbi, size_t index, YV12_BUFFER_CONFIG **sd,
int av1_get_frame_to_show(AV1Decoder *pbi, YV12_BUFFER_CONFIG *frame) {
if (pbi->num_output_frames == 0) return -1;
- *frame = *pbi->output_frames[pbi->num_output_frames - 1];
+ *frame = pbi->output_frames[pbi->num_output_frames - 1]->buf;
return 0;
}
diff --git a/libaom/av1/decoder/decoder.h b/libaom/av1/decoder/decoder.h
index 6ca28e7..685c931 100644
--- a/libaom/av1/decoder/decoder.h
+++ b/libaom/av1/decoder/decoder.h
@@ -48,11 +48,9 @@ typedef void (*cfl_store_inter_block_visitor_fn_t)(AV1_COMMON *const cm,
MACROBLOCKD *const xd);
typedef struct ThreadData {
- aom_reader *bit_reader;
DECLARE_ALIGNED(32, MACROBLOCKD, xd);
- /* dqcoeff are shared by all the planes. So planes must be decoded serially */
- DECLARE_ALIGNED(32, tran_low_t, dqcoeff[MAX_TX_SQUARE]);
CB_BUFFER cb_buffer_base;
+ aom_reader *bit_reader;
uint8_t *mc_buf[2];
int32_t mc_buf_size;
int mc_buf_use_highbd; // Boolean: whether the byte pointers stored in
@@ -163,8 +161,6 @@ typedef struct AV1Decoder {
DECLARE_ALIGNED(32, AV1_COMMON, common);
- int refresh_frame_flags;
-
AVxWorker lf_worker;
AV1LfSync lf_row_sync;
AV1LrSync lr_row_sync;
@@ -190,8 +186,7 @@ typedef struct AV1Decoder {
// Note: The saved buffers are released at the start of the next time the
// application calls aom_codec_decode().
int output_all_layers;
- YV12_BUFFER_CONFIG *output_frames[MAX_NUM_SPATIAL_LAYERS];
- int output_frame_index[MAX_NUM_SPATIAL_LAYERS]; // Buffer pool indices
+ RefCntBuffer *output_frames[MAX_NUM_SPATIAL_LAYERS];
size_t num_output_frames; // How many frames are queued up so far?
// In order to properly support random-access decoding, we need
@@ -205,6 +200,7 @@ typedef struct AV1Decoder {
int need_resync; // wait for key/intra-only frame.
int hold_ref_buf; // Boolean: whether we are holding reference buffers in
// common.next_ref_frame_map.
+ int reset_decoder_state;
int tile_size_bytes;
int tile_col_size_bytes;
@@ -283,23 +279,22 @@ void av1_dec_row_mt_dealloc(AV1DecRowMTSync *dec_row_mt_sync);
void av1_dec_free_cb_buf(AV1Decoder *pbi);
-static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs,
+static INLINE void decrease_ref_count(RefCntBuffer *const buf,
BufferPool *const pool) {
- if (idx >= 0) {
- --frame_bufs[idx].ref_count;
+ if (buf != NULL) {
+ --buf->ref_count;
// Reference counts should never become negative. If this assertion fails,
// there is a bug in our reference count management.
- assert(frame_bufs[idx].ref_count >= 0);
+ assert(buf->ref_count >= 0);
// A worker may only get a free framebuffer index when calling get_free_fb.
// But the raw frame buffer is not set up until we finish decoding header.
// So if any error happens during decoding header, frame_bufs[idx] will not
// have a valid raw frame buffer.
- if (frame_bufs[idx].ref_count == 0 &&
- frame_bufs[idx].raw_frame_buffer.data) {
- pool->release_fb_cb(pool->cb_priv, &frame_bufs[idx].raw_frame_buffer);
- frame_bufs[idx].raw_frame_buffer.data = NULL;
- frame_bufs[idx].raw_frame_buffer.size = 0;
- frame_bufs[idx].raw_frame_buffer.priv = NULL;
+ if (buf->ref_count == 0 && buf->raw_frame_buffer.data) {
+ pool->release_fb_cb(pool->cb_priv, &buf->raw_frame_buffer);
+ buf->raw_frame_buffer.data = NULL;
+ buf->raw_frame_buffer.size = 0;
+ buf->raw_frame_buffer.priv = NULL;
}
}
}
diff --git a/libaom/av1/decoder/decodetxb.c b/libaom/av1/decoder/decodetxb.c
index f3ef2d5..223e32e 100644
--- a/libaom/av1/decoder/decodetxb.c
+++ b/libaom/av1/decoder/decodetxb.c
@@ -136,6 +136,15 @@ uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *const xd,
uint16_t *const max_scan_line = &(eob_data->max_scan_line);
*max_scan_line = 0;
*eob = 0;
+
+#if CONFIG_INSPECTION
+ if (plane == 0) {
+ const int txk_type_idx =
+ av1_get_txk_type_index(mbmi->sb_type, blk_row, blk_col);
+ mbmi->tx_skip[txk_type_idx] = all_zero;
+ }
+#endif
+
if (all_zero) {
*max_scan_line = 0;
if (plane == 0) {
@@ -146,9 +155,6 @@ uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *const xd,
return 0;
}
- memset(levels_buf, 0,
- sizeof(*levels_buf) *
- ((width + TX_PAD_HOR) * (height + TX_PAD_VER) + TX_PAD_END));
if (plane == AOM_PLANE_Y) {
// only y plane's tx_type is transmitted
av1_read_tx_type(cm, xd, blk_row, blk_col, tx_size, r);
@@ -214,23 +220,30 @@ uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *const xd,
break;
}
- if (k_eob_offset_bits[eob_pt] > 0) {
+ const int eob_offset_bits = k_eob_offset_bits[eob_pt];
+ if (eob_offset_bits > 0) {
const int eob_ctx = eob_pt - 3;
int bit = aom_read_symbol(
r, ec_ctx->eob_extra_cdf[txs_ctx][plane_type][eob_ctx], 2, ACCT_STR);
if (bit) {
- eob_extra += (1 << (k_eob_offset_bits[eob_pt] - 1));
+ eob_extra += (1 << (eob_offset_bits - 1));
}
- for (int i = 1; i < k_eob_offset_bits[eob_pt]; i++) {
+ for (int i = 1; i < eob_offset_bits; i++) {
bit = aom_read_bit(r, ACCT_STR);
if (bit) {
- eob_extra += (1 << (k_eob_offset_bits[eob_pt] - 1 - i));
+ eob_extra += (1 << (eob_offset_bits - 1 - i));
}
}
}
*eob = rec_eob_pos(eob_pt, eob_extra);
+ if (*eob > 1) {
+ memset(levels_buf, 0,
+ sizeof(*levels_buf) *
+ ((width + TX_PAD_HOR) * (height + TX_PAD_VER) + TX_PAD_END));
+ }
+
{
// Read the non-zero coefficient with scan index eob-1
// TODO(angiebird): Put this into a function
@@ -242,12 +255,10 @@ uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *const xd,
ec_ctx->coeff_base_eob_cdf[txs_ctx][plane_type][coeff_ctx];
int level = aom_read_symbol(r, cdf, nsymbs, ACCT_STR) + 1;
if (level > NUM_BASE_LEVELS) {
- const int br_ctx = get_br_ctx(levels, pos, bwl, tx_class);
+ const int br_ctx = get_br_ctx_eob(pos, bwl, tx_class);
+ cdf = ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type][br_ctx];
for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
- const int k = aom_read_symbol(
- r,
- ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type][br_ctx],
- BR_CDF_SIZE, ACCT_STR);
+ const int k = aom_read_symbol(r, cdf, BR_CDF_SIZE, ACCT_STR);
level += k;
if (k < BR_CDF_SIZE - 1) break;
}
@@ -269,13 +280,6 @@ uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *const xd,
}
}
- int16_t num_zero_coeffs = 0;
- for (int c = 0; c < *eob; ++c) {
- const int pos = scan[c];
- num_zero_coeffs = AOMMAX(num_zero_coeffs, pos);
- }
- memset(tcoeffs, 0, (num_zero_coeffs + 1) * sizeof(tcoeffs[0]));
-
for (int c = 0; c < *eob; ++c) {
const int pos = scan[c];
uint8_t sign;
diff --git a/libaom/av1/decoder/inspection.c b/libaom/av1/decoder/inspection.c
index 17a9f98..eeed1d3 100644
--- a/libaom/av1/decoder/inspection.c
+++ b/libaom/av1/decoder/inspection.c
@@ -33,7 +33,7 @@ void ifd_clear(insp_frame_data *fd) {
/* TODO(negge) This function may be called by more than one thread when using
a multi-threaded decoder and this may cause a data race. */
-int ifd_inspect(insp_frame_data *fd, void *decoder) {
+int ifd_inspect(insp_frame_data *fd, void *decoder, int skip_not_transform) {
struct AV1Decoder *pbi = (struct AV1Decoder *)decoder;
AV1_COMMON *const cm = &pbi->common;
@@ -82,6 +82,9 @@ int ifd_inspect(insp_frame_data *fd, void *decoder) {
mi->ref_frame[1] = mbmi->ref_frame[1];
// Prediction Mode
mi->mode = mbmi->mode;
+ mi->intrabc = (int16_t)mbmi->use_intrabc;
+ mi->palette = (int16_t)mbmi->palette_mode_info.palette_size[0];
+ mi->uv_palette = (int16_t)mbmi->palette_mode_info.palette_size[1];
// Prediction Mode for Chromatic planes
if (mi->mode < INTRA_MODES) {
mi->uv_mode = mbmi->uv_mode;
@@ -111,13 +114,19 @@ int ifd_inspect(insp_frame_data *fd, void *decoder) {
else
mi->tx_size = mbmi->tx_size;
+ if (skip_not_transform && mi->skip) mi->tx_size = -1;
+
mi->tx_type =
(mi->skip ? 0 : mbmi->txk_type[av1_get_txk_type_index(bsize, r, c)]);
+ if (skip_not_transform &&
+ (mi->skip || mbmi->tx_skip[av1_get_txk_type_index(bsize, r, c)]))
+ mi->tx_type = -1;
mi->cdef_level = cm->cdef_info.cdef_strengths[mbmi->cdef_strength] /
CDEF_SEC_STRENGTHS;
mi->cdef_strength = cm->cdef_info.cdef_strengths[mbmi->cdef_strength] %
CDEF_SEC_STRENGTHS;
+
mi->cdef_strength += mi->cdef_strength == 3;
if (mbmi->uv_mode == UV_CFL_PRED) {
mi->cfl_alpha_idx = mbmi->cfl_alpha_idx;
diff --git a/libaom/av1/decoder/inspection.h b/libaom/av1/decoder/inspection.h
index 0c6f3ad..b963f6a 100644
--- a/libaom/av1/decoder/inspection.h
+++ b/libaom/av1/decoder/inspection.h
@@ -52,6 +52,9 @@ struct insp_mi_data {
int16_t current_qindex;
int16_t compound_type;
int16_t motion_mode;
+ int16_t intrabc;
+ int16_t palette;
+ int16_t uv_palette;
};
typedef struct insp_frame_data insp_frame_data;
@@ -80,7 +83,7 @@ struct insp_frame_data {
void ifd_init(insp_frame_data *fd, int frame_width, int frame_height);
void ifd_clear(insp_frame_data *fd);
-int ifd_inspect(insp_frame_data *fd, void *decoder);
+int ifd_inspect(insp_frame_data *fd, void *decoder, int skip_not_transform);
#ifdef __cplusplus
} // extern "C"
diff --git a/libaom/av1/decoder/obu.c b/libaom/av1/decoder/obu.c
index d892dc4..aaea572 100644
--- a/libaom/av1/decoder/obu.c
+++ b/libaom/av1/decoder/obu.c
@@ -26,7 +26,7 @@
#include "av1/decoder/obu.h"
// Picture prediction structures (0-12 are predefined) in scalability metadata.
-typedef enum {
+enum {
SCALABILITY_L1T2 = 0,
SCALABILITY_L1T3 = 1,
SCALABILITY_L2T1 = 2,
@@ -42,7 +42,7 @@ typedef enum {
SCALABILITY_S2T2h = 12,
SCALABILITY_S2T3h = 13,
SCALABILITY_SS = 14
-} SCALABILITY_STRUCTURES;
+} UENUM1BYTE(SCALABILITY_STRUCTURES);
aom_codec_err_t aom_get_num_layers_from_operating_point_idc(
int operating_point_idc, unsigned int *number_spatial_layers,
@@ -98,12 +98,10 @@ static int byte_alignment(AV1_COMMON *const cm,
static uint32_t read_temporal_delimiter_obu() { return 0; }
// Returns a boolean that indicates success.
-static int read_bitstream_level(BitstreamLevel *bl,
+static int read_bitstream_level(AV1_LEVEL *seq_level_idx,
struct aom_read_bit_buffer *rb) {
- const uint8_t seq_level_idx = aom_rb_read_literal(rb, LEVEL_BITS);
- if (!is_valid_seq_level_idx(seq_level_idx)) return 0;
- bl->major = (seq_level_idx >> LEVEL_MINOR_BITS) + LEVEL_MAJOR_MIN;
- bl->minor = seq_level_idx & ((1 << LEVEL_MINOR_BITS) - 1);
+ *seq_level_idx = aom_rb_read_literal(rb, LEVEL_BITS);
+ if (!is_valid_seq_level_idx(*seq_level_idx)) return 0;
return 1;
}
@@ -151,7 +149,7 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi,
seq_params->display_model_info_present_flag = 0;
seq_params->operating_points_cnt_minus_1 = 0;
seq_params->operating_point_idc[0] = 0;
- if (!read_bitstream_level(&seq_params->level[0], rb)) {
+ if (!read_bitstream_level(&seq_params->seq_level_idx[0], rb)) {
cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
return 0;
}
@@ -175,13 +173,13 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi,
for (int i = 0; i < seq_params->operating_points_cnt_minus_1 + 1; i++) {
seq_params->operating_point_idc[i] =
aom_rb_read_literal(rb, OP_POINTS_IDC_BITS);
- if (!read_bitstream_level(&seq_params->level[i], rb)) {
+ if (!read_bitstream_level(&seq_params->seq_level_idx[i], rb)) {
cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
return 0;
}
// This is the seq_level_idx[i] > 7 check in the spec. seq_level_idx 7
// is equivalent to level 3.3.
- if (seq_params->level[i].major > 3)
+ if (seq_params->seq_level_idx[i] >= SEQ_LEVEL_4_0)
seq_params->tier[i] = aom_rb_read_bit(rb);
else
seq_params->tier[i] = 0;
@@ -195,10 +193,9 @@ static uint32_t read_sequence_header_obu(AV1Decoder *pbi,
if (cm->timing_info_present &&
(cm->timing_info.equal_picture_interval ||
cm->op_params[i].decoder_model_param_present_flag)) {
- cm->op_params[i].bitrate = max_level_bitrate(
- seq_params->profile,
- major_minor_to_seq_level_idx(seq_params->level[i]),
- seq_params->tier[i]);
+ cm->op_params[i].bitrate =
+ max_level_bitrate(seq_params->profile, seq_params->seq_level_idx[i],
+ seq_params->tier[i]);
// Level with seq_level_idx = 31 returns a high "dummy" bitrate to pass
// the check
if (cm->op_params[i].bitrate == 0)
@@ -364,8 +361,10 @@ static void alloc_tile_list_buffer(AV1Decoder *pbi) {
// image format 4:2:0, the output frame of U plane and V plane is 1/4 of the
// output frame.
AV1_COMMON *const cm = &pbi->common;
- const int tile_width_in_pixels = cm->tile_width * MI_SIZE;
- const int tile_height_in_pixels = cm->tile_height * MI_SIZE;
+ int tile_width, tile_height;
+ av1_get_uniform_tile_size(cm, &tile_width, &tile_height);
+ const int tile_width_in_pixels = tile_width * MI_SIZE;
+ const int tile_height_in_pixels = tile_height * MI_SIZE;
const int output_frame_width =
(pbi->output_frame_width_in_tiles_minus_1 + 1) * tile_width_in_pixels;
const int output_frame_height =
@@ -415,8 +414,10 @@ static void yv12_tile_copy(const YV12_BUFFER_CONFIG *src, int hstart1,
static void copy_decoded_tile_to_tile_list_buffer(AV1Decoder *pbi,
int tile_idx) {
AV1_COMMON *const cm = &pbi->common;
- const int tile_width_in_pixels = cm->tile_width * MI_SIZE;
- const int tile_height_in_pixels = cm->tile_height * MI_SIZE;
+ int tile_width, tile_height;
+ av1_get_uniform_tile_size(cm, &tile_width, &tile_height);
+ const int tile_width_in_pixels = tile_width * MI_SIZE;
+ const int tile_height_in_pixels = tile_height * MI_SIZE;
const int ssy = cm->seq_params.subsampling_y;
const int ssx = cm->seq_params.subsampling_x;
const int num_planes = av1_num_planes(cm);
diff --git a/libaom/av1/encoder/aq_cyclicrefresh.c b/libaom/av1/encoder/aq_cyclicrefresh.c
index 8d96b23..bfb2a90 100644
--- a/libaom/av1/encoder/aq_cyclicrefresh.c
+++ b/libaom/av1/encoder/aq_cyclicrefresh.c
@@ -31,9 +31,9 @@ struct CYCLIC_REFRESH {
// excess of the cycle time, i.e., in the case of all zero motion, block
// will be refreshed every (100/percent_refresh + time_for_refresh) frames.
int time_for_refresh;
- // Target number of (8x8) blocks that are set for delta-q.
+ // Target number of (4x4) blocks that are set for delta-q.
int target_num_seg_blocks;
- // Actual number of (8x8) blocks that were applied delta-q.
+ // Actual number of (4x4) blocks that were applied delta-q.
int actual_num_seg1_blocks;
int actual_num_seg2_blocks;
// RD mult. parameters for segment 1.
@@ -55,6 +55,8 @@ struct CYCLIC_REFRESH {
int rate_boost_fac;
double low_content_avg;
int qindex_delta[3];
+ double weight_segment;
+ int apply_cyclic_refresh;
};
CYCLIC_REFRESH *av1_cyclic_refresh_alloc(int mi_rows, int mi_cols) {
@@ -87,27 +89,6 @@ void av1_cyclic_refresh_free(CYCLIC_REFRESH *cr) {
}
}
-// Check if we should turn off cyclic refresh based on bitrate condition.
-static int apply_cyclic_refresh_bitrate(const AV1_COMMON *cm,
- const RATE_CONTROL *rc) {
- // Turn off cyclic refresh if bits available per frame is not sufficiently
- // larger than bit cost of segmentation. Segment map bit cost should scale
- // with number of seg blocks, so compare available bits to number of blocks.
- // Average bits available per frame = avg_frame_bandwidth
- // Number of (8x8) blocks in frame = mi_rows * mi_cols;
- const float factor = 0.25;
- const int number_blocks = cm->mi_rows * cm->mi_cols;
- // The condition below corresponds to turning off at target bitrates:
- // (at 30fps), ~12kbps for CIF, 36kbps for VGA, 100kps for HD/720p.
- // Also turn off at very small frame sizes, to avoid too large fraction of
- // superblocks to be refreshed per frame. Threshold below is less than QCIF.
- if (rc->avg_frame_bandwidth < factor * number_blocks ||
- number_blocks / 64 < 5)
- return 0;
- else
- return 1;
-}
-
// Check if this coding block, of size bsize, should be considered for refresh
// (lower-qp coding). Decision can be based on various factors, such as
// size of the coding block (i.e., below min_block size rejected), coding
@@ -158,11 +139,11 @@ int av1_cyclic_refresh_estimate_bits_at_q(const AV1_COMP *cpi,
const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
int estimated_bits;
int mbs = cm->MBs;
- int num8x8bl = mbs << 2;
+ int num4x4bl = mbs << 4;
// Weight for non-base segments: use actual number of blocks refreshed in
- // previous/just encoded frame. Note number of blocks here is in 8x8 units.
- double weight_segment1 = (double)cr->actual_num_seg1_blocks / num8x8bl;
- double weight_segment2 = (double)cr->actual_num_seg2_blocks / num8x8bl;
+ // previous/just encoded frame. Note number of blocks here is in 4x4 units.
+ double weight_segment1 = (double)cr->actual_num_seg1_blocks / num4x4bl;
+ double weight_segment2 = (double)cr->actual_num_seg2_blocks / num4x4bl;
// Take segment weighted average for estimated bits.
estimated_bits =
(int)((1.0 - weight_segment1 - weight_segment2) *
@@ -190,14 +171,14 @@ int av1_cyclic_refresh_rc_bits_per_mb(const AV1_COMP *cpi, int i,
const AV1_COMMON *const cm = &cpi->common;
CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
int bits_per_mb;
- int num8x8bl = cm->MBs << 2;
+ int num4x4bl = cm->MBs << 4;
// Weight for segment prior to encoding: take the average of the target
// number for the frame to be encoded and the actual from the previous frame.
double weight_segment =
(double)((cr->target_num_seg_blocks + cr->actual_num_seg1_blocks +
cr->actual_num_seg2_blocks) >>
1) /
- num8x8bl;
+ num4x4bl;
// Compute delta-q corresponding to qindex i.
int deltaq = compute_deltaq(cpi, i, cr->rate_ratio_qdelta);
// Take segment weighted average for bits per mb.
@@ -264,21 +245,6 @@ void av1_cyclic_refresh_update_segment(const AV1_COMP *cpi,
int map_offset = block_index + y * cm->mi_cols + x;
cr->map[map_offset] = new_map_value;
cpi->segmentation_map[map_offset] = mbmi->segment_id;
- // Inter skip blocks were clearly not coded at the current qindex, so
- // don't update the map for them. For cases where motion is non-zero or
- // the reference frame isn't the previous frame, the previous value in
- // the map for this spatial location is not entirely correct.
- if ((!is_inter_block(mbmi) || !skip) &&
- mbmi->segment_id <= CR_SEGMENT_ID_BOOST2) {
- cr->last_coded_q_map[map_offset] = clamp(
- cm->base_qindex + cr->qindex_delta[mbmi->segment_id], 0, MAXQ);
- } else if (is_inter_block(mbmi) && skip &&
- mbmi->segment_id <= CR_SEGMENT_ID_BOOST2) {
- cr->last_coded_q_map[map_offset] =
- AOMMIN(clamp(cm->base_qindex + cr->qindex_delta[mbmi->segment_id],
- 0, MAXQ),
- cr->last_coded_q_map[map_offset]);
- }
}
}
@@ -315,73 +281,6 @@ void av1_cyclic_refresh_set_golden_update(AV1_COMP *const cpi) {
rc->baseline_gf_interval = 40;
}
-// Update some encoding stats (from the just encoded frame). If this frame's
-// background has high motion, refresh the golden frame. Otherwise, if the
-// golden reference is to be updated check if we should NOT update the golden
-// ref.
-void av1_cyclic_refresh_check_golden_update(AV1_COMP *const cpi) {
- AV1_COMMON *const cm = &cpi->common;
- CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
- int mi_row, mi_col;
- double fraction_low = 0.0;
- int low_content_frame = 0;
-
- MB_MODE_INFO **mi;
- RATE_CONTROL *const rc = &cpi->rc;
- const int rows = cm->mi_rows, cols = cm->mi_cols;
- int cnt1 = 0, cnt2 = 0;
- int force_gf_refresh = 0;
-
- for (mi_row = 0; mi_row < rows; mi_row++) {
- mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
-
- for (mi_col = 0; mi_col < cols; mi_col++) {
- int16_t abs_mvr = mi[0]->mv[0].as_mv.row >= 0
- ? mi[0]->mv[0].as_mv.row
- : -1 * mi[0]->mv[0].as_mv.row;
- int16_t abs_mvc = mi[0]->mv[0].as_mv.col >= 0
- ? mi[0]->mv[0].as_mv.col
- : -1 * mi[0]->mv[0].as_mv.col;
-
- // Calculate the motion of the background.
- if (abs_mvr <= 16 && abs_mvc <= 16) {
- cnt1++;
- if (abs_mvr == 0 && abs_mvc == 0) cnt2++;
- }
- mi++;
-
- // Accumulate low_content_frame.
- if (cr->map[mi_row * cols + mi_col] < 1) low_content_frame++;
- }
- }
-
- // For video conference clips, if the background has high motion in current
- // frame because of the camera movement, set this frame as the golden frame.
- // Use 70% and 5% as the thresholds for golden frame refreshing.
- if (cnt1 * 10 > (70 * rows * cols) && cnt2 * 20 < cnt1) {
- av1_cyclic_refresh_set_golden_update(cpi);
- rc->frames_till_gf_update_due = rc->baseline_gf_interval;
-
- if (rc->frames_till_gf_update_due > rc->frames_to_key)
- rc->frames_till_gf_update_due = rc->frames_to_key;
- cpi->refresh_golden_frame = 1;
- force_gf_refresh = 1;
- }
-
- fraction_low = (double)low_content_frame / (rows * cols);
- // Update average.
- cr->low_content_avg = (fraction_low + 3 * cr->low_content_avg) / 4;
- if (!force_gf_refresh && cpi->refresh_golden_frame == 1) {
- // Don't update golden reference if the amount of low_content for the
- // current encoded frame is small, or if the recursive average of the
- // low_content over the update interval window falls below threshold.
- if (fraction_low < 0.8 || cr->low_content_avg < 0.7)
- cpi->refresh_golden_frame = 0;
- // Reset for next internal.
- cr->low_content_avg = fraction_low;
- }
-}
-
// Update the segmentation map, and related quantities: cyclic refresh map,
// refresh sb_index, and target number of blocks to be refreshed.
// The map is set to either 0/CR_SEGMENT_ID_BASE (no refresh) or to
@@ -458,26 +357,70 @@ static void cyclic_refresh_update_map(AV1_COMP *const cpi) {
// Set cyclic refresh parameters.
void av1_cyclic_refresh_update_parameters(AV1_COMP *const cpi) {
+ // TODO(marpan): Parameters need to be tuned.
const RATE_CONTROL *const rc = &cpi->rc;
const AV1_COMMON *const cm = &cpi->common;
CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ int num4x4bl = cm->MBs << 4;
+ int target_refresh = 0;
+ double weight_segment_target = 0;
+ double weight_segment = 0;
+ int qp_thresh = AOMMIN(20, rc->best_quality << 1);
+ cr->apply_cyclic_refresh = 1;
+ if (frame_is_intra_only(cm) || is_lossless_requested(&cpi->oxcf) ||
+ rc->avg_frame_qindex[INTER_FRAME] < qp_thresh) {
+ cr->apply_cyclic_refresh = 0;
+ return;
+ }
cr->percent_refresh = 10;
- cr->max_qdelta_perc = 50;
+ cr->max_qdelta_perc = 60;
cr->time_for_refresh = 0;
+ cr->motion_thresh = 32;
+ cr->rate_boost_fac = 15;
// Use larger delta-qp (increase rate_ratio_qdelta) for first few (~4)
// periods of the refresh cycle, after a key frame.
- if (rc->frames_since_key < 4 * cr->percent_refresh)
+ // Account for larger interval on base layer for temporal layers.
+ if (cr->percent_refresh > 0 &&
+ rc->frames_since_key < 400 / cr->percent_refresh) {
cr->rate_ratio_qdelta = 3.0;
- else
+ } else {
cr->rate_ratio_qdelta = 2.0;
- // Adjust some parameters for low resolutions at low bitrates.
- if (cm->width <= 352 && cm->height <= 288 && rc->avg_frame_bandwidth < 3400) {
- cr->motion_thresh = 4;
+ }
+ // Adjust some parameters for low resolutions.
+ if (cm->width <= 352 && cm->height <= 288) {
+ if (rc->avg_frame_bandwidth < 3000) {
+ cr->motion_thresh = 16;
+ cr->rate_boost_fac = 13;
+ } else {
+ cr->max_qdelta_perc = 70;
+ cr->rate_ratio_qdelta = AOMMAX(cr->rate_ratio_qdelta, 2.5);
+ }
+ }
+ if (cpi->oxcf.rc_mode == AOM_VBR) {
+ // To be adjusted for VBR mode, e.g., based on gf period and boost.
+ // For now use smaller qp-delta (than CBR), no second boosted seg, and
+ // turn-off (no refresh) on golden refresh (since it's already boosted).
+ cr->percent_refresh = 10;
+ cr->rate_ratio_qdelta = 1.5;
cr->rate_boost_fac = 10;
- } else {
- cr->motion_thresh = 32;
- cr->rate_boost_fac = 17;
+ if (cpi->refresh_golden_frame == 1) {
+ cr->percent_refresh = 0;
+ cr->rate_ratio_qdelta = 1.0;
+ }
}
+ // Weight for segment prior to encoding: take the average of the target
+ // number for the frame to be encoded and the actual from the previous frame.
+ // Use the target if its less. To be used for setting the base qp for the
+ // frame in vp9_rc_regulate_q.
+ target_refresh = cr->percent_refresh * cm->mi_rows * cm->mi_cols / 100;
+ weight_segment_target = (double)(target_refresh) / num4x4bl;
+ weight_segment = (double)((target_refresh + cr->actual_num_seg1_blocks +
+ cr->actual_num_seg2_blocks) >>
+ 1) /
+ num4x4bl;
+ if (weight_segment_target < 7 * weight_segment / 8)
+ weight_segment = weight_segment_target;
+ cr->weight_segment = weight_segment;
}
// Setup cyclic background refresh: set delta q and segmentation map.
@@ -486,7 +429,6 @@ void av1_cyclic_refresh_setup(AV1_COMP *const cpi) {
const RATE_CONTROL *const rc = &cpi->rc;
CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
struct segmentation *const seg = &cm->seg;
- const int apply_cyclic_refresh = apply_cyclic_refresh_bitrate(cm, rc);
int resolution_change =
cm->prev_frame && (cm->width != cm->prev_frame->width ||
cm->height != cm->prev_frame->height);
@@ -498,8 +440,7 @@ void av1_cyclic_refresh_setup(AV1_COMP *const cpi) {
return;
}
if (cm->current_frame.frame_number == 0) cr->low_content_avg = 0.0;
- // Don't apply refresh on key frame or enhancement layer frames.
- if (!apply_cyclic_refresh || cm->current_frame.frame_type == KEY_FRAME) {
+ if (!cr->apply_cyclic_refresh) {
// Set segmentation map to 0 and disable.
unsigned char *const seg_map = cpi->segmentation_map;
memset(seg_map, 0, cm->mi_rows * cm->mi_cols);
diff --git a/libaom/av1/encoder/aq_cyclicrefresh.h b/libaom/av1/encoder/aq_cyclicrefresh.h
index b457819..ddabae6 100644
--- a/libaom/av1/encoder/aq_cyclicrefresh.h
+++ b/libaom/av1/encoder/aq_cyclicrefresh.h
@@ -54,19 +54,12 @@ void av1_cyclic_refresh_update_segment(const struct AV1_COMP *cpi,
int mi_col, BLOCK_SIZE bsize,
int64_t rate, int64_t dist, int skip);
-// Update the segmentation map, and related quantities: cyclic refresh map,
-// refresh sb_index, and target number of blocks to be refreshed.
-void av1_cyclic_refresh_update__map(struct AV1_COMP *const cpi);
-
// Update the actual number of blocks that were applied the segment delta q.
void av1_cyclic_refresh_postencode(struct AV1_COMP *const cpi);
// Set golden frame update interval, for 1 pass CBR mode.
void av1_cyclic_refresh_set_golden_update(struct AV1_COMP *const cpi);
-// Check if we should not update golden reference, based on past refresh stats.
-void av1_cyclic_refresh_check_golden_update(struct AV1_COMP *const cpi);
-
// Set/update global/frame level refresh parameters.
void av1_cyclic_refresh_update_parameters(struct AV1_COMP *const cpi);
diff --git a/libaom/av1/encoder/aq_variance.c b/libaom/av1/encoder/aq_variance.c
index cfd7610..d572948 100644
--- a/libaom/av1/encoder/aq_variance.c
+++ b/libaom/av1/encoder/aq_variance.c
@@ -121,7 +121,7 @@ int av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
for (i = 0; i < bh; i += 4) {
for (j = 0; j < bw; j += 4) {
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ if (is_cur_buf_hbd(xd)) {
var +=
log(1.0 + cpi->fn_ptr[BLOCK_4X4].vf(
x->plane[0].src.buf + i * x->plane[0].src.stride + j,
@@ -153,7 +153,7 @@ static unsigned int haar_ac_energy(MACROBLOCK *x, BLOCK_SIZE bs) {
uint8_t *buf = x->plane[0].src.buf;
const int bw = MI_SIZE * mi_size_wide[bs];
const int bh = MI_SIZE * mi_size_high[bs];
- int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+ const int hbd = is_cur_buf_hbd(xd);
int var = 0;
for (int r = 0; r < bh; r += 8)
diff --git a/libaom/av1/encoder/av1_multi_thread.c b/libaom/av1/encoder/av1_multi_thread.c
index a0c556e..1260c7a 100644
--- a/libaom/av1/encoder/av1_multi_thread.c
+++ b/libaom/av1/encoder/av1_multi_thread.c
@@ -35,6 +35,14 @@ void av1_row_mt_mem_alloc(AV1_COMP *cpi, int max_sb_rows) {
&cpi->tile_data[tile_row * multi_thread_ctxt->allocated_tile_cols +
tile_col];
av1_row_mt_sync_mem_alloc(&this_tile->row_mt_sync, cm, max_sb_rows);
+ if (cpi->oxcf.cdf_update_mode)
+ CHECK_MEM_ERROR(
+ cm, this_tile->row_ctx,
+ (FRAME_CONTEXT *)aom_memalign(
+ 16,
+ AOMMAX(1, (av1_get_sb_cols_in_tile(cm, this_tile->tile_info) -
+ 1)) *
+ sizeof(*this_tile->row_ctx)));
}
}
}
@@ -53,6 +61,7 @@ void av1_row_mt_mem_dealloc(AV1_COMP *cpi) {
&cpi->tile_data[tile_row * multi_thread_ctxt->allocated_tile_cols +
tile_col];
av1_row_mt_sync_mem_dealloc(&this_tile->row_mt_sync);
+ if (cpi->oxcf.cdf_update_mode) aom_free(this_tile->row_ctx);
}
}
multi_thread_ctxt->allocated_sb_rows = 0;
diff --git a/libaom/av1/encoder/av1_quantize.c b/libaom/av1/encoder/av1_quantize.c
index 21ab4db..ff1342c 100644
--- a/libaom/av1/encoder/av1_quantize.c
+++ b/libaom/av1/encoder/av1_quantize.c
@@ -41,47 +41,37 @@ static void quantize_fp_helper_c(
const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
const qm_val_t *iqm_ptr, int log_scale) {
int i, eob = -1;
+ const int rounding[2] = { ROUND_POWER_OF_TWO(round_ptr[0], log_scale),
+ ROUND_POWER_OF_TWO(round_ptr[1], log_scale) };
// TODO(jingning) Decide the need of these arguments after the
// quantization process is completed.
(void)zbin_ptr;
(void)quant_shift_ptr;
+ (void)iscan;
memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
if (qm_ptr == NULL && iqm_ptr == NULL) {
- const int rounding0 = ROUND_POWER_OF_TWO(round_ptr[0], log_scale);
- { // rc == 0
- const int coeff = coeff_ptr[0];
- const int coeff_sign = (coeff >> 31);
- int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
- if ((abs_coeff << (1 + log_scale)) >= (int32_t)(dequant_ptr[0])) {
- abs_coeff = clamp64(abs_coeff + rounding0, INT16_MIN, INT16_MAX);
- const int tmp32 = (int)((abs_coeff * quant_ptr[0]) >> (16 - log_scale));
- if (tmp32) {
- qcoeff_ptr[0] = (tmp32 ^ coeff_sign) - coeff_sign;
- const tran_low_t abs_dqcoeff = (tmp32 * dequant_ptr[0]) >> log_scale;
- dqcoeff_ptr[0] = (abs_dqcoeff ^ coeff_sign) - coeff_sign;
- eob = 0;
- }
- }
- }
- const int rounding1 = ROUND_POWER_OF_TWO(round_ptr[1], log_scale);
- const int32_t thresh1 = (int32_t)(dequant_ptr[1]);
- for (i = 1; i < n_coeffs; i++) {
- const int coeff = coeff_ptr[i];
+ for (i = 0; i < n_coeffs; i++) {
+ const int rc = scan[i];
+ const int32_t thresh = (int32_t)(dequant_ptr[rc != 0]);
+ const int coeff = coeff_ptr[rc];
const int coeff_sign = (coeff >> 31);
int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
- if ((abs_coeff << (1 + log_scale)) >= thresh1) {
- abs_coeff = clamp64(abs_coeff + rounding1, INT16_MIN, INT16_MAX);
- const int tmp32 = (int)((abs_coeff * quant_ptr[1]) >> (16 - log_scale));
+ int tmp32 = 0;
+ if ((abs_coeff << (1 + log_scale)) >= thresh) {
+ abs_coeff =
+ clamp64(abs_coeff + rounding[rc != 0], INT16_MIN, INT16_MAX);
+ tmp32 = (int)((abs_coeff * quant_ptr[rc != 0]) >> (16 - log_scale));
if (tmp32) {
- qcoeff_ptr[i] = (tmp32 ^ coeff_sign) - coeff_sign;
- const tran_low_t abs_dqcoeff = (tmp32 * dequant_ptr[1]) >> log_scale;
- dqcoeff_ptr[i] = (abs_dqcoeff ^ coeff_sign) - coeff_sign;
- eob = AOMMAX(iscan[i], eob);
+ qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+ const tran_low_t abs_dqcoeff =
+ (tmp32 * dequant_ptr[rc != 0]) >> log_scale;
+ dqcoeff_ptr[rc] = (abs_dqcoeff ^ coeff_sign) - coeff_sign;
}
}
+ if (tmp32) eob = i;
}
} else {
// Quantization pass: All coefficients with index >= zero_flag are
@@ -99,7 +89,7 @@ static void quantize_fp_helper_c(
int tmp32 = 0;
if (abs_coeff * wt >=
(dequant_ptr[rc != 0] << (AOM_QM_BITS - (1 + log_scale)))) {
- abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale);
+ abs_coeff += rounding[rc != 0];
abs_coeff = clamp64(abs_coeff, INT16_MIN, INT16_MAX);
tmp32 = (int)((abs_coeff * wt * quant_ptr[rc != 0]) >>
(16 - log_scale + AOM_QM_BITS));
@@ -275,32 +265,65 @@ void av1_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
const SCAN_ORDER *sc, const QUANT_PARAM *qparam) {
const qm_val_t *qm_ptr = qparam->qmatrix;
const qm_val_t *iqm_ptr = qparam->iqmatrix;
- if (qm_ptr != NULL && iqm_ptr != NULL) {
- quantize_b_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
- p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
- dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
- sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+ if (qparam->use_quant_b_adapt) {
+ // TODO(sarahparker) These quantize_b optimizations need SIMD
+ // implementations
+ if (qm_ptr != NULL && iqm_ptr != NULL) {
+ quantize_b_adaptive_helper_c(
+ coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+ p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
+ sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+ } else {
+ switch (qparam->log_scale) {
+ case 0:
+ aom_quantize_b_adaptive(coeff_ptr, n_coeffs, p->zbin_QTX,
+ p->round_QTX, p->quant_QTX,
+ p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr,
+ p->dequant_QTX, eob_ptr, sc->scan, sc->iscan);
+ break;
+ case 1:
+ aom_quantize_b_32x32_adaptive(
+ coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+ p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+ eob_ptr, sc->scan, sc->iscan);
+ break;
+ case 2:
+ aom_quantize_b_64x64_adaptive_c(
+ coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+ p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+ eob_ptr, sc->scan, sc->iscan);
+ break;
+ default: assert(0);
+ }
+ }
} else {
- switch (qparam->log_scale) {
- case 0:
- aom_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
- p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
- dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
- sc->iscan);
- break;
- case 1:
- aom_quantize_b_32x32(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
- p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
- dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
- sc->iscan);
- break;
- case 2:
- aom_quantize_b_64x64(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
- p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
- dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
- sc->iscan);
- break;
- default: assert(0);
+ if (qm_ptr != NULL && iqm_ptr != NULL) {
+ quantize_b_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+ p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+ } else {
+ switch (qparam->log_scale) {
+ case 0:
+ aom_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+ p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan);
+ break;
+ case 1:
+ aom_quantize_b_32x32(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+ p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan);
+ break;
+ case 2:
+ aom_quantize_b_64x64(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+ p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan);
+ break;
+ default: assert(0);
+ }
}
}
}
@@ -391,41 +414,81 @@ void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr,
const QUANT_PARAM *qparam) {
const qm_val_t *qm_ptr = qparam->qmatrix;
const qm_val_t *iqm_ptr = qparam->iqmatrix;
- if (qm_ptr != NULL && iqm_ptr != NULL) {
- highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
- p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
- dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
- sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+ if (qparam->use_quant_b_adapt) {
+ if (qm_ptr != NULL && iqm_ptr != NULL) {
+ highbd_quantize_b_adaptive_helper_c(
+ coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+ p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
+ sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+ } else {
+ switch (qparam->log_scale) {
+ case 0:
+ if (LIKELY(n_coeffs >= 8)) {
+ aom_highbd_quantize_b_adaptive_c(
+ coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+ p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+ eob_ptr, sc->scan, sc->iscan);
+ } else {
+ // TODO(luoyi): Need SIMD (e.g. sse2) for smaller block size
+ // quantization
+ aom_highbd_quantize_b_adaptive_c(
+ coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+ p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+ eob_ptr, sc->scan, sc->iscan);
+ }
+ break;
+ case 1:
+ aom_highbd_quantize_b_32x32_adaptive_c(
+ coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+ p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+ eob_ptr, sc->scan, sc->iscan);
+ break;
+ case 2:
+ aom_highbd_quantize_b_64x64_adaptive_c(
+ coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+ p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+ eob_ptr, sc->scan, sc->iscan);
+ break;
+ default: assert(0);
+ }
+ }
} else {
- switch (qparam->log_scale) {
- case 0:
- if (LIKELY(n_coeffs >= 8)) {
- aom_highbd_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
- p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
- dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
- sc->iscan);
- } else {
- // TODO(luoyi): Need SIMD (e.g. sse2) for smaller block size
- // quantization
- aom_highbd_quantize_b_c(coeff_ptr, n_coeffs, p->zbin_QTX,
+ if (qm_ptr != NULL && iqm_ptr != NULL) {
+ highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+ p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+ } else {
+ switch (qparam->log_scale) {
+ case 0:
+ if (LIKELY(n_coeffs >= 8)) {
+ aom_highbd_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX,
p->round_QTX, p->quant_QTX,
p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr,
p->dequant_QTX, eob_ptr, sc->scan, sc->iscan);
- }
- break;
- case 1:
- aom_highbd_quantize_b_32x32(
- coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
- p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
- eob_ptr, sc->scan, sc->iscan);
- break;
- case 2:
- aom_highbd_quantize_b_64x64(
- coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
- p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
- eob_ptr, sc->scan, sc->iscan);
- break;
- default: assert(0);
+ } else {
+ // TODO(luoyi): Need SIMD (e.g. sse2) for smaller block size
+ // quantization
+ aom_highbd_quantize_b_c(
+ coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+ p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+ eob_ptr, sc->scan, sc->iscan);
+ }
+ break;
+ case 1:
+ aom_highbd_quantize_b_32x32(
+ coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+ p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+ eob_ptr, sc->scan, sc->iscan);
+ break;
+ case 2:
+ aom_highbd_quantize_b_64x64(
+ coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+ p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+ eob_ptr, sc->scan, sc->iscan);
+ break;
+ default: assert(0);
+ }
}
}
}
diff --git a/libaom/av1/encoder/av1_quantize.h b/libaom/av1/encoder/av1_quantize.h
index fb53881..6419265 100644
--- a/libaom/av1/encoder/av1_quantize.h
+++ b/libaom/av1/encoder/av1_quantize.h
@@ -22,11 +22,15 @@
extern "C" {
#endif
+#define EOB_FACTOR 325
+#define SKIP_EOB_FACTOR_ADJUST 200
+
typedef struct QUANT_PARAM {
int log_scale;
TX_SIZE tx_size;
const qm_val_t *qmatrix;
const qm_val_t *iqmatrix;
+ int use_quant_b_adapt;
} QUANT_PARAM;
typedef void (*AV1_QUANT_FACADE)(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
diff --git a/libaom/av1/encoder/bitstream.c b/libaom/av1/encoder/bitstream.c
index df79b79..cbac2b2 100644
--- a/libaom/av1/encoder/bitstream.c
+++ b/libaom/av1/encoder/bitstream.c
@@ -145,7 +145,7 @@ static void write_inter_compound_mode(MACROBLOCKD *xd, aom_writer *w,
static void write_tx_size_vartx(MACROBLOCKD *xd, const MB_MODE_INFO *mbmi,
TX_SIZE tx_size, int depth, int blk_row,
int blk_col, aom_writer *w) {
- FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
const int max_blocks_high = max_block_high(xd, mbmi->sb_type, 0);
const int max_blocks_wide = max_block_wide(xd, mbmi->sb_type, 0);
@@ -369,10 +369,18 @@ static void pack_txb_tokens(aom_writer *w, AV1_COMMON *cm, MACROBLOCK *const x,
blk_col)];
if (tx_size == plane_tx_size || plane) {
- tran_low_t *tcoeff = BLOCK_OFFSET(x->mbmi_ext->tcoeff[plane], block);
- const uint16_t eob = x->mbmi_ext->eobs[plane][block];
- TXB_CTX txb_ctx = { x->mbmi_ext->txb_skip_ctx[plane][block],
- x->mbmi_ext->dc_sign_ctx[plane][block] };
+ const int txb_offset =
+ x->mbmi_ext->cb_offset / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
+ tran_low_t *tcoeff_txb =
+ x->mbmi_ext->cb_coef_buff->tcoeff[plane] + x->mbmi_ext->cb_offset;
+ uint16_t *eob_txb = x->mbmi_ext->cb_coef_buff->eobs[plane] + txb_offset;
+ uint8_t *txb_skip_ctx_txb =
+ x->mbmi_ext->cb_coef_buff->txb_skip_ctx[plane] + txb_offset;
+ int *dc_sign_ctx_txb =
+ x->mbmi_ext->cb_coef_buff->dc_sign_ctx[plane] + txb_offset;
+ tran_low_t *tcoeff = BLOCK_OFFSET(tcoeff_txb, block);
+ const uint16_t eob = eob_txb[block];
+ TXB_CTX txb_ctx = { txb_skip_ctx_txb[block], dc_sign_ctx_txb[block] };
av1_write_coeffs_txb(cm, xd, w, blk_row, blk_col, plane, tx_size, tcoeff,
eob, &txb_ctx);
#if CONFIG_RD_DEBUG
@@ -460,7 +468,7 @@ static void write_segment_id(AV1_COMP *cpi, const MB_MODE_INFO *const mbmi,
// changing from lossless to lossy.
assert(is_inter_block(mbmi) || !cpi->has_lossless_segment);
- set_spatial_segment_id(cm, cm->current_frame_seg_map, mbmi->sb_type, mi_row,
+ set_spatial_segment_id(cm, cm->cur_frame->seg_map, mbmi->sb_type, mi_row,
mi_col, pred);
set_spatial_segment_id(cm, cpi->segmentation_map, mbmi->sb_type, mi_row,
mi_col, pred);
@@ -473,7 +481,7 @@ static void write_segment_id(AV1_COMP *cpi, const MB_MODE_INFO *const mbmi,
av1_neg_interleave(mbmi->segment_id, pred, seg->last_active_segid + 1);
aom_cdf_prob *pred_cdf = segp->spatial_pred_seg_cdf[cdf_num];
aom_write_symbol(w, coded_id, pred_cdf, MAX_SEGMENTS);
- set_spatial_segment_id(cm, cm->current_frame_seg_map, mbmi->sb_type, mi_row,
+ set_spatial_segment_id(cm, cm->cur_frame->seg_map, mbmi->sb_type, mi_row,
mi_col, mbmi->segment_id);
}
@@ -627,7 +635,7 @@ static void write_mb_interp_filter(AV1_COMP *cpi, const MACROBLOCKD *xd,
av1_extract_interp_filter(mbmi->interp_filters, dir);
aom_write_symbol(w, filter, ec_ctx->switchable_interp_cdf[ctx],
SWITCHABLE_FILTERS);
- ++cpi->interp_filter_selected[0][filter];
+ ++cm->cur_frame->interp_filter_selected[filter];
if (cm->seq_params.enable_dual_filter == 0) return;
}
}
@@ -867,14 +875,7 @@ static void write_cfl_alphas(FRAME_CONTEXT *const ec_ctx, int idx,
static void write_cdef(AV1_COMMON *cm, MACROBLOCKD *const xd, aom_writer *w,
int skip, int mi_col, int mi_row) {
- if (cm->coded_lossless || cm->allow_intrabc) {
- // Initialize to indicate no CDEF for safety.
- cm->cdef_info.cdef_bits = 0;
- cm->cdef_info.cdef_strengths[0] = 0;
- cm->cdef_info.nb_cdef_strengths = 1;
- cm->cdef_info.cdef_uv_strengths[0] = 0;
- return;
- }
+ if (cm->coded_lossless || cm->allow_intrabc) return;
const int m = ~((1 << (6 - MI_SIZE_LOG2)) - 1);
const MB_MODE_INFO *mbmi =
@@ -903,7 +904,7 @@ static void write_inter_segment_id(AV1_COMP *cpi, aom_writer *w,
int mi_row, int mi_col, int skip,
int preskip) {
MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
- const MB_MODE_INFO *const mbmi = xd->mi[0];
+ MB_MODE_INFO *const mbmi = xd->mi[0];
AV1_COMMON *const cm = &cpi->common;
if (seg->update_map) {
@@ -913,7 +914,7 @@ static void write_inter_segment_id(AV1_COMP *cpi, aom_writer *w,
if (seg->segid_preskip) return;
if (skip) {
write_segment_id(cpi, mbmi, w, seg, segp, mi_row, mi_col, 1);
- if (seg->temporal_update) ((MB_MODE_INFO *)mbmi)->seg_id_predicted = 0;
+ if (seg->temporal_update) mbmi->seg_id_predicted = 0;
return;
}
}
@@ -925,7 +926,7 @@ static void write_inter_segment_id(AV1_COMP *cpi, aom_writer *w,
write_segment_id(cpi, mbmi, w, seg, segp, mi_row, mi_col, 0);
}
if (pred_flag) {
- set_spatial_segment_id(cm, cm->current_frame_seg_map, mbmi->sb_type,
+ set_spatial_segment_id(cm, cm->cur_frame->seg_map, mbmi->sb_type,
mi_row, mi_col, mbmi->segment_id);
}
} else {
@@ -1134,7 +1135,7 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
if (mbmi->ref_frame[1] != INTRA_FRAME) write_motion_mode(cm, xd, mbmi, w);
// First write idx to indicate current compound inter prediction mode group
- // Group A (0): jnt_comp, compound_average
+ // Group A (0): dist_wtd_comp, compound_average
// Group B (1): interintra, compound_diffwtd, wedge
if (has_second_ref(mbmi)) {
const int masked_compound_used = is_any_masked_compound_used(bsize) &&
@@ -1152,7 +1153,7 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
if (mbmi->compound_idx)
assert(mbmi->interinter_comp.type == COMPOUND_AVERAGE);
- if (cm->seq_params.order_hint_info.enable_jnt_comp) {
+ if (cm->seq_params.order_hint_info.enable_dist_wtd_comp) {
const int comp_index_ctx = get_comp_index_context(cm, xd);
aom_write_symbol(w, mbmi->compound_idx,
ec_ctx->compound_index_cdf[comp_index_ctx], 2);
@@ -1169,9 +1170,9 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
mbmi->interinter_comp.type == COMPOUND_DIFFWTD);
if (is_interinter_compound_used(COMPOUND_WEDGE, bsize))
- aom_write_symbol(w, mbmi->interinter_comp.type - 1,
+ aom_write_symbol(w, mbmi->interinter_comp.type - COMPOUND_WEDGE,
ec_ctx->compound_type_cdf[bsize],
- COMPOUND_TYPES - 1);
+ MASKED_COMPOUND_TYPES);
if (mbmi->interinter_comp.type == COMPOUND_WEDGE) {
assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize));
@@ -1185,7 +1186,6 @@ static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
}
}
}
-
write_mb_interp_filter(cpi, xd, w);
}
}
@@ -1237,13 +1237,14 @@ static void write_mb_modes_kf(AV1_COMP *cpi, MACROBLOCKD *xd,
}
#if CONFIG_RD_DEBUG
-static void dump_mode_info(MODE_INFO *mi) {
+static void dump_mode_info(MB_MODE_INFO *mi) {
printf("\nmi->mi_row == %d\n", mi->mi_row);
printf("&& mi->mi_col == %d\n", mi->mi_col);
printf("&& mi->sb_type == %d\n", mi->sb_type);
printf("&& mi->tx_size == %d\n", mi->tx_size);
printf("&& mi->mode == %d\n", mi->mode);
}
+
static int rd_token_stats_mismatch(RD_STATS *rd_stats, TOKEN_STATS *token_stats,
int plane) {
if (rd_stats->txb_coeff_cost[plane] != token_stats->cost) {
@@ -1274,30 +1275,28 @@ static int rd_token_stats_mismatch(RD_STATS *rd_stats, TOKEN_STATS *token_stats,
#if ENC_MISMATCH_DEBUG
static void enc_dump_logs(AV1_COMP *cpi, int mi_row, int mi_col) {
AV1_COMMON *const cm = &cpi->common;
- MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
- xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col);
- const MB_MODE_INFO *const *mbmi = xd->mi[0];
+ const MB_MODE_INFO *const *mbmi =
+ *(cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col));
+ const MB_MODE_INFO_EXT *const *mbmi_ext =
+ cpi->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
if (is_inter_block(mbmi)) {
#define FRAME_TO_CHECK 11
if (cm->current_frame.frame_number == FRAME_TO_CHECK &&
cm->show_frame == 1) {
const BLOCK_SIZE bsize = mbmi->sb_type;
- int_mv mv[2];
- int is_comp_ref = has_second_ref(mbmi);
- int ref;
+ int_mv mv[2] = { 0 };
+ const int is_comp_ref = has_second_ref(mbmi);
- for (ref = 0; ref < 1 + is_comp_ref; ++ref)
+ for (int ref = 0; ref < 1 + is_comp_ref; ++ref)
mv[ref].as_mv = mbmi->mv[ref].as_mv;
if (!is_comp_ref) {
mv[1].as_int = 0;
}
- MACROBLOCK *const x = &cpi->td.mb;
- const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
const int16_t mode_ctx =
- is_comp_ref ? mbmi_ext->compound_mode_context[mbmi->ref_frame[0]]
+ is_comp_ref ? 0
: av1_mode_context_analyzer(mbmi_ext->mode_context,
mbmi->ref_frame);
@@ -1479,14 +1478,16 @@ static void write_tokens_b(AV1_COMP *cpi, const TileInfo *const tile,
row, col, &block[plane], plane);
}
}
+ }
#if CONFIG_RD_DEBUG
+ for (plane = 0; plane < num_planes && is_inter_block(mbmi); ++plane) {
if (mbmi->sb_type >= BLOCK_8X8 &&
rd_token_stats_mismatch(&mbmi->rd_stats, &token_stats, plane)) {
- dump_mode_info(m);
+ dump_mode_info(mbmi);
assert(0);
}
-#endif // CONFIG_RD_DEBUG
}
+#endif // CONFIG_RD_DEBUG
}
}
}
@@ -1875,8 +1876,8 @@ static void loop_restoration_write_sb_coeffs(const AV1_COMMON *const cm,
assert(!cm->all_lossless);
const int wiener_win = (plane > 0) ? WIENER_WIN_CHROMA : WIENER_WIN;
- WienerInfo *wiener_info = xd->wiener_info + plane;
- SgrprojInfo *sgrproj_info = xd->sgrproj_info + plane;
+ WienerInfo *ref_wiener_info = &xd->wiener_info[plane];
+ SgrprojInfo *ref_sgrproj_info = &xd->sgrproj_info[plane];
RestorationType unit_rtype = rui->restoration_type;
if (frame_rtype == RESTORE_SWITCHABLE) {
@@ -1887,10 +1888,10 @@ static void loop_restoration_write_sb_coeffs(const AV1_COMMON *const cm,
#endif
switch (unit_rtype) {
case RESTORE_WIENER:
- write_wiener_filter(wiener_win, &rui->wiener_info, wiener_info, w);
+ write_wiener_filter(wiener_win, &rui->wiener_info, ref_wiener_info, w);
break;
case RESTORE_SGRPROJ:
- write_sgrproj_filter(&rui->sgrproj_info, sgrproj_info, w);
+ write_sgrproj_filter(&rui->sgrproj_info, ref_sgrproj_info, w);
break;
default: assert(unit_rtype == RESTORE_NONE); break;
}
@@ -1901,7 +1902,7 @@ static void loop_restoration_write_sb_coeffs(const AV1_COMMON *const cm,
++counts->wiener_restore[unit_rtype != RESTORE_NONE];
#endif
if (unit_rtype != RESTORE_NONE) {
- write_wiener_filter(wiener_win, &rui->wiener_info, wiener_info, w);
+ write_wiener_filter(wiener_win, &rui->wiener_info, ref_wiener_info, w);
}
} else if (frame_rtype == RESTORE_SGRPROJ) {
aom_write_symbol(w, unit_rtype != RESTORE_NONE,
@@ -1910,7 +1911,7 @@ static void loop_restoration_write_sb_coeffs(const AV1_COMMON *const cm,
++counts->sgrproj_restore[unit_rtype != RESTORE_NONE];
#endif
if (unit_rtype != RESTORE_NONE) {
- write_sgrproj_filter(&rui->sgrproj_info, sgrproj_info, w);
+ write_sgrproj_filter(&rui->sgrproj_info, ref_sgrproj_info, w);
}
}
}
@@ -1941,13 +1942,9 @@ static void encode_loopfilter(AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
aom_wb_write_bit(wb, lf->mode_ref_delta_update);
if (lf->mode_ref_delta_update) {
- const int prime_idx = cm->primary_ref_frame;
- const RefCntBuffer *const buf =
- prime_idx == PRIMARY_REF_NONE
- ? NULL
- : cm->current_frame.frame_refs[prime_idx].buf;
+ const RefCntBuffer *buf = get_primary_ref_frame_buf(cm);
int8_t last_ref_deltas[REF_FRAMES];
- if (prime_idx == PRIMARY_REF_NONE || buf == NULL) {
+ if (buf == NULL) {
av1_set_default_ref_deltas(last_ref_deltas);
} else {
memcpy(last_ref_deltas, buf->ref_deltas, REF_FRAMES);
@@ -1960,7 +1957,7 @@ static void encode_loopfilter(AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
}
int8_t last_mode_deltas[MAX_MODE_LF_DELTAS];
- if (prime_idx == PRIMARY_REF_NONE || buf == NULL) {
+ if (buf == NULL) {
av1_set_default_mode_deltas(last_mode_deltas);
} else {
memcpy(last_mode_deltas, buf->mode_deltas, MAX_MODE_LF_DELTAS);
@@ -2076,15 +2073,6 @@ static void encode_segmentation(AV1_COMMON *cm, MACROBLOCKD *xd,
}
}
-static void write_tx_mode(AV1_COMMON *cm, TX_MODE *mode,
- struct aom_write_bit_buffer *wb) {
- if (cm->coded_lossless) {
- *mode = ONLY_4X4;
- return;
- }
- aom_wb_write_bit(wb, *mode == TX_MODE_SELECT);
-}
-
static void write_frame_interp_filter(InterpFilter filter,
struct aom_write_bit_buffer *wb) {
aom_wb_write_bit(wb, filter == SWITCHABLE);
@@ -2092,29 +2080,6 @@ static void write_frame_interp_filter(InterpFilter filter,
aom_wb_write_literal(wb, filter, LOG_SWITCHABLE_FILTERS);
}
-static void fix_interp_filter(AV1_COMMON *cm, FRAME_COUNTS *counts) {
- if (cm->interp_filter == SWITCHABLE) {
- // Check to see if only one of the filters is actually used
- int count[SWITCHABLE_FILTERS];
- int i, j, c = 0;
- for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
- count[i] = 0;
- for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
- count[i] += counts->switchable_interp[j][i];
- c += (count[i] > 0);
- }
- if (c == 1) {
- // Only one filter is used. So set the filter at frame level
- for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
- if (count[i]) {
- if (i == EIGHTTAP_REGULAR) cm->interp_filter = i;
- break;
- }
- }
- }
- }
-}
-
// Same function as write_uniform but writing to uncompresses header wb
static void wb_write_uniform(struct aom_write_bit_buffer *wb, int n, int v) {
const int l = get_unsigned_bits(n);
@@ -2212,63 +2177,12 @@ static void write_ext_tile_info(const AV1_COMMON *const cm,
}
}
-static int get_refresh_mask(AV1_COMP *cpi) {
- if ((cpi->common.current_frame.frame_type == KEY_FRAME &&
- cpi->common.show_frame) ||
- frame_is_sframe(&cpi->common))
- return 0xFF;
-
- int refresh_mask = 0;
-
- // NOTE(zoeliu): When LAST_FRAME is to get refreshed, the decoder will be
- // notified to get LAST3_FRAME refreshed and then the virtual indexes for all
- // the 3 LAST reference frames will be updated accordingly, i.e.:
- // (1) The original virtual index for LAST3_FRAME will become the new virtual
- // index for LAST_FRAME; and
- // (2) The original virtual indexes for LAST_FRAME and LAST2_FRAME will be
- // shifted and become the new virtual indexes for LAST2_FRAME and
- // LAST3_FRAME.
- refresh_mask |=
- (cpi->refresh_last_frame << get_ref_frame_map_idx(cpi, LAST3_FRAME));
-
-#if USE_SYMM_MULTI_LAYER
- const int bwd_ref_frame =
- (cpi->new_bwdref_update_rule == 1) ? EXTREF_FRAME : BWDREF_FRAME;
-#else
- const int bwd_ref_frame = BWDREF_FRAME;
-#endif
- refresh_mask |=
- (cpi->refresh_bwd_ref_frame << get_ref_frame_map_idx(cpi, bwd_ref_frame));
-
- refresh_mask |= (cpi->refresh_alt2_ref_frame
- << get_ref_frame_map_idx(cpi, ALTREF2_FRAME));
-
- if (av1_preserve_existing_gf(cpi)) {
- // We have decided to preserve the previously existing golden frame as our
- // new ARF frame. However, in the short term we leave it in the GF slot and,
- // if we're updating the GF with the current decoded frame, we save it
- // instead to the ARF slot.
- // Later, in the function av1_encoder.c:av1_update_reference_frames() we
- // will swap gld_fb_idx and alt_fb_idx to achieve our objective. We do it
- // there so that it can be done outside of the recode loop.
- // Note: This is highly specific to the use of ARF as a forward reference,
- // and this needs to be generalized as other uses are implemented
- // (like RTC/temporal scalability).
-
- if (cpi->preserve_arf_as_gld) {
- return refresh_mask;
- } else {
- return refresh_mask | (cpi->refresh_golden_frame
- << get_ref_frame_map_idx(cpi, ALTREF_FRAME));
- }
- } else {
- const int arf_idx = get_ref_frame_map_idx(cpi, ALTREF_FRAME);
- return refresh_mask |
- (cpi->refresh_golden_frame
- << get_ref_frame_map_idx(cpi, GOLDEN_FRAME)) |
- (cpi->refresh_alt_ref_frame << arf_idx);
- }
-}
+// Stores the location and size of a tile's data in the bitstream. Used for
+// later identifying identical tiles
+typedef struct TileBufferEnc {
+ uint8_t *data;
+ size_t size;
+} TileBufferEnc;
static INLINE int find_identical_tile(
const int tile_row, const int tile_col,
@@ -2289,18 +2203,18 @@ static INLINE int find_identical_tile(
int col_offset = candidate_offset[0].col;
int row = tile_row - row_offset;
int col = tile_col - col_offset;
- uint8_t tile_hdr;
const uint8_t *tile_data;
TileBufferEnc *candidate;
if (row < 0 || col < 0) continue;
- tile_hdr = *(tile_buffers[row][col].data);
+ const uint32_t tile_hdr = mem_get_le32(tile_buffers[row][col].data);
- // Read out tcm bit
- if ((tile_hdr >> 7) == 1) {
- // The candidate is a copy tile itself
- row_offset += tile_hdr & 0x7f;
+ // Read out tile-copy-mode bit:
+ if ((tile_hdr >> 31) == 1) {
+ // The candidate is a copy tile itself: the offset is stored in bits
+ // 30 through 24 inclusive.
+ row_offset += (tile_hdr >> 24) & 0x7f;
row = tile_row - row_offset;
}
@@ -2370,14 +2284,13 @@ static void write_frame_size(const AV1_COMMON *cm, int frame_size_override,
write_render_size(cm, wb);
}
-static void write_frame_size_with_refs(AV1_COMP *cpi,
+static void write_frame_size_with_refs(const AV1_COMMON *const cm,
struct aom_write_bit_buffer *wb) {
- AV1_COMMON *const cm = &cpi->common;
int found = 0;
MV_REFERENCE_FRAME ref_frame;
for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
- YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, ref_frame);
+ const YV12_BUFFER_CONFIG *cfg = get_ref_frame_yv12_buf(cm, ref_frame);
if (cfg != NULL) {
found = cm->superres_upscaled_width == cfg->y_crop_width &&
@@ -2539,34 +2452,27 @@ static void write_tu_pts_info(AV1_COMMON *const cm,
cm->buffer_model.frame_presentation_time_length);
}
-static void write_film_grain_params(AV1_COMP *cpi,
+static void write_film_grain_params(const AV1_COMP *const cpi,
struct aom_write_bit_buffer *wb) {
- AV1_COMMON *const cm = &cpi->common;
- aom_film_grain_t *pars = &cm->film_grain_params;
-
- cm->cur_frame->film_grain_params = *pars;
+ const AV1_COMMON *const cm = &cpi->common;
+ const aom_film_grain_t *const pars = &cm->cur_frame->film_grain_params;
aom_wb_write_bit(wb, pars->apply_grain);
if (!pars->apply_grain) return;
aom_wb_write_literal(wb, pars->random_seed, 16);
- pars->random_seed += 3381; // Changing random seed for film grain
- if (!pars->random_seed) // Random seed should not be zero
- pars->random_seed += 7391;
if (cm->current_frame.frame_type == INTER_FRAME)
aom_wb_write_bit(wb, pars->update_parameters);
- else
- pars->update_parameters = 1;
+
if (!pars->update_parameters) {
- RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
- int ref_frame, ref_idx, buf_idx;
+ int ref_frame, ref_idx;
for (ref_frame = LAST_FRAME; ref_frame < REF_FRAMES; ref_frame++) {
- ref_idx = get_ref_frame_map_idx(cpi, ref_frame);
+ ref_idx = get_ref_frame_map_idx(cm, ref_frame);
assert(ref_idx != INVALID_IDX);
- buf_idx = cm->ref_frame_map[ref_idx];
- if (frame_bufs[buf_idx].film_grain_params_present &&
- memcmp(pars, &frame_bufs[buf_idx].film_grain_params, sizeof(*pars))) {
+ const RefCntBuffer *const buf = cm->ref_frame_map[ref_idx];
+ if (buf->film_grain_params_present &&
+ av1_check_grain_params_equiv(pars, &buf->film_grain_params)) {
break;
}
}
@@ -2582,16 +2488,16 @@ static void write_film_grain_params(AV1_COMP *cpi,
aom_wb_write_literal(wb, pars->scaling_points_y[i][1], 8);
}
- if (!cm->seq_params.monochrome)
+ if (!cm->seq_params.monochrome) {
aom_wb_write_bit(wb, pars->chroma_scaling_from_luma);
- else
- pars->chroma_scaling_from_luma = 0; // for monochrome override to 0
+ } else {
+ assert(!pars->chroma_scaling_from_luma);
+ }
if (cm->seq_params.monochrome || pars->chroma_scaling_from_luma ||
((cm->seq_params.subsampling_x == 1) &&
(cm->seq_params.subsampling_y == 1) && (pars->num_y_points == 0))) {
- pars->num_cb_points = 0;
- pars->num_cr_points = 0;
+ assert(pars->num_cb_points == 0 && pars->num_cr_points == 0);
} else {
aom_wb_write_literal(wb, pars->num_cb_points, 4); // max 10
for (int i = 0; i < pars->num_cb_points; i++) {
@@ -2651,7 +2557,7 @@ static void write_film_grain_params(AV1_COMP *cpi,
aom_wb_write_bit(wb, pars->clip_to_restricted_range);
}
-static void write_sb_size(SequenceHeader *seq_params,
+static void write_sb_size(const SequenceHeader *const seq_params,
struct aom_write_bit_buffer *wb) {
(void)seq_params;
(void)wb;
@@ -2662,41 +2568,16 @@ static void write_sb_size(SequenceHeader *seq_params,
aom_wb_write_bit(wb, seq_params->sb_size == BLOCK_128X128 ? 1 : 0);
}
-static void write_sequence_header(AV1_COMP *cpi,
+static void write_sequence_header(const SequenceHeader *const seq_params,
struct aom_write_bit_buffer *wb) {
- AV1_COMMON *const cm = &cpi->common;
- SequenceHeader *seq_params = &cm->seq_params;
-
- int max_frame_width = cpi->oxcf.forced_max_frame_width
- ? cpi->oxcf.forced_max_frame_width
- : cpi->oxcf.width;
- int max_frame_height = cpi->oxcf.forced_max_frame_height
- ? cpi->oxcf.forced_max_frame_height
- : cpi->oxcf.height;
- // max((int)ceil(log2(max_frame_width)), 1)
- const int num_bits_width =
- (max_frame_width > 1) ? get_msb(max_frame_width - 1) + 1 : 1;
- // max((int)ceil(log2(max_frame_height)), 1)
- const int num_bits_height =
- (max_frame_height > 1) ? get_msb(max_frame_height - 1) + 1 : 1;
- assert(num_bits_width <= 16);
- assert(num_bits_height <= 16);
-
- seq_params->num_bits_width = num_bits_width;
- seq_params->num_bits_height = num_bits_height;
- seq_params->max_frame_width = max_frame_width;
- seq_params->max_frame_height = max_frame_height;
-
- aom_wb_write_literal(wb, num_bits_width - 1, 4);
- aom_wb_write_literal(wb, num_bits_height - 1, 4);
- aom_wb_write_literal(wb, max_frame_width - 1, num_bits_width);
- aom_wb_write_literal(wb, max_frame_height - 1, num_bits_height);
-
- /* Placeholder for actually writing to the bitstream */
- if (!seq_params->reduced_still_picture_hdr) {
- seq_params->frame_id_length = FRAME_ID_LENGTH;
- seq_params->delta_frame_id_length = DELTA_FRAME_ID_LENGTH;
+ aom_wb_write_literal(wb, seq_params->num_bits_width - 1, 4);
+ aom_wb_write_literal(wb, seq_params->num_bits_height - 1, 4);
+ aom_wb_write_literal(wb, seq_params->max_frame_width - 1,
+ seq_params->num_bits_width);
+ aom_wb_write_literal(wb, seq_params->max_frame_height - 1,
+ seq_params->num_bits_height);
+ if (!seq_params->reduced_still_picture_hdr) {
aom_wb_write_bit(wb, seq_params->frame_id_numbers_present_flag);
if (seq_params->frame_id_numbers_present_flag) {
// We must always have delta_frame_id_length < frame_id_length,
@@ -2724,7 +2605,7 @@ static void write_sequence_header(AV1_COMP *cpi,
aom_wb_write_bit(wb, seq_params->order_hint_info.enable_order_hint);
if (seq_params->order_hint_info.enable_order_hint) {
- aom_wb_write_bit(wb, seq_params->order_hint_info.enable_jnt_comp);
+ aom_wb_write_bit(wb, seq_params->order_hint_info.enable_dist_wtd_comp);
aom_wb_write_bit(wb, seq_params->order_hint_info.enable_ref_frame_mvs);
}
if (seq_params->force_screen_content_tools == 2) {
@@ -2821,7 +2702,7 @@ static void write_global_motion(AV1_COMP *cpi,
// does not work currently and causes mismatches when resize is on.
// Fix it before turning the optimization back on.
/*
- YV12_BUFFER_CONFIG *ref_buf = get_ref_frame_buffer(cpi, frame);
+ YV12_BUFFER_CONFIG *ref_buf = get_ref_frame_yv12_buf(cpi, frame);
if (cpi->source->y_crop_width == ref_buf->y_crop_width &&
cpi->source->y_crop_height == ref_buf->y_crop_height) {
write_global_motion_params(&cm->global_motion[frame],
@@ -2842,78 +2723,72 @@ static void write_global_motion(AV1_COMP *cpi,
}
}
-static void check_frame_refs_short_signaling(AV1_COMP *const cpi) {
- AV1_COMMON *const cm = &cpi->common;
- if (!cm->frame_refs_short_signaling) return;
-
+static int check_frame_refs_short_signaling(AV1_COMMON *const cm) {
// Check whether all references are distinct frames.
- int buf_markers[FRAME_BUFFERS] = { 0 };
+ const RefCntBuffer *seen_bufs[FRAME_BUFFERS] = { NULL };
+ int num_refs = 0;
for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
- const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
- if (buf_idx != INVALID_IDX) {
- assert(buf_idx >= 0 && buf_idx < FRAME_BUFFERS);
- buf_markers[buf_idx] = 1;
+ const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
+ if (buf != NULL) {
+ int seen = 0;
+ for (int i = 0; i < num_refs; i++) {
+ if (seen_bufs[i] == buf) {
+ seen = 1;
+ break;
+ }
+ }
+ if (!seen) seen_bufs[num_refs++] = buf;
}
}
- int num_refs = 0;
- for (int buf_idx = 0; buf_idx < FRAME_BUFFERS; ++buf_idx) {
- num_refs += buf_markers[buf_idx];
- }
-
// We only turn on frame_refs_short_signaling when all references are
// distinct.
if (num_refs < INTER_REFS_PER_FRAME) {
// It indicates that there exist more than one reference frame pointing to
// the same reference buffer, i.e. two or more references are duplicate.
- cm->frame_refs_short_signaling = 0;
- return;
+ return 0;
}
// Check whether the encoder side ref frame choices are aligned with that to
// be derived at the decoder side.
- RefBuffer frame_refs_copy[INTER_REFS_PER_FRAME];
+ int remapped_ref_idx_decoder[REF_FRAMES];
- // Backup the frame refs info
- memcpy(frame_refs_copy, cm->current_frame.frame_refs,
- INTER_REFS_PER_FRAME * sizeof(RefBuffer));
-
- const int lst_map_idx = get_ref_frame_map_idx(cpi, LAST_FRAME);
- const int gld_map_idx = get_ref_frame_map_idx(cpi, GOLDEN_FRAME);
+ const int lst_map_idx = get_ref_frame_map_idx(cm, LAST_FRAME);
+ const int gld_map_idx = get_ref_frame_map_idx(cm, GOLDEN_FRAME);
// Set up the frame refs mapping indexes according to the
// frame_refs_short_signaling policy.
- av1_set_frame_refs(cm, lst_map_idx, gld_map_idx);
+ av1_set_frame_refs(cm, remapped_ref_idx_decoder, lst_map_idx, gld_map_idx);
// We only turn on frame_refs_short_signaling when the encoder side decision
// on ref frames is identical to that at the decoder side.
+ int frame_refs_short_signaling = 1;
for (int ref_idx = 0; ref_idx < INTER_REFS_PER_FRAME; ++ref_idx) {
// Compare the buffer index between two reference frames indexed
// respectively by the encoder and the decoder side decisions.
- if (cm->current_frame.frame_refs[ref_idx].buf !=
- frame_refs_copy[ref_idx].buf) {
- cm->frame_refs_short_signaling = 0;
+ RefCntBuffer *ref_frame_buf_new = NULL;
+ if (remapped_ref_idx_decoder[ref_idx] != INVALID_IDX) {
+ ref_frame_buf_new = cm->ref_frame_map[remapped_ref_idx_decoder[ref_idx]];
+ }
+ if (get_ref_frame_buf(cm, LAST_FRAME + ref_idx) != ref_frame_buf_new) {
+ frame_refs_short_signaling = 0;
break;
}
}
#if 0 // For debug
printf("\nFrame=%d: \n", cm->current_frame.frame_number);
- printf("***frame_refs_short_signaling=%d\n", cm->frame_refs_short_signaling);
+ printf("***frame_refs_short_signaling=%d\n", frame_refs_short_signaling);
for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
- printf("enc_ref(map_idx=%d, buf_idx=%d)=%d, vs. "
+ printf("enc_ref(map_idx=%d)=%d, vs. "
"dec_ref(map_idx=%d)=%d\n",
- get_ref_frame_map_idx(cpi, ref_frame),
- get_ref_frame_buf_idx(cpi, ref_frame), ref_frame,
- cm->current_frame.frame_refs[ref_frame - LAST_FRAME].map_idx,
+ get_ref_frame_map_idx(cm, ref_frame), ref_frame,
+ cm->remapped_ref_idx[ref_frame - LAST_FRAME],
ref_frame);
}
#endif // 0
- // Restore the frame refs info if frame_refs_short_signaling is off.
- if (!cm->frame_refs_short_signaling)
- memcpy(cm->current_frame.frame_refs, frame_refs_copy,
- INTER_REFS_PER_FRAME * sizeof(RefBuffer));
+ return frame_refs_short_signaling;
}
// New function based on HLS R18
@@ -2925,10 +2800,7 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
CurrentFrame *const current_frame = &cm->current_frame;
- // NOTE: By default all coded frames to be used as a reference
- cm->is_reference_frame = 1;
- current_frame->frame_type =
- current_frame->intra_only ? INTRA_ONLY_FRAME : current_frame->frame_type;
+ current_frame->frame_refs_short_signaling = 0;
if (seq_params->still_picture) {
assert(cm->show_existing_frame == 0);
@@ -2937,17 +2809,6 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
}
if (!seq_params->reduced_still_picture_hdr) {
if (encode_show_existing_frame(cm)) {
- RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
- const int frame_to_show = cm->ref_frame_map[cpi->existing_fb_idx_to_show];
-
- if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) {
- aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
- "Buffer %d does not contain a reconstructed frame",
- frame_to_show);
- }
- assign_frame_buffer(frame_bufs, &cm->new_fb_idx, frame_to_show);
- cm->cur_frame = &frame_bufs[cm->new_fb_idx];
-
aom_wb_write_bit(wb, 1); // show_existing_frame
aom_wb_write_literal(wb, cpi->existing_fb_idx_to_show, 3);
@@ -2960,14 +2821,6 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
int display_frame_id = cm->ref_frame_id[cpi->existing_fb_idx_to_show];
aom_wb_write_literal(wb, display_frame_id, frame_id_len);
}
-
- if (cm->reset_decoder_state &&
- frame_bufs[frame_to_show].frame_type != KEY_FRAME) {
- aom_internal_error(
- &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
- "show_existing_frame to reset state on KEY_FRAME only");
- }
-
return;
} else {
aom_wb_write_bit(wb, 0); // show_existing_frame
@@ -3008,29 +2861,28 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
assert(cm->cur_frame_force_integer_mv == 0);
}
- cm->invalid_delta_frame_id_minus_1 = 0;
int frame_size_override_flag = 0;
- cm->frame_refs_short_signaling = 0;
if (seq_params->reduced_still_picture_hdr) {
- assert(cm->width == seq_params->max_frame_width &&
- cm->height == seq_params->max_frame_height);
+ assert(cm->superres_upscaled_width == seq_params->max_frame_width &&
+ cm->superres_upscaled_height == seq_params->max_frame_height);
} else {
if (seq_params->frame_id_numbers_present_flag) {
int frame_id_len = seq_params->frame_id_length;
aom_wb_write_literal(wb, cm->current_frame_id, frame_id_len);
}
- if (cm->width > seq_params->max_frame_width ||
- cm->height > seq_params->max_frame_height) {
+ if (cm->superres_upscaled_width > seq_params->max_frame_width ||
+ cm->superres_upscaled_height > seq_params->max_frame_height) {
aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
"Frame dimensions are larger than the maximum values");
}
frame_size_override_flag =
- frame_is_sframe(cm) ? 1
- : (cm->width != seq_params->max_frame_width ||
- cm->height != seq_params->max_frame_height);
+ frame_is_sframe(cm)
+ ? 1
+ : (cm->superres_upscaled_width != seq_params->max_frame_width ||
+ cm->superres_upscaled_height != seq_params->max_frame_height);
if (!frame_is_sframe(cm)) aom_wb_write_bit(wb, frame_size_override_flag);
if (seq_params->order_hint_info.enable_order_hint)
@@ -3069,70 +2921,21 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
}
}
}
- cpi->refresh_frame_mask = get_refresh_mask(cpi);
- if (current_frame->frame_type == KEY_FRAME) {
- if (!cm->show_frame) { // unshown keyframe (forward keyframe)
- aom_wb_write_literal(wb, cpi->refresh_frame_mask, REF_FRAMES);
- } else {
- assert(cpi->refresh_frame_mask == 0xFF);
- }
- } else {
- if (current_frame->frame_type == INTRA_ONLY_FRAME) {
- assert(cpi->refresh_frame_mask != 0xFF);
- int updated_fb = -1;
- for (int i = 0; i < REF_FRAMES; i++) {
- // If more than one frame is refreshed, it doesn't matter which one
- // we pick, so pick the first.
- if (cpi->refresh_frame_mask & (1 << i)) {
- updated_fb = i;
- break;
- }
- }
- assert(updated_fb >= 0);
- cm->fb_of_context_type[cm->frame_context_idx] = updated_fb;
- aom_wb_write_literal(wb, cpi->refresh_frame_mask, REF_FRAMES);
- } else if (current_frame->frame_type == INTER_FRAME ||
- frame_is_sframe(cm)) {
- if (current_frame->frame_type == INTER_FRAME) {
- aom_wb_write_literal(wb, cpi->refresh_frame_mask, REF_FRAMES);
- } else {
- assert(frame_is_sframe(cm) && cpi->refresh_frame_mask == 0xFF);
- }
- int updated_fb = -1;
- for (int i = 0; i < REF_FRAMES; i++) {
- // If more than one frame is refreshed, it doesn't matter which one
- // we pick, so pick the first.
- if (cpi->refresh_frame_mask & (1 << i)) {
- updated_fb = i;
- break;
- }
- }
- // large scale tile sometimes won't refresh any fbs
- if (updated_fb >= 0) {
- cm->fb_of_context_type[cm->frame_context_idx] = updated_fb;
- }
- if (!cpi->refresh_frame_mask) {
- // NOTE: "cpi->refresh_frame_mask == 0" indicates that the coded frame
- // will not be used as a reference
- cm->is_reference_frame = 0;
- }
- }
- }
+ // Shown keyframes and switch-frames automatically refreshes all reference
+ // frames. For all other frame types, we need to write refresh_frame_flags.
+ if ((current_frame->frame_type == KEY_FRAME && !cm->show_frame) ||
+ current_frame->frame_type == INTER_FRAME ||
+ current_frame->frame_type == INTRA_ONLY_FRAME)
+ aom_wb_write_literal(wb, current_frame->refresh_frame_flags, REF_FRAMES);
- if (!frame_is_intra_only(cm) || cpi->refresh_frame_mask != 0xFF) {
+ if (!frame_is_intra_only(cm) || current_frame->refresh_frame_flags != 0xff) {
// Write all ref frame order hints if error_resilient_mode == 1
if (cm->error_resilient_mode &&
seq_params->order_hint_info.enable_order_hint) {
- RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
for (int ref_idx = 0; ref_idx < REF_FRAMES; ref_idx++) {
- // Get buffer index
- const int buf_idx = cm->ref_frame_map[ref_idx];
- assert(buf_idx >= 0 && buf_idx < FRAME_BUFFERS);
-
- // Write order hint to bit stream
aom_wb_write_literal(
- wb, frame_bufs[buf_idx].order_hint,
+ wb, cm->ref_frame_map[ref_idx]->order_hint,
seq_params->order_hint_info.order_hint_bits_minus_1 + 1);
}
}
@@ -3143,8 +2946,6 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
assert(!av1_superres_scaled(cm) || !cm->allow_intrabc);
if (cm->allow_screen_content_tools && !av1_superres_scaled(cm))
aom_wb_write_bit(wb, cm->allow_intrabc);
- // all eight fbs are refreshed, pick one that will live long enough
- cm->fb_of_context_type[REGULAR_FRAME] = 0;
} else {
if (current_frame->frame_type == INTRA_ONLY_FRAME) {
write_frame_size(cm, frame_size_override_flag, wb);
@@ -3159,36 +2960,37 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
// automatically.
#define FRAME_REFS_SHORT_SIGNALING 0
#if FRAME_REFS_SHORT_SIGNALING
- cm->frame_refs_short_signaling =
+ current_frame->frame_refs_short_signaling =
seq_params->order_hint_info.enable_order_hint;
#endif // FRAME_REFS_SHORT_SIGNALING
- if (cm->frame_refs_short_signaling) {
+ if (current_frame->frame_refs_short_signaling) {
// NOTE(zoeliu@google.com):
// An example solution for encoder-side implementation on frame refs
// short signaling, which is only turned on when the encoder side
// decision on ref frames is identical to that at the decoder side.
- check_frame_refs_short_signaling(cpi);
+ current_frame->frame_refs_short_signaling =
+ check_frame_refs_short_signaling(cm);
}
if (seq_params->order_hint_info.enable_order_hint)
- aom_wb_write_bit(wb, cm->frame_refs_short_signaling);
+ aom_wb_write_bit(wb, current_frame->frame_refs_short_signaling);
- if (cm->frame_refs_short_signaling) {
- const int lst_ref = get_ref_frame_map_idx(cpi, LAST_FRAME);
+ if (current_frame->frame_refs_short_signaling) {
+ const int lst_ref = get_ref_frame_map_idx(cm, LAST_FRAME);
aom_wb_write_literal(wb, lst_ref, REF_FRAMES_LOG2);
- const int gld_ref = get_ref_frame_map_idx(cpi, GOLDEN_FRAME);
+ const int gld_ref = get_ref_frame_map_idx(cm, GOLDEN_FRAME);
aom_wb_write_literal(wb, gld_ref, REF_FRAMES_LOG2);
}
for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
- assert(get_ref_frame_map_idx(cpi, ref_frame) != INVALID_IDX);
- if (!cm->frame_refs_short_signaling)
- aom_wb_write_literal(wb, get_ref_frame_map_idx(cpi, ref_frame),
+ assert(get_ref_frame_map_idx(cm, ref_frame) != INVALID_IDX);
+ if (!current_frame->frame_refs_short_signaling)
+ aom_wb_write_literal(wb, get_ref_frame_map_idx(cm, ref_frame),
REF_FRAMES_LOG2);
if (seq_params->frame_id_numbers_present_flag) {
- int i = get_ref_frame_map_idx(cpi, ref_frame);
+ int i = get_ref_frame_map_idx(cm, ref_frame);
int frame_id_len = seq_params->frame_id_length;
int diff_len = seq_params->delta_frame_id_length;
int delta_frame_id_minus_1 =
@@ -3197,24 +2999,22 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
(1 << frame_id_len)) -
1;
if (delta_frame_id_minus_1 < 0 ||
- delta_frame_id_minus_1 >= (1 << diff_len))
- cm->invalid_delta_frame_id_minus_1 = 1;
+ delta_frame_id_minus_1 >= (1 << diff_len)) {
+ aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR,
+ "Invalid delta_frame_id_minus_1");
+ }
aom_wb_write_literal(wb, delta_frame_id_minus_1, diff_len);
}
}
if (!cm->error_resilient_mode && frame_size_override_flag) {
- write_frame_size_with_refs(cpi, wb);
+ write_frame_size_with_refs(cm, wb);
} else {
write_frame_size(cm, frame_size_override_flag, wb);
}
- if (cm->cur_frame_force_integer_mv) {
- cm->allow_high_precision_mv = 0;
- } else {
+ if (!cm->cur_frame_force_integer_mv)
aom_wb_write_bit(wb, cm->allow_high_precision_mv);
- }
- fix_interp_filter(cm, cpi->td.counts);
write_frame_interp_filter(cm->interp_filter, wb);
aom_wb_write_bit(wb, cm->switchable_motion_mode);
if (frame_might_allow_ref_frame_mvs(cm)) {
@@ -3228,7 +3028,7 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
const int might_bwd_adapt =
!(seq_params->reduced_still_picture_hdr) && !(cm->disable_cdf_update);
if (cm->large_scale_tile)
- cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
+ assert(cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_DISABLED);
if (might_bwd_adapt) {
aom_wb_write_bit(
@@ -3268,9 +3068,13 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
encode_restoration_mode(cm, wb);
}
- write_tx_mode(cm, &cm->tx_mode, wb);
+ // Write TX mode
+ if (cm->coded_lossless)
+ assert(cm->tx_mode == ONLY_4X4);
+ else
+ aom_wb_write_bit(wb, cm->tx_mode == TX_MODE_SELECT);
- if (cpi->allow_comp_inter_inter) {
+ if (!frame_is_intra_only(cm)) {
const int use_hybrid_pred =
current_frame->reference_mode == REFERENCE_MODE_SELECT;
@@ -3290,19 +3094,9 @@ static void write_uncompressed_header_obu(AV1_COMP *cpi,
if (!frame_is_intra_only(cm)) write_global_motion(cpi, wb);
if (seq_params->film_grain_params_present &&
- (cm->show_frame || cm->showable_frame)) {
- int flip_back_update_parameters_flag = 0;
- if (current_frame->frame_type != INTER_FRAME &&
- cm->film_grain_params.update_parameters == 0) {
- cm->film_grain_params.update_parameters = 1;
- flip_back_update_parameters_flag = 1;
- }
+ (cm->show_frame || cm->showable_frame))
write_film_grain_params(cpi, wb);
- if (flip_back_update_parameters_flag)
- cm->film_grain_params.update_parameters = 0;
- }
-
if (cm->large_scale_tile) write_ext_tile_info(cm, saved_wb, wb);
}
@@ -3440,8 +3234,12 @@ static int remux_tiles(const AV1_COMMON *const cm, uint8_t *dst,
return wpos;
}
-uint32_t write_obu_header(OBU_TYPE obu_type, int obu_extension,
- uint8_t *const dst) {
+uint32_t av1_write_obu_header(AV1_COMP *const cpi, OBU_TYPE obu_type,
+ int obu_extension, uint8_t *const dst) {
+ if (cpi->keep_level_stats &&
+ (obu_type == OBU_FRAME || obu_type == OBU_FRAME_HEADER))
+ ++cpi->frame_header_count;
+
struct aom_write_bit_buffer wb = { dst, 0 };
uint32_t size = 0;
@@ -3493,9 +3291,8 @@ static void add_trailing_bits(struct aom_write_bit_buffer *wb) {
}
}
-static void write_bitstream_level(BitstreamLevel bl,
+static void write_bitstream_level(AV1_LEVEL seq_level_idx,
struct aom_write_bit_buffer *wb) {
- uint8_t seq_level_idx = major_minor_to_seq_level_idx(bl);
assert(is_valid_seq_level_idx(seq_level_idx));
aom_wb_write_literal(wb, seq_level_idx, LEVEL_BITS);
}
@@ -3518,7 +3315,7 @@ uint32_t write_sequence_header_obu(AV1_COMP *cpi, uint8_t *const dst) {
assert(cm->timing_info_present == 0);
assert(cm->seq_params.decoder_model_info_present_flag == 0);
assert(cm->seq_params.display_model_info_present_flag == 0);
- write_bitstream_level(cm->seq_params.level[0], &wb);
+ write_bitstream_level(cm->seq_params.seq_level_idx[0], &wb);
} else {
aom_wb_write_bit(&wb, cm->timing_info_present); // timing info present flag
@@ -3537,8 +3334,8 @@ uint32_t write_sequence_header_obu(AV1_COMP *cpi, uint8_t *const dst) {
for (i = 0; i < cm->seq_params.operating_points_cnt_minus_1 + 1; i++) {
aom_wb_write_literal(&wb, cm->seq_params.operating_point_idc[i],
OP_POINTS_IDC_BITS);
- write_bitstream_level(cm->seq_params.level[i], &wb);
- if (cm->seq_params.level[i].major > 3)
+ write_bitstream_level(cm->seq_params.seq_level_idx[i], &wb);
+ if (cm->seq_params.seq_level_idx[i] >= SEQ_LEVEL_4_0)
aom_wb_write_bit(&wb, cm->seq_params.tier[i]);
if (cm->seq_params.decoder_model_info_present_flag) {
aom_wb_write_bit(&wb,
@@ -3557,7 +3354,7 @@ uint32_t write_sequence_header_obu(AV1_COMP *cpi, uint8_t *const dst) {
}
}
}
- write_sequence_header(cpi, &wb);
+ write_sequence_header(&cm->seq_params, &wb);
write_color_config(&cm->seq_params, &wb);
@@ -3607,11 +3404,13 @@ typedef struct {
static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
struct aom_write_bit_buffer *saved_wb,
uint8_t obu_extension_header,
- const FrameHeaderInfo *fh_info) {
+ const FrameHeaderInfo *fh_info,
+ int *const largest_tile_id) {
AV1_COMMON *const cm = &cpi->common;
aom_writer mode_bc;
int tile_row, tile_col;
- TileBufferEnc(*const tile_buffers)[MAX_TILE_COLS] = cpi->tile_buffers;
+ // Store the location and size of each tile's data in the bitstream:
+ TileBufferEnc tile_buffers[MAX_TILE_ROWS][MAX_TILE_COLS];
uint32_t total_size = 0;
const int tile_cols = cm->tile_cols;
const int tile_rows = cm->tile_rows;
@@ -3632,13 +3431,13 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
const int have_tiles = tile_cols * tile_rows > 1;
int first_tg = 1;
- cm->largest_tile_id = 0;
+ *largest_tile_id = 0;
if (cm->large_scale_tile) {
// For large_scale_tile case, we always have only one tile group, so it can
// be written as an OBU_FRAME.
const OBU_TYPE obu_type = OBU_FRAME;
- const uint32_t tg_hdr_size = write_obu_header(obu_type, 0, data);
+ const uint32_t tg_hdr_size = av1_write_obu_header(cpi, obu_type, 0, data);
data += tg_hdr_size;
const uint32_t frame_header_size =
@@ -3685,8 +3484,6 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
// Is CONFIG_EXT_TILE = 1, every tile in the row has a header,
// even for the last one, unless no tiling is used at all.
total_size += data_offset;
- // Initialise tile context from the frame context
- this_tile->tctx = *cm->fc;
cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
mode_bc.allow_update_cdf = !cm->large_scale_tile;
mode_bc.allow_update_cdf =
@@ -3700,7 +3497,7 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
// Record the maximum tile size we see, so we can compact headers later.
if (tile_size > max_tile_size) {
max_tile_size = tile_size;
- cm->largest_tile_id = tile_cols * tile_row + tile_col;
+ *largest_tile_id = tile_cols * tile_row + tile_col;
}
if (have_tiles) {
@@ -3718,6 +3515,9 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
const int identical_tile_offset =
find_identical_tile(tile_row, tile_col, tile_buffers);
+ // Indicate a copy-tile by setting the most significant bit.
+ // The row-offset to copy from is stored in the highest byte.
+ // remux_tiles will move these around later
if (identical_tile_offset > 0) {
tile_size = 0;
tile_header = identical_tile_offset | 0x80;
@@ -3792,7 +3592,7 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
const OBU_TYPE obu_type =
(num_tg_hdrs == 1) ? OBU_FRAME : OBU_TILE_GROUP;
curr_tg_data_size =
- write_obu_header(obu_type, obu_extension_header, data);
+ av1_write_obu_header(cpi, obu_type, obu_extension_header, data);
obu_header_size = curr_tg_data_size;
if (num_tg_hdrs == 1) {
@@ -3823,8 +3623,6 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
// The last tile of the tile group does not have a header.
if (!is_last_tile_in_tg) total_size += 4;
- // Initialise tile context from the frame context
- this_tile->tctx = *cm->fc;
cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
mode_bc.allow_update_cdf = 1;
mode_bc.allow_update_cdf =
@@ -3841,7 +3639,7 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
curr_tg_data_size += (tile_size + (is_last_tile_in_tg ? 0 : 4));
buf->size = tile_size;
if (tile_size > max_tile_size) {
- cm->largest_tile_id = tile_cols * tile_row + tile_col;
+ *largest_tile_id = tile_cols * tile_row + tile_col;
max_tile_size = tile_size;
}
@@ -3876,12 +3674,13 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
// Force context update tile to be the first tile in error
// resiliant mode as the duplicate frame headers will have
// context_update_tile_id set to 0
- cm->largest_tile_id = 0;
+ *largest_tile_id = 0;
// Rewrite the OBU header to change the OBU type to Redundant Frame
// Header.
- write_obu_header(OBU_REDUNDANT_FRAME_HEADER, obu_extension_header,
- &data[fh_info->obu_header_byte_offset]);
+ av1_write_obu_header(cpi, OBU_REDUNDANT_FRAME_HEADER,
+ obu_extension_header,
+ &data[fh_info->obu_header_byte_offset]);
data += fh_info->total_length;
@@ -3899,7 +3698,7 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
// Fill in context_update_tile_id indicating the tile to use for the
// cdf update. The encoder currently sets it to the largest tile
// (but is up to the encoder)
- aom_wb_overwrite_literal(saved_wb, cm->largest_tile_id,
+ aom_wb_overwrite_literal(saved_wb, *largest_tile_id,
cm->log2_tile_cols + cm->log2_tile_rows);
// If more than one tile group. tile_size_bytes takes the default value 4
// and does not need to be set. For a single tile group it is set in the
@@ -3945,7 +3744,8 @@ static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
return total_size;
}
-int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size) {
+int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size,
+ int *const largest_tile_id) {
uint8_t *data = dst;
uint32_t data_size;
AV1_COMMON *const cm = &cpi->common;
@@ -3959,11 +3759,13 @@ int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size) {
bitstream_queue_reset_write();
#endif
+ cpi->frame_header_count = 0;
+
// The TD is now written outside the frame encode loop
// write sequence header obu if KEY_FRAME, preceded by 4-byte size
if (cm->current_frame.frame_type == KEY_FRAME && cm->show_frame) {
- obu_header_size = write_obu_header(OBU_SEQUENCE_HEADER, 0, data);
+ obu_header_size = av1_write_obu_header(cpi, OBU_SEQUENCE_HEADER, 0, data);
obu_payload_size = write_sequence_header_obu(cpi, data + obu_header_size);
const size_t length_field_size =
@@ -3983,7 +3785,7 @@ int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size) {
// Write Frame Header OBU.
fh_info.frame_header = data;
obu_header_size =
- write_obu_header(OBU_FRAME_HEADER, obu_extension_header, data);
+ av1_write_obu_header(cpi, OBU_FRAME_HEADER, obu_extension_header, data);
obu_payload_size =
write_frame_header_obu(cpi, &saved_wb, data + obu_header_size, 1);
@@ -4009,8 +3811,8 @@ int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size) {
} else {
// Each tile group obu will be preceded by 4-byte size of the tile group
// obu
- data_size = write_tiles_in_tg_obus(cpi, data, &saved_wb,
- obu_extension_header, &fh_info);
+ data_size = write_tiles_in_tg_obus(
+ cpi, data, &saved_wb, obu_extension_header, &fh_info, largest_tile_id);
}
data += data_size;
*size = data - dst;
diff --git a/libaom/av1/encoder/bitstream.h b/libaom/av1/encoder/bitstream.h
index 465ccae..b05d0d5 100644
--- a/libaom/av1/encoder/bitstream.h
+++ b/libaom/av1/encoder/bitstream.h
@@ -27,18 +27,14 @@ uint32_t write_sequence_header_obu(AV1_COMP *cpi, uint8_t *const dst);
// Writes the OBU header byte, and the OBU header extension byte when
// 'obu_extension' is non-zero. Returns number of bytes written to 'dst'.
-uint32_t write_obu_header(OBU_TYPE obu_type, int obu_extension,
- uint8_t *const dst);
+uint32_t av1_write_obu_header(AV1_COMP *const cpi, OBU_TYPE obu_type,
+ int obu_extension, uint8_t *const dst);
int write_uleb_obu_size(uint32_t obu_header_size, uint32_t obu_payload_size,
uint8_t *dest);
-int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dest, size_t *size);
-
-static INLINE int av1_preserve_existing_gf(AV1_COMP *cpi) {
- // Do not swap gf and arf indices for internal overlay frames
- return cpi->rc.is_src_frame_alt_ref && !cpi->rc.is_src_frame_ext_arf;
-}
+int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size,
+ int *const largest_tile_id);
void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd,
int blk_row, int blk_col, int plane, TX_SIZE tx_size,
diff --git a/libaom/av1/encoder/block.h b/libaom/av1/encoder/block.h
index 1b04519..96b0991 100644
--- a/libaom/av1/encoder/block.h
+++ b/libaom/av1/encoder/block.h
@@ -54,10 +54,10 @@ typedef struct macroblock_plane {
typedef struct {
int txb_skip_cost[TXB_SKIP_CONTEXTS][2];
int base_eob_cost[SIG_COEF_CONTEXTS_EOB][3];
- int base_cost[SIG_COEF_CONTEXTS][4];
+ int base_cost[SIG_COEF_CONTEXTS][8];
int eob_extra_cost[EOB_COEF_CONTEXTS][2];
int dc_sign_cost[DC_SIGN_CONTEXTS][2];
- int lps_cost[LEVEL_CONTEXTS][COEFF_BASE_RANGE + 1];
+ int lps_cost[LEVEL_CONTEXTS][COEFF_BASE_RANGE + 1 + COEFF_BASE_RANGE + 1];
} LV_MAP_COEFF_COST;
typedef struct {
@@ -74,16 +74,13 @@ typedef struct {
} CB_COEFF_BUFFER;
typedef struct {
- int16_t mode_context[MODE_CTX_REF_FRAMES];
// TODO(angiebird): Reduce the buffer size according to sb_type
- tran_low_t *tcoeff[MAX_MB_PLANE];
- uint16_t *eobs[MAX_MB_PLANE];
- uint8_t *txb_skip_ctx[MAX_MB_PLANE];
- int *dc_sign_ctx[MAX_MB_PLANE];
- uint8_t ref_mv_count[MODE_CTX_REF_FRAMES];
+ CB_COEFF_BUFFER *cb_coef_buff;
CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE];
int_mv global_mvs[REF_FRAMES];
- int16_t compound_mode_context[MODE_CTX_REF_FRAMES];
+ int cb_offset;
+ int16_t mode_context[MODE_CTX_REF_FRAMES];
+ uint8_t ref_mv_count[MODE_CTX_REF_FRAMES];
} MB_MODE_INFO_EXT;
typedef struct {
@@ -156,7 +153,7 @@ typedef struct {
// Region size for mode decision sampling in the first pass of partition
// search(two_pass_partition_search speed feature), in units of mi size(4).
-// Used by the mode_pruning_based_on_two_pass_partition_search speed feature.
+// Used by the mode pruning in two_pass_partition_search feature.
#define FIRST_PARTITION_PASS_SAMPLE_REGION 8
#define FIRST_PARTITION_PASS_SAMPLE_REGION_LOG2 3
#define FIRST_PARTITION_PASS_STATS_TABLES \
@@ -177,6 +174,8 @@ typedef struct {
uint8_t ref0_counts[REF_FRAMES]; // Counters for ref_frame[0].
uint8_t ref1_counts[REF_FRAMES]; // Counters for ref_frame[1].
int sample_counts; // Number of samples collected.
+ uint8_t interintra_motion_mode_count[REF_FRAMES]; // Counter for interintra
+ // motion mode
} FIRST_PARTITION_PASS_STATS;
#define MAX_INTERP_FILTER_STATS 64
@@ -185,11 +184,26 @@ typedef struct {
int_mv mv[2];
int8_t ref_frames[2];
COMPOUND_TYPE comp_type;
+ int64_t rd;
+ int skip_txfm_sb;
+ int64_t skip_sse_sb;
+ unsigned int pred_sse;
} INTERPOLATION_FILTER_STATS;
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+#define MAX_COMP_RD_STATS 64
+typedef struct {
+ int32_t rate[COMPOUND_TYPES];
+ int64_t dist[COMPOUND_TYPES];
+ int64_t comp_model_rd[COMPOUND_TYPES];
+ int_mv mv[2];
+ MV_REFERENCE_FRAME ref_frames[2];
+ PREDICTION_MODE mode;
+ InterpFilters filter;
+ int ref_mv_idx;
+ int is_global[2];
+} COMP_RD_STATS;
+
struct inter_modes_info;
-#endif
typedef struct macroblock MACROBLOCK;
struct macroblock {
struct macroblock_plane plane[MAX_MB_PLANE];
@@ -251,6 +265,9 @@ struct macroblock {
int *ex_search_count_ptr;
unsigned int txb_split_count;
+#if CONFIG_SPEED_STATS
+ unsigned int tx_search_count;
+#endif // CONFIG_SPEED_STATS
// These are set to their default values at the beginning, and then adjusted
// further in the encoding process.
@@ -259,6 +276,7 @@ struct macroblock {
unsigned int max_mv_context[REF_FRAMES];
unsigned int source_variance;
+ unsigned int simple_motion_pred_sse;
unsigned int pred_sse[REF_FRAMES];
int pred_mv_sad[REF_FRAMES];
@@ -277,7 +295,7 @@ struct macroblock {
CONV_BUF_TYPE *tmp_conv_dst;
uint8_t *tmp_obmc_bufs[2];
- FRAME_CONTEXT *backup_tile_ctx;
+ FRAME_CONTEXT *row_ctx;
// This context will be used to update color_map_cdf pointer which would be
// used during pack bitstream. For single thread and tile-multithreading case
// this ponter will be same as xd->tile_ctx, but for the case of row-mt:
@@ -285,9 +303,7 @@ struct macroblock {
// to the accurate tile context.
FRAME_CONTEXT *tile_pb_ctx;
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
struct inter_modes_info *inter_modes_info;
-#endif
// buffer for hash value calculation of a block
// used only in av1_get_block_hash_value()
@@ -340,7 +356,7 @@ struct macroblock {
// BWDREF_FRAME) in bidir-comp mode.
int comp_bwdref_cost[REF_CONTEXTS][BWD_REFS - 1][2];
int inter_compound_mode_cost[INTER_MODE_CONTEXTS][INTER_COMPOUND_MODES];
- int compound_type_cost[BLOCK_SIZES_ALL][COMPOUND_TYPES - 1];
+ int compound_type_cost[BLOCK_SIZES_ALL][MASKED_COMPOUND_TYPES];
int wedge_idx_cost[BLOCK_SIZES_ALL][16];
int interintra_cost[BLOCK_SIZE_GROUPS][2];
int wedge_interintra_cost[BLOCK_SIZES_ALL][2];
@@ -385,6 +401,11 @@ struct macroblock {
// Store the fractional best motion vector during sub/Qpel-pixel motion search
int_mv fractional_best_mv[3];
+ // Ref frames that are selected by square partition blocks within a super-
+ // block, in MI resolution. They can be used to prune ref frames for
+ // rectangular blocks.
+ int picked_ref_frames_mask[32 * 32];
+
// use default transform and skip transform type search for intra modes
int use_default_intra_tx_type;
// use default transform and skip transform type search for inter modes
@@ -405,6 +426,13 @@ struct macroblock {
// detection). For reference, 556 is the value returned for a solid
// vertical black/white edge.
uint16_t edge_strength;
+ // The strongest edge strength seen along the x/y axis.
+ uint16_t edge_strength_x;
+ uint16_t edge_strength_y;
+
+ // [Saved stat index]
+ COMP_RD_STATS comp_rd_stats[MAX_COMP_RD_STATS];
+ int comp_rd_stats_idx;
};
static INLINE int is_rect_tx_allowed_bsize(BLOCK_SIZE bsize) {
diff --git a/libaom/av1/encoder/context_tree.h b/libaom/av1/encoder/context_tree.h
index cde3f2b..205ac8a 100644
--- a/libaom/av1/encoder/context_tree.h
+++ b/libaom/av1/encoder/context_tree.h
@@ -23,7 +23,7 @@ struct AV1_COMP;
struct AV1Common;
struct ThreadData;
-typedef enum {
+enum {
// Search all the partition types in this plane.
SEARCH_FULL_PLANE = 0,
// Only search none_partition coding block.
@@ -32,12 +32,14 @@ typedef enum {
SEARCH_SAME_PLANE = 2,
// Skip search partition on this plane. Go split directly.
SPLIT_PLANE = 3,
-} CB_TREE_SEARCH;
+} UENUM1BYTE(CB_TREE_SEARCH);
// Structure to hold snapshot of coding context during the mode picking process
typedef struct {
MB_MODE_INFO mic;
MB_MODE_INFO_EXT mbmi_ext;
+ int64_t dist;
+ int64_t rdcost;
uint8_t *color_index_map[2];
uint8_t *blk_skip;
@@ -56,51 +58,32 @@ typedef struct {
int hybrid_pred_diff;
int comp_pred_diff;
int single_pred_diff;
- // Skip certain ref frames during RD search of rectangular partitions.
- int skip_ref_frame_mask;
// TODO(jingning) Use RD_COST struct here instead. This involves a boarder
// scope of refactoring.
int rate;
- int64_t dist;
- int64_t rdcost;
+
int rd_mode_is_ready; // Flag to indicate whether rd pick mode decision has
// been made.
-#if CONFIG_ONE_PASS_SVM
- // Features for one pass svm early term
- int seg_feat;
-#endif
-
// motion vector cache for adaptive motion search control in partition
// search loop
MV pred_mv[REF_FRAMES];
InterpFilter pred_interp_filter;
PARTITION_TYPE partition;
-
- // Reference and prediction mode cache for ref/mode speedup
- // TODO(zoeliu@gmail.com): The values of ref_selected and mode_selected will
- // be explored for further encoder speedup, to differentiate this approach for
- // setting skip_ref_frame_mask from others. For instance, it is possible that
- // the underlying square block(s) share the same SIMPLE_TRANSLATION motion
- // mode as well as the mode of GLOBALMV, more ref/mode combos could be
- // skipped.
- MV_REFERENCE_FRAME ref_selected[2];
- int mode_selected;
} PICK_MODE_CONTEXT;
typedef struct {
+ int64_t rdcost;
+ int64_t sub_block_rdcost[4];
int valid;
int split;
- int skip;
- int64_t rdcost;
int sub_block_split[4];
int sub_block_skip[4];
- int64_t sub_block_rdcost[4];
+ int skip;
} PC_TREE_STATS;
typedef struct PC_TREE {
- int index;
PARTITION_TYPE partitioning;
BLOCK_SIZE block_size;
PICK_MODE_CONTEXT none;
@@ -112,9 +95,11 @@ typedef struct PC_TREE {
PICK_MODE_CONTEXT verticalb[3];
PICK_MODE_CONTEXT horizontal4[4];
PICK_MODE_CONTEXT vertical4[4];
- CB_TREE_SEARCH cb_search_range;
struct PC_TREE *split[4];
PC_TREE_STATS pc_tree_stats;
+ CB_TREE_SEARCH cb_search_range;
+ int index;
+ MV mv_ref_fulls[REF_FRAMES];
} PC_TREE;
void av1_setup_pc_tree(struct AV1Common *cm, struct ThreadData *td);
diff --git a/libaom/av1/encoder/cost.h b/libaom/av1/encoder/cost.h
index af5b098..be0241a 100644
--- a/libaom/av1/encoder/cost.h
+++ b/libaom/av1/encoder/cost.h
@@ -30,6 +30,10 @@ extern const uint16_t av1_prob_cost[128];
// Calculate the cost of a symbol with probability p15 / 2^15
static INLINE int av1_cost_symbol(aom_cdf_prob p15) {
+ // p15 can be out of range [1, CDF_PROB_TOP - 1]. Clamping it, so that the
+ // following cost calculation works correctly. Otherwise, if p15 =
+ // CDF_PROB_TOP, shift would be -1, and "p15 << shift" would be wrong.
+ p15 = (aom_cdf_prob)clamp(p15, 1, CDF_PROB_TOP - 1);
assert(0 < p15 && p15 < CDF_PROB_TOP);
const int shift = CDF_PROB_BITS - 1 - get_msb(p15);
const int prob = get_prob(p15 << shift, CDF_PROB_TOP);
diff --git a/libaom/av1/encoder/encode_strategy.c b/libaom/av1/encoder/encode_strategy.c
new file mode 100644
index 0000000..e9d6ee7
--- /dev/null
+++ b/libaom/av1/encoder/encode_strategy.c
@@ -0,0 +1,1173 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdint.h>
+
+#include "config/aom_config.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_codec.h"
+#include "aom/aom_encoder.h"
+
+#include "aom_ports/system_state.h"
+
+#if CONFIG_MISMATCH_DEBUG
+#include "aom_util/debug_util.h"
+#endif // CONFIG_MISMATCH_DEBUG
+
+#include "av1/common/onyxc_int.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encode_strategy.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/pass2_strategy.h"
+#include "av1/encoder/temporal_filter.h"
+#include "av1/encoder/tpl_model.h"
+
+void av1_configure_buffer_updates(AV1_COMP *const cpi,
+ EncodeFrameParams *const frame_params,
+ const FRAME_UPDATE_TYPE type,
+ int force_refresh_all) {
+ // NOTE(weitinglin): Should we define another function to take care of
+ // cpi->rc.is_$Source_Type to make this function as it is in the comment?
+
+ cpi->rc.is_src_frame_alt_ref = 0;
+ cpi->rc.is_src_frame_internal_arf = 0;
+
+ switch (type) {
+ case KF_UPDATE:
+ frame_params->refresh_last_frame = 1;
+ frame_params->refresh_golden_frame = 1;
+ frame_params->refresh_bwd_ref_frame = 1;
+ frame_params->refresh_alt2_ref_frame = 1;
+ frame_params->refresh_alt_ref_frame = 1;
+ break;
+
+ case LF_UPDATE:
+ frame_params->refresh_last_frame = 1;
+ frame_params->refresh_golden_frame = 0;
+ frame_params->refresh_bwd_ref_frame = 0;
+ frame_params->refresh_alt2_ref_frame = 0;
+ frame_params->refresh_alt_ref_frame = 0;
+ break;
+
+ case GF_UPDATE:
+ // TODO(zoeliu): To further investigate whether 'refresh_last_frame' is
+ // needed.
+ frame_params->refresh_last_frame = 1;
+ frame_params->refresh_golden_frame = 1;
+ frame_params->refresh_bwd_ref_frame = 0;
+ frame_params->refresh_alt2_ref_frame = 0;
+ frame_params->refresh_alt_ref_frame = 0;
+ break;
+
+ case OVERLAY_UPDATE:
+ frame_params->refresh_last_frame = 0;
+ frame_params->refresh_golden_frame = 1;
+ frame_params->refresh_bwd_ref_frame = 0;
+ frame_params->refresh_alt2_ref_frame = 0;
+ frame_params->refresh_alt_ref_frame = 0;
+
+ cpi->rc.is_src_frame_alt_ref = 1;
+ break;
+
+ case ARF_UPDATE:
+ frame_params->refresh_last_frame = 0;
+ frame_params->refresh_golden_frame = 0;
+ // NOTE: BWDREF does not get updated along with ALTREF_FRAME.
+ frame_params->refresh_bwd_ref_frame = 0;
+ frame_params->refresh_alt2_ref_frame = 0;
+ frame_params->refresh_alt_ref_frame = 1;
+ break;
+
+ case INTNL_OVERLAY_UPDATE:
+ frame_params->refresh_last_frame = 1;
+ frame_params->refresh_golden_frame = 0;
+ frame_params->refresh_bwd_ref_frame = 0;
+ frame_params->refresh_alt2_ref_frame = 0;
+ frame_params->refresh_alt_ref_frame = 0;
+
+ cpi->rc.is_src_frame_alt_ref = 1;
+ cpi->rc.is_src_frame_internal_arf = 1;
+ break;
+
+ case INTNL_ARF_UPDATE:
+ frame_params->refresh_last_frame = 0;
+ frame_params->refresh_golden_frame = 0;
+ if (cpi->oxcf.pass == 2) {
+ frame_params->refresh_bwd_ref_frame = 1;
+ frame_params->refresh_alt2_ref_frame = 0;
+ } else {
+ frame_params->refresh_bwd_ref_frame = 0;
+ frame_params->refresh_alt2_ref_frame = 1;
+ }
+ frame_params->refresh_alt_ref_frame = 0;
+ break;
+
+ default: assert(0); break;
+ }
+
+ if (cpi->ext_refresh_frame_flags_pending &&
+ (cpi->oxcf.pass == 0 || cpi->oxcf.pass == 2)) {
+ frame_params->refresh_last_frame = cpi->ext_refresh_last_frame;
+ frame_params->refresh_golden_frame = cpi->ext_refresh_golden_frame;
+ frame_params->refresh_alt_ref_frame = cpi->ext_refresh_alt_ref_frame;
+ frame_params->refresh_bwd_ref_frame = cpi->ext_refresh_bwd_ref_frame;
+ frame_params->refresh_alt2_ref_frame = cpi->ext_refresh_alt2_ref_frame;
+ }
+
+ if (force_refresh_all) {
+ frame_params->refresh_last_frame = 1;
+ frame_params->refresh_golden_frame = 1;
+ frame_params->refresh_bwd_ref_frame = 1;
+ frame_params->refresh_alt2_ref_frame = 1;
+ frame_params->refresh_alt_ref_frame = 1;
+ }
+}
+
+static void set_additional_frame_flags(const AV1_COMMON *const cm,
+ unsigned int *const frame_flags) {
+ if (frame_is_intra_only(cm)) *frame_flags |= FRAMEFLAGS_INTRAONLY;
+ if (frame_is_sframe(cm)) *frame_flags |= FRAMEFLAGS_SWITCH;
+ if (cm->error_resilient_mode) *frame_flags |= FRAMEFLAGS_ERROR_RESILIENT;
+}
+
+static INLINE void update_keyframe_counters(AV1_COMP *cpi) {
+ if (cpi->common.show_frame) {
+ if (!cpi->common.show_existing_frame || cpi->rc.is_src_frame_alt_ref ||
+ cpi->common.current_frame.frame_type == KEY_FRAME) {
+ // If this is a show_existing_frame with a source other than altref,
+ // or if it is not a displayed forward keyframe, the keyframe update
+ // counters were incremented when it was originally encoded.
+ cpi->rc.frames_since_key++;
+ cpi->rc.frames_to_key--;
+ }
+ }
+}
+
+static INLINE int is_frame_droppable(const AV1_COMP *const cpi) {
+ return !(cpi->refresh_alt_ref_frame || cpi->refresh_alt2_ref_frame ||
+ cpi->refresh_bwd_ref_frame || cpi->refresh_golden_frame ||
+ cpi->refresh_last_frame);
+}
+
+static INLINE void update_frames_till_gf_update(AV1_COMP *cpi) {
+ // TODO(weitinglin): Updating this counter for is_frame_droppable
+ // is a work-around to handle the condition when a frame is drop.
+ // We should fix the cpi->common.show_frame flag
+ // instead of checking the other condition to update the counter properly.
+ if (cpi->common.show_frame || is_frame_droppable(cpi)) {
+ // Decrement count down till next gf
+ if (cpi->rc.frames_till_gf_update_due > 0)
+ cpi->rc.frames_till_gf_update_due--;
+ }
+}
+
+static INLINE void update_twopass_gf_group_index(AV1_COMP *cpi) {
+ // Increment the gf group index ready for the next frame. If this is
+ // a show_existing_frame with a source other than altref, or if it is not
+ // a displayed forward keyframe, the index was incremented when it was
+ // originally encoded.
+ if (!cpi->common.show_existing_frame || cpi->rc.is_src_frame_alt_ref ||
+ cpi->common.current_frame.frame_type == KEY_FRAME) {
+ ++cpi->twopass.gf_group.index;
+ }
+}
+
+static void update_rc_counts(AV1_COMP *cpi) {
+ update_keyframe_counters(cpi);
+ update_frames_till_gf_update(cpi);
+ if (cpi->oxcf.pass == 2) update_twopass_gf_group_index(cpi);
+}
+
+static void check_show_existing_frame(AV1_COMP *const cpi,
+ EncodeFrameParams *const frame_params) {
+ const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+ AV1_COMMON *const cm = &cpi->common;
+ const FRAME_UPDATE_TYPE frame_update_type =
+ gf_group->update_type[gf_group->index];
+ const int which_arf = (gf_group->arf_update_idx[gf_group->index] > 0);
+
+ if (cm->show_existing_frame == 1) {
+ frame_params->show_existing_frame = 0;
+ } else if (cpi->is_arf_filter_off[which_arf] &&
+ (frame_update_type == OVERLAY_UPDATE ||
+ frame_update_type == INTNL_OVERLAY_UPDATE)) {
+ // Other parameters related to OVERLAY_UPDATE will be taken care of
+ // in av1_get_second_pass_params(cpi)
+ frame_params->show_existing_frame = 1;
+ frame_params->existing_fb_idx_to_show =
+ (frame_update_type == OVERLAY_UPDATE)
+ ? get_ref_frame_map_idx(cm, ALTREF_FRAME)
+ : get_ref_frame_map_idx(cm, BWDREF_FRAME);
+ }
+}
+
+static void set_ext_overrides(AV1_COMP *const cpi,
+ EncodeFrameParams *const frame_params) {
+ // Overrides the defaults with the externally supplied values with
+ // av1_update_reference() and av1_update_entropy() calls
+ // Note: The overrides are valid only for the next frame passed
+ // to av1_encode_lowlevel()
+
+ AV1_COMMON *const cm = &cpi->common;
+
+ if (cpi->ext_use_s_frame) {
+ frame_params->frame_type = S_FRAME;
+ }
+
+ if (cpi->ext_refresh_frame_context_pending) {
+ cm->refresh_frame_context = cpi->ext_refresh_frame_context;
+ cpi->ext_refresh_frame_context_pending = 0;
+ }
+ cm->allow_ref_frame_mvs = cpi->ext_use_ref_frame_mvs;
+
+ frame_params->error_resilient_mode = cpi->ext_use_error_resilient;
+ // A keyframe is already error resilient and keyframes with
+ // error_resilient_mode interferes with the use of show_existing_frame
+ // when forward reference keyframes are enabled.
+ frame_params->error_resilient_mode &= frame_params->frame_type != KEY_FRAME;
+ // For bitstream conformance, s-frames must be error-resilient
+ frame_params->error_resilient_mode |= frame_params->frame_type == S_FRAME;
+}
+
+static int get_ref_frame_flags(const AV1_COMP *const cpi) {
+ const AV1_COMMON *const cm = &cpi->common;
+
+ const RefCntBuffer *last_buf = get_ref_frame_buf(cm, LAST_FRAME);
+ const RefCntBuffer *last2_buf = get_ref_frame_buf(cm, LAST2_FRAME);
+ const RefCntBuffer *last3_buf = get_ref_frame_buf(cm, LAST3_FRAME);
+ const RefCntBuffer *golden_buf = get_ref_frame_buf(cm, GOLDEN_FRAME);
+ const RefCntBuffer *bwd_buf = get_ref_frame_buf(cm, BWDREF_FRAME);
+ const RefCntBuffer *alt2_buf = get_ref_frame_buf(cm, ALTREF2_FRAME);
+ const RefCntBuffer *alt_buf = get_ref_frame_buf(cm, ALTREF_FRAME);
+
+ // No.1 Priority: LAST_FRAME
+ const int last2_is_last = (last2_buf == last_buf);
+ const int last3_is_last = (last3_buf == last_buf);
+ const int gld_is_last = (golden_buf == last_buf);
+ const int bwd_is_last = (bwd_buf == last_buf);
+ const int alt2_is_last = (alt2_buf == last_buf);
+ const int alt_is_last = (alt_buf == last_buf);
+
+ // No.2 Priority: ALTREF_FRAME
+ const int last2_is_alt = (last2_buf == alt_buf);
+ const int last3_is_alt = (last3_buf == alt_buf);
+ const int gld_is_alt = (golden_buf == alt_buf);
+ const int bwd_is_alt = (bwd_buf == alt_buf);
+ const int alt2_is_alt = (alt2_buf == alt_buf);
+
+ // No.3 Priority: LAST2_FRAME
+ const int last3_is_last2 = (last3_buf == last2_buf);
+ const int gld_is_last2 = (golden_buf == last2_buf);
+ const int bwd_is_last2 = (bwd_buf == last2_buf);
+ const int alt2_is_last2 = (alt2_buf == last2_buf);
+
+ // No.4 Priority: LAST3_FRAME
+ const int gld_is_last3 = (golden_buf == last3_buf);
+ const int bwd_is_last3 = (bwd_buf == last3_buf);
+ const int alt2_is_last3 = (alt2_buf == last3_buf);
+
+ // No.5 Priority: GOLDEN_FRAME
+ const int bwd_is_gld = (bwd_buf == golden_buf);
+ const int alt2_is_gld = (alt2_buf == golden_buf);
+
+ // No.6 Priority: BWDREF_FRAME
+ const int alt2_is_bwd = (alt2_buf == bwd_buf);
+
+ // No.7 Priority: ALTREF2_FRAME
+
+ // cpi->ext_ref_frame_flags allows certain reference types to be disabled
+ // by the external interface. These are set by av1_apply_encoding_flags().
+ // Start with what the external interface allows, then suppress any reference
+ // types which we have found to be duplicates.
+
+ int flags = cpi->ext_ref_frame_flags;
+
+ if (cpi->rc.frames_till_gf_update_due == INT_MAX) flags &= ~AOM_GOLD_FLAG;
+
+ if (alt_is_last) flags &= ~AOM_ALT_FLAG;
+
+ if (last2_is_last || last2_is_alt) flags &= ~AOM_LAST2_FLAG;
+
+ if (last3_is_last || last3_is_alt || last3_is_last2) flags &= ~AOM_LAST3_FLAG;
+
+ if (gld_is_last || gld_is_alt || gld_is_last2 || gld_is_last3)
+ flags &= ~AOM_GOLD_FLAG;
+
+ if ((bwd_is_last || bwd_is_alt || bwd_is_last2 || bwd_is_last3 || bwd_is_gld))
+ flags &= ~AOM_BWD_FLAG;
+
+ if ((alt2_is_last || alt2_is_alt || alt2_is_last2 || alt2_is_last3 ||
+ alt2_is_gld || alt2_is_bwd))
+ flags &= ~AOM_ALT2_FLAG;
+
+ return flags;
+}
+
+static int get_current_frame_ref_type(
+ const AV1_COMP *const cpi, const EncodeFrameParams *const frame_params) {
+ const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+ // We choose the reference "type" of this frame from the flags which indicate
+ // which reference frames will be refreshed by it. More than one of these
+ // flags may be set, so the order here implies an order of precedence.
+ // This is just used to choose the primary_ref_frame (as the most recent
+ // reference buffer of the same reference-type as the current frame)
+
+ const int intra_only = frame_params->frame_type == KEY_FRAME ||
+ frame_params->frame_type == INTRA_ONLY_FRAME;
+ if (intra_only || frame_params->error_resilient_mode ||
+ cpi->ext_use_primary_ref_none)
+ return REGULAR_FRAME;
+ else if (gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE)
+ return INTERNAL_ARF_FRAME;
+ else if (frame_params->refresh_alt_ref_frame)
+ return ARF_FRAME;
+ else if (cpi->rc.is_src_frame_alt_ref)
+ return OVERLAY_FRAME;
+ else if (frame_params->refresh_golden_frame)
+ return GLD_FRAME;
+ else if (frame_params->refresh_bwd_ref_frame)
+ return BRF_FRAME;
+ else
+ return REGULAR_FRAME;
+}
+
+static int choose_primary_ref_frame(
+ const AV1_COMP *const cpi, const EncodeFrameParams *const frame_params) {
+ const AV1_COMMON *const cm = &cpi->common;
+
+ const int intra_only = frame_params->frame_type == KEY_FRAME ||
+ frame_params->frame_type == INTRA_ONLY_FRAME;
+ if (intra_only || frame_params->error_resilient_mode ||
+ cpi->ext_use_primary_ref_none) {
+ return PRIMARY_REF_NONE;
+ }
+
+ // Find the most recent reference frame with the same reference type as the
+ // current frame
+ const FRAME_CONTEXT_INDEX current_ref_type =
+ get_current_frame_ref_type(cpi, frame_params);
+ int wanted_fb = cpi->fb_of_context_type[current_ref_type];
+
+ int primary_ref_frame = PRIMARY_REF_NONE;
+ for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+ if (get_ref_frame_map_idx(cm, ref_frame) == wanted_fb) {
+ primary_ref_frame = ref_frame - LAST_FRAME;
+ }
+ }
+ return primary_ref_frame;
+}
+
+static void update_fb_of_context_type(
+ const AV1_COMP *const cpi, const EncodeFrameParams *const frame_params,
+ int *const fb_of_context_type) {
+ const AV1_COMMON *const cm = &cpi->common;
+
+ if (frame_is_intra_only(cm) || cm->error_resilient_mode ||
+ cpi->ext_use_primary_ref_none) {
+ for (int i = 0; i < REF_FRAMES; i++) {
+ fb_of_context_type[i] = -1;
+ }
+ fb_of_context_type[REGULAR_FRAME] =
+ cm->show_frame ? get_ref_frame_map_idx(cm, GOLDEN_FRAME)
+ : get_ref_frame_map_idx(cm, ALTREF_FRAME);
+ }
+
+ if (!encode_show_existing_frame(cm)) {
+ // Refresh fb_of_context_type[]: see encoder.h for explanation
+ if (cm->current_frame.frame_type == KEY_FRAME) {
+ // All ref frames are refreshed, pick one that will live long enough
+ fb_of_context_type[REGULAR_FRAME] = 0;
+ } else {
+ // If more than one frame is refreshed, it doesn't matter which one we
+ // pick so pick the first. LST sometimes doesn't refresh any: this is ok
+ const int current_frame_ref_type =
+ get_current_frame_ref_type(cpi, frame_params);
+ for (int i = 0; i < REF_FRAMES; i++) {
+ if (cm->current_frame.refresh_frame_flags & (1 << i)) {
+ fb_of_context_type[current_frame_ref_type] = i;
+ break;
+ }
+ }
+ }
+ }
+}
+
+static int get_order_offset(const GF_GROUP *const gf_group,
+ const EncodeFrameParams *const frame_params) {
+ // shown frame by definition has order offset 0
+ // show_existing_frame ignores order_offset and simply takes the order_hint
+ // from the reference frame being shown.
+ if (frame_params->show_frame || frame_params->show_existing_frame) return 0;
+
+ const int arf_offset =
+ AOMMIN((MAX_GF_INTERVAL - 1), gf_group->arf_src_offset[gf_group->index]);
+ return AOMMIN((MAX_GF_INTERVAL - 1), arf_offset);
+}
+
+static void adjust_frame_rate(AV1_COMP *cpi,
+ const struct lookahead_entry *source) {
+ int64_t this_duration;
+ int step = 0;
+
+ // Clear down mmx registers
+ aom_clear_system_state();
+
+ if (source->ts_start == cpi->first_time_stamp_ever) {
+ this_duration = source->ts_end - source->ts_start;
+ step = 1;
+ } else {
+ int64_t last_duration =
+ cpi->last_end_time_stamp_seen - cpi->last_time_stamp_seen;
+
+ this_duration = source->ts_end - cpi->last_end_time_stamp_seen;
+
+ // do a step update if the duration changes by 10%
+ if (last_duration)
+ step = (int)((this_duration - last_duration) * 10 / last_duration);
+ }
+
+ if (this_duration) {
+ if (step) {
+ av1_new_framerate(cpi, 10000000.0 / this_duration);
+ } else {
+ // Average this frame's rate into the last second's average
+ // frame rate. If we haven't seen 1 second yet, then average
+ // over the whole interval seen.
+ const double interval = AOMMIN(
+ (double)(source->ts_end - cpi->first_time_stamp_ever), 10000000.0);
+ double avg_duration = 10000000.0 / cpi->framerate;
+ avg_duration *= (interval - avg_duration + this_duration);
+ avg_duration /= interval;
+
+ av1_new_framerate(cpi, 10000000.0 / avg_duration);
+ }
+ }
+ cpi->last_time_stamp_seen = source->ts_start;
+ cpi->last_end_time_stamp_seen = source->ts_end;
+}
+
+// If this is an alt-ref, returns the offset of the source frame used
+// as the arf midpoint. Otherwise, returns 0.
+static int get_arf_src_index(AV1_COMP *cpi) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ int arf_src_index = 0;
+ if (cpi->oxcf.pass == 2) {
+ const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+ if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
+ assert(is_altref_enabled(cpi));
+ arf_src_index = gf_group->arf_src_offset[gf_group->index];
+ }
+ } else if (rc->source_alt_ref_pending) {
+ arf_src_index = rc->frames_till_gf_update_due;
+ }
+ return arf_src_index;
+}
+
+// If this is an internal alt-ref, returns the offset of the source frame used
+// as the internal arf midpoint. Otherwise, returns 0.
+static int get_internal_arf_src_index(AV1_COMP *cpi) {
+ int internal_arf_src_index = 0;
+ if (cpi->oxcf.pass == 2) {
+ const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+ if (gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE) {
+ assert(is_altref_enabled(cpi) && cpi->internal_altref_allowed);
+ internal_arf_src_index = gf_group->arf_src_offset[gf_group->index];
+ }
+ }
+ return internal_arf_src_index;
+}
+
+// Called if this frame is an ARF or ARF2. Also handles forward-keyframes
+// For an ARF set arf2=0, for ARF2 set arf2=1
+// temporal_filtered is set to 1 if we temporally filter the ARF frame, so that
+// the correct post-filter buffer can be used.
+static struct lookahead_entry *setup_arf_or_arf2(
+ AV1_COMP *const cpi, const int arf_src_index, const int arf2,
+ int *temporal_filtered, EncodeFrameParams *const frame_params) {
+ AV1_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+
+ assert(arf_src_index <= rc->frames_to_key);
+ *temporal_filtered = 0;
+
+ struct lookahead_entry *source =
+ av1_lookahead_peek(cpi->lookahead, arf_src_index);
+
+ if (source != NULL) {
+ cm->showable_frame = 1;
+ cpi->alt_ref_source = source;
+
+ // When arf_src_index == rc->frames_to_key, it indicates a fwd_kf
+ if (!arf2 && arf_src_index == rc->frames_to_key) {
+ // Skip temporal filtering and mark as intra_only if we have a fwd_kf
+ const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+ int which_arf = gf_group->arf_update_idx[gf_group->index];
+ cpi->is_arf_filter_off[which_arf] = 1;
+ cpi->no_show_kf = 1;
+ } else {
+ if (oxcf->arnr_max_frames > 0) {
+ // Produce the filtered ARF frame.
+ av1_temporal_filter(cpi, arf_src_index);
+ aom_extend_frame_borders(&cpi->alt_ref_buffer, av1_num_planes(cm));
+ *temporal_filtered = 1;
+ }
+ }
+ frame_params->show_frame = 0;
+ }
+ rc->source_alt_ref_pending = 0;
+ return source;
+}
+
+// Determine whether there is a forced keyframe pending in the lookahead buffer
+static int is_forced_keyframe_pending(struct lookahead_ctx *lookahead,
+ const int up_to_index) {
+ for (int i = 0; i <= up_to_index; i++) {
+ const struct lookahead_entry *e = av1_lookahead_peek(lookahead, i);
+ if (e == NULL) {
+ // We have reached the end of the lookahead buffer and not early-returned
+ // so there isn't a forced key-frame pending.
+ return 0;
+ } else if (e->flags == AOM_EFLAG_FORCE_KF) {
+ return 1;
+ } else {
+ continue;
+ }
+ }
+ return 0; // Never reached
+}
+
+// Check if we should encode an ARF or internal ARF. If not, try a LAST
+// Do some setup associated with the chosen source
+// temporal_filtered, flush, and frame_update_type are outputs.
+// Return the frame source, or NULL if we couldn't find one
+struct lookahead_entry *choose_frame_source(
+ AV1_COMP *const cpi, int *const temporal_filtered, int *const flush,
+ struct lookahead_entry **last_source, FRAME_UPDATE_TYPE *frame_update_type,
+ EncodeFrameParams *const frame_params) {
+ AV1_COMMON *const cm = &cpi->common;
+ struct lookahead_entry *source = NULL;
+ *temporal_filtered = 0;
+
+ // Should we encode an alt-ref frame.
+ int arf_src_index = get_arf_src_index(cpi);
+ if (arf_src_index &&
+ is_forced_keyframe_pending(cpi->lookahead, arf_src_index)) {
+ arf_src_index = 0;
+ *flush = 1;
+ }
+
+ if (arf_src_index) {
+ source = setup_arf_or_arf2(cpi, arf_src_index, 0, temporal_filtered,
+ frame_params);
+ *frame_update_type = ARF_UPDATE;
+ }
+
+ // Should we encode an internal Alt-ref frame (mutually exclusive to ARF)
+ arf_src_index = get_internal_arf_src_index(cpi);
+ if (arf_src_index &&
+ is_forced_keyframe_pending(cpi->lookahead, arf_src_index)) {
+ arf_src_index = 0;
+ *flush = 1;
+ }
+
+ if (arf_src_index) {
+ source = setup_arf_or_arf2(cpi, arf_src_index, 1, temporal_filtered,
+ frame_params);
+ *frame_update_type = INTNL_ARF_UPDATE;
+ }
+
+ if (!source) {
+ // Get last frame source.
+ if (cm->current_frame.frame_number > 0) {
+ *last_source = av1_lookahead_peek(cpi->lookahead, -1);
+ }
+ // Read in the source frame.
+ source = av1_lookahead_pop(cpi->lookahead, *flush);
+ if (source == NULL) return NULL;
+ *frame_update_type = LF_UPDATE; // Default update type
+ frame_params->show_frame = 1;
+
+ // Check to see if the frame should be encoded as an arf overlay.
+ if (cpi->alt_ref_source == source) {
+ *frame_update_type = OVERLAY_UPDATE;
+ cpi->alt_ref_source = NULL;
+ }
+ }
+ return source;
+}
+
+// Don't allow a show_existing_frame to coincide with an error resilient or
+// S-Frame. An exception can be made in the case of a keyframe, since it does
+// not depend on any previous frames.
+static int allow_show_existing(const AV1_COMP *const cpi,
+ unsigned int frame_flags) {
+ if (cpi->common.current_frame.frame_number == 0) return 0;
+
+ const struct lookahead_entry *lookahead_src =
+ av1_lookahead_peek(cpi->lookahead, 0);
+ if (lookahead_src == NULL) return 1;
+
+ const int is_error_resilient =
+ cpi->oxcf.error_resilient_mode ||
+ (lookahead_src->flags & AOM_EFLAG_ERROR_RESILIENT);
+ const int is_s_frame =
+ cpi->oxcf.s_frame_mode || (lookahead_src->flags & AOM_EFLAG_SET_S_FRAME);
+ const int is_key_frame =
+ (cpi->rc.frames_to_key == 0) || (frame_flags & FRAMEFLAGS_KEY);
+ return !(is_error_resilient || is_s_frame) || is_key_frame;
+}
+
+// Update frame_flags to tell the encoder's caller what sort of frame was
+// encoded.
+static void update_frame_flags(AV1_COMP *cpi, unsigned int *frame_flags) {
+ if (encode_show_existing_frame(&cpi->common)) {
+ *frame_flags &= ~FRAMEFLAGS_GOLDEN;
+ *frame_flags &= ~FRAMEFLAGS_BWDREF;
+ *frame_flags &= ~FRAMEFLAGS_ALTREF;
+ *frame_flags &= ~FRAMEFLAGS_KEY;
+ return;
+ }
+
+ if (cpi->refresh_golden_frame == 1) {
+ *frame_flags |= FRAMEFLAGS_GOLDEN;
+ } else {
+ *frame_flags &= ~FRAMEFLAGS_GOLDEN;
+ }
+
+ if (cpi->refresh_alt_ref_frame == 1) {
+ *frame_flags |= FRAMEFLAGS_ALTREF;
+ } else {
+ *frame_flags &= ~FRAMEFLAGS_ALTREF;
+ }
+
+ if (cpi->refresh_bwd_ref_frame == 1) {
+ *frame_flags |= FRAMEFLAGS_BWDREF;
+ } else {
+ *frame_flags &= ~FRAMEFLAGS_BWDREF;
+ }
+
+ if (cpi->common.current_frame.frame_type == KEY_FRAME) {
+ *frame_flags |= FRAMEFLAGS_KEY;
+ } else {
+ *frame_flags &= ~FRAMEFLAGS_KEY;
+ }
+}
+
+#define DUMP_REF_FRAME_IMAGES 0
+
+#if DUMP_REF_FRAME_IMAGES == 1
+static int dump_one_image(AV1_COMMON *cm,
+ const YV12_BUFFER_CONFIG *const ref_buf,
+ char *file_name) {
+ int h;
+ FILE *f_ref = NULL;
+
+ if (ref_buf == NULL) {
+ printf("Frame data buffer is NULL.\n");
+ return AOM_CODEC_MEM_ERROR;
+ }
+
+ if ((f_ref = fopen(file_name, "wb")) == NULL) {
+ printf("Unable to open file %s to write.\n", file_name);
+ return AOM_CODEC_MEM_ERROR;
+ }
+
+ // --- Y ---
+ for (h = 0; h < cm->height; ++h) {
+ fwrite(&ref_buf->y_buffer[h * ref_buf->y_stride], 1, cm->width, f_ref);
+ }
+ // --- U ---
+ for (h = 0; h < (cm->height >> 1); ++h) {
+ fwrite(&ref_buf->u_buffer[h * ref_buf->uv_stride], 1, (cm->width >> 1),
+ f_ref);
+ }
+ // --- V ---
+ for (h = 0; h < (cm->height >> 1); ++h) {
+ fwrite(&ref_buf->v_buffer[h * ref_buf->uv_stride], 1, (cm->width >> 1),
+ f_ref);
+ }
+
+ fclose(f_ref);
+
+ return AOM_CODEC_OK;
+}
+
+static void dump_ref_frame_images(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ MV_REFERENCE_FRAME ref_frame;
+
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ char file_name[256] = "";
+ snprintf(file_name, sizeof(file_name), "/tmp/enc_F%d_ref_%d.yuv",
+ cm->current_frame.frame_number, ref_frame);
+ dump_one_image(cm, get_ref_frame_yv12_buf(cpi, ref_frame), file_name);
+ }
+}
+#endif // DUMP_REF_FRAME_IMAGES == 1
+
+// Assign new_ref in the new mapping to point at the reference buffer pointed at
+// by old_ref in the old_map. The new mapping is stored in *new_map, while the
+// old map comes from cm->remapped_ref_idx[].
+static void assign_new_map(AV1_COMMON *const cm, int *new_map, int new_ref,
+ int old_ref) {
+ new_map[new_ref - LAST_FRAME] = cm->remapped_ref_idx[old_ref - LAST_FRAME];
+}
+
+// Generate a new reference frame mapping. This function updates
+// cm->remapped_ref_idx[] depending on the frame_update_type of this frame.
+// This determines which references (e.g. LAST_FRAME, ALTREF_FRAME) point at the
+// 8 underlying buffers and, together with get_refresh_frame_flags(), implements
+// our reference frame management strategy.
+static void update_ref_frame_map(AV1_COMP *cpi,
+ FRAME_UPDATE_TYPE frame_update_type) {
+ AV1_COMMON *const cm = &cpi->common;
+
+ // If check_frame_refs_short_signaling() decided to set
+ // frame_refs_short_signaling=1 then we update remapped_ref_idx[] here. Every
+ // reference will still map to the same RefCntBuffer (through ref_frame_map[])
+ // after this, but that does not necessarily mean that remapped_ref_idx[] is
+ // unchanged.
+ if (cm->current_frame.frame_refs_short_signaling) {
+ const int lst_map_idx = get_ref_frame_map_idx(cm, LAST_FRAME);
+ const int gld_map_idx = get_ref_frame_map_idx(cm, GOLDEN_FRAME);
+ av1_set_frame_refs(cm, cm->remapped_ref_idx, lst_map_idx, gld_map_idx);
+ }
+
+ // For shown keyframes and S-frames all buffers are refreshed, but we don't
+ // change any of the mapping.
+ if ((cm->current_frame.frame_type == KEY_FRAME && cm->show_frame) ||
+ frame_is_sframe(cm)) {
+ return;
+ }
+
+ // Initialize the new reference map as a copy of the old one.
+ int new_map[REF_FRAMES];
+ memcpy(new_map, cm->remapped_ref_idx, sizeof(new_map));
+
+ // The reference management strategy is currently as follows. See
+ // gop_structure.c for more details of the structure and DOI
+ // 10.1109/DCC.2018.00045 for a higher-level explanation
+ //
+ // * ALTREF_FRAME and GOLDEN_FRAME are kept separate from the other
+ // references. When we code an ALTREF it refreshes the ALTREF buffer. When
+ // we code an OVERLAY the old GOLDEN becomes the new ALTREF and the old
+ // ALTREF (possibly refreshed by the OVERLAY) becomes the new GOLDEN.
+ // * LAST_FRAME, LAST2_FRAME, and LAST3_FRAME work like a FIFO. When we code
+ // a frame which does a last-frame update we pick a buffer to refresh and
+ // then point the LAST_FRAME reference at it. The old LAST_FRAME becomes
+ // LAST2_FRAME and the old LAST2_FRAME becomes LAST3_FRAME. The old
+ // LAST3_FRAME is re-used somewhere else.
+ // * BWDREF, ALTREF2, and EXTREF act like a stack structure, so we can
+ // "push" and "pop" internal alt-ref frames through the three references.
+ // * When we code a BRF or internal-ARF (they work the same in this
+ // structure) we push it onto the bwdref stack. Because we have a finite
+ // number of buffers, we actually refresh EXTREF, the bottom of the stack,
+ // and rotate the three references to make EXTREF the top.
+ // * When we code an INTNL_OVERLAY we refresh BWDREF, then pop it off of the
+ // bwdref stack and push it into the last-frame FIFO. The old LAST3
+ // buffer gets pushed out of the last-frame FIFO and becomes the new
+ // EXTREF, bottom of the bwdref stack.
+ // * LAST_BIPRED just acts like a LAST_FRAME. The BWDREF will have an
+ // INTNL_OVERLAY and so can do its own ref map update.
+ //
+ // Note that this function runs *after* a frame has been coded, so it does not
+ // affect reference assignment of the current frame, it only affects future
+ // frames. This is why we refresh buffers using the old reference map before
+ // remapping them.
+ //
+ // show_existing_frames don't refresh any buffers or send the reference map to
+ // the decoder, but we can still update our reference map if we want to: the
+ // decoder will update its map next time we code a non-show-existing frame.
+
+ if (frame_update_type == OVERLAY_UPDATE) {
+ // We want the old golden-frame to become our new ARF so swap the
+ // references. If cpi->preserve_arf_as_gld == 0 then we will refresh the
+ // old ARF before it becomes our new GF
+ assign_new_map(cm, new_map, ALTREF_FRAME, GOLDEN_FRAME);
+ assign_new_map(cm, new_map, GOLDEN_FRAME, ALTREF_FRAME);
+ } else if (frame_update_type == INTNL_OVERLAY_UPDATE &&
+ encode_show_existing_frame(cm)) {
+ // Note that because encode_show_existing_frame(cm) we don't refresh any
+ // buffers.
+ // Pop BWDREF (shown as current frame) from the bwdref stack and make it
+ // the new LAST_FRAME.
+ assign_new_map(cm, new_map, LAST_FRAME, BWDREF_FRAME);
+
+ // Progress the last-frame FIFO and the bwdref stack
+ assign_new_map(cm, new_map, LAST2_FRAME, LAST_FRAME);
+ assign_new_map(cm, new_map, LAST3_FRAME, LAST2_FRAME);
+ assign_new_map(cm, new_map, BWDREF_FRAME, ALTREF2_FRAME);
+ assign_new_map(cm, new_map, ALTREF2_FRAME, EXTREF_FRAME);
+ assign_new_map(cm, new_map, EXTREF_FRAME, LAST3_FRAME);
+ } else if (frame_update_type == INTNL_ARF_UPDATE &&
+ !cm->show_existing_frame) {
+ // We want to push the current frame onto the bwdref stack. We refresh
+ // EXTREF (the old bottom of the stack) and rotate the references so it
+ // becomes BWDREF, the top of the stack.
+ assign_new_map(cm, new_map, BWDREF_FRAME, EXTREF_FRAME);
+ assign_new_map(cm, new_map, ALTREF2_FRAME, BWDREF_FRAME);
+ assign_new_map(cm, new_map, EXTREF_FRAME, ALTREF2_FRAME);
+ }
+
+ if ((frame_update_type == LF_UPDATE || frame_update_type == GF_UPDATE ||
+ frame_update_type == INTNL_OVERLAY_UPDATE) &&
+ !encode_show_existing_frame(cm) &&
+ (!cm->show_existing_frame || frame_update_type == INTNL_OVERLAY_UPDATE)) {
+ // A standard last-frame: we refresh the LAST3_FRAME buffer and then push it
+ // into the last-frame FIFO.
+ assign_new_map(cm, new_map, LAST3_FRAME, LAST2_FRAME);
+ assign_new_map(cm, new_map, LAST2_FRAME, LAST_FRAME);
+ assign_new_map(cm, new_map, LAST_FRAME, LAST3_FRAME);
+ }
+
+ memcpy(cm->remapped_ref_idx, new_map, sizeof(new_map));
+
+#if DUMP_REF_FRAME_IMAGES == 1
+ // Dump out all reference frame images.
+ dump_ref_frame_images(cpi);
+#endif // DUMP_REF_FRAME_IMAGES
+}
+
+static int get_refresh_frame_flags(const AV1_COMP *const cpi,
+ const EncodeFrameParams *const frame_params,
+ FRAME_UPDATE_TYPE frame_update_type) {
+ const AV1_COMMON *const cm = &cpi->common;
+
+ // Switch frames and shown key-frames overwrite all reference slots
+ if ((frame_params->frame_type == KEY_FRAME && frame_params->show_frame) ||
+ frame_params->frame_type == S_FRAME)
+ return 0xFF;
+
+ // show_existing_frames don't actually send refresh_frame_flags so set the
+ // flags to 0 to keep things consistent.
+ if (frame_params->show_existing_frame &&
+ (!frame_params->error_resilient_mode ||
+ frame_params->frame_type == KEY_FRAME)) {
+ return 0;
+ }
+
+ int refresh_mask = 0;
+
+ if (cpi->ext_refresh_frame_flags_pending) {
+ // Unfortunately the encoder interface reflects the old refresh_*_frame
+ // flags so we have to replicate the old refresh_frame_flags logic here in
+ // order to preserve the behaviour of the flag overrides.
+ refresh_mask |= cpi->ext_refresh_last_frame
+ << get_ref_frame_map_idx(cm, LAST3_FRAME);
+ refresh_mask |= cpi->ext_refresh_bwd_ref_frame
+ << get_ref_frame_map_idx(cm, EXTREF_FRAME);
+ refresh_mask |= cpi->ext_refresh_alt2_ref_frame
+ << get_ref_frame_map_idx(cm, ALTREF2_FRAME);
+ if (frame_update_type == OVERLAY_UPDATE) {
+ if (!cpi->preserve_arf_as_gld) {
+ refresh_mask |= cpi->ext_refresh_golden_frame
+ << get_ref_frame_map_idx(cm, ALTREF_FRAME);
+ }
+ } else {
+ refresh_mask |= cpi->ext_refresh_golden_frame
+ << get_ref_frame_map_idx(cm, GOLDEN_FRAME);
+ refresh_mask |= cpi->ext_refresh_alt_ref_frame
+ << get_ref_frame_map_idx(cm, ALTREF_FRAME);
+ }
+ return refresh_mask;
+ }
+
+ // See update_ref_frame_map() for a thorough description of the reference
+ // buffer management strategy currently in use. This function just decides
+ // which buffers should be refreshed.
+
+ switch (frame_update_type) {
+ case KF_UPDATE:
+ // Note that a real shown key-frame or S-frame refreshes every buffer,
+ // handled in a special case above. This case is for frames which aren't
+ // really a shown key-frame or S-frame but want to refresh all the
+ // important buffers.
+ refresh_mask |= 1 << get_ref_frame_map_idx(cm, LAST3_FRAME);
+ refresh_mask |= 1 << get_ref_frame_map_idx(cm, EXTREF_FRAME);
+ refresh_mask |= 1 << get_ref_frame_map_idx(cm, ALTREF2_FRAME);
+ refresh_mask |= 1 << get_ref_frame_map_idx(cm, GOLDEN_FRAME);
+ refresh_mask |= 1 << get_ref_frame_map_idx(cm, ALTREF_FRAME);
+ break;
+ case LF_UPDATE:
+ // Refresh LAST3, which becomes the new LAST while LAST becomes LAST2
+ // and LAST2 becomes the new LAST3 (like a FIFO but circular)
+ refresh_mask |= 1 << get_ref_frame_map_idx(cm, LAST3_FRAME);
+ break;
+ case GF_UPDATE:
+ // In addition to refreshing the GF buffer, we refresh LAST3 and push it
+ // into the last-frame FIFO.
+ refresh_mask |= 1 << get_ref_frame_map_idx(cm, LAST3_FRAME);
+ refresh_mask |= 1 << get_ref_frame_map_idx(cm, GOLDEN_FRAME);
+ break;
+ case OVERLAY_UPDATE:
+ if (!cpi->preserve_arf_as_gld) {
+ // The result of our OVERLAY should become the GOLDEN_FRAME but we'd
+ // like to keep the old GOLDEN as our new ALTREF. So we refresh the
+ // ALTREF and swap around the ALTREF and GOLDEN references.
+ refresh_mask |= 1 << get_ref_frame_map_idx(cm, ALTREF_FRAME);
+ }
+ break;
+ case ARF_UPDATE:
+ refresh_mask |= 1 << get_ref_frame_map_idx(cm, ALTREF_FRAME);
+ break;
+ case INTNL_OVERLAY_UPDATE:
+ // INTNL_OVERLAY may be a show_existing_frame in which case we don't
+ // refresh anything and the BWDREF or ALTREF2 being shown becomes the new
+ // LAST_FRAME. But, if it's not a show_existing_frame, then we update as
+ // though it's a normal LF_UPDATE: we refresh LAST3 and
+ // update_ref_frame_map() makes that the new LAST_FRAME.
+ refresh_mask |= 1 << get_ref_frame_map_idx(cm, LAST3_FRAME);
+ break;
+ case INTNL_ARF_UPDATE:
+ if (cpi->oxcf.pass == 2) {
+ // Push the new ARF2 onto the bwdref stack. We refresh EXTREF which is
+ // at the bottom of the stack then move it to the top.
+ refresh_mask |= 1 << get_ref_frame_map_idx(cm, EXTREF_FRAME);
+ } else {
+ // ARF2 just gets stored in the ARF2 slot, no reference map change.
+ refresh_mask |= 1 << get_ref_frame_map_idx(cm, ALTREF2_FRAME);
+ }
+ break;
+ default: assert(0); break;
+ }
+ return refresh_mask;
+}
+
+int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
+ uint8_t *const dest, unsigned int *frame_flags,
+ int64_t *const time_stamp, int64_t *const time_end,
+ const aom_rational_t *const timebase, int flush) {
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ AV1_COMMON *const cm = &cpi->common;
+
+ EncodeFrameInput frame_input;
+ EncodeFrameParams frame_params;
+ EncodeFrameResults frame_results;
+ memset(&frame_input, 0, sizeof(frame_input));
+ memset(&frame_params, 0, sizeof(frame_params));
+ memset(&frame_results, 0, sizeof(frame_results));
+
+ if (oxcf->pass == 0 || oxcf->pass == 2) {
+ check_show_existing_frame(cpi, &frame_params);
+ frame_params.show_existing_frame &= allow_show_existing(cpi, *frame_flags);
+ } else {
+ frame_params.show_existing_frame = 0;
+ }
+
+ int temporal_filtered = 0;
+ struct lookahead_entry *source = NULL;
+ struct lookahead_entry *last_source = NULL;
+ FRAME_UPDATE_TYPE frame_update_type;
+ if (frame_params.show_existing_frame) {
+ source = av1_lookahead_pop(cpi->lookahead, flush);
+ frame_update_type = LF_UPDATE;
+ } else {
+ source = choose_frame_source(cpi, &temporal_filtered, &flush, &last_source,
+ &frame_update_type, &frame_params);
+ }
+
+ // In pass 2 we get the frame_update_type from gf_group
+ if (oxcf->pass == 2) {
+ frame_update_type =
+ cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index];
+ }
+
+ if (source == NULL) { // If no source was found, we can't encode a frame.
+ if (flush && oxcf->pass == 1 && !cpi->twopass.first_pass_done) {
+ av1_end_first_pass(cpi); /* get last stats packet */
+ cpi->twopass.first_pass_done = 1;
+ }
+ return -1;
+ }
+
+ frame_input.source = temporal_filtered ? &cpi->alt_ref_buffer : &source->img;
+ frame_input.last_source = last_source != NULL ? &last_source->img : NULL;
+ frame_input.ts_duration = source->ts_end - source->ts_start;
+
+ *time_stamp = source->ts_start;
+ *time_end = source->ts_end;
+ if (source->ts_start < cpi->first_time_stamp_ever) {
+ cpi->first_time_stamp_ever = source->ts_start;
+ cpi->last_end_time_stamp_seen = source->ts_start;
+ }
+
+ av1_apply_encoding_flags(cpi, source->flags);
+ if (!frame_params.show_existing_frame)
+ *frame_flags = (source->flags & AOM_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
+
+ const int is_overlay = frame_params.show_existing_frame &&
+ (frame_update_type == OVERLAY_UPDATE ||
+ frame_update_type == INTNL_OVERLAY_UPDATE);
+ if (frame_params.show_frame || is_overlay) {
+ // Shown frames and arf-overlay frames need frame-rate considering
+ adjust_frame_rate(cpi, source);
+ }
+
+ if (frame_params.show_existing_frame) {
+ // show_existing_frame implies this frame is shown!
+ frame_params.show_frame = 1;
+ } else {
+ if (cpi->film_grain_table) {
+ cm->cur_frame->film_grain_params_present = aom_film_grain_table_lookup(
+ cpi->film_grain_table, *time_stamp, *time_end, 0 /* =erase */,
+ &cm->film_grain_params);
+ } else {
+ cm->cur_frame->film_grain_params_present =
+ cm->seq_params.film_grain_params_present;
+ }
+ // only one operating point supported now
+ const int64_t pts64 = ticks_to_timebase_units(timebase, *time_stamp);
+ if (pts64 < 0 || pts64 > UINT32_MAX) return AOM_CODEC_ERROR;
+ cpi->common.frame_presentation_time = (uint32_t)pts64;
+ }
+
+ if (oxcf->pass == 2 && (!frame_params.show_existing_frame || is_overlay)) {
+ // GF_GROUP needs updating for arf overlays as well as non-show-existing
+ av1_get_second_pass_params(cpi, &frame_params, *frame_flags);
+ frame_update_type =
+ cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index];
+ }
+
+ if (frame_params.show_existing_frame &&
+ frame_params.frame_type != KEY_FRAME) {
+ // Force show-existing frames to be INTER, except forward keyframes
+ frame_params.frame_type = INTER_FRAME;
+ }
+
+ // TODO(david.turner@argondesign.com): Move all the encode strategy
+ // (largely near av1_get_compressed_data) in here
+
+ // TODO(david.turner@argondesign.com): Change all the encode strategy to
+ // modify frame_params instead of cm or cpi.
+
+ // Per-frame encode speed. In theory this can vary, but things may have been
+ // written assuming speed-level will not change within a sequence, so this
+ // parameter should be used with caution.
+ frame_params.speed = oxcf->speed;
+
+ if (!frame_params.show_existing_frame) {
+ cm->using_qmatrix = cpi->oxcf.using_qm;
+ cm->min_qmlevel = cpi->oxcf.qm_minlevel;
+ cm->max_qmlevel = cpi->oxcf.qm_maxlevel;
+ if (cpi->twopass.gf_group.index == 1 && cpi->oxcf.enable_tpl_model) {
+ av1_configure_buffer_updates(cpi, &frame_params, frame_update_type, 0);
+ av1_set_frame_size(cpi, cm->width, cm->height);
+ av1_tpl_setup_stats(cpi, &frame_input);
+ }
+ }
+
+ // Work out some encoding parameters specific to the pass:
+ if (oxcf->pass == 0) {
+ if (cpi->oxcf.rc_mode == AOM_CBR) {
+ av1_rc_get_one_pass_cbr_params(cpi, &frame_update_type, &frame_params,
+ *frame_flags);
+ } else {
+ av1_rc_get_one_pass_vbr_params(cpi, &frame_update_type, &frame_params,
+ *frame_flags);
+ }
+ } else if (oxcf->pass == 1) {
+ cpi->td.mb.e_mbd.lossless[0] = is_lossless_requested(&cpi->oxcf);
+ const int kf_requested = (cm->current_frame.frame_number == 0 ||
+ (*frame_flags & FRAMEFLAGS_KEY));
+ if (kf_requested && frame_update_type != OVERLAY_UPDATE &&
+ frame_update_type != INTNL_OVERLAY_UPDATE) {
+ frame_params.frame_type = KEY_FRAME;
+ } else {
+ frame_params.frame_type = INTER_FRAME;
+ }
+ } else if (oxcf->pass == 2) {
+#if CONFIG_MISMATCH_DEBUG
+ mismatch_move_frame_idx_w();
+#endif
+#if TXCOEFF_COST_TIMER
+ cm->txcoeff_cost_timer = 0;
+ cm->txcoeff_cost_count = 0;
+#endif
+ }
+
+ if (oxcf->pass == 0 || oxcf->pass == 2) set_ext_overrides(cpi, &frame_params);
+
+ // Shown keyframes and S frames refresh all reference buffers
+ const int force_refresh_all =
+ ((frame_params.frame_type == KEY_FRAME && frame_params.show_frame) ||
+ frame_params.frame_type == S_FRAME) &&
+ !frame_params.show_existing_frame;
+
+ av1_configure_buffer_updates(cpi, &frame_params, frame_update_type,
+ force_refresh_all);
+
+ if (oxcf->pass == 0 || oxcf->pass == 2) {
+ // Work out which reference frame slots may be used.
+ frame_params.ref_frame_flags = get_ref_frame_flags(cpi);
+
+ frame_params.primary_ref_frame =
+ choose_primary_ref_frame(cpi, &frame_params);
+ frame_params.order_offset =
+ get_order_offset(&cpi->twopass.gf_group, &frame_params);
+
+ frame_params.refresh_frame_flags =
+ get_refresh_frame_flags(cpi, &frame_params, frame_update_type);
+ }
+
+ // The way frame_params->remapped_ref_idx is setup is a placeholder.
+ // Currently, reference buffer assignment is done by update_ref_frame_map()
+ // which is called by high-level strategy AFTER encoding a frame. It modifies
+ // cm->remapped_ref_idx. If you want to use an alternative method to
+ // determine reference buffer assignment, just put your assignments into
+ // frame_params->remapped_ref_idx here and they will be used when encoding
+ // this frame. If frame_params->remapped_ref_idx is setup independently of
+ // cm->remapped_ref_idx then update_ref_frame_map() will have no effect.
+ memcpy(frame_params.remapped_ref_idx, cm->remapped_ref_idx,
+ REF_FRAMES * sizeof(*cm->remapped_ref_idx));
+
+ if (av1_encode(cpi, dest, &frame_input, &frame_params, &frame_results) !=
+ AOM_CODEC_OK) {
+ return AOM_CODEC_ERROR;
+ }
+
+ if (oxcf->pass == 0 || oxcf->pass == 2) {
+ // First pass doesn't modify reference buffer assignment or produce frame
+ // flags
+ update_frame_flags(cpi, frame_flags);
+ update_ref_frame_map(cpi, frame_update_type);
+ }
+
+ if (oxcf->pass == 2) {
+#if TXCOEFF_COST_TIMER
+ cm->cum_txcoeff_cost_timer += cm->txcoeff_cost_timer;
+ fprintf(stderr,
+ "\ntxb coeff cost block number: %ld, frame time: %ld, cum time %ld "
+ "in us\n",
+ cm->txcoeff_cost_count, cm->txcoeff_cost_timer,
+ cm->cum_txcoeff_cost_timer);
+#endif
+ av1_twopass_postencode_update(cpi);
+ }
+
+ if (oxcf->pass == 0 || oxcf->pass == 2) {
+ update_fb_of_context_type(cpi, &frame_params, cpi->fb_of_context_type);
+ set_additional_frame_flags(cm, frame_flags);
+ update_rc_counts(cpi);
+ }
+
+ // Unpack frame_results:
+ *size = frame_results.size;
+
+ // Leave a signal for a higher level caller about if this frame is droppable
+ if (*size > 0) {
+ cpi->droppable = is_frame_droppable(cpi);
+ }
+
+ return AOM_CODEC_OK;
+}
diff --git a/libaom/av1/encoder/encode_strategy.h b/libaom/av1/encoder/encode_strategy.h
new file mode 100644
index 0000000..6830e44
--- /dev/null
+++ b/libaom/av1/encoder/encode_strategy.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENCODE_STRATEGY_H_
+#define AOM_AV1_ENCODER_ENCODE_STRATEGY_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+
+#include "aom/aom_encoder.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/firstpass.h"
+
+// This function will implement high-level encode strategy, choosing frame type,
+// frame placement, etc. It populates an EncodeFrameParams struct with the
+// results of these decisions and then calls av1_encode()
+int av1_encode_strategy(AV1_COMP *const cpi, size_t *const size,
+ uint8_t *const dest, unsigned int *frame_flags,
+ int64_t *const time_stamp, int64_t *const time_end,
+ const aom_rational_t *const timebase, int flush);
+
+// Set individual buffer update flags based on frame reference type.
+// force_refresh_all is used when we have a KEY_FRAME or S_FRAME. It forces all
+// refresh_*_frame flags to be set, because we refresh all buffers in this case.
+void av1_configure_buffer_updates(AV1_COMP *const cpi,
+ EncodeFrameParams *const frame_params,
+ const FRAME_UPDATE_TYPE type,
+ int force_refresh_all);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_ENCODE_STRATEGY_H_
diff --git a/libaom/av1/encoder/encodeframe.c b/libaom/av1/encoder/encodeframe.c
index ebfc8c2..2952184 100644
--- a/libaom/av1/encoder/encodeframe.c
+++ b/libaom/av1/encoder/encodeframe.c
@@ -10,6 +10,7 @@
*/
#include <limits.h>
+#include <float.h>
#include <math.h>
#include <stdbool.h>
#include <stdio.h>
@@ -54,12 +55,14 @@
#include "av1/encoder/ethread.h"
#include "av1/encoder/extend.h"
#include "av1/encoder/ml.h"
+#include "av1/encoder/partition_strategy.h"
#include "av1/encoder/partition_model_weights.h"
#include "av1/encoder/rd.h"
#include "av1/encoder/rdopt.h"
#include "av1/encoder/reconinter_enc.h"
#include "av1/encoder/segmentation.h"
#include "av1/encoder/tokenize.h"
+#include "av1/encoder/var_based_part.h"
static void encode_superblock(const AV1_COMP *const cpi, TileDataEnc *tile_data,
ThreadData *td, TOKENEXTRA **t, RUN_TYPE dry_run,
@@ -74,7 +77,7 @@ static int ml_predict_breakout(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
// purposes of activity masking.
// Eventually this should be replaced by custom no-reference routines,
// which will be faster.
-static const uint8_t AV1_VAR_OFFS[MAX_SB_SIZE] = {
+const uint8_t AV1_VAR_OFFS[MAX_SB_SIZE] = {
128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
@@ -139,15 +142,6 @@ static const uint16_t AV1_HIGH_VAR_OFFS_12[MAX_SB_SIZE] = {
128 * 16, 128 * 16
};
-#if CONFIG_FP_MB_STATS
-static const uint8_t num_16x16_blocks_wide_lookup[BLOCK_SIZES_ALL] = {
- 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 1, 1, 1, 2, 2, 4
-};
-static const uint8_t num_16x16_blocks_high_lookup[BLOCK_SIZES_ALL] = {
- 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 1, 1, 2, 1, 4, 2
-};
-#endif // CONFIG_FP_MB_STATS
-
unsigned int av1_get_sby_perpixel_variance(const AV1_COMP *cpi,
const struct buf_2d *ref,
BLOCK_SIZE bs) {
@@ -188,7 +182,8 @@ static unsigned int get_sby_perpixel_diff_variance(const AV1_COMP *const cpi,
BLOCK_SIZE bs) {
unsigned int sse, var;
uint8_t *last_y;
- const YV12_BUFFER_CONFIG *last = get_ref_frame_buffer(cpi, LAST_FRAME);
+ const YV12_BUFFER_CONFIG *last =
+ get_ref_frame_yv12_buf(&cpi->common, LAST_FRAME);
assert(last != NULL);
last_y =
@@ -211,18 +206,6 @@ static BLOCK_SIZE get_rd_var_based_fixed_partition(AV1_COMP *cpi, MACROBLOCK *x,
return BLOCK_8X8;
}
-// Lighter version of set_offsets that only sets the mode info
-// pointers.
-static void set_mode_info_offsets(const AV1_COMP *const cpi,
- MACROBLOCK *const x, MACROBLOCKD *const xd,
- int mi_row, int mi_col) {
- const AV1_COMMON *const cm = &cpi->common;
- const int idx_str = xd->mi_stride * mi_row + mi_col;
- xd->mi = cm->mi_grid_visible + idx_str;
- xd->mi[0] = cm->mi + idx_str;
- x->mbmi_ext = cpi->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
-}
-
static void set_offsets_without_segment_id(const AV1_COMP *const cpi,
const TileInfo *const tile,
MACROBLOCK *const x, int mi_row,
@@ -267,25 +250,24 @@ static void set_offsets_without_segment_id(const AV1_COMP *const cpi,
// required by av1_append_sub8x8_mvs_for_idx() and av1_find_best_ref_mvs()
xd->tile = *tile;
+
+ xd->cfl.mi_row = mi_row;
+ xd->cfl.mi_col = mi_col;
}
static void set_offsets(const AV1_COMP *const cpi, const TileInfo *const tile,
MACROBLOCK *const x, int mi_row, int mi_col,
BLOCK_SIZE bsize) {
const AV1_COMMON *const cm = &cpi->common;
+ const struct segmentation *const seg = &cm->seg;
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *mbmi;
- const struct segmentation *const seg = &cm->seg;
set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
+ // Setup segment ID.
mbmi = xd->mi[0];
- xd->cfl.mi_row = mi_row;
- xd->cfl.mi_col = mi_col;
-
mbmi->segment_id = 0;
-
- // Setup segment ID.
if (seg->enabled) {
if (seg->enabled && !cpi->vaq_refresh) {
const uint8_t *const map =
@@ -297,15 +279,6 @@ static void set_offsets(const AV1_COMP *const cpi, const TileInfo *const tile,
}
}
-static void reset_intmv_filter_type(MB_MODE_INFO *mbmi) {
- InterpFilter filters[2];
-
- for (int dir = 0; dir < 2; ++dir) {
- filters[dir] = av1_extract_interp_filter(mbmi->interp_filters, dir);
- }
- mbmi->interp_filters = av1_make_interp_filters(filters[0], filters[1]);
-}
-
static void update_filter_type_count(uint8_t allow_update_cdf,
FRAME_COUNTS *counts,
const MACROBLOCKD *xd,
@@ -380,8 +353,6 @@ static void update_state(const AV1_COMP *const cpi,
*mi_addr = *mi;
*x->mbmi_ext = ctx->mbmi_ext;
- reset_intmv_filter_type(mi_addr);
-
memcpy(x->blk_skip, ctx->blk_skip, sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
x->skip = ctx->skip;
@@ -401,7 +372,6 @@ static void update_state(const AV1_COMP *const cpi,
if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
av1_cyclic_refresh_update_segment(cpi, mi_addr, mi_row, mi_col, bsize,
ctx->rate, ctx->dist, x->skip);
- reset_tx_size(x, mi_addr, cm->tx_mode);
}
if (mi_addr->uv_mode == UV_CFL_PRED && !is_cfl_allowed(xd))
mi_addr->uv_mode = UV_DC_PRED;
@@ -512,24 +482,32 @@ static int set_deltaq_rdmult(const AV1_COMP *const cpi, MACROBLOCKD *const xd) {
cpi, cm->base_qindex + xd->delta_qindex + cm->y_dc_delta_q);
}
-static uint16_t edge_strength(const struct buf_2d *ref, const BLOCK_SIZE bsize,
- const bool high_bd, const int bd) {
+static EdgeInfo edge_info(const struct buf_2d *ref, const BLOCK_SIZE bsize,
+ const bool high_bd, const int bd) {
const int width = block_size_wide[bsize];
const int height = block_size_high[bsize];
// Implementation requires width to be a multiple of 8. It also requires
// height to be a multiple of 4, but this is always the case.
assert(height % 4 == 0);
if (width % 8 != 0) {
- return 0;
+ EdgeInfo ei = { .magnitude = 0, .x = 0, .y = 0 };
+ return ei;
}
return av1_edge_exists(ref->buf, ref->stride, width, height, high_bd, bd);
}
-static void rd_pick_sb_modes(AV1_COMP *const cpi, TileDataEnc *tile_data,
- MACROBLOCK *const x, int mi_row, int mi_col,
- RD_STATS *rd_cost, PARTITION_TYPE partition,
- BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
- int64_t best_rd) {
+static int use_pb_simple_motion_pred_sse(const AV1_COMP *const cpi) {
+ // TODO(debargha, yuec): Not in use, need to implement a speed feature
+ // utilizing this data point, and replace '0' by the corresponding speed
+ // feature flag.
+ return 0 && !frame_is_intra_only(&cpi->common);
+}
+
+static void pick_sb_modes(AV1_COMP *const cpi, TileDataEnc *tile_data,
+ MACROBLOCK *const x, int mi_row, int mi_col,
+ RD_STATS *rd_cost, PARTITION_TYPE partition,
+ BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+ int64_t best_rd, int use_nonrd_pick_mode) {
AV1_COMMON *const cm = &cpi->common;
const int num_planes = av1_num_planes(cm);
TileInfo *const tile_info = &tile_data->tile_info;
@@ -542,6 +520,10 @@ static void rd_pick_sb_modes(AV1_COMP *const cpi, TileDataEnc *tile_data,
const DELTAQ_MODE deltaq_mode = cpi->oxcf.deltaq_mode;
int i, orig_rdmult;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, rd_pick_sb_modes_time);
+#endif
+
if (best_rd < 0) {
ctx->rdcost = INT64_MAX;
ctx->skip = 0;
@@ -602,21 +584,32 @@ static void rd_pick_sb_modes(AV1_COMP *const cpi, TileDataEnc *tile_data,
return;
}
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ if (is_cur_buf_hbd(xd)) {
x->source_variance = av1_high_get_sby_perpixel_variance(
cpi, &x->plane[0].src, bsize, xd->bd);
} else {
x->source_variance =
av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
}
+ if (use_pb_simple_motion_pred_sse(cpi)) {
+ const MV ref_mv_full = { .row = 0, .col = 0 };
+ unsigned int var = 0;
+ av1_simple_motion_sse_var(cpi, x, mi_row, mi_col, bsize, ref_mv_full, 0,
+ &x->simple_motion_pred_sse, &var);
+ }
+
// If the threshold for disabling wedge search is zero, it means the feature
// should not be used. Use a value that will always succeed in the check.
if (cpi->sf.disable_wedge_search_edge_thresh == 0) {
x->edge_strength = UINT16_MAX;
+ x->edge_strength_x = UINT16_MAX;
+ x->edge_strength_y = UINT16_MAX;
} else {
- x->edge_strength =
- edge_strength(&x->plane[0].src, bsize,
- xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd);
+ EdgeInfo ei =
+ edge_info(&x->plane[0].src, bsize, is_cur_buf_hbd(xd), xd->bd);
+ x->edge_strength = ei.magnitude;
+ x->edge_strength_x = ei.x;
+ x->edge_strength_y = ei.y;
}
// Save rdmult before it might be changed, so it can be restored later.
orig_rdmult = x->rdmult;
@@ -644,22 +637,35 @@ static void rd_pick_sb_modes(AV1_COMP *const cpi, TileDataEnc *tile_data,
// Find best coding mode & reconstruct the MB so it is available
// as a predictor for MBs that follow in the SB
if (frame_is_intra_only(cm)) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, av1_rd_pick_intra_mode_sb_time);
+#endif
av1_rd_pick_intra_mode_sb(cpi, x, mi_row, mi_col, rd_cost, bsize, ctx,
best_rd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, av1_rd_pick_intra_mode_sb_time);
+#endif
} else {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, av1_rd_pick_inter_mode_sb_time);
+#endif
if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
av1_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, mi_row, mi_col,
rd_cost, bsize, ctx, best_rd);
-#if CONFIG_ONE_PASS_SVM
- ctx->seg_feat = 1;
-#endif
} else {
- av1_rd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col, rd_cost,
- bsize, ctx, best_rd);
-#if CONFIG_ONE_PASS_SVM
- ctx->seg_feat = 0;
-#endif
+ // TODO(kyslov): do the same for pick_intra_mode and
+ // pick_inter_mode_sb_seg_skip
+ if (use_nonrd_pick_mode) {
+ av1_nonrd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col, rd_cost,
+ bsize, ctx, best_rd);
+ } else {
+ av1_rd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col, rd_cost,
+ bsize, ctx, best_rd);
+ }
}
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, av1_rd_pick_inter_mode_sb_time);
+#endif
}
// Examine the resulting rate and for AQ mode 2 make a segment choice.
@@ -680,6 +686,10 @@ static void rd_pick_sb_modes(AV1_COMP *const cpi, TileDataEnc *tile_data,
ctx->rate = rd_cost->rate;
ctx->dist = rd_cost->dist;
ctx->rdcost = rd_cost->rdcost;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, rd_pick_sb_modes_time);
+#endif
}
static void update_inter_mode_stats(FRAME_CONTEXT *fc, FRAME_COUNTS *counts,
@@ -1287,11 +1297,13 @@ static void update_stats(const AV1_COMMON *const cm, TileDataEnc *tile_data,
assert(masked_compound_used);
if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) {
#if CONFIG_ENTROPY_STATS
- ++counts->compound_type[bsize][mbmi->interinter_comp.type - 1];
+ ++counts->compound_type[bsize][mbmi->interinter_comp.type -
+ COMPOUND_WEDGE];
#endif
if (allow_update_cdf) {
update_cdf(fc->compound_type_cdf[bsize],
- mbmi->interinter_comp.type - 1, COMPOUND_TYPES - 1);
+ mbmi->interinter_comp.type - COMPOUND_WEDGE,
+ MASKED_COMPOUND_TYPES);
}
}
}
@@ -1474,10 +1486,8 @@ static void encode_b(const AV1_COMP *const cpi, TileDataEnc *tile_data,
encode_superblock(cpi, tile_data, td, tp, dry_run, mi_row, mi_col, bsize,
rate);
- if (dry_run == 0)
- x->cb_offset += block_size_wide[bsize] * block_size_high[bsize];
-
if (!dry_run) {
+ x->cb_offset += block_size_wide[bsize] * block_size_high[bsize];
if (bsize == cpi->common.seq_params.sb_size && mbmi->skip == 1 &&
cpi->common.delta_q_info.delta_lf_present_flag) {
const int frame_lf_count = av1_num_planes(&cpi->common) > 1
@@ -1624,25 +1634,6 @@ static void encode_sb(const AV1_COMP *const cpi, ThreadData *td,
update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
}
-// Check to see if the given partition size is allowed for a specified number
-// of mi block rows and columns remaining in the image.
-// If not then return the largest allowed partition size
-static BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize, int rows_left,
- int cols_left, int *bh, int *bw) {
- if (rows_left <= 0 || cols_left <= 0) {
- return AOMMIN(bsize, BLOCK_8X8);
- } else {
- for (; bsize > 0; bsize -= 3) {
- *bh = mi_size_high[bsize];
- *bw = mi_size_wide[bsize];
- if ((*bh <= rows_left) && (*bw <= cols_left)) {
- break;
- }
- }
- }
- return bsize;
-}
-
static void set_partial_sb_partition(const AV1_COMMON *const cm,
MB_MODE_INFO *mi, int bh_in, int bw_in,
int mi_rows_remaining,
@@ -1766,8 +1757,8 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
if (partition != PARTITION_NONE && !splits_below &&
mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols) {
pc_tree->partitioning = PARTITION_NONE;
- rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc,
- PARTITION_NONE, bsize, ctx_none, INT64_MAX);
+ pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc,
+ PARTITION_NONE, bsize, ctx_none, INT64_MAX, 0);
if (none_rdc.rate < INT_MAX) {
none_rdc.rate += x->partition_cost[pl][PARTITION_NONE];
@@ -1779,29 +1770,16 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
pc_tree->partitioning = partition;
}
}
- for (int b = 0; b < 2; ++b) {
- pc_tree->horizontal[b].skip_ref_frame_mask = 0;
- pc_tree->vertical[b].skip_ref_frame_mask = 0;
- }
- for (int b = 0; b < 3; ++b) {
- pc_tree->horizontala[b].skip_ref_frame_mask = 0;
- pc_tree->horizontalb[b].skip_ref_frame_mask = 0;
- pc_tree->verticala[b].skip_ref_frame_mask = 0;
- pc_tree->verticalb[b].skip_ref_frame_mask = 0;
- }
- for (int b = 0; b < 4; ++b) {
- pc_tree->horizontal4[b].skip_ref_frame_mask = 0;
- pc_tree->vertical4[b].skip_ref_frame_mask = 0;
- }
+
switch (partition) {
case PARTITION_NONE:
- rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
- PARTITION_NONE, bsize, ctx_none, INT64_MAX);
+ pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+ PARTITION_NONE, bsize, ctx_none, INT64_MAX, 0);
break;
case PARTITION_HORZ:
- rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
- PARTITION_HORZ, subsize, &pc_tree->horizontal[0],
- INT64_MAX);
+ pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+ PARTITION_HORZ, subsize, &pc_tree->horizontal[0], INT64_MAX,
+ 0);
if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
mi_row + hbs < cm->mi_rows) {
RD_STATS tmp_rdc;
@@ -1810,9 +1788,9 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
update_state(cpi, tile_data, td, ctx_h, mi_row, mi_col, subsize, 1);
encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row,
mi_col, subsize, NULL);
- rd_pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, &tmp_rdc,
- PARTITION_HORZ, subsize, &pc_tree->horizontal[1],
- INT64_MAX);
+ pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, &tmp_rdc,
+ PARTITION_HORZ, subsize, &pc_tree->horizontal[1],
+ INT64_MAX, 0);
if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
av1_invalid_rd_stats(&last_part_rdc);
break;
@@ -1823,9 +1801,9 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
}
break;
case PARTITION_VERT:
- rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
- PARTITION_VERT, subsize, &pc_tree->vertical[0],
- INT64_MAX);
+ pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+ PARTITION_VERT, subsize, &pc_tree->vertical[0], INT64_MAX,
+ 0);
if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
mi_col + hbs < cm->mi_cols) {
RD_STATS tmp_rdc;
@@ -1834,9 +1812,9 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
update_state(cpi, tile_data, td, ctx_v, mi_row, mi_col, subsize, 1);
encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row,
mi_col, subsize, NULL);
- rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, &tmp_rdc,
- PARTITION_VERT, subsize,
- &pc_tree->vertical[bsize > BLOCK_8X8], INT64_MAX);
+ pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, &tmp_rdc,
+ PARTITION_VERT, subsize,
+ &pc_tree->vertical[bsize > BLOCK_8X8], INT64_MAX, 0);
if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
av1_invalid_rd_stats(&last_part_rdc);
break;
@@ -1910,9 +1888,9 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
pc_tree->split[i]->partitioning = PARTITION_NONE;
- rd_pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx,
- &tmp_rdc, PARTITION_SPLIT, split_subsize,
- &pc_tree->split[i]->none, INT64_MAX);
+ pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx, &tmp_rdc,
+ PARTITION_SPLIT, split_subsize, &pc_tree->split[i]->none,
+ INT64_MAX, 0);
restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
@@ -1973,67 +1951,170 @@ static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
*dist = chosen_rdc.dist;
}
-/* clang-format off */
-static const BLOCK_SIZE min_partition_size[BLOCK_SIZES_ALL] = {
- BLOCK_4X4, // 4x4
- BLOCK_4X4, BLOCK_4X4, BLOCK_4X4, // 4x8, 8x4, 8x8
- BLOCK_4X4, BLOCK_4X4, BLOCK_8X8, // 8x16, 16x8, 16x16
- BLOCK_8X8, BLOCK_8X8, BLOCK_16X16, // 16x32, 32x16, 32x32
- BLOCK_16X16, BLOCK_16X16, BLOCK_16X16, // 32x64, 64x32, 64x64
- BLOCK_16X16, BLOCK_16X16, BLOCK_16X16, // 64x128, 128x64, 128x128
- BLOCK_4X4, BLOCK_4X4, BLOCK_8X8, // 4x16, 16x4, 8x32
- BLOCK_8X8, BLOCK_16X16, BLOCK_16X16, // 32x8, 16x64, 64x16
-};
+// TODO(kyslov): now this is very similar to rd_use_partition (except that
+// doesn't do extra search arounf suggested partitioning)
+// consider passing a flag to select non-rd path (similar to
+// encode_sb_row)
+static void nonrd_use_partition(AV1_COMP *cpi, ThreadData *td,
+ TileDataEnc *tile_data, MB_MODE_INFO **mib,
+ TOKENEXTRA **tp, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, int *rate, int64_t *dist,
+ int do_recon, PC_TREE *pc_tree) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ TileInfo *const tile_info = &tile_data->tile_info;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int bs = mi_size_wide[bsize];
+ const int hbs = bs / 2;
+ int i;
+ const int pl = (bsize >= BLOCK_8X8)
+ ? partition_plane_context(xd, mi_row, mi_col, bsize)
+ : 0;
+ const PARTITION_TYPE partition =
+ (bsize >= BLOCK_8X8) ? get_partition(cm, mi_row, mi_col, bsize)
+ : PARTITION_NONE;
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+ RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+ RD_STATS last_part_rdc;
+ PICK_MODE_CONTEXT *ctx_none = &pc_tree->none;
-static const BLOCK_SIZE max_partition_size[BLOCK_SIZES_ALL] = {
- BLOCK_8X8, // 4x4
- BLOCK_16X16, BLOCK_16X16, BLOCK_16X16, // 4x8, 8x4, 8x8
- BLOCK_32X32, BLOCK_32X32, BLOCK_32X32, // 8x16, 16x8, 16x16
- BLOCK_64X64, BLOCK_64X64, BLOCK_64X64, // 16x32, 32x16, 32x32
- BLOCK_LARGEST, BLOCK_LARGEST, BLOCK_LARGEST, // 32x64, 64x32, 64x64
- BLOCK_LARGEST, BLOCK_LARGEST, BLOCK_LARGEST, // 64x128, 128x64, 128x128
- BLOCK_16X16, BLOCK_16X16, BLOCK_32X32, // 4x16, 16x4, 8x32
- BLOCK_32X32, BLOCK_LARGEST, BLOCK_LARGEST, // 32x8, 16x64, 64x16
-};
+ if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
-// Next square block size less or equal than current block size.
-static const BLOCK_SIZE next_square_size[BLOCK_SIZES_ALL] = {
- BLOCK_4X4, // 4x4
- BLOCK_4X4, BLOCK_4X4, BLOCK_8X8, // 4x8, 8x4, 8x8
- BLOCK_8X8, BLOCK_8X8, BLOCK_16X16, // 8x16, 16x8, 16x16
- BLOCK_16X16, BLOCK_16X16, BLOCK_32X32, // 16x32, 32x16, 32x32
- BLOCK_32X32, BLOCK_32X32, BLOCK_64X64, // 32x64, 64x32, 64x64
- BLOCK_64X64, BLOCK_64X64, BLOCK_128X128, // 64x128, 128x64, 128x128
- BLOCK_4X4, BLOCK_4X4, BLOCK_8X8, // 4x16, 16x4, 8x32
- BLOCK_8X8, BLOCK_16X16, BLOCK_16X16, // 32x8, 16x64, 64x16
-};
-/* clang-format on */
-
-// Look at all the mode_info entries for blocks that are part of this
-// partition and find the min and max values for sb_type.
-// At the moment this is designed to work on a superblock but could be
-// adjusted to use a size parameter.
-//
-// The min and max are assumed to have been initialized prior to calling this
-// function so repeat calls can accumulate a min and max of more than one
-// superblock.
-static void get_sb_partition_size_range(const AV1_COMMON *const cm,
- MACROBLOCKD *xd, MB_MODE_INFO **mib,
- BLOCK_SIZE *min_block_size,
- BLOCK_SIZE *max_block_size) {
- int i, j;
- int index = 0;
-
- // Check the sb_type for each block that belongs to this region.
- for (i = 0; i < cm->seq_params.mib_size; ++i) {
- for (j = 0; j < cm->seq_params.mib_size; ++j) {
- MB_MODE_INFO *mi = mib[index + j];
- BLOCK_SIZE sb_type = mi ? mi->sb_type : BLOCK_4X4;
- *min_block_size = AOMMIN(*min_block_size, sb_type);
- *max_block_size = AOMMAX(*max_block_size, sb_type);
- }
- index += xd->mi_stride;
+ assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+
+ av1_invalid_rd_stats(&last_part_rdc);
+
+ pc_tree->partitioning = partition;
+
+ xd->above_txfm_context = cm->above_txfm_context[tile_info->tile_row] + mi_col;
+ xd->left_txfm_context =
+ xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+ save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+
+ if (bsize == BLOCK_16X16 && cpi->vaq_refresh) {
+ set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+ x->mb_energy = av1_log_block_var(cpi, x, bsize);
}
+
+ switch (partition) {
+ case PARTITION_NONE:
+ pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+ PARTITION_NONE, bsize, ctx_none, INT64_MAX, 1);
+ break;
+ case PARTITION_HORZ:
+ pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+ PARTITION_HORZ, subsize, &pc_tree->horizontal[0], INT64_MAX,
+ 1);
+ if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
+ mi_row + hbs < cm->mi_rows) {
+ RD_STATS tmp_rdc;
+ const PICK_MODE_CONTEXT *const ctx_h = &pc_tree->horizontal[0];
+ av1_init_rd_stats(&tmp_rdc);
+ update_state(cpi, tile_data, td, ctx_h, mi_row, mi_col, subsize, 1);
+ encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row,
+ mi_col, subsize, NULL);
+ pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, &tmp_rdc,
+ PARTITION_HORZ, subsize, &pc_tree->horizontal[1],
+ INT64_MAX, 1);
+ if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+ av1_invalid_rd_stats(&last_part_rdc);
+ break;
+ }
+ last_part_rdc.rate += tmp_rdc.rate;
+ last_part_rdc.dist += tmp_rdc.dist;
+ last_part_rdc.rdcost += tmp_rdc.rdcost;
+ }
+ break;
+ case PARTITION_VERT:
+ pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+ PARTITION_VERT, subsize, &pc_tree->vertical[0], INT64_MAX,
+ 1);
+ if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
+ mi_col + hbs < cm->mi_cols) {
+ RD_STATS tmp_rdc;
+ const PICK_MODE_CONTEXT *const ctx_v = &pc_tree->vertical[0];
+ av1_init_rd_stats(&tmp_rdc);
+ update_state(cpi, tile_data, td, ctx_v, mi_row, mi_col, subsize, 1);
+ encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row,
+ mi_col, subsize, NULL);
+ pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, &tmp_rdc,
+ PARTITION_VERT, subsize,
+ &pc_tree->vertical[bsize > BLOCK_8X8], INT64_MAX, 1);
+ if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+ av1_invalid_rd_stats(&last_part_rdc);
+ break;
+ }
+ last_part_rdc.rate += tmp_rdc.rate;
+ last_part_rdc.dist += tmp_rdc.dist;
+ last_part_rdc.rdcost += tmp_rdc.rdcost;
+ }
+ break;
+ case PARTITION_SPLIT:
+ last_part_rdc.rate = 0;
+ last_part_rdc.dist = 0;
+ last_part_rdc.rdcost = 0;
+ for (i = 0; i < 4; i++) {
+ int x_idx = (i & 1) * hbs;
+ int y_idx = (i >> 1) * hbs;
+ int jj = i >> 1, ii = i & 0x01;
+ RD_STATS tmp_rdc;
+ if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
+ continue;
+
+ av1_init_rd_stats(&tmp_rdc);
+ nonrd_use_partition(
+ cpi, td, tile_data, mib + jj * hbs * cm->mi_stride + ii * hbs, tp,
+ mi_row + y_idx, mi_col + x_idx, subsize, &tmp_rdc.rate,
+ &tmp_rdc.dist, i != 3, pc_tree->split[i]);
+ if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+ av1_invalid_rd_stats(&last_part_rdc);
+ break;
+ }
+ last_part_rdc.rate += tmp_rdc.rate;
+ last_part_rdc.dist += tmp_rdc.dist;
+ }
+ break;
+ case PARTITION_VERT_A:
+ case PARTITION_VERT_B:
+ case PARTITION_HORZ_A:
+ case PARTITION_HORZ_B:
+ case PARTITION_HORZ_4:
+ case PARTITION_VERT_4:
+ assert(0 && "Cannot handle extended partition types");
+ default: assert(0); break;
+ }
+
+ if (last_part_rdc.rate < INT_MAX) {
+ last_part_rdc.rate += x->partition_cost[pl][partition];
+ last_part_rdc.rdcost =
+ RDCOST(x->rdmult, last_part_rdc.rate, last_part_rdc.dist);
+ }
+
+ restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+
+ // We must have chosen a partitioning and encoding or we'll fail later on.
+ // No other opportunities for success.
+ if (bsize == cm->seq_params.sb_size)
+ assert(last_part_rdc.rate < INT_MAX && last_part_rdc.dist < INT64_MAX);
+
+ if (do_recon) {
+ if (bsize == cm->seq_params.sb_size) {
+ // NOTE: To get estimate for rate due to the tokens, use:
+ // int rate_coeffs = 0;
+ // encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_COSTCOEFFS,
+ // bsize, pc_tree, &rate_coeffs);
+ x->cb_offset = 0;
+ encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+ pc_tree, NULL);
+ } else {
+ encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+ pc_tree, NULL);
+ }
+ }
+
+ *rate = last_part_rdc.rate;
+ *dist = last_part_rdc.dist;
}
// Checks to see if a super block is on a horizontal image edge.
@@ -2090,234 +2171,6 @@ static int active_v_edge(const AV1_COMP *cpi, int mi_col, int mi_step) {
return is_active_v_edge;
}
-// Checks to see if a super block is at the edge of the active image.
-// In most cases this is the "real" edge unless there are formatting
-// bars embedded in the stream.
-static int active_edge_sb(const AV1_COMP *cpi, int mi_row, int mi_col) {
- return active_h_edge(cpi, mi_row, cpi->common.seq_params.mib_size) ||
- active_v_edge(cpi, mi_col, cpi->common.seq_params.mib_size);
-}
-
-// Performs a motion search in SIMPLE_TRANSLATION mode using
-// reference frame ref. Returns the sad of the result
-static void simple_motion_search(AV1_COMP *const cpi, MACROBLOCK *x, int mi_row,
- int mi_col, BLOCK_SIZE bsize, int ref,
- int num_planes, int use_subpixel) {
- AV1_COMMON *const cm = &cpi->common;
- MACROBLOCKD *xd = &x->e_mbd;
- MB_MODE_INFO *mbmi = xd->mi[0];
-
- mbmi->ref_frame[0] = ref;
- mbmi->ref_frame[1] = NONE_FRAME;
- mbmi->sb_type = bsize;
- mbmi->motion_mode = SIMPLE_TRANSLATION;
-
- YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref);
- const YV12_BUFFER_CONFIG *scaled_ref_frame =
- av1_get_scaled_ref_frame(cpi, ref);
- struct buf_2d backup_yv12;
- // ref_mv is in units of 1/8-pel whereas ref_mv_full is in units of pel
- MV ref_mv = { 0, 0 };
- MV ref_mv_full = { 0, 0 };
- const int step_param = cpi->mv_step_param;
- const MvLimits tmp_mv_limits = x->mv_limits;
- const SEARCH_METHODS search_methods = NSTEP;
- const int do_mesh_search = 0;
- const int sadpb = x->sadperbit16;
- int cost_list[5];
- const int ref_idx = 0;
- int var;
-
- av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize);
-
- if (scaled_ref_frame) {
- backup_yv12 = xd->plane[AOM_PLANE_Y].pre[ref_idx];
- av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL,
- num_planes);
- } else {
- av1_setup_pre_planes(xd, ref_idx, yv12, mi_row, mi_col,
- &cm->current_frame.frame_refs[ref - LAST_FRAME].sf,
- num_planes);
- }
-
- // This overwrites the mv_limits so we will need to restore it later.
- av1_set_mv_search_range(&x->mv_limits, &ref_mv);
- var = av1_full_pixel_search(cpi, x, bsize, &ref_mv_full, step_param,
- search_methods, do_mesh_search, sadpb,
- cond_cost_list(cpi, cost_list), &ref_mv, INT_MAX,
- 1, mi_col * MI_SIZE, mi_row * MI_SIZE, 0);
- // Restore
- x->mv_limits = tmp_mv_limits;
-
- const int use_subpel_search =
- var < INT_MAX && !cpi->common.cur_frame_force_integer_mv && use_subpixel;
- if (use_subpel_search) {
- int not_used = 0;
- if (cpi->sf.use_accurate_subpel_search) {
- const int pw = block_size_wide[bsize];
- const int ph = block_size_high[bsize];
- cpi->find_fractional_mv_step(
- x, cm, mi_row, mi_col, &ref_mv, cm->allow_high_precision_mv,
- x->errorperbit, &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
- cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list),
- x->nmv_vec_cost, x->mv_cost_stack, &not_used, &x->pred_sse[ref], NULL,
- NULL, 0, 0, pw, ph, cpi->sf.use_accurate_subpel_search, 1);
- } else {
- cpi->find_fractional_mv_step(
- x, cm, mi_row, mi_col, &ref_mv, cm->allow_high_precision_mv,
- x->errorperbit, &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
- cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list),
- x->nmv_vec_cost, x->mv_cost_stack, &not_used, &x->pred_sse[ref], NULL,
- NULL, 0, 0, 0, 0, 0, 1);
- }
- } else {
- // Manually convert from units of pixel to 1/8-pixels if we are not doing
- // subpel search
- x->best_mv.as_mv.row *= 8;
- x->best_mv.as_mv.col *= 8;
- }
-
- mbmi->mv[0].as_mv = x->best_mv.as_mv;
-
- // Get a copy of the prediction output
- set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
- av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, NULL, bsize);
-
- aom_clear_system_state();
-
- if (scaled_ref_frame) {
- xd->plane[AOM_PLANE_Y].pre[ref_idx] = backup_yv12;
- }
-}
-
-// Look at neighboring blocks and set a min and max partition size based on
-// what they chose.
-static void rd_auto_partition_range(AV1_COMP *cpi, const TileInfo *const tile,
- MACROBLOCKD *const xd, int mi_row,
- int mi_col, BLOCK_SIZE *min_block_size,
- BLOCK_SIZE *max_block_size) {
- AV1_COMMON *const cm = &cpi->common;
- MB_MODE_INFO **mi = xd->mi;
- const int left_in_image = xd->left_available && mi[-1];
- const int above_in_image = xd->up_available && mi[-xd->mi_stride];
- const int mi_rows_remaining = tile->mi_row_end - mi_row;
- const int mi_cols_remaining = tile->mi_col_end - mi_col;
- int bh, bw;
- BLOCK_SIZE min_size = BLOCK_4X4;
- BLOCK_SIZE max_size = BLOCK_LARGEST;
-
- // Trap case where we do not have a prediction.
- if (left_in_image || above_in_image ||
- cm->current_frame.frame_type != KEY_FRAME) {
- // Default "min to max" and "max to min"
- min_size = BLOCK_LARGEST;
- max_size = BLOCK_4X4;
-
- // NOTE: each call to get_sb_partition_size_range() uses the previous
- // passed in values for min and max as a starting point.
- // Find the min and max partition used in previous frame at this location
- if (cm->current_frame.frame_type != KEY_FRAME) {
- MB_MODE_INFO **prev_mi =
- &cm->prev_mi_grid_visible[mi_row * xd->mi_stride + mi_col];
- get_sb_partition_size_range(cm, xd, prev_mi, &min_size, &max_size);
- }
- // Find the min and max partition sizes used in the left superblock
- if (left_in_image) {
- MB_MODE_INFO **left_sb_mi = &mi[-cm->seq_params.mib_size];
- get_sb_partition_size_range(cm, xd, left_sb_mi, &min_size, &max_size);
- }
- // Find the min and max partition sizes used in the above suprblock.
- if (above_in_image) {
- MB_MODE_INFO **above_sb_mi =
- &mi[-xd->mi_stride * cm->seq_params.mib_size];
- get_sb_partition_size_range(cm, xd, above_sb_mi, &min_size, &max_size);
- }
-
- // Adjust observed min and max for "relaxed" auto partition case.
- if (cpi->sf.auto_min_max_partition_size == RELAXED_NEIGHBORING_MIN_MAX) {
- min_size = min_partition_size[min_size];
- max_size = max_partition_size[max_size];
- }
- }
-
- // Check border cases where max and min from neighbors may not be legal.
- max_size = find_partition_size(max_size, mi_rows_remaining, mi_cols_remaining,
- &bh, &bw);
- min_size = AOMMIN(min_size, max_size);
-
- // Test for blocks at the edge of the active image.
- // This may be the actual edge of the image or where there are formatting
- // bars.
- if (active_edge_sb(cpi, mi_row, mi_col)) {
- min_size = BLOCK_4X4;
- } else {
- min_size = AOMMIN(cpi->sf.rd_auto_partition_min_limit, min_size);
- }
-
- // When use_square_partition_only is true, make sure at least one square
- // partition is allowed by selecting the next smaller square size as
- // *min_block_size.
- if (min_size >= cpi->sf.use_square_partition_only_threshold) {
- min_size = AOMMIN(min_size, next_square_size[max_size]);
- }
-
- *min_block_size = AOMMIN(min_size, cm->seq_params.sb_size);
- *max_block_size = AOMMIN(max_size, cm->seq_params.sb_size);
-}
-
-// TODO(jingning) refactor functions setting partition search range
-static void set_partition_range(const AV1_COMMON *const cm,
- const MACROBLOCKD *const xd, int mi_row,
- int mi_col, BLOCK_SIZE bsize,
- BLOCK_SIZE *const min_bs,
- BLOCK_SIZE *const max_bs) {
- const int mi_width = mi_size_wide[bsize];
- const int mi_height = mi_size_high[bsize];
- int idx, idy;
-
- const int idx_str = cm->mi_stride * mi_row + mi_col;
- MB_MODE_INFO **const prev_mi = &cm->prev_mi_grid_visible[idx_str];
- BLOCK_SIZE min_size = cm->seq_params.sb_size; // default values
- BLOCK_SIZE max_size = BLOCK_4X4;
-
- if (prev_mi) {
- for (idy = 0; idy < mi_height; ++idy) {
- for (idx = 0; idx < mi_width; ++idx) {
- const MB_MODE_INFO *const mi = prev_mi[idy * cm->mi_stride + idx];
- const BLOCK_SIZE bs = mi ? mi->sb_type : bsize;
- min_size = AOMMIN(min_size, bs);
- max_size = AOMMAX(max_size, bs);
- }
- }
- }
-
- if (xd->left_available) {
- for (idy = 0; idy < mi_height; ++idy) {
- const MB_MODE_INFO *const mi = xd->mi[idy * cm->mi_stride - 1];
- const BLOCK_SIZE bs = mi ? mi->sb_type : bsize;
- min_size = AOMMIN(min_size, bs);
- max_size = AOMMAX(max_size, bs);
- }
- }
-
- if (xd->up_available) {
- for (idx = 0; idx < mi_width; ++idx) {
- const MB_MODE_INFO *const mi = xd->mi[idx - cm->mi_stride];
- const BLOCK_SIZE bs = mi ? mi->sb_type : bsize;
- min_size = AOMMIN(min_size, bs);
- max_size = AOMMAX(max_size, bs);
- }
- }
-
- if (min_size == max_size) {
- min_size = min_partition_size[min_size];
- max_size = max_partition_size[max_size];
- }
-
- *min_bs = AOMMIN(min_size, cm->seq_params.sb_size);
- *max_bs = AOMMIN(max_size, cm->seq_params.sb_size);
-}
-
static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
memcpy(ctx->pred_mv, x->pred_mv, sizeof(x->pred_mv));
}
@@ -2327,56 +2180,6 @@ static INLINE void load_pred_mv(MACROBLOCK *x,
memcpy(x->pred_mv, ctx->pred_mv, sizeof(x->pred_mv));
}
-#if CONFIG_FP_MB_STATS
-const int qindex_skip_threshold_lookup[BLOCK_SIZES] = {
- 0, 10, 10, 30, 40, 40, 60, 80, 80, 90, 100, 100, 120,
- // TODO(debargha): What are the correct numbers here?
- 130, 130, 150
-};
-const int qindex_split_threshold_lookup[BLOCK_SIZES] = {
- 0, 3, 3, 7, 15, 15, 30, 40, 40, 60, 80, 80, 120,
- // TODO(debargha): What are the correct numbers here?
- 160, 160, 240
-};
-const int complexity_16x16_blocks_threshold[BLOCK_SIZES] = {
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 6,
- // TODO(debargha): What are the correct numbers here?
- 8, 8, 10
-};
-
-typedef enum {
- MV_ZERO = 0,
- MV_LEFT = 1,
- MV_UP = 2,
- MV_RIGHT = 3,
- MV_DOWN = 4,
- MV_INVALID
-} MOTION_DIRECTION;
-
-static INLINE MOTION_DIRECTION get_motion_direction_fp(uint8_t fp_byte) {
- if (fp_byte & FPMB_MOTION_ZERO_MASK) {
- return MV_ZERO;
- } else if (fp_byte & FPMB_MOTION_LEFT_MASK) {
- return MV_LEFT;
- } else if (fp_byte & FPMB_MOTION_RIGHT_MASK) {
- return MV_RIGHT;
- } else if (fp_byte & FPMB_MOTION_UP_MASK) {
- return MV_UP;
- } else {
- return MV_DOWN;
- }
-}
-
-static INLINE int get_motion_inconsistency(MOTION_DIRECTION this_mv,
- MOTION_DIRECTION that_mv) {
- if (this_mv == that_mv) {
- return 0;
- } else {
- return abs(this_mv - that_mv) == 2 ? 2 : 1;
- }
-}
-#endif
-
// Try searching for an encoding for the given subblock. Returns zero if the
// rdcost is already too high (to tell the caller not to bother searching for
// encodings of further subblocks)
@@ -2398,9 +2201,9 @@ static int rd_try_subblock(AV1_COMP *const cpi, ThreadData *td,
? INT64_MAX
: (best_rdc->rdcost - sum_rdc->rdcost);
- rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, this_rdc,
- RTS_X_RATE_NOCOEF_ARG partition, subsize, this_ctx,
- rdcost_remaining);
+ pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, this_rdc,
+ RTS_X_RATE_NOCOEF_ARG partition, subsize, this_ctx,
+ rdcost_remaining, 0);
if (this_rdc->rate == INT_MAX) {
sum_rdc->rdcost = INT64_MAX;
@@ -2616,8 +2419,8 @@ static void rd_pick_sqr_partition(AV1_COMP *const cpi, ThreadData *td,
const int64_t best_remain_rdcost =
best_rdc.rdcost == INT64_MAX ? INT64_MAX
: (best_rdc.rdcost - partition_rd_cost);
- rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
- PARTITION_NONE, bsize, ctx_none, best_remain_rdcost);
+ pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, PARTITION_NONE,
+ bsize, ctx_none, best_remain_rdcost, 0);
pc_tree->pc_tree_stats.rdcost = ctx_none->rdcost;
pc_tree->pc_tree_stats.skip = ctx_none->skip;
@@ -2669,6 +2472,17 @@ static void rd_pick_sqr_partition(AV1_COMP *const cpi, ThreadData *td,
do_square_split = 0;
}
}
+
+ if (cpi->sf.firstpass_simple_motion_search_early_term &&
+ cm->show_frame && bsize <= BLOCK_32X32 && bsize >= BLOCK_8X8 &&
+ !frame_is_intra_only(cm) && mi_row + mi_step < cm->mi_rows &&
+ mi_col + mi_step < cm->mi_cols && this_rdc.rdcost < INT64_MAX &&
+ this_rdc.rdcost >= 0 && this_rdc.rate < INT_MAX &&
+ this_rdc.rate >= 0 && do_square_split) {
+ av1_firstpass_simple_motion_search_early_term(
+ cpi, x, pc_tree, mi_row, mi_col, bsize, &this_rdc,
+ &do_square_split);
+ }
}
}
@@ -2788,79 +2602,9 @@ static void rd_pick_sqr_partition(AV1_COMP *const cpi, ThreadData *td,
}
}
-#define FEATURE_SIZE 19
-static const float two_pass_split_partition_weights_128[FEATURE_SIZE + 1] = {
- 2.683936f, -0.193620f, -4.106470f, -0.141320f, -0.282289f,
- 0.125296f, -1.134961f, 0.862757f, -0.418799f, -0.637666f,
- 0.016232f, 0.345013f, 0.018823f, -0.393394f, -1.130700f,
- 0.695357f, 0.112569f, -0.341975f, -0.513882f, 5.7488966f,
-};
-
-static const float two_pass_split_partition_weights_64[FEATURE_SIZE + 1] = {
- 2.990993f, 0.423273f, -0.926544f, 0.454646f, -0.292698f,
- -1.311632f, -0.284432f, 0.717141f, -0.419257f, -0.574760f,
- -0.674444f, 0.669047f, -0.374255f, 0.380624f, -0.804036f,
- 0.264021f, 0.004163f, 1.896802f, 0.924287f, 0.13490619f,
-};
-
-static const float two_pass_split_partition_weights_32[FEATURE_SIZE + 1] = {
- 2.795181f, -0.136943f, -0.924842f, 0.405330f, -0.463505f,
- -0.584076f, -0.831472f, 0.382985f, -0.597544f, -0.138915f,
- -1.354350f, 0.466035f, -0.553961f, 0.213202f, -1.166429f,
- 0.010776f, -0.096236f, 2.335084f, 1.699857f, -0.58178353f,
-};
-
-static const float two_pass_split_partition_weights_16[FEATURE_SIZE + 1] = {
- 1.987888f, -0.431100f, -1.687703f, 0.262602f, -0.425298f,
- -0.463870f, -1.493457f, 0.470917f, -0.528457f, -0.087700f,
- -1.815092f, 0.152883f, -0.337908f, 0.093679f, -1.548267f,
- -0.042387f, -0.000861f, 2.556746f, 1.619192f, 0.03643292f,
-};
-
-static const float two_pass_split_partition_weights_8[FEATURE_SIZE + 1] = {
- 2.188344f, -0.817528f, -2.119219f, 0.000000f, -0.348167f,
- -0.658074f, -1.960362f, 0.000000f, -0.403080f, 0.282699f,
- -2.061088f, 0.000000f, -0.431919f, -0.127960f, -1.099550f,
- 0.000000f, 0.121622f, 2.017455f, 2.058228f, -0.15475988f,
-};
-
-static const float two_pass_none_partition_weights_128[FEATURE_SIZE + 1] = {
- -1.006689f, 0.777908f, 4.461072f, -0.395782f, -0.014610f,
- -0.853863f, 0.729997f, -0.420477f, 0.282429f, -1.194595f,
- 3.181220f, -0.511416f, 0.117084f, -1.149348f, 1.507990f,
- -0.477212f, 0.202963f, -1.469581f, 0.624461f, -0.89081228f,
-};
-
-static const float two_pass_none_partition_weights_64[FEATURE_SIZE + 1] = {
- -1.241117f, 0.844878f, 5.638803f, -0.489780f, -0.108796f,
- -4.576821f, 1.540624f, -0.477519f, 0.227791f, -1.443968f,
- 1.586911f, -0.505125f, 0.140764f, -0.464194f, 1.466658f,
- -0.641166f, 0.195412f, 1.427905f, 2.080007f, -1.98272777f,
-};
-
-static const float two_pass_none_partition_weights_32[FEATURE_SIZE + 1] = {
- -2.130825f, 0.476023f, 5.907343f, -0.516002f, -0.097471f,
- -2.662754f, 0.614858f, -0.576728f, 0.085261f, -0.031901f,
- 0.727842f, -0.600034f, 0.079326f, 0.324328f, 0.504502f,
- -0.547105f, -0.037670f, 0.304995f, 0.369018f, -2.66299987f,
-};
-
-static const float two_pass_none_partition_weights_16[FEATURE_SIZE + 1] = {
- -1.626410f, 0.872047f, 5.414965f, -0.554781f, -0.084514f,
- -3.020550f, 0.467632f, -0.382280f, 0.199568f, 0.426220f,
- 0.829426f, -0.467100f, 0.153098f, 0.662994f, 0.327545f,
- -0.560106f, -0.141610f, 0.403372f, 0.523991f, -3.02891231f,
-};
-
-static const float two_pass_none_partition_weights_8[FEATURE_SIZE + 1] = {
- -1.463349f, 0.375376f, 4.751430f, 0.000000f, -0.184451f,
- -1.655447f, 0.443214f, 0.000000f, 0.127961f, 0.152435f,
- 0.083288f, 0.000000f, 0.143105f, 0.438012f, 0.073238f,
- 0.000000f, -0.278137f, 0.186134f, 0.073737f, -1.6494962f,
-};
-
// split_score indicates confidence of picking split partition;
// none_score indicates confidence of picking none partition;
+#define FEATURE_SIZE 19
static int ml_prune_2pass_split_partition(const PC_TREE_STATS *pc_tree_stats,
BLOCK_SIZE bsize, int *split_score,
int *none_score) {
@@ -2870,24 +2614,24 @@ static int ml_prune_2pass_split_partition(const PC_TREE_STATS *pc_tree_stats,
switch (bsize) {
case BLOCK_4X4: break;
case BLOCK_8X8:
- split_weights = two_pass_split_partition_weights_8;
- none_weights = two_pass_none_partition_weights_8;
+ split_weights = av1_2pass_split_partition_weights_8;
+ none_weights = av1_2pass_none_partition_weights_8;
break;
case BLOCK_16X16:
- split_weights = two_pass_split_partition_weights_16;
- none_weights = two_pass_none_partition_weights_16;
+ split_weights = av1_2pass_split_partition_weights_16;
+ none_weights = av1_2pass_none_partition_weights_16;
break;
case BLOCK_32X32:
- split_weights = two_pass_split_partition_weights_32;
- none_weights = two_pass_none_partition_weights_32;
+ split_weights = av1_2pass_split_partition_weights_32;
+ none_weights = av1_2pass_none_partition_weights_32;
break;
case BLOCK_64X64:
- split_weights = two_pass_split_partition_weights_64;
- none_weights = two_pass_none_partition_weights_64;
+ split_weights = av1_2pass_split_partition_weights_64;
+ none_weights = av1_2pass_none_partition_weights_64;
break;
case BLOCK_128X128:
- split_weights = two_pass_split_partition_weights_128;
- none_weights = two_pass_none_partition_weights_128;
+ split_weights = av1_2pass_split_partition_weights_128;
+ none_weights = av1_2pass_none_partition_weights_128;
break;
default: assert(0 && "Unexpected bsize.");
}
@@ -2981,7 +2725,7 @@ static void ml_prune_rect_partition(const AV1_COMP *const cpi,
// Variance ratios
const MACROBLOCKD *const xd = &x->e_mbd;
int whole_block_variance;
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ if (is_cur_buf_hbd(xd)) {
whole_block_variance = av1_high_get_sby_perpixel_variance(
cpi, &x->plane[0].src, bsize, xd->bd);
} else {
@@ -2999,7 +2743,7 @@ static void ml_prune_rect_partition(const AV1_COMP *const cpi,
const int x_idx = (i & 1) * bw / 2;
const int y_idx = (i >> 1) * bw / 2;
buf.buf = x->plane[0].src.buf + x_idx + y_idx * buf.stride;
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ if (is_cur_buf_hbd(xd)) {
split_variance[i] =
av1_high_get_sby_perpixel_variance(cpi, &buf, subsize, xd->bd);
} else {
@@ -3181,7 +2925,7 @@ static void ml_prune_4_partition(const AV1_COMP *const cpi, MACROBLOCK *const x,
src + i * block_size_high[horz_4_bs] * src_stride;
const uint8_t *vert_src = src + i * block_size_wide[vert_4_bs];
unsigned int horz_var, vert_var, sse;
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ if (is_cur_buf_hbd(xd)) {
switch (xd->bd) {
case 10:
horz_var = cpi->fn_ptr[horz_4_bs].vf(
@@ -3340,204 +3084,32 @@ static int ml_predict_breakout(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
}
#undef FEATURES
-#if CONFIG_ONE_PASS_SVM
-#define FEATURES 24
-static void ml_op_svm_early_term(const AV1_COMP *const cpi,
- const MACROBLOCK *const x,
- const MACROBLOCKD *const xd,
- const PICK_MODE_CONTEXT *ctx_none,
- const RD_STATS *none_rdc, int pb_source_var,
- BLOCK_SIZE bsize, float *const score) {
- const float *ml_weights = NULL, *ml_mean = NULL, *ml_std = NULL;
- if (bsize == BLOCK_128X128) {
- ml_weights = av1_op_svm_early_term_weights_128;
- ml_mean = av1_op_svm_early_term_mean_128;
- ml_std = av1_op_svm_early_term_std_128;
- } else if (bsize == BLOCK_64X64) {
- ml_weights = av1_op_svm_early_term_weights_64;
- ml_mean = av1_op_svm_early_term_mean_64;
- ml_std = av1_op_svm_early_term_std_64;
- } else if (bsize == BLOCK_32X32) {
- ml_weights = av1_op_svm_early_term_weights_32;
- ml_mean = av1_op_svm_early_term_mean_32;
- ml_std = av1_op_svm_early_term_std_32;
- } else if (bsize == BLOCK_16X16) {
- ml_weights = av1_op_svm_early_term_weights_16;
- ml_mean = av1_op_svm_early_term_mean_16;
- ml_std = av1_op_svm_early_term_std_16;
- } else {
- assert(bsize == BLOCK_128X128 || bsize == BLOCK_64X64 ||
- bsize == BLOCK_32X32 || bsize == BLOCK_8X8);
- }
- if (ml_weights != NULL) {
- // Compute some features
-
- float features[FEATURES] = { 0 };
- int f_idx = 0;
- int r_idx = 0;
-
- // None features
- // Get none stats
- features[f_idx++] = none_rdc->rate;
- features[f_idx++] = none_rdc->dist;
- features[f_idx++] = none_rdc->rdcost;
- features[f_idx++] = ctx_none->skip;
-
- // EOBS
- features[f_idx++] = none_rdc->eob;
- int scaled_eob = none_rdc->eob * 32 * 32;
- features[f_idx++] = (1.0f + none_rdc->eob_0) / (4.0f + scaled_eob);
- features[f_idx++] = (1.0f + none_rdc->eob_1) / (4.0f + scaled_eob);
- features[f_idx++] = (1.0f + none_rdc->eob_2) / (4.0f + scaled_eob);
- features[f_idx++] = (1.0f + none_rdc->eob_3) / (4.0f + scaled_eob);
-
- // Y_RD
- features[f_idx++] = none_rdc->rd;
- int64_t scaled_rd = none_rdc->rd * 32 * 32;
- features[f_idx++] = (1.0f + none_rdc->rd_0) / (4.0f + scaled_rd);
- features[f_idx++] = (1.0f + none_rdc->rd_1) / (4.0f + scaled_rd);
- features[f_idx++] = (1.0f + none_rdc->rd_2) / (4.0f + scaled_rd);
- features[f_idx++] = (1.0f + none_rdc->rd_3) / (4.0f + scaled_rd);
-
- // Q_SQUARED
- features[f_idx++] =
- (x->plane[0].dequant_QTX[0]) * (x->plane[0].dequant_QTX[0]);
-
- // SIZE
- // Get size of surrounding blocks
- int above_size = 18, left_size = 18;
- const MB_MODE_INFO *above_block = xd->above_mbmi;
- const MB_MODE_INFO *left_block = xd->left_mbmi;
-
- if (above_block) {
- above_size = above_block->sb_type;
- }
- if (left_block) {
- left_size = left_block->sb_type;
- }
-
- features[f_idx++] = left_size;
- features[f_idx++] = left_size != 18;
-
- features[f_idx++] = above_size;
- features[f_idx++] = above_size != 18;
-
- // Variance
- // Get variance
- int var = pb_source_var, var_reg[4] = { 0 };
- const int bw = block_size_wide[bsize];
- const int bh = block_size_high[bsize];
- const BLOCK_SIZE split_size = get_partition_subsize(bsize, PARTITION_SPLIT);
- struct buf_2d buf;
- buf.stride = x->plane[0].src.stride;
- for (int i = 0; i < 4; ++i) {
- const int x_idx = (i & 1) * bw / 2;
- const int y_idx = (i >> 1) * bh / 2;
- buf.buf = x->plane[0].src.buf + x_idx + y_idx * buf.stride;
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- var_reg[i] =
- av1_high_get_sby_perpixel_variance(cpi, &buf, split_size, xd->bd);
- } else {
- var_reg[i] = av1_get_sby_perpixel_variance(cpi, &buf, split_size);
- }
- }
-
- features[f_idx++] = var;
- for (r_idx = 0; r_idx < 4; r_idx++) {
- features[f_idx] = (var_reg[r_idx] + 1.0f) / (var + 4.0f);
- f_idx++;
- }
-
- assert(f_idx == FEATURES);
-
- // Calculate the score
- *score = 0.0f;
- for (f_idx = 0; f_idx < FEATURES; f_idx++) {
- *score += ml_weights[f_idx] * (features[f_idx] - ml_mean[f_idx]) /
- ml_std[f_idx];
- }
- // Dont forget the bias
- *score += ml_weights[FEATURES];
- }
-}
-#undef FEATURES
-#endif
-
-// Performs a full_pixel_motion_search with a single reference frame and extract
-// the variance of residues. Here features is assumed to be a length 6 array.
-// After this function is called, we will store the following in to features:
-// features[0] = log(1 + dc_q**2/256)
-// features[1] = log(1 + variance_of_residue)
-// for i in [2, 3, 4, 5]:
-// features[i] = log(1 + variance_of_residue_in_block[i]/variance_of_residue)
-static void get_res_var_features(AV1_COMP *const cpi, MACROBLOCK *x, int mi_row,
- int mi_col, BLOCK_SIZE bsize,
- float *features) {
- // TODO(chiyotsai@google.com): The data this model trained on did not also use
- // SIMPLE_TRANSLATION to build the inter_predictor. Retraining and tuning the
- // model with the correct data should give better performance.
+// Record the ref frames that have been selected by square partition blocks.
+static void update_picked_ref_frames_mask(MACROBLOCK *const x, int ref_type,
+ BLOCK_SIZE bsize, int mib_size,
+ int mi_row, int mi_col) {
assert(mi_size_wide[bsize] == mi_size_high[bsize]);
-
- MACROBLOCKD *xd = &x->e_mbd;
- DECLARE_ALIGNED(16, uint16_t, pred_buffer[MAX_SB_SQUARE]);
- int pred_stride = 128;
-
- // Perform a single motion search in Y_PLANE to make a prediction
- const MV_REFERENCE_FRAME ref =
- cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME : LAST_FRAME;
- const int use_subpixel = 0;
- const int num_planes = 1;
-
- uint8_t *const pred_buf = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
- ? CONVERT_TO_BYTEPTR(pred_buffer)
- : (uint8_t *)pred_buffer;
- xd->plane[0].dst.buf = pred_buf;
- xd->plane[0].dst.stride = pred_stride;
-
- simple_motion_search(cpi, x, mi_row, mi_col, bsize, ref, num_planes,
- use_subpixel);
-
- // Start getting the features
- int f_idx = 0;
-
- // Q_INDEX
- const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8);
- features[f_idx++] = logf(1.0f + (float)(dc_q * dc_q) / 256.0f);
-
- // VARIANCE
- const uint8_t *src = x->plane[0].src.buf;
- const int src_stride = x->plane[0].src.stride;
- unsigned int sse = 0;
-
- // Whole block
- const unsigned int var =
- cpi->fn_ptr[bsize].vf(src, src_stride, pred_buf, pred_stride, &sse);
- features[f_idx++] = logf(1.0f + (float)var);
-
- // Regional
- const int bw = block_size_wide[bsize];
- const int bh = block_size_high[bsize];
- const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
- int r_idx = 0;
- for (r_idx = 0; r_idx < 4; r_idx++) {
- const int x_idx = (r_idx & 1) * bw / 2;
- const int y_idx = (r_idx >> 1) * bh / 2;
- const int src_offset = y_idx * src_stride + x_idx;
- const int pred_offset = y_idx * pred_stride + x_idx;
- const unsigned int sub_var =
- cpi->fn_ptr[subsize].vf(src + src_offset, src_stride,
- pred_buf + pred_offset, pred_stride, &sse);
- const float var_ratio = (1.0f + (float)sub_var) / (4.0f + (float)var);
- features[f_idx++] = var_ratio;
+ const int sb_size_mask = mib_size - 1;
+ const int mi_row_in_sb = mi_row & sb_size_mask;
+ const int mi_col_in_sb = mi_col & sb_size_mask;
+ const int mi_size = mi_size_wide[bsize];
+ for (int i = mi_row_in_sb; i < mi_row_in_sb + mi_size; ++i) {
+ for (int j = mi_col_in_sb; j < mi_col_in_sb + mi_size; ++j) {
+ x->picked_ref_frames_mask[i * 32 + j] |= 1 << ref_type;
+ }
}
}
-// TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
+// TODO(jinging,jimbankoski,rbultje): properly skip partition types that are
// unlikely to be selected depending on previous rate-distortion optimization
// results, for encoding speed-up.
+// TODO(chiyotsai@google.com): Move these ml related varables to a seprate file
+// to separate low level ml logic from partition logic
+#define NUM_SIMPLE_MOTION_FEATURES 28
static void rd_pick_partition(AV1_COMP *const cpi, ThreadData *td,
TileDataEnc *tile_data, TOKENEXTRA **tp,
int mi_row, int mi_col, BLOCK_SIZE bsize,
+ BLOCK_SIZE max_sq_part, BLOCK_SIZE min_sq_part,
RD_STATS *rd_cost, int64_t best_rd,
PC_TREE *pc_tree, int64_t *none_rd) {
const AV1_COMMON *const cm = &cpi->common;
@@ -3560,11 +3132,14 @@ static void rd_pick_partition(AV1_COMP *const cpi, ThreadData *td,
const int *partition_cost =
pl >= 0 ? x->partition_cost[pl] : x->partition_cost[0];
- int do_rectangular_split = 1;
+ int do_rectangular_split = cpi->oxcf.enable_rect_partitions;
int64_t cur_none_rd = 0;
int64_t split_rd[4] = { 0, 0, 0, 0 };
int64_t horz_rd[2] = { 0, 0 };
int64_t vert_rd[2] = { 0, 0 };
+ int prune_horz = 0;
+ int prune_vert = 0;
+ int terminate_partition_search = 0;
int split_ctx_is_ready[2] = { 0, 0 };
int horz_ctx_is_ready = 0;
@@ -3585,22 +3160,26 @@ static void rd_pick_partition(AV1_COMP *const cpi, ThreadData *td,
const int xss = x->e_mbd.plane[1].subsampling_x;
const int yss = x->e_mbd.plane[1].subsampling_y;
- BLOCK_SIZE min_size = x->min_partition_size;
- BLOCK_SIZE max_size = x->max_partition_size;
-
if (none_rd) *none_rd = 0;
-
-#if CONFIG_FP_MB_STATS
- unsigned int src_diff_var = UINT_MAX;
- int none_complexity = 0;
-#endif
-
int partition_none_allowed = has_rows && has_cols;
- int partition_horz_allowed = has_cols && yss <= xss && bsize_at_least_8x8;
- int partition_vert_allowed = has_rows && xss <= yss && bsize_at_least_8x8;
+ int partition_horz_allowed = has_cols && yss <= xss && bsize_at_least_8x8 &&
+ cpi->oxcf.enable_rect_partitions;
+ int partition_vert_allowed = has_rows && xss <= yss && bsize_at_least_8x8 &&
+ cpi->oxcf.enable_rect_partitions;
(void)*tp_orig;
+#if CONFIG_COLLECT_PARTITION_STATS
+ int partition_decisions[EXT_PARTITION_TYPES] = { 0 };
+ int partition_attempts[EXT_PARTITION_TYPES] = { 0 };
+ int64_t partition_times[EXT_PARTITION_TYPES] = { 0 };
+ struct aom_usec_timer partition_timer = { 0 };
+ int partition_timer_on = 0;
+#if CONFIG_COLLECT_PARTITION_STATS == 2
+ PartitionStats *part_stats = &cpi->partition_stats;
+#endif
+#endif
+
// Override partition costs at the edges of the frame in the same
// way as in read_partition (see decodeframe.c)
if (!(has_rows && has_cols)) {
@@ -3625,6 +3204,7 @@ static void rd_pick_partition(AV1_COMP *const cpi, ThreadData *td,
}
partition_cost = tmp_partition_cost;
+ do_square_split &= partition_cost[PARTITION_SPLIT] != INT_MAX;
}
#ifndef NDEBUG
@@ -3647,35 +3227,12 @@ static void rd_pick_partition(AV1_COMP *const cpi, ThreadData *td,
if (bsize == BLOCK_16X16 && cpi->vaq_refresh)
x->mb_energy = av1_log_block_var(cpi, x, bsize);
- if (cpi->sf.cb_partition_search && bsize == BLOCK_16X16) {
- const int cb_partition_search_ctrl =
- ((pc_tree->index == 0 || pc_tree->index == 3) +
- get_chessboard_index(cm->current_frame.frame_number)) &
- 0x1;
-
- if (cb_partition_search_ctrl && bsize > min_size && bsize < max_size)
- set_partition_range(cm, xd, mi_row, mi_col, bsize, &min_size, &max_size);
- }
-
- // Determine partition types in search according to the speed features.
- // The threshold set here has to be of square block size.
- if (cpi->sf.auto_min_max_partition_size) {
- const int no_partition_allowed = (bsize <= max_size && bsize >= min_size);
- // Note: Further partitioning is NOT allowed when bsize == min_size already.
- const int partition_allowed = (bsize <= max_size && bsize > min_size);
- partition_none_allowed &= no_partition_allowed;
- partition_horz_allowed &= partition_allowed || !has_rows;
- partition_vert_allowed &= partition_allowed || !has_cols;
- do_square_split &= bsize > min_size;
- }
-
if (bsize > cpi->sf.use_square_partition_only_threshold) {
partition_horz_allowed &= !has_rows;
partition_vert_allowed &= !has_cols;
}
- if (bsize > BLOCK_4X4 && x->use_cb_search_range &&
- cpi->sf.auto_min_max_partition_size == 0) {
+ if (bsize > BLOCK_4X4 && x->use_cb_search_range) {
int split_score = 0;
int none_score = 0;
const int score_valid = ml_prune_2pass_split_partition(
@@ -3720,8 +3277,10 @@ static void rd_pick_partition(AV1_COMP *const cpi, ThreadData *td,
partition_horz_allowed == 0 && partition_vert_allowed == 0) {
do_square_split = bsize_at_least_8x8;
partition_none_allowed = has_rows && has_cols;
- partition_horz_allowed = has_cols && yss <= xss && bsize_at_least_8x8;
- partition_vert_allowed = has_rows && xss <= yss && bsize_at_least_8x8;
+ partition_horz_allowed = has_cols && yss <= xss && bsize_at_least_8x8 &&
+ cpi->oxcf.enable_rect_partitions;
+ partition_vert_allowed = has_rows && xss <= yss && bsize_at_least_8x8 &&
+ cpi->oxcf.enable_rect_partitions;
}
}
@@ -3730,127 +3289,91 @@ static void rd_pick_partition(AV1_COMP *const cpi, ThreadData *td,
xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
-#if CONFIG_FP_MB_STATS
- if (cpi->use_fp_mb_stats) {
- set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
- src_diff_var = get_sby_perpixel_diff_variance(cpi, &x->plane[0].src, mi_row,
- mi_col, bsize);
- }
-
- // Decide whether we shall split directly and skip searching NONE by using
- // the first pass block statistics
- if (cpi->use_fp_mb_stats && bsize >= BLOCK_32X32 && do_square_split &&
- partition_none_allowed && src_diff_var > 4 &&
- cm->base_qindex < qindex_split_threshold_lookup[bsize]) {
- int mb_row = mi_row >> 1;
- int mb_col = mi_col >> 1;
- int mb_row_end =
- AOMMIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows);
- int mb_col_end =
- AOMMIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols);
- int r, c;
-
- // compute a complexity measure, basically measure inconsistency of motion
- // vectors obtained from the first pass in the current block
- for (r = mb_row; r < mb_row_end; r++) {
- for (c = mb_col; c < mb_col_end; c++) {
- const int mb_index = r * cm->mb_cols + c;
-
- MOTION_DIRECTION this_mv;
- MOTION_DIRECTION right_mv;
- MOTION_DIRECTION bottom_mv;
-
- this_mv =
- get_motion_direction_fp(cpi->twopass.this_frame_mb_stats[mb_index]);
-
- // to its right
- if (c != mb_col_end - 1) {
- right_mv = get_motion_direction_fp(
- cpi->twopass.this_frame_mb_stats[mb_index + 1]);
- none_complexity += get_motion_inconsistency(this_mv, right_mv);
- }
-
- // to its bottom
- if (r != mb_row_end - 1) {
- bottom_mv = get_motion_direction_fp(
- cpi->twopass.this_frame_mb_stats[mb_index + cm->mb_cols]);
- none_complexity += get_motion_inconsistency(this_mv, bottom_mv);
- }
-
- // do not count its left and top neighbors to avoid double counting
- }
- }
-
- if (none_complexity > complexity_16x16_blocks_threshold[bsize]) {
- partition_none_allowed = 0;
- }
- }
-#endif
-
- // Ref frames picked in the [i_th] quarter subblock during square partition
- // RD search. It may be used to prune ref frame selection of rect partitions.
- int ref_frames_used[4] = {
- 0,
- };
-
- MB_MODE_INFO *split_mbmi[4] = { 0 };
-
- // Perform a full_pixel_search and use the residue to estimate whether we
- // should split directly.
- // TODO(chiyotsai@google.com): Try the algorithm on hbd and speed 0.
- // Also try pruning PARTITION_SPLIT
- if (cpi->sf.full_pixel_motion_search_based_split && bsize >= BLOCK_8X8 &&
+ // Use simple_motion_search to prune partitions. This must be done prior to
+ // PARTITION_SPLIT to propagate the initial mvs to a smaller blocksize.
+ const int try_split_only =
+ cpi->sf.simple_motion_search_split_only && bsize >= BLOCK_8X8 &&
do_square_split && mi_row + mi_size_high[bsize] <= cm->mi_rows &&
mi_col + mi_size_wide[bsize] <= cm->mi_cols && !frame_is_intra_only(cm) &&
- !cm->seq_params.enable_superres) {
- const NN_CONFIG *nn_config = NULL;
- float split_only_thresh = 0.0f;
- if (bsize == BLOCK_128X128) {
- nn_config = &full_pixel_motion_search_based_split_nn_config_128;
- split_only_thresh = full_pixel_motion_search_based_split_thresh_128;
- } else if (bsize == BLOCK_64X64) {
- nn_config = &full_pixel_motion_search_based_split_nn_config_64;
- split_only_thresh = full_pixel_motion_search_based_split_thresh_64;
- } else if (bsize == BLOCK_32X32) {
- nn_config = &full_pixel_motion_search_based_split_nn_config_32;
- split_only_thresh = full_pixel_motion_search_based_split_thresh_32;
- } else if (bsize == BLOCK_16X16) {
- nn_config = &full_pixel_motion_search_based_split_nn_config_16;
- split_only_thresh = full_pixel_motion_search_based_split_thresh_16;
- } else if (bsize == BLOCK_8X8) {
-#if !CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8
- // Disable BLOCK_8X8 for now
- nn_config = &full_pixel_motion_search_based_split_nn_config_8;
- split_only_thresh = full_pixel_motion_search_based_split_thresh_8;
-#endif
- } else {
- assert(0 && "Unexpected block size in full_pixel_motion_based_split");
- }
- if (nn_config) {
- float features[6] = { 0 };
- float score = 0;
- get_res_var_features(cpi, x, mi_row, mi_col, bsize, features);
- av1_nn_predict(features, nn_config, &score);
-
- if (score > split_only_thresh) {
- partition_none_allowed = 0;
- partition_horz_allowed = 0;
- partition_vert_allowed = 0;
- do_rectangular_split = 0;
- }
- }
- }
+ !av1_superres_scaled(cm);
+
+ if (try_split_only) {
+ av1_simple_motion_search_based_split(
+ cpi, x, mi_row, mi_col, bsize, &partition_none_allowed,
+ &partition_horz_allowed, &partition_vert_allowed, &do_rectangular_split,
+ &do_square_split);
+ }
+
+ const int try_prune_rect =
+ cpi->sf.simple_motion_search_prune_rect && !frame_is_intra_only(cm) &&
+ do_rectangular_split &&
+ (do_square_split || partition_none_allowed ||
+ (prune_horz && prune_vert)) &&
+ (partition_horz_allowed || partition_vert_allowed) && bsize >= BLOCK_8X8;
+
+ float simple_motion_features[NUM_SIMPLE_MOTION_FEATURES] = { 0.0f };
+ int simple_motion_features_are_valid = 0;
+
+ if (try_prune_rect) {
+ av1_simple_motion_search_prune_part(
+ cpi, x, pc_tree, mi_row, mi_col, bsize, &partition_none_allowed,
+ &partition_horz_allowed, &partition_vert_allowed, &do_square_split,
+ &do_rectangular_split, &prune_horz, &prune_vert, simple_motion_features,
+ &simple_motion_features_are_valid);
+ }
+
+ // Max and min square partition levels are defined as the partition nodes that
+ // the recursive function rd_pick_partition() can reach. To implement this:
+ // only PARTITION_NONE is allowed if the current node equals min_sq_part,
+ // only PARTITION_SPLIT is allowed if the current node exceeds max_sq_part.
+ assert(block_size_wide[min_sq_part] == block_size_high[min_sq_part]);
+ assert(block_size_wide[max_sq_part] == block_size_high[max_sq_part]);
+ assert(min_sq_part <= max_sq_part);
+ assert(block_size_wide[bsize] == block_size_high[bsize]);
+ const int max_partition_size = block_size_wide[max_sq_part];
+ const int min_partition_size = block_size_wide[min_sq_part];
+ const int blksize = block_size_wide[bsize];
+ assert(min_partition_size <= max_partition_size);
+ const int is_le_min_sq_part = blksize <= min_partition_size;
+ const int is_gt_max_sq_part = blksize > max_partition_size;
+ if (is_gt_max_sq_part) {
+ // If current block size is larger than max, only allow split.
+ partition_none_allowed = 0;
+ partition_horz_allowed = 0;
+ partition_vert_allowed = 0;
+ do_square_split = 1;
+ } else if (is_le_min_sq_part) {
+ // If current block size is less or equal to min, only allow none if valid
+ // block large enough; only allow split otherwise.
+ partition_horz_allowed = 0;
+ partition_vert_allowed = 0;
+ // only disable square split when current block is not at the picture
+ // boundary. otherwise, inherit the square split flag from previous logic
+ if (has_rows && has_cols) do_square_split = 0;
+ partition_none_allowed = !do_square_split;
+ }
+ do_square_split &= partition_cost[PARTITION_SPLIT] != INT_MAX;
BEGIN_PARTITION_SEARCH:
if (x->must_find_valid_partition) {
+ do_square_split =
+ bsize_at_least_8x8 && partition_cost[PARTITION_SPLIT] != INT_MAX;
partition_none_allowed = has_rows && has_cols;
- partition_horz_allowed = has_cols && yss <= xss && bsize_at_least_8x8;
- partition_vert_allowed = has_rows && xss <= yss && bsize_at_least_8x8;
+ partition_horz_allowed = has_cols && yss <= xss && bsize_at_least_8x8 &&
+ cpi->oxcf.enable_rect_partitions;
+ partition_vert_allowed = has_rows && xss <= yss && bsize_at_least_8x8 &&
+ cpi->oxcf.enable_rect_partitions;
+ terminate_partition_search = 0;
}
// Partition block source pixel variance.
unsigned int pb_source_variance = UINT_MAX;
+ // Partition block sse after simple motion compensation, not in use now,
+ // but will be used for upcoming speed features
+ unsigned int pb_simple_motion_pred_sse = UINT_MAX;
+ (void)pb_simple_motion_pred_sse;
+
#if CONFIG_DIST_8X8
if (x->using_dist_8x8) {
if (block_size_high[bsize] <= 8) partition_horz_allowed = 0;
@@ -3861,7 +3384,9 @@ BEGIN_PARTITION_SEARCH:
#endif
// PARTITION_NONE
- if (partition_none_allowed) {
+ if (is_le_min_sq_part && has_rows && has_cols) partition_none_allowed = 1;
+ if (!terminate_partition_search && partition_none_allowed &&
+ !is_gt_max_sq_part) {
int pt_cost = 0;
if (bsize_at_least_8x8) {
pt_cost = partition_cost[PARTITION_NONE] < INT_MAX
@@ -3872,17 +3397,32 @@ BEGIN_PARTITION_SEARCH:
const int64_t best_remain_rdcost =
(best_rdc.rdcost == INT64_MAX) ? INT64_MAX
: (best_rdc.rdcost - partition_rd_cost);
- rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
- PARTITION_NONE, bsize, ctx_none, best_remain_rdcost);
+#if CONFIG_COLLECT_PARTITION_STATS
+ if (best_remain_rdcost >= 0) {
+ partition_attempts[PARTITION_NONE] += 1;
+ aom_usec_timer_start(&partition_timer);
+ partition_timer_on = 1;
+ }
+#endif
+ pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, PARTITION_NONE,
+ bsize, ctx_none, best_remain_rdcost, 0);
+#if CONFIG_COLLECT_PARTITION_STATS
+ if (partition_timer_on) {
+ aom_usec_timer_mark(&partition_timer);
+ int64_t time = aom_usec_timer_elapsed(&partition_timer);
+ partition_times[PARTITION_NONE] += time;
+ partition_timer_on = 0;
+ }
+#endif
pb_source_variance = x->source_variance;
+ pb_simple_motion_pred_sse = x->simple_motion_pred_sse;
if (none_rd) *none_rd = this_rdc.rdcost;
cur_none_rd = this_rdc.rdcost;
if (this_rdc.rate != INT_MAX) {
if (cpi->sf.prune_ref_frame_for_rect_partitions) {
const int ref_type = av1_ref_frame_type(ctx_none->mic.ref_frame);
- for (int i = 0; i < 4; ++i) {
- ref_frames_used[i] |= (1 << ref_type);
- }
+ update_picked_ref_frames_mask(x, ref_type, bsize,
+ cm->seq_params.mib_size, mi_row, mi_col);
}
if (bsize_at_least_8x8) {
this_rdc.rate += pt_cost;
@@ -3902,25 +3442,6 @@ BEGIN_PARTITION_SEARCH:
best_rdc = this_rdc;
if (bsize_at_least_8x8) pc_tree->partitioning = PARTITION_NONE;
-#if CONFIG_ONE_PASS_SVM
- // Use ML if the block size is square and >= 16X16
- if (bsize >= BLOCK_16X16 && !frame_is_intra_only(cm) &&
- this_rdc.rate < INT_MAX && this_rdc.rate >= 0 &&
- !ctx_none->seg_feat) {
- // Model Prediction
- float score = 0.0f;
- ml_op_svm_early_term(cpi, x, xd, ctx_none, &this_rdc,
- pb_source_variance, bsize, &score);
-
- // Decide if we want to terminate early
- if (score >= 0) {
- do_square_split = 0;
- do_rectangular_split = 0;
- partition_horz_allowed = 0;
- partition_vert_allowed = 0;
- }
- }
-#endif
if ((do_square_split || do_rectangular_split) &&
!x->e_mbd.lossless[xd->mi[0]->segment_id] && ctx_none->skippable) {
const int use_ml_based_breakout =
@@ -3946,51 +3467,17 @@ BEGIN_PARTITION_SEARCH:
}
}
-#if CONFIG_FP_MB_STATS
- // Check if every 16x16 first pass block statistics has zero
- // motion and the corresponding first pass residue is small enough.
- // If that is the case, check the difference variance between the
- // current frame and the last frame. If the variance is small enough,
- // stop further splitting in RD optimization
- if (cpi->use_fp_mb_stats && do_square_split &&
- cm->base_qindex > qindex_skip_threshold_lookup[bsize]) {
- int mb_row = mi_row >> 1;
- int mb_col = mi_col >> 1;
- int mb_row_end =
- AOMMIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows);
- int mb_col_end =
- AOMMIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols);
- int r, c;
-
- int skip = 1;
- for (r = mb_row; r < mb_row_end; r++) {
- for (c = mb_col; c < mb_col_end; c++) {
- const int mb_index = r * cm->mb_cols + c;
- if (!(cpi->twopass.this_frame_mb_stats[mb_index] &
- FPMB_MOTION_ZERO_MASK) ||
- !(cpi->twopass.this_frame_mb_stats[mb_index] &
- FPMB_ERROR_SMALL_MASK)) {
- skip = 0;
- break;
- }
- }
- if (skip == 0) {
- break;
- }
- }
- if (skip) {
- if (src_diff_var == UINT_MAX) {
- set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
- src_diff_var = get_sby_perpixel_diff_variance(
- cpi, &x->plane[0].src, mi_row, mi_col, bsize);
- }
- if (src_diff_var < 8) {
- do_square_split = 0;
- do_rectangular_split = 0;
- }
- }
+ if (cpi->sf.simple_motion_search_early_term_none && cm->show_frame &&
+ !frame_is_intra_only(cm) && bsize >= BLOCK_16X16 &&
+ mi_row + mi_step < cm->mi_rows && mi_col + mi_step < cm->mi_cols &&
+ this_rdc.rdcost < INT64_MAX && this_rdc.rdcost >= 0 &&
+ this_rdc.rate < INT_MAX && this_rdc.rate >= 0 &&
+ (do_square_split || do_rectangular_split)) {
+ av1_simple_motion_search_early_term_none(
+ cpi, x, pc_tree, mi_row, mi_col, bsize, &this_rdc,
+ &terminate_partition_search, simple_motion_features,
+ &simple_motion_features_are_valid);
}
-#endif
}
}
@@ -4001,13 +3488,20 @@ BEGIN_PARTITION_SEARCH:
if (cpi->sf.adaptive_motion_search) store_pred_mv(x, ctx_none);
// PARTITION_SPLIT
- if (do_square_split) {
+ if ((!terminate_partition_search && do_square_split) || is_gt_max_sq_part) {
av1_init_rd_stats(&sum_rdc);
subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
sum_rdc.rate = partition_cost[PARTITION_SPLIT];
sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
int idx;
+#if CONFIG_COLLECT_PARTITION_STATS
+ if (best_rdc.rdcost - sum_rdc.rdcost >= 0) {
+ partition_attempts[PARTITION_SPLIT] += 1;
+ aom_usec_timer_start(&partition_timer);
+ partition_timer_on = 1;
+ }
+#endif
for (idx = 0; idx < 4 && sum_rdc.rdcost < best_rdc.rdcost; ++idx) {
const int x_idx = (idx & 1) * mi_step;
const int y_idx = (idx >> 1) * mi_step;
@@ -4022,11 +3516,9 @@ BEGIN_PARTITION_SEARCH:
const int64_t best_remain_rdcost =
best_rdc.rdcost == INT64_MAX ? INT64_MAX
: (best_rdc.rdcost - sum_rdc.rdcost);
- if (cpi->sf.prune_ref_frame_for_rect_partitions)
- pc_tree->split[idx]->none.rate = INT_MAX;
rd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx,
- subsize, &this_rdc, best_remain_rdcost,
- pc_tree->split[idx], p_split_rd);
+ subsize, max_sq_part, min_sq_part, &this_rdc,
+ best_remain_rdcost, pc_tree->split[idx], p_split_rd);
if (this_rdc.rate == INT_MAX) {
sum_rdc.rdcost = INT64_MAX;
@@ -4035,16 +3527,6 @@ BEGIN_PARTITION_SEARCH:
sum_rdc.rate += this_rdc.rate;
sum_rdc.dist += this_rdc.dist;
sum_rdc.rdcost += this_rdc.rdcost;
- if (cpi->sf.prune_ref_frame_for_rect_partitions &&
- pc_tree->split[idx]->none.rate != INT_MAX) {
- const int ref_type =
- av1_ref_frame_type(pc_tree->split[idx]->none.mic.ref_frame);
- ref_frames_used[idx] |= (1 << ref_type);
-
- if (cpi->sf.prune_ref_mode_for_partitions) {
- split_mbmi[idx] = &pc_tree->split[idx]->none.mic;
- }
- }
if (idx <= 1 && (bsize <= BLOCK_8X8 ||
pc_tree->split[idx]->partitioning == PARTITION_NONE)) {
const MB_MODE_INFO *const mbmi = &pc_tree->split[idx]->none.mic;
@@ -4056,6 +3538,14 @@ BEGIN_PARTITION_SEARCH:
}
}
}
+#if CONFIG_COLLECT_PARTITION_STATS
+ if (partition_timer_on) {
+ aom_usec_timer_mark(&partition_timer);
+ int64_t time = aom_usec_timer_elapsed(&partition_timer);
+ partition_times[PARTITION_SPLIT] += time;
+ partition_timer_on = 0;
+ }
+#endif
const int reached_last_index = (idx == 4);
if (reached_last_index && sum_rdc.rdcost < best_rdc.rdcost) {
@@ -4075,108 +3565,19 @@ BEGIN_PARTITION_SEARCH:
restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
} // if (do_split)
- pc_tree->horizontal[0].skip_ref_frame_mask = 0;
- pc_tree->horizontal[1].skip_ref_frame_mask = 0;
- pc_tree->vertical[0].skip_ref_frame_mask = 0;
- pc_tree->vertical[1].skip_ref_frame_mask = 0;
- if (cpi->sf.prune_ref_frame_for_rect_partitions) {
- int used_frames;
- used_frames = ref_frames_used[0] | ref_frames_used[1];
- if (used_frames) pc_tree->horizontal[0].skip_ref_frame_mask = ~used_frames;
- used_frames = ref_frames_used[2] | ref_frames_used[3];
- if (used_frames) pc_tree->horizontal[1].skip_ref_frame_mask = ~used_frames;
- used_frames = ref_frames_used[0] | ref_frames_used[2];
- if (used_frames) pc_tree->vertical[0].skip_ref_frame_mask = ~used_frames;
- used_frames = ref_frames_used[1] | ref_frames_used[3];
- if (used_frames) pc_tree->vertical[1].skip_ref_frame_mask = ~used_frames;
- }
-
- for (int i = 0; i < 2; ++i) {
- pc_tree->horizontal[i].ref_selected[0] =
- pc_tree->horizontal[i].ref_selected[1] = NONE_FRAME;
- pc_tree->horizontal[i].mode_selected = -1;
- pc_tree->vertical[i].ref_selected[0] =
- pc_tree->vertical[i].ref_selected[1] = NONE_FRAME;
- pc_tree->vertical[i].mode_selected = -1;
- }
-
- if (cpi->sf.prune_ref_mode_for_partitions) {
- // horizontal partition
- for (int idx = 0; idx < 4; idx += 2) {
- const int horz_idx = idx / 2;
- if (split_mbmi[idx] && split_mbmi[idx + 1] &&
- split_mbmi[idx]->ref_frame[0] > INTRA_FRAME) {
- if (!has_second_ref(split_mbmi[idx])) {
- // Single ref
- if (split_mbmi[idx]->ref_frame[0] ==
- split_mbmi[idx + 1]->ref_frame[0] &&
- !has_second_ref(split_mbmi[idx + 1])) {
- const int ref_type = av1_ref_frame_type(split_mbmi[idx]->ref_frame);
- // Overwrite skip_ref_frame_mask for the current block
- const int used_frames = (1 << ref_type);
- pc_tree->horizontal[horz_idx].skip_ref_frame_mask = ~used_frames;
- pc_tree->horizontal[horz_idx].ref_selected[0] =
- split_mbmi[idx]->ref_frame[0];
-#if 0
- // TODO(zoeliu@gmail.com): To consider the scenario of obmc
- if (split_mbmi[idx]->motion_mode ==
- split_mbmi[idx + 1]->motion_mode &&
- split_mbmi[idx]->motion_mode == SIMPLE_TRANSLATION &&
- split_mbmi[idx]->use_wedge_interintra == 0) {
- pc_tree->horizontal[horz_idx].mode_selected = SIMPLE_TRANSLATION;
- }
-#endif // 0
- }
- } else {
- // TODO(zoeliu@gmail.com): To handle comp ref
- }
- }
- }
- // vertical partition
- for (int idx = 0; idx < 2; ++idx) {
- const int vert_idx = idx;
- if (split_mbmi[idx] && split_mbmi[idx + 2] &&
- split_mbmi[idx]->ref_frame[0] > INTRA_FRAME) {
- if (!has_second_ref(split_mbmi[idx])) {
- // Single ref
- if (split_mbmi[idx]->ref_frame[0] ==
- split_mbmi[idx + 2]->ref_frame[0] &&
- !has_second_ref(split_mbmi[idx + 2])) {
- const int ref_type = av1_ref_frame_type(split_mbmi[idx]->ref_frame);
- // Overwrite skip_ref_frame_mask for the current block
- const int used_frames = (1 << ref_type);
- pc_tree->vertical[vert_idx].skip_ref_frame_mask = ~used_frames;
- pc_tree->vertical[vert_idx].ref_selected[0] =
- split_mbmi[idx]->ref_frame[0];
-#if 0
- // TODO(zoeliu@gmail.com): To consider the scenario of obmc
- if (split_mbmi[idx]->motion_mode ==
- split_mbmi[idx + 2]->motion_mode &&
- split_mbmi[idx]->motion_mode == SIMPLE_TRANSLATION &&
- split_mbmi[idx]->use_wedge_interintra == 0) {
- pc_tree->vertical[vert_idx].mode_selected = SIMPLE_TRANSLATION;
- }
-#endif // 0
- }
- } else {
- // TODO(zoeliu@gmail.com): To handle comp ref
- }
- }
- }
- }
-
- int prune_horz = 0;
- int prune_vert = 0;
if (cpi->sf.ml_prune_rect_partition && !frame_is_intra_only(cm) &&
- (partition_horz_allowed || partition_vert_allowed)) {
+ (partition_horz_allowed || partition_vert_allowed) &&
+ !(prune_horz || prune_vert)) {
av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize);
ml_prune_rect_partition(cpi, x, bsize, best_rdc.rdcost, cur_none_rd,
split_rd, &prune_horz, &prune_vert);
}
// PARTITION_HORZ
- if (partition_horz_allowed && !prune_horz &&
- (do_rectangular_split || active_h_edge(cpi, mi_row, mi_step))) {
+ assert(IMPLIES(!cpi->oxcf.enable_rect_partitions, !partition_horz_allowed));
+ if (!terminate_partition_search && partition_horz_allowed && !prune_horz &&
+ (do_rectangular_split || active_h_edge(cpi, mi_row, mi_step)) &&
+ !is_gt_max_sq_part) {
av1_init_rd_stats(&sum_rdc);
subsize = get_partition_subsize(bsize, PARTITION_HORZ);
if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
@@ -4185,14 +3586,20 @@ BEGIN_PARTITION_SEARCH:
pc_tree->horizontal[0].pred_interp_filter =
av1_extract_interp_filter(ctx_none->mic.interp_filters, 0);
}
+ sum_rdc.rate = partition_cost[PARTITION_HORZ];
+ sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
const int64_t best_remain_rdcost = best_rdc.rdcost == INT64_MAX
? INT64_MAX
: (best_rdc.rdcost - sum_rdc.rdcost);
- sum_rdc.rate = partition_cost[PARTITION_HORZ];
- sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
- rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
- PARTITION_HORZ, subsize, &pc_tree->horizontal[0],
- best_remain_rdcost);
+#if CONFIG_COLLECT_PARTITION_STATS
+ if (best_remain_rdcost >= 0) {
+ partition_attempts[PARTITION_HORZ] += 1;
+ aom_usec_timer_start(&partition_timer);
+ partition_timer_on = 1;
+ }
+#endif
+ pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, PARTITION_HORZ,
+ subsize, &pc_tree->horizontal[0], best_remain_rdcost, 0);
if (this_rdc.rate == INT_MAX) {
sum_rdc.rdcost = INT64_MAX;
@@ -4222,9 +3629,9 @@ BEGIN_PARTITION_SEARCH:
pc_tree->horizontal[1].pred_interp_filter =
av1_extract_interp_filter(ctx_h->mic.interp_filters, 0);
}
- rd_pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col, &this_rdc,
- PARTITION_HORZ, subsize, &pc_tree->horizontal[1],
- best_rdc.rdcost - sum_rdc.rdcost);
+ pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col, &this_rdc,
+ PARTITION_HORZ, subsize, &pc_tree->horizontal[1],
+ best_rdc.rdcost - sum_rdc.rdcost, 0);
horz_rd[1] = this_rdc.rdcost;
if (this_rdc.rate == INT_MAX) {
@@ -4235,6 +3642,14 @@ BEGIN_PARTITION_SEARCH:
sum_rdc.rdcost += this_rdc.rdcost;
}
}
+#if CONFIG_COLLECT_PARTITION_STATS
+ if (partition_timer_on) {
+ aom_usec_timer_mark(&partition_timer);
+ int64_t time = aom_usec_timer_elapsed(&partition_timer);
+ partition_times[PARTITION_HORZ] += time;
+ partition_timer_on = 0;
+ }
+#endif
if (sum_rdc.rdcost < best_rdc.rdcost) {
sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
@@ -4248,8 +3663,10 @@ BEGIN_PARTITION_SEARCH:
}
// PARTITION_VERT
- if (partition_vert_allowed && !prune_vert &&
- (do_rectangular_split || active_v_edge(cpi, mi_col, mi_step))) {
+ assert(IMPLIES(!cpi->oxcf.enable_rect_partitions, !partition_vert_allowed));
+ if (!terminate_partition_search && partition_vert_allowed && !prune_vert &&
+ (do_rectangular_split || active_v_edge(cpi, mi_col, mi_step)) &&
+ !is_gt_max_sq_part) {
av1_init_rd_stats(&sum_rdc);
subsize = get_partition_subsize(bsize, PARTITION_VERT);
@@ -4265,9 +3682,15 @@ BEGIN_PARTITION_SEARCH:
const int64_t best_remain_rdcost = best_rdc.rdcost == INT64_MAX
? INT64_MAX
: (best_rdc.rdcost - sum_rdc.rdcost);
- rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
- PARTITION_VERT, subsize, &pc_tree->vertical[0],
- best_remain_rdcost);
+#if CONFIG_COLLECT_PARTITION_STATS
+ if (best_remain_rdcost >= 0) {
+ partition_attempts[PARTITION_VERT] += 1;
+ aom_usec_timer_start(&partition_timer);
+ partition_timer_on = 1;
+ }
+#endif
+ pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, PARTITION_VERT,
+ subsize, &pc_tree->vertical[0], best_remain_rdcost, 0);
if (this_rdc.rate == INT_MAX) {
sum_rdc.rdcost = INT64_MAX;
@@ -4296,9 +3719,9 @@ BEGIN_PARTITION_SEARCH:
pc_tree->vertical[1].pred_interp_filter =
av1_extract_interp_filter(ctx_none->mic.interp_filters, 0);
}
- rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step, &this_rdc,
- PARTITION_VERT, subsize, &pc_tree->vertical[1],
- best_rdc.rdcost - sum_rdc.rdcost);
+ pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step, &this_rdc,
+ PARTITION_VERT, subsize, &pc_tree->vertical[1],
+ best_rdc.rdcost - sum_rdc.rdcost, 0);
vert_rd[1] = this_rdc.rdcost;
if (this_rdc.rate == INT_MAX) {
@@ -4309,6 +3732,14 @@ BEGIN_PARTITION_SEARCH:
sum_rdc.rdcost += this_rdc.rdcost;
}
}
+#if CONFIG_COLLECT_PARTITION_STATS
+ if (partition_timer_on) {
+ aom_usec_timer_mark(&partition_timer);
+ int64_t time = aom_usec_timer_elapsed(&partition_timer);
+ partition_times[PARTITION_VERT] += time;
+ partition_timer_on = 0;
+ }
+#endif
if (sum_rdc.rdcost < best_rdc.rdcost) {
sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
@@ -4323,7 +3754,7 @@ BEGIN_PARTITION_SEARCH:
if (pb_source_variance == UINT_MAX) {
av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize);
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ if (is_cur_buf_hbd(xd)) {
pb_source_variance = av1_high_get_sby_perpixel_variance(
cpi, &x->plane[0].src, bsize, xd->bd);
} else {
@@ -4332,13 +3763,26 @@ BEGIN_PARTITION_SEARCH:
}
}
+ if (use_pb_simple_motion_pred_sse(cpi) &&
+ pb_simple_motion_pred_sse == UINT_MAX) {
+ const MV ref_mv_full = { .row = 0, .col = 0 };
+ unsigned int var = 0;
+
+ av1_simple_motion_sse_var(cpi, x, mi_row, mi_col, bsize, ref_mv_full, 0,
+ &pb_simple_motion_pred_sse, &var);
+ }
+
+ assert(IMPLIES(!cpi->oxcf.enable_rect_partitions, !do_rectangular_split));
+
const int ext_partition_allowed =
do_rectangular_split && bsize > BLOCK_8X8 && partition_none_allowed;
// The standard AB partitions are allowed whenever ext-partition-types are
// allowed
- int horzab_partition_allowed = ext_partition_allowed;
- int vertab_partition_allowed = ext_partition_allowed;
+ int horzab_partition_allowed =
+ ext_partition_allowed & cpi->oxcf.enable_ab_partitions;
+ int vertab_partition_allowed =
+ ext_partition_allowed & cpi->oxcf.enable_ab_partitions;
#if CONFIG_DIST_8X8
if (x->using_dist_8x8) {
@@ -4414,9 +3858,9 @@ BEGIN_PARTITION_SEARCH:
if (cpi->sf.ml_prune_ab_partition && ext_partition_allowed &&
partition_horz_allowed && partition_vert_allowed) {
- // TODO(huisu@google.com): x->source_variance may not be the current block's
- // variance. The correct one to use is pb_source_variance.
- // Need to re-train the model to fix it.
+ // TODO(huisu@google.com): x->source_variance may not be the current
+ // block's variance. The correct one to use is pb_source_variance. Need to
+ // re-train the model to fix it.
ml_prune_ab_partition(bsize, pc_tree->partitioning,
get_unsigned_bits(x->source_variance),
best_rdc.rdcost, horz_rd, vert_rd, split_rd,
@@ -4424,8 +3868,14 @@ BEGIN_PARTITION_SEARCH:
&verta_partition_allowed, &vertb_partition_allowed);
}
+ horza_partition_allowed &= cpi->oxcf.enable_ab_partitions;
+ horzb_partition_allowed &= cpi->oxcf.enable_ab_partitions;
+ verta_partition_allowed &= cpi->oxcf.enable_ab_partitions;
+ vertb_partition_allowed &= cpi->oxcf.enable_ab_partitions;
+
// PARTITION_HORZ_A
- if (partition_horz_allowed && horza_partition_allowed) {
+ if (!terminate_partition_search && partition_horz_allowed &&
+ horza_partition_allowed && !is_gt_max_sq_part) {
subsize = get_partition_subsize(bsize, PARTITION_HORZ_A);
pc_tree->horizontala[0].rd_mode_is_ready = 0;
pc_tree->horizontala[1].rd_mode_is_ready = 0;
@@ -4441,56 +3891,37 @@ BEGIN_PARTITION_SEARCH:
pc_tree->horizontala[1].rd_mode_is_ready = 1;
}
}
- for (int i = 0; i < 3; ++i) {
- pc_tree->horizontala[i].skip_ref_frame_mask = 0;
- pc_tree->horizontala[i].ref_selected[0] =
- pc_tree->horizontala[i].ref_selected[1] = NONE_FRAME;
- }
- if (cpi->sf.prune_ref_frame_for_rect_partitions) {
- int used_frames;
- used_frames = ref_frames_used[0];
- if (used_frames)
- pc_tree->horizontala[0].skip_ref_frame_mask = ~used_frames;
- used_frames = ref_frames_used[1];
- if (used_frames)
- pc_tree->horizontala[1].skip_ref_frame_mask = ~used_frames;
- used_frames = ref_frames_used[2] | ref_frames_used[3];
- if (used_frames)
- pc_tree->horizontala[2].skip_ref_frame_mask = ~used_frames;
- }
- if (cpi->sf.prune_ref_mode_for_partitions) {
- // Overwrite skip_ref_frame_mask for the current block
- if (split_mbmi[0] && split_mbmi[0]->ref_frame[0] > INTRA_FRAME &&
- !has_second_ref(split_mbmi[0])) { // single ref
- const int used_frames = 1 << (int)split_mbmi[0]->ref_frame[0];
- pc_tree->horizontala[0].skip_ref_frame_mask = ~used_frames;
- pc_tree->horizontala[0].ref_selected[0] = split_mbmi[0]->ref_frame[0];
- }
- if (split_mbmi[1] && split_mbmi[1]->ref_frame[0] > INTRA_FRAME &&
- !has_second_ref(split_mbmi[1])) { // single ref
- const int used_frames = 1 << (int)split_mbmi[1]->ref_frame[0];
- pc_tree->horizontala[1].skip_ref_frame_mask = ~used_frames;
- pc_tree->horizontala[1].ref_selected[0] = split_mbmi[1]->ref_frame[0];
- }
- if (split_mbmi[2] && split_mbmi[3] &&
- split_mbmi[2]->ref_frame[0] > INTRA_FRAME &&
- split_mbmi[2]->ref_frame[0] == split_mbmi[3]->ref_frame[0] &&
- !has_second_ref(split_mbmi[2]) &&
- !has_second_ref(split_mbmi[3])) { // single ref
- const int used_frames = 1 << (int)split_mbmi[2]->ref_frame[0];
- pc_tree->horizontala[2].skip_ref_frame_mask = ~used_frames;
- pc_tree->horizontala[2].ref_selected[0] = split_mbmi[2]->ref_frame[0];
+#if CONFIG_COLLECT_PARTITION_STATS
+ {
+ RD_STATS tmp_sum_rdc;
+ av1_init_rd_stats(&tmp_sum_rdc);
+ tmp_sum_rdc.rate = x->partition_cost[pl][PARTITION_HORZ_A];
+ tmp_sum_rdc.rdcost = RDCOST(x->rdmult, tmp_sum_rdc.rate, 0);
+ if (best_rdc.rdcost - tmp_sum_rdc.rdcost >= 0) {
+ partition_attempts[PARTITION_HORZ_A] += 1;
+ aom_usec_timer_start(&partition_timer);
+ partition_timer_on = 1;
}
}
+#endif
rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
pc_tree->horizontala, ctx_none, mi_row, mi_col, bsize,
PARTITION_HORZ_A, mi_row, mi_col, bsize2, mi_row,
mi_col + mi_step, bsize2, mi_row + mi_step, mi_col,
subsize);
+#if CONFIG_COLLECT_PARTITION_STATS
+ if (partition_timer_on) {
+ aom_usec_timer_mark(&partition_timer);
+ int64_t time = aom_usec_timer_elapsed(&partition_timer);
+ partition_times[PARTITION_HORZ_A] += time;
+ partition_timer_on = 0;
+ }
+#endif
restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
}
// PARTITION_HORZ_B
- if (partition_horz_allowed && horzb_partition_allowed) {
+ if (!terminate_partition_search && partition_horz_allowed &&
+ horzb_partition_allowed && !is_gt_max_sq_part) {
subsize = get_partition_subsize(bsize, PARTITION_HORZ_B);
pc_tree->horizontalb[0].rd_mode_is_ready = 0;
pc_tree->horizontalb[1].rd_mode_is_ready = 0;
@@ -4500,57 +3931,39 @@ BEGIN_PARTITION_SEARCH:
pc_tree->horizontalb[0].mic.partition = PARTITION_HORZ_B;
pc_tree->horizontalb[0].rd_mode_is_ready = 1;
}
- for (int i = 0; i < 3; ++i) {
- pc_tree->horizontalb[i].skip_ref_frame_mask = 0;
- pc_tree->horizontalb[i].ref_selected[0] =
- pc_tree->horizontalb[i].ref_selected[1] = NONE_FRAME;
- }
- if (cpi->sf.prune_ref_frame_for_rect_partitions) {
- int used_frames;
- used_frames = ref_frames_used[0] | ref_frames_used[1];
- if (used_frames)
- pc_tree->horizontalb[0].skip_ref_frame_mask = ~used_frames;
- used_frames = ref_frames_used[2];
- if (used_frames)
- pc_tree->horizontalb[1].skip_ref_frame_mask = ~used_frames;
- used_frames = ref_frames_used[3];
- if (used_frames)
- pc_tree->horizontalb[2].skip_ref_frame_mask = ~used_frames;
- }
- if (cpi->sf.prune_ref_mode_for_partitions) {
- // Overwrite skip_ref_frame_mask for the current block
- if (split_mbmi[0] && split_mbmi[1] &&
- split_mbmi[0]->ref_frame[0] > INTRA_FRAME &&
- split_mbmi[0]->ref_frame[0] == split_mbmi[1]->ref_frame[0] &&
- !has_second_ref(split_mbmi[0]) &&
- !has_second_ref(split_mbmi[1])) { // single ref
- const int used_frames = 1 << (int)split_mbmi[0]->ref_frame[0];
- pc_tree->horizontalb[0].skip_ref_frame_mask = ~used_frames;
- pc_tree->horizontalb[0].ref_selected[0] = split_mbmi[0]->ref_frame[0];
- }
- if (split_mbmi[2] && split_mbmi[2]->ref_frame[0] > INTRA_FRAME &&
- !has_second_ref(split_mbmi[2])) { // single ref
- const int used_frames = 1 << (int)split_mbmi[2]->ref_frame[0];
- pc_tree->horizontalb[1].skip_ref_frame_mask = ~used_frames;
- pc_tree->horizontalb[1].ref_selected[0] = split_mbmi[2]->ref_frame[0];
- }
- if (split_mbmi[3] && split_mbmi[3]->ref_frame[0] > INTRA_FRAME &&
- !has_second_ref(split_mbmi[3])) { // single ref
- const int used_frames = 1 << (int)split_mbmi[3]->ref_frame[0];
- pc_tree->horizontalb[2].skip_ref_frame_mask = ~used_frames;
- pc_tree->horizontalb[2].ref_selected[0] = split_mbmi[3]->ref_frame[0];
+#if CONFIG_COLLECT_PARTITION_STATS
+ {
+ RD_STATS tmp_sum_rdc;
+ av1_init_rd_stats(&tmp_sum_rdc);
+ tmp_sum_rdc.rate = x->partition_cost[pl][PARTITION_HORZ_B];
+ tmp_sum_rdc.rdcost = RDCOST(x->rdmult, tmp_sum_rdc.rate, 0);
+ if (best_rdc.rdcost - tmp_sum_rdc.rdcost >= 0) {
+ partition_attempts[PARTITION_HORZ_B] += 1;
+ aom_usec_timer_start(&partition_timer);
+ partition_timer_on = 1;
}
}
+#endif
rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
pc_tree->horizontalb, ctx_none, mi_row, mi_col, bsize,
PARTITION_HORZ_B, mi_row, mi_col, subsize,
mi_row + mi_step, mi_col, bsize2, mi_row + mi_step,
mi_col + mi_step, bsize2);
+
+#if CONFIG_COLLECT_PARTITION_STATS
+ if (partition_timer_on) {
+ aom_usec_timer_mark(&partition_timer);
+ int64_t time = aom_usec_timer_elapsed(&partition_timer);
+ partition_times[PARTITION_HORZ_B] += time;
+ partition_timer_on = 0;
+ }
+#endif
restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
}
// PARTITION_VERT_A
- if (partition_vert_allowed && verta_partition_allowed) {
+ if (!terminate_partition_search && partition_vert_allowed &&
+ verta_partition_allowed && !is_gt_max_sq_part) {
subsize = get_partition_subsize(bsize, PARTITION_VERT_A);
pc_tree->verticala[0].rd_mode_is_ready = 0;
pc_tree->verticala[1].rd_mode_is_ready = 0;
@@ -4560,53 +3973,37 @@ BEGIN_PARTITION_SEARCH:
pc_tree->verticala[0].mic.partition = PARTITION_VERT_A;
pc_tree->verticala[0].rd_mode_is_ready = 1;
}
- for (int i = 0; i < 3; ++i) {
- pc_tree->verticala[i].skip_ref_frame_mask = 0;
- pc_tree->verticala[i].ref_selected[0] =
- pc_tree->verticala[i].ref_selected[1] = NONE_FRAME;
- }
- if (cpi->sf.prune_ref_frame_for_rect_partitions) {
- int used_frames;
- used_frames = ref_frames_used[0];
- if (used_frames) pc_tree->verticala[0].skip_ref_frame_mask = ~used_frames;
- used_frames = ref_frames_used[2];
- if (used_frames) pc_tree->verticala[1].skip_ref_frame_mask = ~used_frames;
- used_frames = ref_frames_used[1] | ref_frames_used[3];
- if (used_frames) pc_tree->verticala[2].skip_ref_frame_mask = ~used_frames;
- }
- if (cpi->sf.prune_ref_mode_for_partitions) {
- // Overwrite skip_ref_frame_mask for the current block
- if (split_mbmi[0] && split_mbmi[0]->ref_frame[0] > INTRA_FRAME &&
- !has_second_ref(split_mbmi[0])) { // single ref
- const int used_frames = 1 << (int)split_mbmi[0]->ref_frame[0];
- pc_tree->verticala[0].skip_ref_frame_mask = ~used_frames;
- pc_tree->verticala[0].ref_selected[0] = split_mbmi[0]->ref_frame[0];
- }
- if (split_mbmi[2] && split_mbmi[2]->ref_frame[0] > INTRA_FRAME &&
- !has_second_ref(split_mbmi[2])) { // single ref
- const int used_frames = 1 << (int)split_mbmi[2]->ref_frame[0];
- pc_tree->verticala[1].skip_ref_frame_mask = ~used_frames;
- pc_tree->verticala[1].ref_selected[0] = split_mbmi[2]->ref_frame[0];
- }
- if (split_mbmi[1] && split_mbmi[3] &&
- split_mbmi[1]->ref_frame[0] > INTRA_FRAME &&
- split_mbmi[1]->ref_frame[0] == split_mbmi[3]->ref_frame[0] &&
- !has_second_ref(split_mbmi[1]) &&
- !has_second_ref(split_mbmi[3])) { // single ref
- const int used_frames = 1 << (int)split_mbmi[1]->ref_frame[0];
- pc_tree->verticala[2].skip_ref_frame_mask = ~used_frames;
- pc_tree->verticala[2].ref_selected[0] = split_mbmi[1]->ref_frame[0];
+#if CONFIG_COLLECT_PARTITION_STATS
+ {
+ RD_STATS tmp_sum_rdc;
+ av1_init_rd_stats(&tmp_sum_rdc);
+ tmp_sum_rdc.rate = x->partition_cost[pl][PARTITION_VERT_A];
+ tmp_sum_rdc.rdcost = RDCOST(x->rdmult, tmp_sum_rdc.rate, 0);
+ if (best_rdc.rdcost - tmp_sum_rdc.rdcost >= 0) {
+ partition_attempts[PARTITION_VERT_A] += 1;
+ aom_usec_timer_start(&partition_timer);
+ partition_timer_on = 1;
}
}
+#endif
rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
pc_tree->verticala, ctx_none, mi_row, mi_col, bsize,
PARTITION_VERT_A, mi_row, mi_col, bsize2,
mi_row + mi_step, mi_col, bsize2, mi_row,
mi_col + mi_step, subsize);
+#if CONFIG_COLLECT_PARTITION_STATS
+ if (partition_timer_on) {
+ aom_usec_timer_mark(&partition_timer);
+ int64_t time = aom_usec_timer_elapsed(&partition_timer);
+ partition_times[PARTITION_VERT_A] += time;
+ partition_timer_on = 0;
+ }
+#endif
restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
}
// PARTITION_VERT_B
- if (partition_vert_allowed && vertb_partition_allowed) {
+ if (!terminate_partition_search && partition_vert_allowed &&
+ vertb_partition_allowed && !is_gt_max_sq_part) {
subsize = get_partition_subsize(bsize, PARTITION_VERT_B);
pc_tree->verticalb[0].rd_mode_is_ready = 0;
pc_tree->verticalb[1].rd_mode_is_ready = 0;
@@ -4616,58 +4013,44 @@ BEGIN_PARTITION_SEARCH:
pc_tree->verticalb[0].mic.partition = PARTITION_VERT_B;
pc_tree->verticalb[0].rd_mode_is_ready = 1;
}
- for (int i = 0; i < 3; ++i) {
- pc_tree->verticalb[i].skip_ref_frame_mask = 0;
- pc_tree->verticalb[i].ref_selected[0] =
- pc_tree->verticalb[i].ref_selected[1] = NONE_FRAME;
- }
- if (cpi->sf.prune_ref_frame_for_rect_partitions) {
- int used_frames;
- used_frames = ref_frames_used[0] | ref_frames_used[2];
- if (used_frames) pc_tree->verticalb[0].skip_ref_frame_mask = ~used_frames;
- used_frames = ref_frames_used[1];
- if (used_frames) pc_tree->verticalb[1].skip_ref_frame_mask = ~used_frames;
- used_frames = ref_frames_used[3];
- if (used_frames) pc_tree->verticalb[2].skip_ref_frame_mask = ~used_frames;
- }
- if (cpi->sf.prune_ref_mode_for_partitions) {
- // Overwrite skip_ref_frame_mask for the current block
- if (split_mbmi[0] && split_mbmi[2] &&
- split_mbmi[0]->ref_frame[0] > INTRA_FRAME &&
- split_mbmi[0]->ref_frame[0] == split_mbmi[2]->ref_frame[0] &&
- !has_second_ref(split_mbmi[0]) &&
- !has_second_ref(split_mbmi[2])) { // single ref
- const int used_frames = 1 << (int)split_mbmi[0]->ref_frame[0];
- pc_tree->verticalb[0].skip_ref_frame_mask = ~used_frames;
- pc_tree->verticalb[0].ref_selected[0] = split_mbmi[0]->ref_frame[0];
- }
- if (split_mbmi[1] && split_mbmi[1]->ref_frame[0] > INTRA_FRAME &&
- !has_second_ref(split_mbmi[1])) { // single ref
- const int used_frames = 1 << (int)split_mbmi[1]->ref_frame[0];
- pc_tree->verticalb[1].skip_ref_frame_mask = ~used_frames;
- pc_tree->verticalb[1].ref_selected[0] = split_mbmi[1]->ref_frame[0];
- }
- if (split_mbmi[3] && split_mbmi[3]->ref_frame[0] > INTRA_FRAME &&
- !has_second_ref(split_mbmi[3])) { // single ref
- const int used_frames = 1 << (int)split_mbmi[3]->ref_frame[0];
- pc_tree->verticalb[2].skip_ref_frame_mask = ~used_frames;
- pc_tree->verticalb[2].ref_selected[0] = split_mbmi[3]->ref_frame[0];
+#if CONFIG_COLLECT_PARTITION_STATS
+ {
+ RD_STATS tmp_sum_rdc;
+ av1_init_rd_stats(&tmp_sum_rdc);
+ tmp_sum_rdc.rate = x->partition_cost[pl][PARTITION_VERT_B];
+ tmp_sum_rdc.rdcost = RDCOST(x->rdmult, tmp_sum_rdc.rate, 0);
+ if (!frame_is_intra_only(cm) &&
+ best_rdc.rdcost - tmp_sum_rdc.rdcost >= 0) {
+ partition_attempts[PARTITION_VERT_B] += 1;
+ aom_usec_timer_start(&partition_timer);
+ partition_timer_on = 1;
}
}
+#endif
rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
pc_tree->verticalb, ctx_none, mi_row, mi_col, bsize,
PARTITION_VERT_B, mi_row, mi_col, subsize, mi_row,
mi_col + mi_step, bsize2, mi_row + mi_step,
mi_col + mi_step, bsize2);
+#if CONFIG_COLLECT_PARTITION_STATS
+ if (partition_timer_on) {
+ aom_usec_timer_mark(&partition_timer);
+ int64_t time = aom_usec_timer_elapsed(&partition_timer);
+ partition_times[PARTITION_VERT_B] += time;
+ partition_timer_on = 0;
+ }
+#endif
restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
}
// partition4_allowed is 1 if we can use a PARTITION_HORZ_4 or
// PARTITION_VERT_4 for this block. This is almost the same as
- // ext_partition_allowed, except that we don't allow 128x32 or 32x128 blocks,
- // so we require that bsize is not BLOCK_128X128.
- const int partition4_allowed =
- ext_partition_allowed && bsize != BLOCK_128X128;
+ // ext_partition_allowed, except that we don't allow 128x32 or 32x128
+ // blocks, so we require that bsize is not BLOCK_128X128.
+ const int partition4_allowed = cpi->oxcf.enable_1to4_partitions &&
+ ext_partition_allowed &&
+ bsize != BLOCK_128X128;
+
int partition_horz4_allowed = partition4_allowed && partition_horz_allowed;
int partition_vert4_allowed = partition4_allowed && partition_vert_allowed;
if (cpi->sf.prune_ext_partition_types_search_level == 2) {
@@ -4699,9 +4082,16 @@ BEGIN_PARTITION_SEARCH:
}
#endif
+ if (blksize < (min_partition_size << 2)) {
+ partition_horz4_allowed = 0;
+ partition_vert4_allowed = 0;
+ }
+
// PARTITION_HORZ_4
- if (partition_horz4_allowed && has_rows &&
- (do_rectangular_split || active_h_edge(cpi, mi_row, mi_step))) {
+ assert(IMPLIES(!cpi->oxcf.enable_rect_partitions, !partition_horz4_allowed));
+ if (!terminate_partition_search && partition_horz4_allowed && has_rows &&
+ (do_rectangular_split || active_h_edge(cpi, mi_row, mi_step)) &&
+ !is_gt_max_sq_part) {
av1_init_rd_stats(&sum_rdc);
const int quarter_step = mi_size_high[bsize] / 4;
PICK_MODE_CONTEXT *ctx_prev = ctx_none;
@@ -4710,6 +4100,13 @@ BEGIN_PARTITION_SEARCH:
sum_rdc.rate = partition_cost[PARTITION_HORZ_4];
sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
+#if CONFIG_COLLECT_PARTITION_STATS
+ if (best_rdc.rdcost - sum_rdc.rdcost >= 0) {
+ partition_attempts[PARTITION_HORZ_4] += 1;
+ aom_usec_timer_start(&partition_timer);
+ partition_timer_on = 1;
+ }
+#endif
for (int i = 0; i < 4; ++i) {
const int this_mi_row = mi_row + i * quarter_step;
@@ -4718,13 +4115,6 @@ BEGIN_PARTITION_SEARCH:
PICK_MODE_CONTEXT *ctx_this = &pc_tree->horizontal4[i];
ctx_this->rd_mode_is_ready = 0;
- ctx_this->skip_ref_frame_mask = 0;
- if (cpi->sf.prune_ref_frame_for_rect_partitions) {
- const int used_frames = i <= 1
- ? (ref_frames_used[0] | ref_frames_used[1])
- : (ref_frames_used[2] | ref_frames_used[3]);
- if (used_frames) ctx_this->skip_ref_frame_mask = ~used_frames;
- }
if (!rd_try_subblock(cpi, td, tile_data, tp, (i == 3), this_mi_row,
mi_col, subsize, &best_rdc, &sum_rdc, &this_rdc,
PARTITION_HORZ_4, ctx_prev, ctx_this))
@@ -4740,12 +4130,23 @@ BEGIN_PARTITION_SEARCH:
pc_tree->partitioning = PARTITION_HORZ_4;
}
}
+
+#if CONFIG_COLLECT_PARTITION_STATS
+ if (partition_timer_on) {
+ aom_usec_timer_mark(&partition_timer);
+ int64_t time = aom_usec_timer_elapsed(&partition_timer);
+ partition_times[PARTITION_HORZ_4] += time;
+ partition_timer_on = 0;
+ }
+#endif
restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
}
// PARTITION_VERT_4
- if (partition_vert4_allowed && has_cols &&
- (do_rectangular_split || active_v_edge(cpi, mi_row, mi_step))) {
+ assert(IMPLIES(!cpi->oxcf.enable_rect_partitions, !partition_vert4_allowed));
+ if (!terminate_partition_search && partition_vert4_allowed && has_cols &&
+ (do_rectangular_split || active_v_edge(cpi, mi_row, mi_step)) &&
+ !is_gt_max_sq_part) {
av1_init_rd_stats(&sum_rdc);
const int quarter_step = mi_size_wide[bsize] / 4;
PICK_MODE_CONTEXT *ctx_prev = ctx_none;
@@ -4754,6 +4155,13 @@ BEGIN_PARTITION_SEARCH:
sum_rdc.rate = partition_cost[PARTITION_VERT_4];
sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
+#if CONFIG_COLLECT_PARTITION_STATS
+ if (best_rdc.rdcost - sum_rdc.rdcost >= 0) {
+ partition_attempts[PARTITION_VERT_4] += 1;
+ aom_usec_timer_start(&partition_timer);
+ partition_timer_on = 1;
+ }
+#endif
for (int i = 0; i < 4; ++i) {
const int this_mi_col = mi_col + i * quarter_step;
@@ -4762,13 +4170,6 @@ BEGIN_PARTITION_SEARCH:
PICK_MODE_CONTEXT *ctx_this = &pc_tree->vertical4[i];
ctx_this->rd_mode_is_ready = 0;
- ctx_this->skip_ref_frame_mask = 0;
- if (cpi->sf.prune_ref_frame_for_rect_partitions) {
- const int used_frames = i <= 1
- ? (ref_frames_used[0] | ref_frames_used[2])
- : (ref_frames_used[1] | ref_frames_used[3]);
- if (used_frames) ctx_this->skip_ref_frame_mask = ~used_frames;
- }
if (!rd_try_subblock(cpi, td, tile_data, tp, (i == 3), mi_row,
this_mi_col, subsize, &best_rdc, &sum_rdc, &this_rdc,
PARTITION_VERT_4, ctx_prev, ctx_this))
@@ -4784,6 +4185,14 @@ BEGIN_PARTITION_SEARCH:
pc_tree->partitioning = PARTITION_VERT_4;
}
}
+#if CONFIG_COLLECT_PARTITION_STATS
+ if (partition_timer_on) {
+ aom_usec_timer_mark(&partition_timer);
+ int64_t time = aom_usec_timer_elapsed(&partition_timer);
+ partition_times[PARTITION_VERT_4] += time;
+ partition_timer_on = 0;
+ }
+#endif
restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
}
@@ -4791,6 +4200,9 @@ BEGIN_PARTITION_SEARCH:
// Did not find a valid partition, go back and search again, with less
// constraint on which partition types to search.
x->must_find_valid_partition = 1;
+#if CONFIG_COLLECT_PARTITION_STATS == 2
+ part_stats->partition_redo += 1;
+#endif
goto BEGIN_PARTITION_SEARCH;
}
@@ -4801,6 +4213,44 @@ BEGIN_PARTITION_SEARCH:
(void)best_rd;
*rd_cost = best_rdc;
+#if CONFIG_COLLECT_PARTITION_STATS
+ if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX) {
+ partition_decisions[pc_tree->partitioning] += 1;
+ }
+#endif
+
+#if CONFIG_COLLECT_PARTITION_STATS == 1
+ // If CONFIG_COLLECT_PARTITION_STATS is 1, then print out the stats for each
+ // prediction block
+ FILE *f = fopen("data.csv", "a");
+ fprintf(f, "%d,%d,%d,", bsize, cm->show_frame, frame_is_intra_only(cm));
+ for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
+ fprintf(f, "%d,", partition_decisions[idx]);
+ }
+ for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
+ fprintf(f, "%d,", partition_attempts[idx]);
+ }
+ for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
+ fprintf(f, "%ld,", partition_times[idx]);
+ }
+ fprintf(f, "\n");
+ fclose(f);
+#endif
+
+#if CONFIG_COLLECT_PARTITION_STATS == 2
+ // If CONFIG_COLLECTION_PARTITION_STATS is 2, then we print out the stats for
+ // the whole clip. So we need to pass the information upstream to the encoder
+ const int bsize_idx = av1_get_bsize_idx_for_part_stats(bsize);
+ int *agg_attempts = part_stats->partition_attempts[bsize_idx];
+ int *agg_decisions = part_stats->partition_decisions[bsize_idx];
+ int64_t *agg_times = part_stats->partition_times[bsize_idx];
+ for (int idx = 0; idx < EXT_PARTITION_TYPES; idx++) {
+ agg_attempts[idx] += partition_attempts[idx];
+ agg_decisions[idx] += partition_decisions[idx];
+ agg_times[idx] += partition_times[idx];
+ }
+#endif
+
if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX &&
pc_tree->index != 3) {
if (bsize == cm->seq_params.sb_size) {
@@ -4820,19 +4270,23 @@ BEGIN_PARTITION_SEARCH:
assert(tp_orig == *tp);
}
}
+#undef NUM_SIMPLE_MOTION_FEATURES
// Set all the counters as max.
static void init_first_partition_pass_stats_tables(
- FIRST_PARTITION_PASS_STATS *stats) {
+ AV1_COMP *cpi, FIRST_PARTITION_PASS_STATS *stats) {
for (int i = 0; i < FIRST_PARTITION_PASS_STATS_TABLES; ++i) {
memset(stats[i].ref0_counts, 0xff, sizeof(stats[i].ref0_counts));
memset(stats[i].ref1_counts, 0xff, sizeof(stats[i].ref1_counts));
stats[i].sample_counts = INT_MAX;
+ if (cpi->sf.use_first_partition_pass_interintra_stats)
+ memset(stats[i].interintra_motion_mode_count, 0xff,
+ sizeof(stats[i].interintra_motion_mode_count));
}
}
-// Minimum number of samples to trigger the
-// mode_pruning_based_on_two_pass_partition_search feature.
+// Minimum number of samples to trigger the mode pruning in
+// two_pass_partition_search feature.
#define FIRST_PARTITION_PASS_MIN_SAMPLES 16
static int get_rdmult_delta(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
@@ -4847,7 +4301,6 @@ static int get_rdmult_delta(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
int row, col;
int dr = 0;
- int count = 0;
double r0, rk, beta;
if (tpl_frame->is_valid == 0) return orig_rdmult;
@@ -4864,8 +4317,6 @@ static int get_rdmult_delta(AV1_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
intra_cost += this_stats->intra_cost;
mc_dep_cost += this_stats->mc_dep_cost;
-
- ++count;
}
}
@@ -4955,8 +4406,7 @@ static void first_partition_search_pass(AV1_COMP *cpi, ThreadData *td,
const SPEED_FEATURES *const sf = &cpi->sf;
// Reset the stats tables.
- if (sf->mode_pruning_based_on_two_pass_partition_search)
- av1_zero(x->first_partition_pass_stats);
+ av1_zero(x->first_partition_pass_stats);
AV1_COMMON *const cm = &cpi->common;
const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
@@ -4968,6 +4418,7 @@ static void first_partition_search_pass(AV1_COMP *cpi, ThreadData *td,
x->cb_partition_scan = 0;
x->source_variance = UINT_MAX;
+ x->simple_motion_pred_sse = UINT_MAX;
if (sf->adaptive_pred_interp_filter) {
const int leaf_nodes = 256;
for (int i = 0; i < leaf_nodes; ++i) {
@@ -4996,29 +4447,208 @@ static void first_partition_search_pass(AV1_COMP *cpi, ThreadData *td,
x->use_cb_search_range = 1;
- if (sf->mode_pruning_based_on_two_pass_partition_search) {
- for (int i = 0; i < FIRST_PARTITION_PASS_STATS_TABLES; ++i) {
- FIRST_PARTITION_PASS_STATS *const stat =
- &x->first_partition_pass_stats[i];
- if (stat->sample_counts < FIRST_PARTITION_PASS_MIN_SAMPLES) {
- // If there are not enough samples collected, make all available.
- memset(stat->ref0_counts, 0xff, sizeof(stat->ref0_counts));
- memset(stat->ref1_counts, 0xff, sizeof(stat->ref1_counts));
- } else if (sf->selective_ref_frame < 3) {
- // ALTREF2_FRAME and BWDREF_FRAME may be skipped during the
- // initial partition scan, so we don't eliminate them.
- stat->ref0_counts[ALTREF2_FRAME] = 0xff;
- stat->ref1_counts[ALTREF2_FRAME] = 0xff;
- stat->ref0_counts[BWDREF_FRAME] = 0xff;
- stat->ref1_counts[BWDREF_FRAME] = 0xff;
+ for (int i = 0; i < FIRST_PARTITION_PASS_STATS_TABLES; ++i) {
+ FIRST_PARTITION_PASS_STATS *const stat = &x->first_partition_pass_stats[i];
+ if (stat->sample_counts < FIRST_PARTITION_PASS_MIN_SAMPLES) {
+ // If there are not enough samples collected, make all available.
+ memset(stat->ref0_counts, 0xff, sizeof(stat->ref0_counts));
+ memset(stat->ref1_counts, 0xff, sizeof(stat->ref1_counts));
+ if (cpi->sf.use_first_partition_pass_interintra_stats)
+ memset(stat->interintra_motion_mode_count, 0xff,
+ sizeof(stat->interintra_motion_mode_count));
+ } else if (sf->selective_ref_frame < 3) {
+ // ALTREF2_FRAME and BWDREF_FRAME may be skipped during the
+ // initial partition scan, so we don't eliminate them.
+ stat->ref0_counts[ALTREF2_FRAME] = 0xff;
+ stat->ref1_counts[ALTREF2_FRAME] = 0xff;
+ stat->ref0_counts[BWDREF_FRAME] = 0xff;
+ stat->ref1_counts[BWDREF_FRAME] = 0xff;
+ if (cpi->sf.use_first_partition_pass_interintra_stats) {
+ stat->interintra_motion_mode_count[ALTREF2_FRAME] = 0xff;
+ stat->interintra_motion_mode_count[BWDREF_FRAME] = 0xff;
}
}
}
}
-static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
- TileDataEnc *tile_data, int mi_row,
- TOKENEXTRA **tp) {
+#define AVG_CDF_WEIGHT_LEFT 3
+#define AVG_CDF_WEIGHT_TOP_RIGHT 1
+
+static void avg_cdf_symbol(aom_cdf_prob *cdf_ptr_left, aom_cdf_prob *cdf_ptr_tr,
+ int num_cdfs, int cdf_stride, int nsymbs,
+ int wt_left, int wt_tr) {
+ for (int i = 0; i < num_cdfs; i++) {
+ for (int j = 0; j <= nsymbs; j++) {
+ cdf_ptr_left[i * cdf_stride + j] =
+ (aom_cdf_prob)(((int)cdf_ptr_left[i * cdf_stride + j] * wt_left +
+ (int)cdf_ptr_tr[i * cdf_stride + j] * wt_tr +
+ ((wt_left + wt_tr) / 2)) /
+ (wt_left + wt_tr));
+ assert(cdf_ptr_left[i * cdf_stride + j] >= 0 &&
+ cdf_ptr_left[i * cdf_stride + j] < CDF_PROB_TOP);
+ }
+ }
+}
+
+#define AVERAGE_CDF(cname_left, cname_tr, nsymbs) \
+ AVG_CDF_STRIDE(cname_left, cname_tr, nsymbs, CDF_SIZE(nsymbs))
+
+#define AVG_CDF_STRIDE(cname_left, cname_tr, nsymbs, cdf_stride) \
+ do { \
+ aom_cdf_prob *cdf_ptr_left = (aom_cdf_prob *)cname_left; \
+ aom_cdf_prob *cdf_ptr_tr = (aom_cdf_prob *)cname_tr; \
+ int array_size = (int)sizeof(cname_left) / sizeof(aom_cdf_prob); \
+ int num_cdfs = array_size / cdf_stride; \
+ avg_cdf_symbol(cdf_ptr_left, cdf_ptr_tr, num_cdfs, cdf_stride, nsymbs, \
+ wt_left, wt_tr); \
+ } while (0)
+
+static void avg_nmv(nmv_context *nmv_left, nmv_context *nmv_tr, int wt_left,
+ int wt_tr) {
+ AVERAGE_CDF(nmv_left->joints_cdf, nmv_tr->joints_cdf, 4);
+ for (int i = 0; i < 2; i++) {
+ AVERAGE_CDF(nmv_left->comps[i].classes_cdf, nmv_tr->comps[i].classes_cdf,
+ MV_CLASSES);
+ AVERAGE_CDF(nmv_left->comps[i].class0_fp_cdf,
+ nmv_tr->comps[i].class0_fp_cdf, MV_FP_SIZE);
+ AVERAGE_CDF(nmv_left->comps[i].fp_cdf, nmv_tr->comps[i].fp_cdf, MV_FP_SIZE);
+ AVERAGE_CDF(nmv_left->comps[i].sign_cdf, nmv_tr->comps[i].sign_cdf, 2);
+ AVERAGE_CDF(nmv_left->comps[i].class0_hp_cdf,
+ nmv_tr->comps[i].class0_hp_cdf, 2);
+ AVERAGE_CDF(nmv_left->comps[i].hp_cdf, nmv_tr->comps[i].hp_cdf, 2);
+ AVERAGE_CDF(nmv_left->comps[i].class0_cdf, nmv_tr->comps[i].class0_cdf,
+ CLASS0_SIZE);
+ AVERAGE_CDF(nmv_left->comps[i].bits_cdf, nmv_tr->comps[i].bits_cdf, 2);
+ }
+}
+
+// In case of row-based multi-threading of encoder, since we always
+// keep a top - right sync, we can average the top - right SB's CDFs and
+// the left SB's CDFs and use the same for current SB's encoding to
+// improve the performance. This function facilitates the averaging
+// of CDF and used only when row-mt is enabled in encoder.
+static void avg_cdf_symbols(FRAME_CONTEXT *ctx_left, FRAME_CONTEXT *ctx_tr,
+ int wt_left, int wt_tr) {
+ AVERAGE_CDF(ctx_left->txb_skip_cdf, ctx_tr->txb_skip_cdf, 2);
+ AVERAGE_CDF(ctx_left->eob_extra_cdf, ctx_tr->eob_extra_cdf, 2);
+ AVERAGE_CDF(ctx_left->dc_sign_cdf, ctx_tr->dc_sign_cdf, 2);
+ AVERAGE_CDF(ctx_left->eob_flag_cdf16, ctx_tr->eob_flag_cdf16, 5);
+ AVERAGE_CDF(ctx_left->eob_flag_cdf32, ctx_tr->eob_flag_cdf32, 6);
+ AVERAGE_CDF(ctx_left->eob_flag_cdf64, ctx_tr->eob_flag_cdf64, 7);
+ AVERAGE_CDF(ctx_left->eob_flag_cdf128, ctx_tr->eob_flag_cdf128, 8);
+ AVERAGE_CDF(ctx_left->eob_flag_cdf256, ctx_tr->eob_flag_cdf256, 9);
+ AVERAGE_CDF(ctx_left->eob_flag_cdf512, ctx_tr->eob_flag_cdf512, 10);
+ AVERAGE_CDF(ctx_left->eob_flag_cdf1024, ctx_tr->eob_flag_cdf1024, 11);
+ AVERAGE_CDF(ctx_left->coeff_base_eob_cdf, ctx_tr->coeff_base_eob_cdf, 3);
+ AVERAGE_CDF(ctx_left->coeff_base_cdf, ctx_tr->coeff_base_cdf, 4);
+ AVERAGE_CDF(ctx_left->coeff_br_cdf, ctx_tr->coeff_br_cdf, BR_CDF_SIZE);
+ AVERAGE_CDF(ctx_left->newmv_cdf, ctx_tr->newmv_cdf, 2);
+ AVERAGE_CDF(ctx_left->zeromv_cdf, ctx_tr->zeromv_cdf, 2);
+ AVERAGE_CDF(ctx_left->refmv_cdf, ctx_tr->refmv_cdf, 2);
+ AVERAGE_CDF(ctx_left->drl_cdf, ctx_tr->drl_cdf, 2);
+ AVERAGE_CDF(ctx_left->inter_compound_mode_cdf,
+ ctx_tr->inter_compound_mode_cdf, INTER_COMPOUND_MODES);
+ AVERAGE_CDF(ctx_left->compound_type_cdf, ctx_tr->compound_type_cdf,
+ MASKED_COMPOUND_TYPES);
+ AVERAGE_CDF(ctx_left->wedge_idx_cdf, ctx_tr->wedge_idx_cdf, 16);
+ AVERAGE_CDF(ctx_left->interintra_cdf, ctx_tr->interintra_cdf, 2);
+ AVERAGE_CDF(ctx_left->wedge_interintra_cdf, ctx_tr->wedge_interintra_cdf, 2);
+ AVERAGE_CDF(ctx_left->interintra_mode_cdf, ctx_tr->interintra_mode_cdf,
+ INTERINTRA_MODES);
+ AVERAGE_CDF(ctx_left->motion_mode_cdf, ctx_tr->motion_mode_cdf, MOTION_MODES);
+ AVERAGE_CDF(ctx_left->obmc_cdf, ctx_tr->obmc_cdf, 2);
+ AVERAGE_CDF(ctx_left->palette_y_size_cdf, ctx_tr->palette_y_size_cdf,
+ PALETTE_SIZES);
+ AVERAGE_CDF(ctx_left->palette_uv_size_cdf, ctx_tr->palette_uv_size_cdf,
+ PALETTE_SIZES);
+ for (int j = 0; j < PALETTE_SIZES; j++) {
+ int nsymbs = j + PALETTE_MIN_SIZE;
+ AVG_CDF_STRIDE(ctx_left->palette_y_color_index_cdf[j],
+ ctx_tr->palette_y_color_index_cdf[j], nsymbs,
+ CDF_SIZE(PALETTE_COLORS));
+ AVG_CDF_STRIDE(ctx_left->palette_uv_color_index_cdf[j],
+ ctx_tr->palette_uv_color_index_cdf[j], nsymbs,
+ CDF_SIZE(PALETTE_COLORS));
+ }
+ AVERAGE_CDF(ctx_left->palette_y_mode_cdf, ctx_tr->palette_y_mode_cdf, 2);
+ AVERAGE_CDF(ctx_left->palette_uv_mode_cdf, ctx_tr->palette_uv_mode_cdf, 2);
+ AVERAGE_CDF(ctx_left->comp_inter_cdf, ctx_tr->comp_inter_cdf, 2);
+ AVERAGE_CDF(ctx_left->single_ref_cdf, ctx_tr->single_ref_cdf, 2);
+ AVERAGE_CDF(ctx_left->comp_ref_type_cdf, ctx_tr->comp_ref_type_cdf, 2);
+ AVERAGE_CDF(ctx_left->uni_comp_ref_cdf, ctx_tr->uni_comp_ref_cdf, 2);
+ AVERAGE_CDF(ctx_left->comp_ref_cdf, ctx_tr->comp_ref_cdf, 2);
+ AVERAGE_CDF(ctx_left->comp_bwdref_cdf, ctx_tr->comp_bwdref_cdf, 2);
+ AVERAGE_CDF(ctx_left->txfm_partition_cdf, ctx_tr->txfm_partition_cdf, 2);
+ AVERAGE_CDF(ctx_left->compound_index_cdf, ctx_tr->compound_index_cdf, 2);
+ AVERAGE_CDF(ctx_left->comp_group_idx_cdf, ctx_tr->comp_group_idx_cdf, 2);
+ AVERAGE_CDF(ctx_left->skip_mode_cdfs, ctx_tr->skip_mode_cdfs, 2);
+ AVERAGE_CDF(ctx_left->skip_cdfs, ctx_tr->skip_cdfs, 2);
+ AVERAGE_CDF(ctx_left->intra_inter_cdf, ctx_tr->intra_inter_cdf, 2);
+ avg_nmv(&ctx_left->nmvc, &ctx_tr->nmvc, wt_left, wt_tr);
+ avg_nmv(&ctx_left->ndvc, &ctx_tr->ndvc, wt_left, wt_tr);
+ AVERAGE_CDF(ctx_left->intrabc_cdf, ctx_tr->intrabc_cdf, 2);
+ AVERAGE_CDF(ctx_left->seg.tree_cdf, ctx_tr->seg.tree_cdf, MAX_SEGMENTS);
+ AVERAGE_CDF(ctx_left->seg.pred_cdf, ctx_tr->seg.pred_cdf, 2);
+ AVERAGE_CDF(ctx_left->seg.spatial_pred_seg_cdf,
+ ctx_tr->seg.spatial_pred_seg_cdf, MAX_SEGMENTS);
+ AVERAGE_CDF(ctx_left->filter_intra_cdfs, ctx_tr->filter_intra_cdfs, 2);
+ AVERAGE_CDF(ctx_left->filter_intra_mode_cdf, ctx_tr->filter_intra_mode_cdf,
+ FILTER_INTRA_MODES);
+ AVERAGE_CDF(ctx_left->switchable_restore_cdf, ctx_tr->switchable_restore_cdf,
+ RESTORE_SWITCHABLE_TYPES);
+ AVERAGE_CDF(ctx_left->wiener_restore_cdf, ctx_tr->wiener_restore_cdf, 2);
+ AVERAGE_CDF(ctx_left->sgrproj_restore_cdf, ctx_tr->sgrproj_restore_cdf, 2);
+ AVERAGE_CDF(ctx_left->y_mode_cdf, ctx_tr->y_mode_cdf, INTRA_MODES);
+ AVG_CDF_STRIDE(ctx_left->uv_mode_cdf[0], ctx_tr->uv_mode_cdf[0],
+ UV_INTRA_MODES - 1, CDF_SIZE(UV_INTRA_MODES));
+ AVERAGE_CDF(ctx_left->uv_mode_cdf[1], ctx_tr->uv_mode_cdf[1], UV_INTRA_MODES);
+ for (int i = 0; i < PARTITION_CONTEXTS; i++) {
+ if (i < 4) {
+ AVG_CDF_STRIDE(ctx_left->partition_cdf[i], ctx_tr->partition_cdf[i], 4,
+ CDF_SIZE(10));
+ } else if (i < 16) {
+ AVERAGE_CDF(ctx_left->partition_cdf[i], ctx_tr->partition_cdf[i], 10);
+ } else {
+ AVG_CDF_STRIDE(ctx_left->partition_cdf[i], ctx_tr->partition_cdf[i], 8,
+ CDF_SIZE(10));
+ }
+ }
+ AVERAGE_CDF(ctx_left->switchable_interp_cdf, ctx_tr->switchable_interp_cdf,
+ SWITCHABLE_FILTERS);
+ AVERAGE_CDF(ctx_left->kf_y_cdf, ctx_tr->kf_y_cdf, INTRA_MODES);
+ AVERAGE_CDF(ctx_left->angle_delta_cdf, ctx_tr->angle_delta_cdf,
+ 2 * MAX_ANGLE_DELTA + 1);
+ AVG_CDF_STRIDE(ctx_left->tx_size_cdf[0], ctx_tr->tx_size_cdf[0], MAX_TX_DEPTH,
+ CDF_SIZE(MAX_TX_DEPTH + 1));
+ AVERAGE_CDF(ctx_left->tx_size_cdf[1], ctx_tr->tx_size_cdf[1],
+ MAX_TX_DEPTH + 1);
+ AVERAGE_CDF(ctx_left->tx_size_cdf[2], ctx_tr->tx_size_cdf[2],
+ MAX_TX_DEPTH + 1);
+ AVERAGE_CDF(ctx_left->tx_size_cdf[3], ctx_tr->tx_size_cdf[3],
+ MAX_TX_DEPTH + 1);
+ AVERAGE_CDF(ctx_left->delta_q_cdf, ctx_tr->delta_q_cdf, DELTA_Q_PROBS + 1);
+ AVERAGE_CDF(ctx_left->delta_lf_cdf, ctx_tr->delta_lf_cdf, DELTA_LF_PROBS + 1);
+ for (int i = 0; i < FRAME_LF_COUNT; i++) {
+ AVERAGE_CDF(ctx_left->delta_lf_multi_cdf[i], ctx_tr->delta_lf_multi_cdf[i],
+ DELTA_LF_PROBS + 1);
+ }
+ AVG_CDF_STRIDE(ctx_left->intra_ext_tx_cdf[1], ctx_tr->intra_ext_tx_cdf[1], 7,
+ CDF_SIZE(TX_TYPES));
+ AVG_CDF_STRIDE(ctx_left->intra_ext_tx_cdf[2], ctx_tr->intra_ext_tx_cdf[2], 5,
+ CDF_SIZE(TX_TYPES));
+ AVG_CDF_STRIDE(ctx_left->inter_ext_tx_cdf[1], ctx_tr->inter_ext_tx_cdf[1], 16,
+ CDF_SIZE(TX_TYPES));
+ AVG_CDF_STRIDE(ctx_left->inter_ext_tx_cdf[2], ctx_tr->inter_ext_tx_cdf[2], 12,
+ CDF_SIZE(TX_TYPES));
+ AVG_CDF_STRIDE(ctx_left->inter_ext_tx_cdf[3], ctx_tr->inter_ext_tx_cdf[3], 2,
+ CDF_SIZE(TX_TYPES));
+ AVERAGE_CDF(ctx_left->cfl_sign_cdf, ctx_tr->cfl_sign_cdf, CFL_JOINT_SIGNS);
+ AVERAGE_CDF(ctx_left->cfl_alpha_cdf, ctx_tr->cfl_alpha_cdf,
+ CFL_ALPHABET_SIZE);
+}
+
+static void encode_sb_row(AV1_COMP *cpi, ThreadData *td, TileDataEnc *tile_data,
+ int mi_row, TOKENEXTRA **tp, int use_nonrd_mode) {
AV1_COMMON *const cm = &cpi->common;
const int num_planes = av1_num_planes(cm);
const TileInfo *const tile_info = &tile_data->tile_info;
@@ -5032,6 +4662,10 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
const int mib_size_log2 = cm->seq_params.mib_size_log2;
const int sb_row = (mi_row - tile_info->mi_row_start) >> mib_size_log2;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, encode_sb_time);
+#endif
+
// Initialize the left context for the new SB row
av1_zero_left_context(xd);
@@ -5049,13 +4683,48 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
mi_col < tile_info->mi_col_end; mi_col += mib_size, sb_col_in_tile++) {
(*(cpi->row_mt_sync_read_ptr))(&tile_data->row_mt_sync, sb_row,
sb_col_in_tile);
- if ((cpi->row_mt == 1) && (tile_info->mi_col_start == mi_col) &&
+ if (tile_data->allow_update_cdf && (cpi->row_mt == 1) &&
(tile_info->mi_row_start != mi_row)) {
- // restore frame context of 1st column sb
- memcpy(xd->tile_ctx, x->backup_tile_ctx, sizeof(*xd->tile_ctx));
+ if ((tile_info->mi_col_start == mi_col)) {
+ // restore frame context of 1st column sb
+ memcpy(xd->tile_ctx, x->row_ctx, sizeof(*xd->tile_ctx));
+ } else {
+ int wt_left = AVG_CDF_WEIGHT_LEFT;
+ int wt_tr = AVG_CDF_WEIGHT_TOP_RIGHT;
+ if (tile_info->mi_col_end > (mi_col + mib_size))
+ avg_cdf_symbols(xd->tile_ctx, x->row_ctx + sb_col_in_tile, wt_left,
+ wt_tr);
+ else
+ avg_cdf_symbols(xd->tile_ctx, x->row_ctx + sb_col_in_tile - 1,
+ wt_left, wt_tr);
+ }
+ }
+
+ switch (cpi->oxcf.coeff_cost_upd_freq) {
+ case COST_UPD_TILE: // Tile level
+ if (mi_row != tile_info->mi_row_start) break;
+ AOM_FALLTHROUGH_INTENDED;
+ case COST_UPD_SBROW: // SB row level in tile
+ if (mi_col != tile_info->mi_col_start) break;
+ AOM_FALLTHROUGH_INTENDED;
+ case COST_UPD_SB: // SB level
+ av1_fill_coeff_costs(&td->mb, xd->tile_ctx, num_planes);
+ break;
+ default: assert(0);
+ }
+
+ switch (cpi->oxcf.mode_cost_upd_freq) {
+ case COST_UPD_TILE: // Tile level
+ if (mi_row != tile_info->mi_row_start) break;
+ AOM_FALLTHROUGH_INTENDED;
+ case COST_UPD_SBROW: // SB row level in tile
+ if (mi_col != tile_info->mi_col_start) break;
+ AOM_FALLTHROUGH_INTENDED;
+ case COST_UPD_SB: // SB level
+ av1_fill_mode_rates(cm, x, xd->tile_ctx);
+ break;
+ default: assert(0);
}
- av1_fill_coeff_costs(&td->mb, xd->tile_ctx, num_planes);
- av1_fill_mode_rates(cm, x, xd->tile_ctx);
if (sf->adaptive_pred_interp_filter) {
for (int i = 0; i < leaf_nodes; ++i) {
@@ -5068,16 +4737,27 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
x->mb_rd_record.num = x->mb_rd_record.index_start = 0;
- av1_zero(x->txb_rd_record_8X8);
- av1_zero(x->txb_rd_record_16X16);
- av1_zero(x->txb_rd_record_32X32);
- av1_zero(x->txb_rd_record_64X64);
- av1_zero(x->txb_rd_record_intra);
+ if (!use_nonrd_mode) {
+ av1_zero(x->txb_rd_record_8X8);
+ av1_zero(x->txb_rd_record_16X16);
+ av1_zero(x->txb_rd_record_32X32);
+ av1_zero(x->txb_rd_record_64X64);
+ av1_zero(x->txb_rd_record_intra);
+ }
+
+ av1_zero(x->picked_ref_frames_mask);
av1_zero(x->pred_mv);
PC_TREE *const pc_root = td->pc_root[mib_size_log2 - MIN_MIB_SIZE_LOG2];
pc_root->index = 0;
+ if ((sf->simple_motion_search_prune_rect ||
+ sf->simple_motion_search_early_term_none ||
+ sf->firstpass_simple_motion_search_early_term) &&
+ !frame_is_intra_only(cm)) {
+ init_simple_motion_search_mvs(pc_root);
+ }
+
const struct segmentation *const seg = &cm->seg;
int seg_skip = 0;
if (seg->enabled) {
@@ -5099,6 +4779,7 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
const int idx_str = cm->mi_stride * mi_row + mi_col;
MB_MODE_INFO **mi = cm->mi_grid_visible + idx_str;
x->source_variance = UINT_MAX;
+ x->simple_motion_pred_sse = UINT_MAX;
if (sf->partition_search_type == FIXED_PARTITION || seg_skip) {
set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
const BLOCK_SIZE bsize = seg_skip ? sb_size : sf->always_this_block_size;
@@ -5112,6 +4793,13 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
&dummy_rate, &dummy_dist, 1, pc_root);
+ } else if (sf->partition_search_type == VAR_BASED_PARTITION &&
+ use_nonrd_mode) {
+ set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
+ av1_choose_var_based_partitioning(cpi, tile_info, x, mi_row, mi_col);
+ nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, sb_size,
+ &dummy_rate, &dummy_dist, 1, pc_root);
+
} else {
const int orig_rdmult = cpi->rd.RDMULT;
x->cb_rdmult = orig_rdmult;
@@ -5124,58 +4812,87 @@ static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
x->rdmult = x->cb_rdmult;
}
- // If required set upper and lower partition size limits
- if (sf->auto_min_max_partition_size) {
- set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size);
- rd_auto_partition_range(cpi, tile_info, xd, mi_row, mi_col,
- &x->min_partition_size, &x->max_partition_size);
- }
-
reset_partition(pc_root, sb_size);
x->use_cb_search_range = 0;
- init_first_partition_pass_stats_tables(x->first_partition_pass_stats);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, first_partition_search_pass_time);
+#endif
+ init_first_partition_pass_stats_tables(cpi,
+ x->first_partition_pass_stats);
// Do the first pass if we need two pass partition search
- if (cpi->sf.two_pass_partition_search &&
+ if (cpi->two_pass_partition_search &&
cpi->sf.use_square_partition_only_threshold > BLOCK_4X4 &&
- mi_row + mi_size_high[sb_size] < cm->mi_rows &&
- mi_col + mi_size_wide[sb_size] < cm->mi_cols &&
+ mi_row + mi_size_high[sb_size] <= cm->mi_rows &&
+ mi_col + mi_size_wide[sb_size] <= cm->mi_cols &&
cm->current_frame.frame_type != KEY_FRAME) {
first_partition_search_pass(cpi, td, tile_data, mi_row, mi_col, tp);
}
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, first_partition_search_pass_time);
+#endif
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, rd_pick_partition_time);
+#endif
+ BLOCK_SIZE max_sq_size = BLOCK_128X128;
+ switch (cpi->oxcf.max_partition_size) {
+ case 4: max_sq_size = BLOCK_4X4; break;
+ case 8: max_sq_size = BLOCK_8X8; break;
+ case 16: max_sq_size = BLOCK_16X16; break;
+ case 32: max_sq_size = BLOCK_32X32; break;
+ case 64: max_sq_size = BLOCK_64X64; break;
+ case 128: max_sq_size = BLOCK_128X128; break;
+ default: assert(0); break;
+ }
+ max_sq_size = AOMMIN(max_sq_size, sb_size);
+
+ BLOCK_SIZE min_sq_size = BLOCK_4X4;
+ switch (cpi->oxcf.min_partition_size) {
+ case 4: min_sq_size = BLOCK_4X4; break;
+ case 8: min_sq_size = BLOCK_8X8; break;
+ case 16: min_sq_size = BLOCK_16X16; break;
+ case 32: min_sq_size = BLOCK_32X32; break;
+ case 64: min_sq_size = BLOCK_64X64; break;
+ case 128: min_sq_size = BLOCK_128X128; break;
+ default: assert(0); break;
+ }
+
+ if (use_auto_max_partition(cpi, sb_size, mi_row, mi_col)) {
+ float features[FEATURE_SIZE_MAX_MIN_PART_PRED] = { 0.0f };
+
+ av1_get_max_min_partition_features(cpi, x, mi_row, mi_col, features);
+ max_sq_size =
+ AOMMIN(av1_predict_max_partition(cpi, x, features), max_sq_size);
+ }
+
+ min_sq_size = AOMMIN(min_sq_size, max_sq_size);
rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, sb_size,
- &dummy_rdc, INT64_MAX, pc_root, NULL);
+ max_sq_size, min_sq_size, &dummy_rdc, INT64_MAX,
+ pc_root, NULL);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, rd_pick_partition_time);
+#endif
}
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
// TODO(angiebird): Let inter_mode_rd_model_estimation support multi-tile.
- if (cpi->sf.inter_mode_rd_model_estimation && cm->tile_cols == 1 &&
+ if (cpi->sf.inter_mode_rd_model_estimation == 1 && cm->tile_cols == 1 &&
cm->tile_rows == 1) {
av1_inter_mode_data_fit(tile_data, x->rdmult);
}
-#endif
- // Context update for row based multi-threading of encoder is done based on
- // the following conditions:
- // 1. If mib_size_log2==5, context of top-right superblock is used
- // for context modelling. If top-right is not available (in case of tile
- // with width == mib_size_log2==5), top superblock's context is used.
- // 2. If mib_size_log2==4, context of next superblock to top-right
- // superblock is used. Using context of top-right superblock in this case
- // gives high BD Rate drop for smaller resolutions.
- if (cpi->row_mt == 1) {
- int update_context = 0;
- if (mib_size_log2 == 5) {
- update_context = sb_cols_in_tile == 1 || sb_col_in_tile == 1;
- } else if (mib_size_log2 == 4) {
- update_context = sb_cols_in_tile == 1 ||
- (sb_cols_in_tile == 2 && sb_col_in_tile == 1) ||
- sb_col_in_tile == 2;
- }
- if (update_context)
- memcpy(x->backup_tile_ctx, xd->tile_ctx, sizeof(*xd->tile_ctx));
+ if (tile_data->allow_update_cdf && (cpi->row_mt == 1) &&
+ (tile_info->mi_row_end > (mi_row + mib_size))) {
+ if (sb_cols_in_tile == 1)
+ memcpy(x->row_ctx, xd->tile_ctx, sizeof(*xd->tile_ctx));
+ else if (sb_col_in_tile >= 1)
+ memcpy(x->row_ctx + sb_col_in_tile - 1, xd->tile_ctx,
+ sizeof(*xd->tile_ctx));
}
(*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, sb_row,
sb_col_in_tile, sb_cols_in_tile);
}
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, encode_sb_time);
+#endif
}
static void init_encode_frame_mb_context(AV1_COMP *cpi) {
@@ -5193,18 +4910,18 @@ static void init_encode_frame_mb_context(AV1_COMP *cpi) {
}
static MV_REFERENCE_FRAME get_frame_type(const AV1_COMP *cpi) {
- if (frame_is_intra_only(&cpi->common)) return INTRA_FRAME;
- // We will not update the golden frame with an internal overlay frame
- else if ((cpi->rc.is_src_frame_alt_ref && cpi->refresh_golden_frame) ||
- cpi->rc.is_src_frame_ext_arf)
+ if (frame_is_intra_only(&cpi->common)) {
+ return INTRA_FRAME;
+ } else if ((cpi->rc.is_src_frame_alt_ref && cpi->refresh_golden_frame) ||
+ cpi->rc.is_src_frame_internal_arf) {
+ // We will not update the golden frame with an internal overlay frame
return ALTREF_FRAME;
- else if (cpi->refresh_golden_frame || cpi->refresh_alt2_ref_frame ||
- cpi->refresh_alt_ref_frame)
+ } else if (cpi->refresh_golden_frame || cpi->refresh_alt2_ref_frame ||
+ cpi->refresh_alt_ref_frame) {
return GOLDEN_FRAME;
- else
- // TODO(zoeliu): To investigate whether a frame_type other than
- // INTRA/ALTREF/GOLDEN/LAST needs to be specified seperately.
+ } else {
return LAST_FRAME;
+ }
}
static TX_MODE select_tx_mode(const AV1_COMP *cpi) {
@@ -5238,7 +4955,6 @@ void av1_alloc_tile_data(AV1_COMP *cpi) {
for (i = 0; i < BLOCK_SIZES_ALL; ++i) {
for (j = 0; j < MAX_MODES; ++j) {
tile_data->thresh_freq_fact[i][j] = 32;
- tile_data->mode_map[i][j] = j;
}
}
}
@@ -5296,7 +5012,7 @@ void av1_encode_sb_row(AV1_COMP *cpi, ThreadData *td, int tile_row,
cm->seq_params.mib_size_log2 + MI_SIZE_LOG2, num_planes);
cpi->tplist[tile_row][tile_col][sb_row_in_tile].start = tok;
- encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok);
+ encode_sb_row(cpi, td, this_tile, mi_row, &tok, cpi->sf.use_nonrd_pick_mode);
cpi->tplist[tile_row][tile_col][sb_row_in_tile].stop = tok;
cpi->tplist[tile_row][tile_col][sb_row_in_tile].count =
@@ -5321,9 +5037,7 @@ void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row,
const TileInfo *const tile_info = &this_tile->tile_info;
int mi_row;
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
av1_inter_mode_data_init(this_tile);
-#endif
av1_zero_above_context(cm, &td->mb.e_mbd, tile_info->mi_col_start,
tile_info->mi_col_end, tile_row);
@@ -5363,28 +5077,12 @@ static void encode_tiles(AV1_COMP *cpi) {
cpi->td.intrabc_used = 0;
cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
cpi->td.mb.tile_pb_ctx = &this_tile->tctx;
- cpi->td.mb.backup_tile_ctx = &this_tile->backup_tctx;
av1_encode_tile(cpi, &cpi->td, tile_row, tile_col);
cpi->intrabc_used |= cpi->td.intrabc_used;
}
}
}
-#if CONFIG_FP_MB_STATS
-static int input_fpmb_stats(FIRSTPASS_MB_STATS *firstpass_mb_stats,
- AV1_COMMON *cm, uint8_t **this_frame_mb_stats) {
- uint8_t *mb_stats_in =
- firstpass_mb_stats->mb_stats_start +
- cm->current_frame.frame_number * cm->MBs * sizeof(uint8_t);
-
- if (mb_stats_in > firstpass_mb_stats->mb_stats_end) return EOF;
-
- *this_frame_mb_stats = mb_stats_in;
-
- return 1;
-}
-#endif
-
#define GLOBAL_TRANS_TYPES_ENC 3 // highest motion model to search
static int gm_get_params_cost(const WarpedMotionParams *gm,
const WarpedMotionParams *ref_gm, int allow_hp) {
@@ -5441,123 +5139,73 @@ static int do_gm_search_logic(SPEED_FEATURES *const sf, int num_refs_using_gm,
(void)frame;
switch (sf->gm_search_type) {
case GM_FULL_SEARCH: return 1;
- case GM_REDUCED_REF_SEARCH:
+ case GM_REDUCED_REF_SEARCH_SKIP_L2_L3:
return !(frame == LAST2_FRAME || frame == LAST3_FRAME);
+ case GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2:
+ return !(frame == LAST2_FRAME || frame == LAST3_FRAME ||
+ (frame == ALTREF2_FRAME));
case GM_DISABLE_SEARCH: return 0;
default: assert(0);
}
return 1;
}
-static const uint8_t ref_frame_flag_list[REF_FRAMES] = { 0,
- AOM_LAST_FLAG,
- AOM_LAST2_FLAG,
- AOM_LAST3_FLAG,
- AOM_GOLD_FLAG,
- AOM_BWD_FLAG,
- AOM_ALT2_FLAG,
- AOM_ALT_FLAG };
-
-// Enforce the number of references for each arbitrary frame limited to
-// (INTER_REFS_PER_FRAME - 1)
+static int get_max_allowed_ref_frames(const AV1_COMP *cpi) {
+ const unsigned int max_allowed_refs_for_given_speed =
+ (cpi->sf.selective_ref_frame >= 3) ? INTER_REFS_PER_FRAME - 1
+ : INTER_REFS_PER_FRAME;
+ return AOMMIN(max_allowed_refs_for_given_speed,
+ cpi->oxcf.max_reference_frames);
+}
+
+// Enforce the number of references for each arbitrary frame based on user
+// options and speed.
static void enforce_max_ref_frames(AV1_COMP *cpi) {
- AV1_COMMON *const cm = &cpi->common;
MV_REFERENCE_FRAME ref_frame;
int total_valid_refs = 0;
for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
- if (cpi->ref_frame_flags & ref_frame_flag_list[ref_frame])
+ if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) {
total_valid_refs++;
+ }
}
- // NOTE(zoeliu): When all the possible reference frames are availble, we
- // reduce the number of reference frames by 1, following the rules of:
- // (1) Retain GOLDEN_FARME/ALTEF_FRAME;
- // (2) Check the earliest 2 remaining reference frames, and remove the one
- // with the lower quality factor, otherwise if both have been coded at
- // the same quality level, remove the earliest reference frame.
-
- if (total_valid_refs == INTER_REFS_PER_FRAME) {
- unsigned int min_ref_order_hint = UINT_MAX;
- unsigned int second_min_ref_order_hint = UINT_MAX;
- MV_REFERENCE_FRAME earliest_ref_frames[2] = { LAST3_FRAME, LAST2_FRAME };
- const RefCntBuffer *earliest_bufs[2] = { NULL };
-
- // Locate the earliest two reference frames except GOLDEN/ALTREF.
- for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
- // Retain GOLDEN/ALTERF
- if (ref_frame == GOLDEN_FRAME || ref_frame == ALTREF_FRAME) continue;
-
- const RefCntBuffer *const buf =
- cm->current_frame.frame_refs[ref_frame - LAST_FRAME].buf;
- if (buf != NULL) {
- const unsigned int ref_order_hint = buf->order_hint;
-
- if (min_ref_order_hint == UINT_MAX) {
- min_ref_order_hint = ref_order_hint;
- earliest_ref_frames[0] = ref_frame;
- earliest_bufs[0] = buf;
- } else {
- if (get_relative_dist(&cm->seq_params.order_hint_info, ref_order_hint,
- min_ref_order_hint) < 0) {
- second_min_ref_order_hint = min_ref_order_hint;
- earliest_ref_frames[1] = earliest_ref_frames[0];
- earliest_bufs[1] = earliest_bufs[0];
-
- min_ref_order_hint = ref_order_hint;
- earliest_ref_frames[0] = ref_frame;
- earliest_bufs[0] = buf;
- } else if (second_min_ref_order_hint == UINT_MAX ||
- get_relative_dist(&cm->seq_params.order_hint_info,
- ref_order_hint,
- second_min_ref_order_hint) < 0) {
- second_min_ref_order_hint = ref_order_hint;
- earliest_ref_frames[1] = ref_frame;
- earliest_bufs[1] = buf;
- }
- }
- }
+ const int max_allowed_refs = get_max_allowed_ref_frames(cpi);
+
+ // When more than 'max_allowed_refs' are available, we reduce the number of
+ // reference frames one at a time based on this order.
+ const MV_REFERENCE_FRAME disable_order[] = {
+ LAST3_FRAME,
+ LAST2_FRAME,
+ ALTREF2_FRAME,
+ GOLDEN_FRAME,
+ };
+
+ for (int i = 0; i < 4 && total_valid_refs > max_allowed_refs; ++i) {
+ const MV_REFERENCE_FRAME ref_frame_to_disable = disable_order[i];
+
+ if (!(cpi->ref_frame_flags &
+ av1_ref_frame_flag_list[ref_frame_to_disable])) {
+ continue;
}
- // Check the coding quality factors of the two earliest reference frames.
- RATE_FACTOR_LEVEL ref_rf_level[2];
- double ref_rf_deltas[2];
- for (int i = 0; i < 2; ++i) {
- ref_rf_level[i] = earliest_bufs[i]->frame_rf_level;
- ref_rf_deltas[i] = rate_factor_deltas[ref_rf_level[i]];
- }
- (void)ref_rf_level;
- (void)ref_rf_deltas;
-
-#define USE_RF_LEVEL_TO_ENFORCE 1
-#if USE_RF_LEVEL_TO_ENFORCE
- // If both earliest two reference frames are coded using the same rate-
- // factor, disable the earliest reference frame; Otherwise disable the
- // reference frame that uses a lower rate-factor delta.
- const MV_REFERENCE_FRAME ref_frame_to_disable =
- (ref_rf_deltas[0] <= ref_rf_deltas[1]) ? earliest_ref_frames[0]
- : earliest_ref_frames[1];
-#else
- // Always disable the earliest reference frame
- const MV_REFERENCE_FRAME ref_frame_to_disable = earliest_ref_frames[0];
-#endif // USE_RF_LEVEL_TO_ENFORCE
-#undef USE_RF_LEVEL_TO_ENFORCE
switch (ref_frame_to_disable) {
- case LAST_FRAME: cpi->ref_frame_flags &= ~AOM_LAST_FLAG; break;
- case LAST2_FRAME: cpi->ref_frame_flags &= ~AOM_LAST2_FLAG; break;
case LAST3_FRAME: cpi->ref_frame_flags &= ~AOM_LAST3_FLAG; break;
- case BWDREF_FRAME: cpi->ref_frame_flags &= ~AOM_BWD_FLAG; break;
+ case LAST2_FRAME: cpi->ref_frame_flags &= ~AOM_LAST2_FLAG; break;
case ALTREF2_FRAME: cpi->ref_frame_flags &= ~AOM_ALT2_FLAG; break;
- default: break;
+ case GOLDEN_FRAME: cpi->ref_frame_flags &= ~AOM_GOLD_FLAG; break;
+ default: assert(0);
}
+ --total_valid_refs;
}
+ assert(total_valid_refs <= max_allowed_refs);
}
static INLINE int av1_refs_are_one_sided(const AV1_COMMON *cm) {
assert(!frame_is_intra_only(cm));
int one_sided_refs = 1;
- for (int ref = 0; ref < INTER_REFS_PER_FRAME; ++ref) {
- const RefCntBuffer *const buf = cm->current_frame.frame_refs[ref].buf;
+ for (int ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref) {
+ const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref);
if (buf == NULL) continue;
const int ref_order_hint = buf->order_hint;
@@ -5577,9 +5225,9 @@ static INLINE void get_skip_mode_ref_offsets(const AV1_COMMON *cm,
if (!skip_mode_info->skip_mode_allowed) return;
const RefCntBuffer *const buf_0 =
- cm->current_frame.frame_refs[skip_mode_info->ref_frame_idx_0].buf;
+ get_ref_frame_buf(cm, LAST_FRAME + skip_mode_info->ref_frame_idx_0);
const RefCntBuffer *const buf_1 =
- cm->current_frame.frame_refs[skip_mode_info->ref_frame_idx_1].buf;
+ get_ref_frame_buf(cm, LAST_FRAME + skip_mode_info->ref_frame_idx_1);
assert(buf_0 != NULL && buf_1 != NULL);
ref_order_hint[0] = buf_0->order_hint;
@@ -5666,9 +5314,10 @@ static void encode_frame_internal(AV1_COMP *cpi) {
av1_zero(*td->counts);
av1_zero(rdc->comp_pred_diff);
+ // Two pass partition search can be enabled/disabled for different frames.
+ // Reset this data at frame level to avoid any incorrect usage.
+ init_first_partition_pass_stats_tables(cpi, x->first_partition_pass_stats);
- // Allow intrabc when screen content tools are enabled.
- cm->allow_intrabc = cm->allow_screen_content_tools;
// Reset the flag.
cpi->intrabc_used = 0;
// Need to disable intrabc when superres is selected
@@ -5676,6 +5325,8 @@ static void encode_frame_internal(AV1_COMP *cpi) {
cm->allow_intrabc = 0;
}
+ cm->allow_intrabc &= (cpi->oxcf.enable_intrabc);
+
if (cpi->oxcf.pass != 1 && av1_use_hash_me(cm)) {
// add to hash table
const int pic_width = cpi->source->y_crop_width;
@@ -5760,7 +5411,7 @@ static void encode_frame_internal(AV1_COMP *cpi) {
if (xd->lossless[i]) {
cpi->optimize_seg_arr[i] = 0;
} else {
- cpi->optimize_seg_arr[i] = cpi->optimize_speed_feature;
+ cpi->optimize_seg_arr[i] = cpi->sf.optimize_coefficients;
}
}
cm->coded_lossless = is_coded_lossless(cm, xd);
@@ -5775,7 +5426,8 @@ static void encode_frame_internal(AV1_COMP *cpi) {
cm->delta_q_info.delta_q_present_flag = cpi->oxcf.deltaq_mode != NO_DELTA_Q;
cm->delta_q_info.delta_lf_present_flag = cpi->oxcf.deltaq_mode == DELTA_Q_LF;
cm->delta_q_info.delta_lf_multi = DEFAULT_DELTA_LF_MULTI;
- // update delta_q_present_flag and delta_lf_present_flag based on base_qindex
+ // update delta_q_present_flag and delta_lf_present_flag based on
+ // base_qindex
cm->delta_q_info.delta_q_present_flag &= cm->base_qindex > 0;
cm->delta_q_info.delta_lf_present_flag &= cm->base_qindex > 0;
@@ -5801,8 +5453,7 @@ static void encode_frame_internal(AV1_COMP *cpi) {
aom_clear_system_state();
if (tpl_frame->is_valid)
- cpi->rd.r0 =
- (double)intra_cost_base / (intra_cost_base + mc_dep_cost_base);
+ cpi->rd.r0 = (double)intra_cost_base / mc_dep_cost_base;
}
av1_frame_init_quantizer(cpi);
@@ -5815,7 +5466,6 @@ static void encode_frame_internal(AV1_COMP *cpi) {
cm->last_frame_seg_map = cm->prev_frame->seg_map;
else
cm->last_frame_seg_map = NULL;
- cm->current_frame_seg_map = cm->cur_frame->seg_map;
if (cm->allow_intrabc || cm->coded_lossless) {
av1_set_default_ref_deltas(cm->lf.ref_deltas);
av1_set_default_mode_deltas(cm->lf.mode_deltas);
@@ -5831,14 +5481,17 @@ static void encode_frame_internal(AV1_COMP *cpi) {
cm->prev_mi = cm->allow_ref_frame_mvs ? cm->prev_mip : NULL;
x->txb_split_count = 0;
+#if CONFIG_SPEED_STATS
+ x->tx_search_count = 0;
+#endif // CONFIG_SPEED_STATS
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, av1_compute_global_motion_time);
+#endif
av1_zero(rdc->global_motion_used);
av1_zero(cpi->gmparams_cost);
-#if !CONFIG_GLOBAL_MOTION_SEARCH
- cpi->global_motion_search_done = 1;
-#endif // !CONFIG_GLOBAL_MOTION_SEARCH
if (cpi->common.current_frame.frame_type == INTER_FRAME && cpi->source &&
- !cpi->global_motion_search_done) {
+ cpi->oxcf.enable_global_motion && !cpi->global_motion_search_done) {
YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES];
int frame;
double params_by_motion[RANSAC_NUM_MOTIONS * (MAX_PARAMDIM - 1)];
@@ -5853,7 +5506,9 @@ static void encode_frame_internal(AV1_COMP *cpi) {
int num_refs_using_gm = 0;
for (frame = ALTREF_FRAME; frame >= LAST_FRAME; --frame) {
- ref_buf[frame] = get_ref_frame_buffer(cpi, frame);
+ ref_buf[frame] = NULL;
+ RefCntBuffer *buf = get_ref_frame_buf(cm, frame);
+ if (buf != NULL) ref_buf[frame] = &buf->buf;
int pframe;
cm->global_motion[frame] = default_warp_params;
const WarpedMotionParams *ref_params =
@@ -5872,15 +5527,26 @@ static void encode_frame_internal(AV1_COMP *cpi) {
do_gm_search_logic(&cpi->sf, num_refs_using_gm, frame) &&
!(cpi->sf.selective_ref_gm && skip_gm_frame(cm, frame))) {
TransformationType model;
- const int64_t ref_frame_error =
- av1_frame_error(xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
- ref_buf[frame]->y_buffer, ref_buf[frame]->y_stride,
- cpi->source->y_buffer, cpi->source->y_width,
- cpi->source->y_height, cpi->source->y_stride);
+ const int64_t ref_frame_error = av1_frame_error(
+ is_cur_buf_hbd(xd), xd->bd, ref_buf[frame]->y_buffer,
+ ref_buf[frame]->y_stride, cpi->source->y_buffer,
+ cpi->source->y_width, cpi->source->y_height, cpi->source->y_stride);
if (ref_frame_error == 0) continue;
aom_clear_system_state();
+
+ // TODO(sarahparker, debargha): Explore do_adaptive_gm_estimation = 1
+ const int do_adaptive_gm_estimation = 0;
+
+ const int ref_frame_dist = get_relative_dist(
+ &cm->seq_params.order_hint_info, cm->current_frame.order_hint,
+ cm->cur_frame->ref_order_hints[frame - LAST_FRAME]);
+ const GlobalMotionEstimationType gm_estimation_type =
+ cm->seq_params.order_hint_info.enable_order_hint &&
+ abs(ref_frame_dist) <= 2 && do_adaptive_gm_estimation
+ ? GLOBAL_MOTION_DISFLOW_BASED
+ : GLOBAL_MOTION_FEATURE_BASED;
for (model = ROTZOOM; model < GLOBAL_TRANS_TYPES_ENC; ++model) {
int64_t best_warp_error = INT64_MAX;
// Initially set all params to identity.
@@ -5891,8 +5557,8 @@ static void encode_frame_internal(AV1_COMP *cpi) {
av1_compute_global_motion(model, cpi->source, ref_buf[frame],
cpi->common.seq_params.bit_depth,
- inliers_by_motion, params_by_motion,
- RANSAC_NUM_MOTIONS);
+ gm_estimation_type, inliers_by_motion,
+ params_by_motion, RANSAC_NUM_MOTIONS);
for (i = 0; i < RANSAC_NUM_MOTIONS; ++i) {
if (inliers_by_motion[i] == 0) continue;
@@ -5902,17 +5568,17 @@ static void encode_frame_internal(AV1_COMP *cpi) {
if (tmp_wm_params.wmtype != IDENTITY) {
const int64_t warp_error = av1_refine_integerized_param(
- &tmp_wm_params, tmp_wm_params.wmtype,
- xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
- ref_buf[frame]->y_buffer, ref_buf[frame]->y_width,
+ &tmp_wm_params, tmp_wm_params.wmtype, is_cur_buf_hbd(xd),
+ xd->bd, ref_buf[frame]->y_buffer, ref_buf[frame]->y_width,
ref_buf[frame]->y_height, ref_buf[frame]->y_stride,
cpi->source->y_buffer, cpi->source->y_width,
cpi->source->y_height, cpi->source->y_stride, 5,
best_warp_error);
if (warp_error < best_warp_error) {
best_warp_error = warp_error;
- // Save the wm_params modified by av1_refine_integerized_param()
- // rather than motion index to avoid rerunning refine() below.
+ // Save the wm_params modified by
+ // av1_refine_integerized_param() rather than motion index to
+ // avoid rerunning refine() below.
memcpy(&(cm->global_motion[frame]), &tmp_wm_params,
sizeof(WarpedMotionParams));
}
@@ -5956,7 +5622,7 @@ static void encode_frame_internal(AV1_COMP *cpi) {
// clear disabled ref_frames
for (frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) {
const int ref_disabled =
- !(cpi->ref_frame_flags & ref_frame_flag_list[frame]);
+ !(cpi->ref_frame_flags & av1_ref_frame_flag_list[frame]);
if (ref_disabled && cpi->sf.recode_loop != DISALLOW_RECODE) {
cpi->gmparams_cost[frame] = 0;
cm->global_motion[frame] = default_warp_params;
@@ -5966,8 +5632,17 @@ static void encode_frame_internal(AV1_COMP *cpi) {
}
memcpy(cm->cur_frame->global_motion, cm->global_motion,
REF_FRAMES * sizeof(WarpedMotionParams));
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, av1_compute_global_motion_time);
+#endif
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, av1_setup_motion_field_time);
+#endif
av1_setup_motion_field(cm);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, av1_setup_motion_field_time);
+#endif
cpi->all_one_sided_refs =
frame_is_intra_only(cm) ? 0 : av1_refs_are_one_sided(cm);
@@ -5976,16 +5651,6 @@ static void encode_frame_internal(AV1_COMP *cpi) {
check_skip_mode_enabled(cpi);
{
- struct aom_usec_timer emr_timer;
- aom_usec_timer_start(&emr_timer);
-
-#if CONFIG_FP_MB_STATS
- if (cpi->use_fp_mb_stats) {
- input_fpmb_stats(&cpi->twopass.firstpass_mb_stats, cm,
- &cpi->twopass.this_frame_mb_stats);
- }
-#endif
-
cpi->row_mt_sync_read_ptr = av1_row_mt_sync_read_dummy;
cpi->row_mt_sync_write_ptr = av1_row_mt_sync_write_dummy;
cpi->row_mt = 0;
@@ -6000,9 +5665,6 @@ static void encode_frame_internal(AV1_COMP *cpi) {
else
encode_tiles(cpi);
}
-
- aom_usec_timer_mark(&emr_timer);
- cpi->time_encode_sb_row += aom_usec_timer_elapsed(&emr_timer);
}
// If intrabc is allowed but never selected, reset the allow_intrabc flag.
@@ -6016,21 +5678,7 @@ void av1_encode_frame(AV1_COMP *cpi) {
const int num_planes = av1_num_planes(cm);
// Indicates whether or not to use a default reduced set for ext-tx
// rather than the potential full set of 16 transforms
- cm->reduced_tx_set_used = 0;
-
- if (cm->show_frame == 0) {
- int arf_offset = AOMMIN(
- (MAX_GF_INTERVAL - 1),
- cpi->twopass.gf_group.arf_src_offset[cpi->twopass.gf_group.index]);
- int brf_offset =
- cpi->twopass.gf_group.brf_src_offset[cpi->twopass.gf_group.index];
- arf_offset = AOMMIN((MAX_GF_INTERVAL - 1), arf_offset + brf_offset);
- current_frame->order_hint = current_frame->frame_number + arf_offset;
- } else {
- current_frame->order_hint = current_frame->frame_number;
- }
- current_frame->order_hint %=
- (1 << (cm->seq_params.order_hint_info.order_hint_bits_minus_1 + 1));
+ cm->reduced_tx_set_used = cpi->oxcf.reduced_tx_type_set;
// Make sure segment_id is no larger than last_active_segid.
if (cm->seg.enabled && cm->seg.update_map) {
@@ -6047,7 +5695,7 @@ void av1_encode_frame(AV1_COMP *cpi) {
}
av1_setup_frame_buf_refs(cm);
- if (cpi->sf.selective_ref_frame >= 3) enforce_max_ref_frames(cpi);
+ enforce_max_ref_frames(cpi);
av1_setup_frame_sign_bias(cm);
#if CONFIG_MISMATCH_DEBUG
@@ -6056,8 +5704,6 @@ void av1_encode_frame(AV1_COMP *cpi) {
(void)num_planes;
#endif
- cpi->allow_comp_inter_inter = !frame_is_intra_only(cm);
-
if (cpi->sf.frame_parameter_update) {
int i;
RD_OPT *const rd_opt = &cpi->rd;
@@ -6079,7 +5725,7 @@ void av1_encode_frame(AV1_COMP *cpi) {
/* prediction (compound, single or hybrid) mode selection */
// NOTE: "is_alt_ref" is true only for OVERLAY/INTNL_OVERLAY frames
- if (is_alt_ref || !cpi->allow_comp_inter_inter)
+ if (is_alt_ref || frame_is_intra_only(cm))
current_frame->reference_mode = SINGLE_REFERENCE;
else
current_frame->reference_mode = REFERENCE_MODE_SELECT;
@@ -6106,7 +5752,8 @@ void av1_encode_frame(AV1_COMP *cpi) {
#endif // CONFIG_ENTROPY_STATS
}
}
- // Re-check on the skip mode status as reference mode may have been changed.
+ // Re-check on the skip mode status as reference mode may have been
+ // changed.
SkipModeInfo *const skip_mode_info = &current_frame->skip_mode_info;
if (frame_is_intra_only(cm) ||
current_frame->reference_mode == SINGLE_REFERENCE) {
@@ -6287,8 +5934,7 @@ static void encode_superblock(const AV1_COMP *const cpi, TileDataEnc *tile_data,
const int mi_height = mi_size_high[bsize];
const int is_inter = is_inter_block(mbmi);
- if (cpi->sf.mode_pruning_based_on_two_pass_partition_search &&
- x->cb_partition_scan) {
+ if (cpi->two_pass_partition_search && x->cb_partition_scan) {
for (int row = mi_row; row < mi_row + mi_width;
row += FIRST_PARTITION_PASS_SAMPLE_REGION) {
for (int col = mi_col; col < mi_col + mi_height;
@@ -6302,8 +5948,15 @@ static void encode_superblock(const AV1_COMP *const cpi, TileDataEnc *tile_data,
if (stats->ref0_counts[mbmi->ref_frame[0]] < 255)
++stats->ref0_counts[mbmi->ref_frame[0]];
if (mbmi->ref_frame[1] >= 0 &&
- stats->ref1_counts[mbmi->ref_frame[0]] < 255)
+ stats->ref1_counts[mbmi->ref_frame[1]] < 255)
++stats->ref1_counts[mbmi->ref_frame[1]];
+ if (cpi->sf.use_first_partition_pass_interintra_stats) {
+ // Increase the counter for interintra_motion_mode_count
+ if (mbmi->motion_mode == 0 && mbmi->ref_frame[1] == INTRA_FRAME &&
+ stats->interintra_motion_mode_count[mbmi->ref_frame[0]] < 255) {
+ ++stats->interintra_motion_mode_count[mbmi->ref_frame[0]];
+ }
+ }
}
}
}
@@ -6351,15 +6004,19 @@ static void encode_superblock(const AV1_COMP *const cpi, TileDataEnc *tile_data,
set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
for (ref = 0; ref < 1 + is_compound; ++ref) {
- YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, mbmi->ref_frame[ref]);
+ const YV12_BUFFER_CONFIG *cfg =
+ get_ref_frame_yv12_buf(cm, mbmi->ref_frame[ref]);
assert(IMPLIES(!is_intrabc_block(mbmi), cfg));
av1_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
- &xd->block_refs[ref]->sf, num_planes);
+ xd->block_ref_scale_factors[ref], num_planes);
}
- av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
- if (mbmi->motion_mode == OBMC_CAUSAL)
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
+ av1_num_planes(cm) - 1);
+ if (mbmi->motion_mode == OBMC_CAUSAL) {
+ assert(cpi->oxcf.enable_obmc == 1);
av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+ }
#if CONFIG_MISMATCH_DEBUG
if (dry_run == OUTPUT_ENABLED) {
diff --git a/libaom/av1/encoder/encodemb.c b/libaom/av1/encoder/encodemb.c
index e0c0370..8e9da61 100644
--- a/libaom/av1/encoder/encodemb.c
+++ b/libaom/av1/encoder/encodemb.c
@@ -43,7 +43,7 @@ static void subtract_block(const MACROBLOCKD *xd, int rows, int cols,
const uint8_t *src8, ptrdiff_t src_stride,
const uint8_t *pred8, ptrdiff_t pred_stride) {
if (check_subtract_block_size(rows, cols)) {
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ if (is_cur_buf_hbd(xd)) {
aom_highbd_subtract_block_c(rows, cols, diff, diff_stride, src8,
src_stride, pred8, pred_stride, xd->bd);
return;
@@ -54,7 +54,7 @@ static void subtract_block(const MACROBLOCKD *xd, int rows, int cols,
return;
}
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ if (is_cur_buf_hbd(xd)) {
aom_highbd_subtract_block(rows, cols, diff, diff_stride, src8, src_stride,
pred8, pred_stride, xd->bd);
return;
@@ -111,16 +111,15 @@ int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *mb, int plane,
return eob;
}
- (void)fast_mode;
return av1_optimize_txb_new(cpi, mb, plane, block, tx_size, tx_type, txb_ctx,
- rate_cost, cpi->oxcf.sharpness);
+ rate_cost, cpi->oxcf.sharpness, fast_mode);
}
-typedef enum QUANT_FUNC {
+enum {
QUANT_FUNC_LOWBD = 0,
QUANT_FUNC_HIGHBD = 1,
QUANT_FUNC_TYPES = 2
-} QUANT_FUNC;
+} UENUM1BYTE(QUANT_FUNC);
static AV1_QUANT_FACADE
quant_func_list[AV1_XFORM_QUANT_TYPES][QUANT_FUNC_TYPES] = {
@@ -163,6 +162,7 @@ void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
qparam.tx_size = tx_size;
qparam.qmatrix = qmatrix;
qparam.iqmatrix = iqmatrix;
+ qparam.use_quant_b_adapt = cm->use_quant_b_adapt;
TxfmParam txfm_param;
txfm_param.tx_type = tx_type;
txfm_param.tx_size = tx_size;
@@ -171,7 +171,7 @@ void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
txfm_param.tx_size, is_inter_block(mbmi), cm->reduced_tx_set_used);
txfm_param.bd = xd->bd;
- txfm_param.is_hbd = get_bitdepth_data_path_index(xd);
+ txfm_param.is_hbd = is_cur_buf_hbd(xd);
av1_fwd_txfm(src_diff, coeff, diff_stride, &txfm_param);
@@ -184,7 +184,7 @@ void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
av1_quantize_skip(n_coeffs, qcoeff, dqcoeff, eob);
}
}
- // NOTE: optimize_b_following is ture means av1_optimze_b will be called
+ // NOTE: optimize_b_following is true means av1_optimze_b will be called
// When the condition of doing optimize_b is changed,
// this flag need update simultaneously
const int optimize_b_following =
@@ -226,13 +226,17 @@ static void encode_block(int plane, int block, int blk_row, int blk_col,
if (!is_blk_skip(x, plane, blk_row * bw + blk_col) && !mbmi->skip_mode) {
TX_TYPE tx_type = av1_get_tx_type(pd->plane_type, xd, blk_row, blk_col,
tx_size, cm->reduced_tx_set_used);
- if (args->enable_optimize_b) {
- av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize,
- tx_size, tx_type, AV1_XFORM_QUANT_FP);
+ if (args->enable_optimize_b != NO_TRELLIS_OPT) {
+ av1_xform_quant(
+ cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, tx_type,
+ USE_B_QUANT_NO_TRELLIS &&
+ (args->enable_optimize_b == FINAL_PASS_TRELLIS_OPT)
+ ? AV1_XFORM_QUANT_B
+ : AV1_XFORM_QUANT_FP);
TXB_CTX txb_ctx;
get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
- av1_optimize_b(args->cpi, x, plane, block, tx_size, tx_type, &txb_ctx, 1,
- &dummy_rate_cost);
+ av1_optimize_b(args->cpi, x, plane, block, tx_size, tx_type, &txb_ctx,
+ args->cpi->sf.trellis_eob_fast, &dummy_rate_cost);
} else {
av1_xform_quant(
cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, tx_type,
@@ -255,12 +259,12 @@ static void encode_block(int plane, int block, int blk_row, int blk_col,
cm->reduced_tx_set_used);
}
+ // TODO(debargha, jingning): Temporarily disable txk_type check for eob=0
+ // case. It is possible that certain collision in hash index would cause
+ // the assertion failure. To further optimize the rate-distortion
+ // performance, we need to re-visit this part and enable this assert
+ // again.
if (p->eobs[block] == 0 && plane == 0) {
- // TODO(debargha, jingning): Temporarily disable txk_type check for eob=0
- // case. It is possible that certain collision in hash index would cause
- // the assertion failure. To further optimize the rate-distortion
- // performance, we need to re-visit this part and enable this assert
- // again.
#if 0
if (args->cpi->oxcf.aq_mode == NO_AQ &&
args->cpi->oxcf.deltaq_mode == NO_DELTA_Q) {
@@ -431,7 +435,7 @@ static void encode_block_pass1(int plane, int block, int blk_row, int blk_col,
if (p->eobs[block] > 0) {
txfm_param.bd = xd->bd;
- txfm_param.is_hbd = get_bitdepth_data_path_index(xd);
+ txfm_param.is_hbd = is_cur_buf_hbd(xd);
txfm_param.tx_type = DCT_DCT;
txfm_param.tx_size = tx_size;
txfm_param.eob = p->eobs[block];
@@ -578,13 +582,17 @@ void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
const ENTROPY_CONTEXT *a = &args->ta[blk_col];
const ENTROPY_CONTEXT *l = &args->tl[blk_row];
- if (args->enable_optimize_b) {
- av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize,
- tx_size, tx_type, AV1_XFORM_QUANT_FP);
+ if (args->enable_optimize_b != NO_TRELLIS_OPT) {
+ av1_xform_quant(
+ cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, tx_type,
+ USE_B_QUANT_NO_TRELLIS &&
+ (args->enable_optimize_b == FINAL_PASS_TRELLIS_OPT)
+ ? AV1_XFORM_QUANT_B
+ : AV1_XFORM_QUANT_FP);
TXB_CTX txb_ctx;
get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
- av1_optimize_b(args->cpi, x, plane, block, tx_size, tx_type, &txb_ctx, 1,
- &dummy_rate_cost);
+ av1_optimize_b(args->cpi, x, plane, block, tx_size, tx_type, &txb_ctx,
+ args->cpi->sf.trellis_eob_fast, &dummy_rate_cost);
} else {
av1_xform_quant(
cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, tx_type,
@@ -597,12 +605,12 @@ void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
dst_stride, *eob, cm->reduced_tx_set_used);
}
+ // TODO(jingning): Temporarily disable txk_type check for eob=0 case.
+ // It is possible that certain collision in hash index would cause
+ // the assertion failure. To further optimize the rate-distortion
+ // performance, we need to re-visit this part and enable this assert
+ // again.
if (*eob == 0 && plane == 0) {
- // TODO(jingning): Temporarily disable txk_type check for eob=0 case.
- // It is possible that certain collision in hash index would cause
- // the assertion failure. To further optimize the rate-distortion
- // performance, we need to re-visit this part and enable this assert
- // again.
#if 0
if (args->cpi->oxcf.aq_mode == NO_AQ
&& args->cpi->oxcf.deltaq_mode == NO_DELTA_Q) {
diff --git a/libaom/av1/encoder/encodemb.h b/libaom/av1/encoder/encodemb.h
index 39080de..d4394cf 100644
--- a/libaom/av1/encoder/encodemb.h
+++ b/libaom/av1/encoder/encodemb.h
@@ -37,13 +37,13 @@ struct encode_b_args {
int8_t enable_optimize_b;
};
-typedef enum AV1_XFORM_QUANT {
+enum {
AV1_XFORM_QUANT_FP = 0,
AV1_XFORM_QUANT_B = 1,
AV1_XFORM_QUANT_DC = 2,
AV1_XFORM_QUANT_SKIP_QUANT,
AV1_XFORM_QUANT_TYPES,
-} AV1_XFORM_QUANT;
+} UENUM1BYTE(AV1_XFORM_QUANT);
void av1_encode_sb(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
int mi_row, int mi_col, RUN_TYPE dry_run);
diff --git a/libaom/av1/encoder/encoder.c b/libaom/av1/encoder/encoder.c
index 7652029..818e43c 100644
--- a/libaom/av1/encoder/encoder.c
+++ b/libaom/av1/encoder/encoder.c
@@ -33,9 +33,9 @@
#include "aom_ports/mem.h"
#include "aom_ports/system_state.h"
#include "aom_scale/aom_scale.h"
-#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+#if CONFIG_BITSTREAM_DEBUG
#include "aom_util/debug_util.h"
-#endif // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+#endif // CONFIG_BITSTREAM_DEBUG
#include "av1/common/alloccommon.h"
#include "av1/common/cdef.h"
@@ -54,6 +54,7 @@
#include "av1/encoder/context_tree.h"
#include "av1/encoder/encodeframe.h"
#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encode_strategy.h"
#include "av1/encoder/encoder.h"
#include "av1/encoder/encodetxb.h"
#include "av1/encoder/ethread.h"
@@ -61,6 +62,7 @@
#include "av1/encoder/grain_test_vectors.h"
#include "av1/encoder/hash_motion.h"
#include "av1/encoder/mbgraph.h"
+#include "av1/encoder/pass2_strategy.h"
#include "av1/encoder/picklpf.h"
#include "av1/encoder/pickrst.h"
#include "av1/encoder/random.h"
@@ -69,14 +71,11 @@
#include "av1/encoder/rdopt.h"
#include "av1/encoder/segmentation.h"
#include "av1/encoder/speed_features.h"
-#include "av1/encoder/temporal_filter.h"
#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/var_based_part.h"
#define DEFAULT_EXPLICIT_ORDER_HINT_BITS 7
-// av1 uses 10,000,000 ticks/second as time stamp
-#define TICKS_PER_SEC 10000000LL
-
#if CONFIG_ENTROPY_STATS
FRAME_COUNTS aggregate_fc;
#endif // CONFIG_ENTROPY_STATS
@@ -100,30 +99,6 @@ FILE *yuv_rec_file;
#define FILE_NAME_LEN 100
#endif
-// Estimate if the source frame is screen content, based on the portion of
-// blocks that have no more than 4 (experimentally selected) luma colors.
-static int is_screen_content(const uint8_t *src, int use_hbd, int bd,
- int stride, int width, int height) {
- assert(src != NULL);
- int counts = 0;
- const int blk_w = 16;
- const int blk_h = 16;
- const int limit = 4;
- for (int r = 0; r + blk_h <= height; r += blk_h) {
- for (int c = 0; c + blk_w <= width; c += blk_w) {
- int count_buf[1 << 12]; // Maximum (1 << 12) color levels.
- const int n_colors =
- use_hbd ? av1_count_colors_highbd(src + r * stride + c, stride, blk_w,
- blk_h, bd, count_buf)
- : av1_count_colors(src + r * stride + c, stride, blk_w, blk_h,
- count_buf);
- if (n_colors > 1 && n_colors <= limit) counts++;
- }
- }
- // The threshold is 10%.
- return counts * blk_h * blk_w * 10 > width * height;
-}
-
static INLINE void Scale2Ratio(AOM_SCALING mode, int *hr, int *hs) {
switch (mode) {
case NORMAL:
@@ -269,7 +244,7 @@ int av1_get_active_map(AV1_COMP *cpi, unsigned char *new_map_16x16, int rows,
// by calculuating the 16x4 Horizontal DCT. This is to be used to
// decide the superresolution parameters.
void analyze_hor_freq(const AV1_COMP *cpi, double *energy) {
- uint64_t freq_energy[8] = { 0 };
+ uint64_t freq_energy[16] = { 0 };
const YV12_BUFFER_CONFIG *buf = cpi->source;
const int bd = cpi->td.mb.e_mbd.bd;
const int width = buf->y_crop_width;
@@ -283,14 +258,13 @@ void analyze_hor_freq(const AV1_COMP *cpi, double *energy) {
for (int j = 0; j < width - 16; j += 16) {
av1_fwd_txfm2d_16x4(src16 + i * buf->y_stride + j, coeff, buf->y_stride,
H_DCT, bd);
- for (int k = 8; k < 16; ++k) {
+ for (int k = 1; k < 16; ++k) {
const uint64_t this_energy =
((int64_t)coeff[k] * coeff[k]) +
((int64_t)coeff[k + 16] * coeff[k + 16]) +
((int64_t)coeff[k + 32] * coeff[k + 32]) +
((int64_t)coeff[k + 48] * coeff[k + 48]);
- freq_energy[k - 8] +=
- ROUND_POWER_OF_TWO(this_energy, 2 + 2 * (bd - 8));
+ freq_energy[k] += ROUND_POWER_OF_TWO(this_energy, 2 + 2 * (bd - 8));
}
n++;
}
@@ -305,24 +279,24 @@ void analyze_hor_freq(const AV1_COMP *cpi, double *energy) {
src16[ii * 16 + jj] =
buf->y_buffer[(i + ii) * buf->y_stride + (j + jj)];
av1_fwd_txfm2d_16x4(src16, coeff, 16, H_DCT, bd);
- for (int k = 8; k < 16; ++k) {
+ for (int k = 1; k < 16; ++k) {
const uint64_t this_energy =
((int64_t)coeff[k] * coeff[k]) +
((int64_t)coeff[k + 16] * coeff[k + 16]) +
((int64_t)coeff[k + 32] * coeff[k + 32]) +
((int64_t)coeff[k + 48] * coeff[k + 48]);
- freq_energy[k - 8] += ROUND_POWER_OF_TWO(this_energy, 2);
+ freq_energy[k] += ROUND_POWER_OF_TWO(this_energy, 2);
}
n++;
}
}
}
if (n) {
- for (int k = 0; k < 8; ++k) energy[k] = (double)freq_energy[k] / n;
+ for (int k = 1; k < 16; ++k) energy[k] = (double)freq_energy[k] / n;
// Convert to cumulative energy
- for (int k = 6; k >= 0; --k) energy[k] += energy[k + 1];
+ for (int k = 14; k > 0; --k) energy[k] += energy[k + 1];
} else {
- for (int k = 0; k < 8; ++k) energy[k] = 1e+20;
+ for (int k = 1; k < 16; ++k) energy[k] = 1e+20;
}
}
@@ -358,6 +332,9 @@ static BLOCK_SIZE select_sb_size(const AV1_COMP *const cpi) {
// When superres / resize is on, 'cm->width / height' can change between
// calls, so we don't apply this heuristic there. Also, this heuristic gives
// compression gain for speed >= 2 only.
+ // Things break if superblock size changes per-frame which is why this
+ // heuristic is set based on configured speed rather than actual
+ // speed-features (which may change per-frame in future)
if (cpi->oxcf.superres_mode == SUPERRES_NONE &&
cpi->oxcf.resize_mode == RESIZE_NONE && cpi->oxcf.speed >= 2) {
return (cm->width >= 480 && cm->height >= 360) ? BLOCK_128X128
@@ -375,64 +352,28 @@ static void setup_frame(AV1_COMP *cpi) {
// other inter-frames the encoder currently uses only two contexts;
// context 1 for ALTREF frames and context 0 for the others.
- cm->primary_ref_frame = PRIMARY_REF_NONE;
if (frame_is_intra_only(cm) || cm->error_resilient_mode ||
- cm->force_primary_ref_none) {
+ cpi->ext_use_primary_ref_none) {
av1_setup_past_independence(cm);
- for (int i = 0; i < REF_FRAMES; i++) {
- cm->fb_of_context_type[i] = -1;
- }
- cm->fb_of_context_type[REGULAR_FRAME] =
- cm->show_frame ? get_ref_frame_map_idx(cpi, GOLDEN_FRAME)
- : get_ref_frame_map_idx(cpi, ALTREF_FRAME);
- cm->frame_context_idx = REGULAR_FRAME;
- } else {
- const GF_GROUP *gf_group = &cpi->twopass.gf_group;
- if (gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE)
- cm->frame_context_idx = EXT_ARF_FRAME;
- else if (cpi->refresh_alt_ref_frame)
- cm->frame_context_idx = ARF_FRAME;
- else if (cpi->rc.is_src_frame_alt_ref)
- cm->frame_context_idx = OVERLAY_FRAME;
- else if (cpi->refresh_golden_frame)
- cm->frame_context_idx = GLD_FRAME;
- else if (cpi->refresh_bwd_ref_frame)
- cm->frame_context_idx = BRF_FRAME;
- else
- cm->frame_context_idx = REGULAR_FRAME;
- int wanted_fb = cm->fb_of_context_type[cm->frame_context_idx];
- for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
- int fb = get_ref_frame_map_idx(cpi, ref_frame);
- if (fb == wanted_fb) {
- cm->primary_ref_frame = ref_frame - LAST_FRAME;
- }
- }
}
if (cm->current_frame.frame_type == KEY_FRAME && cm->show_frame) {
- cpi->refresh_golden_frame = 1;
- cpi->refresh_alt_ref_frame = 1;
- av1_zero(cpi->interp_filter_selected);
set_sb_size(&cm->seq_params, select_sb_size(cpi));
} else if (frame_is_sframe(cm)) {
- cpi->refresh_golden_frame = 1;
- cpi->refresh_alt_ref_frame = 1;
- av1_zero(cpi->interp_filter_selected);
set_sb_size(&cm->seq_params, select_sb_size(cpi));
} else {
- if (cm->primary_ref_frame == PRIMARY_REF_NONE ||
- cm->current_frame.frame_refs[cm->primary_ref_frame].buf == NULL) {
+ const RefCntBuffer *const primary_ref_buf = get_primary_ref_frame_buf(cm);
+ if (primary_ref_buf == NULL) {
av1_setup_past_independence(cm);
cm->seg.update_map = 1;
cm->seg.update_data = 1;
} else {
- *cm->fc = cm->current_frame.frame_refs[cm->primary_ref_frame]
- .buf->frame_context;
+ *cm->fc = primary_ref_buf->frame_context;
}
- av1_zero(cpi->interp_filter_selected[0]);
}
- cm->prev_frame = get_prev_frame(cm);
+ av1_zero(cm->cur_frame->interp_filter_selected);
+ cm->prev_frame = get_primary_ref_frame_buf(cm);
cpi->vaq_refresh = 0;
}
@@ -526,6 +467,20 @@ static void alloc_context_buffers_ext(AV1_COMP *cpi) {
aom_calloc(mi_size, sizeof(*cpi->mbmi_ext_base)));
}
+static void reset_film_grain_chroma_params(aom_film_grain_t *pars) {
+ pars->num_cr_points = 0;
+ pars->cr_mult = 0;
+ pars->cr_luma_mult = 0;
+ memset(pars->scaling_points_cr, 0, sizeof(pars->scaling_points_cr));
+ memset(pars->ar_coeffs_cr, 0, sizeof(pars->ar_coeffs_cr));
+ pars->num_cb_points = 0;
+ pars->cb_mult = 0;
+ pars->cb_luma_mult = 0;
+ pars->chroma_scaling_from_luma = 0;
+ memset(pars->scaling_points_cb, 0, sizeof(pars->scaling_points_cb));
+ memset(pars->ar_coeffs_cb, 0, sizeof(pars->ar_coeffs_cb));
+}
+
static void update_film_grain_parameters(struct AV1_COMP *cpi,
const AV1EncoderConfig *oxcf) {
AV1_COMMON *const cm = &cpi->common;
@@ -543,20 +498,27 @@ static void update_film_grain_parameters(struct AV1_COMP *cpi,
memcpy(&cm->film_grain_params,
film_grain_test_vectors + oxcf->film_grain_test_vector - 1,
sizeof(cm->film_grain_params));
-
+ if (oxcf->monochrome)
+ reset_film_grain_chroma_params(&cm->film_grain_params);
cm->film_grain_params.bit_depth = cm->seq_params.bit_depth;
if (cm->seq_params.color_range == AOM_CR_FULL_RANGE) {
cm->film_grain_params.clip_to_restricted_range = 0;
}
}
} else if (oxcf->film_grain_table_filename) {
+ cm->seq_params.film_grain_params_present = 1;
+
cpi->film_grain_table = aom_malloc(sizeof(*cpi->film_grain_table));
memset(cpi->film_grain_table, 0, sizeof(aom_film_grain_table_t));
aom_film_grain_table_read(cpi->film_grain_table,
oxcf->film_grain_table_filename, &cm->error);
} else {
+#if CONFIG_DENOISE
+ cm->seq_params.film_grain_params_present = (cpi->oxcf.noise_level > 0);
+#else
cm->seq_params.film_grain_params_present = 0;
+#endif
memset(&cm->film_grain_params, 0, sizeof(cm->film_grain_params));
}
}
@@ -589,10 +551,8 @@ static void dealloc_compressor_data(AV1_COMP *cpi) {
aom_free(cpi->td.mb.wsrc_buf);
cpi->td.mb.wsrc_buf = NULL;
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
aom_free(cpi->td.mb.inter_modes_info);
cpi->td.mb.inter_modes_info = NULL;
-#endif
for (int i = 0; i < 2; i++)
for (int j = 0; j < 2; j++) {
@@ -809,7 +769,7 @@ static void configure_static_seg_features(AV1_COMP *cpi) {
static void update_reference_segmentation_map(AV1_COMP *cpi) {
AV1_COMMON *const cm = &cpi->common;
MB_MODE_INFO **mi_4x4_ptr = cm->mi_grid_visible;
- uint8_t *cache_ptr = cm->current_frame_seg_map;
+ uint8_t *cache_ptr = cm->cur_frame->seg_map;
int row, col;
for (row = 0; row < cm->mi_rows; row++) {
@@ -827,11 +787,13 @@ static void alloc_raw_frame_buffers(AV1_COMP *cpi) {
const SequenceHeader *const seq_params = &cm->seq_params;
const AV1EncoderConfig *oxcf = &cpi->oxcf;
- if (!cpi->lookahead)
- cpi->lookahead =
- av1_lookahead_init(oxcf->width, oxcf->height, seq_params->subsampling_x,
- seq_params->subsampling_y,
- seq_params->use_highbitdepth, oxcf->lag_in_frames);
+ if (!cpi->lookahead) {
+ int is_scale = (oxcf->resize_mode || oxcf->superres_mode);
+ cpi->lookahead = av1_lookahead_init(
+ oxcf->width, oxcf->height, seq_params->subsampling_x,
+ seq_params->subsampling_y, seq_params->use_highbitdepth,
+ oxcf->lag_in_frames, oxcf->border_in_pixels, is_scale);
+ }
if (!cpi->lookahead)
aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
"Failed to allocate lag buffers");
@@ -840,7 +802,7 @@ static void alloc_raw_frame_buffers(AV1_COMP *cpi) {
if (aom_realloc_frame_buffer(
&cpi->alt_ref_buffer, oxcf->width, oxcf->height,
seq_params->subsampling_x, seq_params->subsampling_y,
- seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+ seq_params->use_highbitdepth, oxcf->border_in_pixels,
cm->byte_alignment, NULL, NULL, NULL))
aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
"Failed to allocate altref buffer");
@@ -852,7 +814,7 @@ static void alloc_util_frame_buffers(AV1_COMP *cpi) {
if (aom_realloc_frame_buffer(
&cpi->last_frame_uf, cm->width, cm->height, seq_params->subsampling_x,
seq_params->subsampling_y, seq_params->use_highbitdepth,
- AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL))
+ cpi->oxcf.border_in_pixels, cm->byte_alignment, NULL, NULL, NULL))
aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
"Failed to allocate last frame buffer");
@@ -860,21 +822,21 @@ static void alloc_util_frame_buffers(AV1_COMP *cpi) {
&cpi->trial_frame_rst, cm->superres_upscaled_width,
cm->superres_upscaled_height, seq_params->subsampling_x,
seq_params->subsampling_y, seq_params->use_highbitdepth,
- AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL))
+ AOM_RESTORATION_FRAME_BORDER, cm->byte_alignment, NULL, NULL, NULL))
aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
"Failed to allocate trial restored frame buffer");
if (aom_realloc_frame_buffer(
&cpi->scaled_source, cm->width, cm->height, seq_params->subsampling_x,
seq_params->subsampling_y, seq_params->use_highbitdepth,
- AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL))
+ cpi->oxcf.border_in_pixels, cm->byte_alignment, NULL, NULL, NULL))
aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
"Failed to allocate scaled source buffer");
if (aom_realloc_frame_buffer(
&cpi->scaled_last_source, cm->width, cm->height,
seq_params->subsampling_x, seq_params->subsampling_y,
- seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+ seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
cm->byte_alignment, NULL, NULL, NULL))
aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
"Failed to allocate scaled last source buffer");
@@ -978,10 +940,9 @@ static void update_frame_size(AV1_COMP *cpi) {
static void init_buffer_indices(AV1_COMP *cpi) {
int fb_idx;
for (fb_idx = 0; fb_idx < REF_FRAMES; ++fb_idx)
- cpi->remapped_ref_idx[fb_idx] = fb_idx;
+ cpi->common.remapped_ref_idx[fb_idx] = fb_idx;
cpi->rate_index = 0;
cpi->rate_size = 0;
- cpi->cur_poc = -1;
}
static INLINE int does_level_match(int width, int height, double fps,
@@ -1003,77 +964,58 @@ static void set_bitstream_level_tier(SequenceHeader *seq, AV1_COMMON *cm,
// and max display sample rates.
// Need to add checks for max bit rate, max decoded luma sample rate, header
// rate, etc. that are not covered by this function.
- (void)oxcf;
- BitstreamLevel bl = { 9, 3 };
+ AV1_LEVEL level = SEQ_LEVEL_MAX;
if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, 512,
288, 30.0, 4)) {
- bl.major = 2;
- bl.minor = 0;
+ level = SEQ_LEVEL_2_0;
} else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
704, 396, 30.0, 4)) {
- bl.major = 2;
- bl.minor = 1;
+ level = SEQ_LEVEL_2_1;
} else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
1088, 612, 30.0, 4)) {
- bl.major = 3;
- bl.minor = 0;
+ level = SEQ_LEVEL_3_0;
} else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
1376, 774, 30.0, 4)) {
- bl.major = 3;
- bl.minor = 1;
+ level = SEQ_LEVEL_3_1;
} else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
2048, 1152, 30.0, 3)) {
- bl.major = 4;
- bl.minor = 0;
+ level = SEQ_LEVEL_4_0;
} else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
2048, 1152, 60.0, 3)) {
- bl.major = 4;
- bl.minor = 1;
+ level = SEQ_LEVEL_4_1;
} else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
4096, 2176, 30.0, 2)) {
- bl.major = 5;
- bl.minor = 0;
+ level = SEQ_LEVEL_5_0;
} else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
4096, 2176, 60.0, 2)) {
- bl.major = 5;
- bl.minor = 1;
+ level = SEQ_LEVEL_5_1;
} else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
4096, 2176, 120.0, 2)) {
- bl.major = 5;
- bl.minor = 2;
+ level = SEQ_LEVEL_5_2;
} else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
8192, 4352, 30.0, 2)) {
- bl.major = 6;
- bl.minor = 0;
+ level = SEQ_LEVEL_6_0;
} else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
8192, 4352, 60.0, 2)) {
- bl.major = 6;
- bl.minor = 1;
} else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
8192, 4352, 120.0, 2)) {
- bl.major = 6;
- bl.minor = 2;
+ level = SEQ_LEVEL_6_2;
} else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
16384, 8704, 30.0, 2)) {
- bl.major = 7;
- bl.minor = 0;
+ level = SEQ_LEVEL_7_0;
} else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
16384, 8704, 60.0, 2)) {
- bl.major = 7;
- bl.minor = 1;
+ level = SEQ_LEVEL_7_1;
} else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
16384, 8704, 120.0, 2)) {
- bl.major = 7;
- bl.minor = 2;
+ level = SEQ_LEVEL_7_2;
}
for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) {
- seq->level[i] = bl;
- seq->tier[i] = 0; // setting main tier by default
+ seq->seq_level_idx[i] = level;
// Set the maximum parameters for bitrate and buffer size for this profile,
// level, and tier
cm->op_params[i].bitrate = max_level_bitrate(
- cm->seq_params.profile, major_minor_to_seq_level_idx(seq->level[i]),
- seq->tier[i]);
+ cm->seq_params.profile, seq->seq_level_idx[i], seq->tier[i]);
// Level with seq_level_idx = 31 returns a high "dummy" bitrate to pass the
// check
if (cm->op_params[i].bitrate == 0)
@@ -1106,9 +1048,24 @@ static void init_seq_coding_tools(SequenceHeader *seq, AV1_COMMON *cm,
? DEFAULT_EXPLICIT_ORDER_HINT_BITS - 1
: -1;
+ seq->max_frame_width =
+ oxcf->forced_max_frame_width ? oxcf->forced_max_frame_width : oxcf->width;
+ seq->max_frame_height = oxcf->forced_max_frame_height
+ ? oxcf->forced_max_frame_height
+ : oxcf->height;
+ seq->num_bits_width =
+ (seq->max_frame_width > 1) ? get_msb(seq->max_frame_width - 1) + 1 : 1;
+ seq->num_bits_height =
+ (seq->max_frame_height > 1) ? get_msb(seq->max_frame_height - 1) + 1 : 1;
+ assert(seq->num_bits_width <= 16);
+ assert(seq->num_bits_height <= 16);
+
+ seq->frame_id_length = FRAME_ID_LENGTH;
+ seq->delta_frame_id_length = DELTA_FRAME_ID_LENGTH;
+
seq->enable_dual_filter = oxcf->enable_dual_filter;
- seq->order_hint_info.enable_jnt_comp = oxcf->enable_jnt_comp;
- seq->order_hint_info.enable_jnt_comp &=
+ seq->order_hint_info.enable_dist_wtd_comp = oxcf->enable_dist_wtd_comp;
+ seq->order_hint_info.enable_dist_wtd_comp &=
seq->order_hint_info.enable_order_hint;
seq->order_hint_info.enable_ref_frame_mvs = oxcf->enable_ref_frame_mvs;
seq->order_hint_info.enable_ref_frame_mvs &=
@@ -1117,10 +1074,10 @@ static void init_seq_coding_tools(SequenceHeader *seq, AV1_COMMON *cm,
seq->enable_cdef = oxcf->enable_cdef;
seq->enable_restoration = oxcf->enable_restoration;
seq->enable_warped_motion = oxcf->enable_warped_motion;
- seq->enable_interintra_compound = 1;
- seq->enable_masked_compound = 1;
- seq->enable_intra_edge_filter = 1;
- seq->enable_filter_intra = 1;
+ seq->enable_interintra_compound = oxcf->enable_interintra_comp;
+ seq->enable_masked_compound = oxcf->enable_masked_comp;
+ seq->enable_intra_edge_filter = oxcf->enable_intra_edge_filter;
+ seq->enable_filter_intra = oxcf->enable_filter_intra;
set_bitstream_level_tier(seq, cm, oxcf);
@@ -1317,14 +1274,14 @@ static void set_rc_buffer_sizes(RATE_CONTROL *rc,
static unsigned int fnname##_bits8( \
const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
int ref_stride, const uint8_t *second_pred, \
- const JNT_COMP_PARAMS *jcp_param) { \
+ const DIST_WTD_COMP_PARAMS *jcp_param) { \
return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \
jcp_param); \
} \
static unsigned int fnname##_bits10( \
const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
int ref_stride, const uint8_t *second_pred, \
- const JNT_COMP_PARAMS *jcp_param) { \
+ const DIST_WTD_COMP_PARAMS *jcp_param) { \
return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \
jcp_param) >> \
2; \
@@ -1332,7 +1289,7 @@ static void set_rc_buffer_sizes(RATE_CONTROL *rc,
static unsigned int fnname##_bits12( \
const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
int ref_stride, const uint8_t *second_pred, \
- const JNT_COMP_PARAMS *jcp_param) { \
+ const DIST_WTD_COMP_PARAMS *jcp_param) { \
return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \
jcp_param) >> \
4; \
@@ -1406,28 +1363,28 @@ MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x16)
MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x16_avg)
MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x16x4d)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad128x128_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad128x64_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad64x128_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad32x16_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad16x32_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad64x32_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad32x64_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad32x32_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad64x64_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad16x16_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad16x8_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad8x16_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad8x8_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad8x4_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad4x8_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad4x4_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad4x16_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad16x4_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad8x32_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad32x8_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad16x64_avg)
-MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad64x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad128x128_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad128x64_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x128_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x32_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x32_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x64_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x32_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x64_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x8_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x8_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x4_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad4x8_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad4x4_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad4x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x4_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad8x32_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad32x8_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad16x64_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_dist_wtd_sad64x16_avg)
#define HIGHBD_MBFP(BT, MCSDF, MCSVF) \
cpi->fn_ptr[BT].msdf = MCSDF; \
@@ -1536,166 +1493,167 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
aom_highbd_8_sub_pixel_variance64x16,
aom_highbd_8_sub_pixel_avg_variance64x16,
aom_highbd_sad64x16x4d_bits8,
- aom_highbd_jnt_sad64x16_avg_bits8,
- aom_highbd_8_jnt_sub_pixel_avg_variance64x16)
+ aom_highbd_dist_wtd_sad64x16_avg_bits8,
+ aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x16)
HIGHBD_BFP(BLOCK_16X64, aom_highbd_sad16x64_bits8,
aom_highbd_sad16x64_avg_bits8, aom_highbd_8_variance16x64,
aom_highbd_8_sub_pixel_variance16x64,
aom_highbd_8_sub_pixel_avg_variance16x64,
aom_highbd_sad16x64x4d_bits8,
- aom_highbd_jnt_sad16x64_avg_bits8,
- aom_highbd_8_jnt_sub_pixel_avg_variance16x64)
+ aom_highbd_dist_wtd_sad16x64_avg_bits8,
+ aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x64)
HIGHBD_BFP(
BLOCK_32X8, aom_highbd_sad32x8_bits8, aom_highbd_sad32x8_avg_bits8,
aom_highbd_8_variance32x8, aom_highbd_8_sub_pixel_variance32x8,
aom_highbd_8_sub_pixel_avg_variance32x8,
- aom_highbd_sad32x8x4d_bits8, aom_highbd_jnt_sad32x8_avg_bits8,
- aom_highbd_8_jnt_sub_pixel_avg_variance32x8)
+ aom_highbd_sad32x8x4d_bits8, aom_highbd_dist_wtd_sad32x8_avg_bits8,
+ aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x8)
HIGHBD_BFP(
BLOCK_8X32, aom_highbd_sad8x32_bits8, aom_highbd_sad8x32_avg_bits8,
aom_highbd_8_variance8x32, aom_highbd_8_sub_pixel_variance8x32,
aom_highbd_8_sub_pixel_avg_variance8x32,
- aom_highbd_sad8x32x4d_bits8, aom_highbd_jnt_sad8x32_avg_bits8,
- aom_highbd_8_jnt_sub_pixel_avg_variance8x32)
+ aom_highbd_sad8x32x4d_bits8, aom_highbd_dist_wtd_sad8x32_avg_bits8,
+ aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x32)
HIGHBD_BFP(
BLOCK_16X4, aom_highbd_sad16x4_bits8, aom_highbd_sad16x4_avg_bits8,
aom_highbd_8_variance16x4, aom_highbd_8_sub_pixel_variance16x4,
aom_highbd_8_sub_pixel_avg_variance16x4,
- aom_highbd_sad16x4x4d_bits8, aom_highbd_jnt_sad16x4_avg_bits8,
- aom_highbd_8_jnt_sub_pixel_avg_variance16x4)
+ aom_highbd_sad16x4x4d_bits8, aom_highbd_dist_wtd_sad16x4_avg_bits8,
+ aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x4)
HIGHBD_BFP(
BLOCK_4X16, aom_highbd_sad4x16_bits8, aom_highbd_sad4x16_avg_bits8,
aom_highbd_8_variance4x16, aom_highbd_8_sub_pixel_variance4x16,
aom_highbd_8_sub_pixel_avg_variance4x16,
- aom_highbd_sad4x16x4d_bits8, aom_highbd_jnt_sad4x16_avg_bits8,
- aom_highbd_8_jnt_sub_pixel_avg_variance4x16)
+ aom_highbd_sad4x16x4d_bits8, aom_highbd_dist_wtd_sad4x16_avg_bits8,
+ aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x16)
HIGHBD_BFP(BLOCK_32X16, aom_highbd_sad32x16_bits8,
aom_highbd_sad32x16_avg_bits8, aom_highbd_8_variance32x16,
aom_highbd_8_sub_pixel_variance32x16,
aom_highbd_8_sub_pixel_avg_variance32x16,
aom_highbd_sad32x16x4d_bits8,
- aom_highbd_jnt_sad32x16_avg_bits8,
- aom_highbd_8_jnt_sub_pixel_avg_variance32x16)
+ aom_highbd_dist_wtd_sad32x16_avg_bits8,
+ aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x16)
HIGHBD_BFP(BLOCK_16X32, aom_highbd_sad16x32_bits8,
aom_highbd_sad16x32_avg_bits8, aom_highbd_8_variance16x32,
aom_highbd_8_sub_pixel_variance16x32,
aom_highbd_8_sub_pixel_avg_variance16x32,
aom_highbd_sad16x32x4d_bits8,
- aom_highbd_jnt_sad16x32_avg_bits8,
- aom_highbd_8_jnt_sub_pixel_avg_variance16x32)
+ aom_highbd_dist_wtd_sad16x32_avg_bits8,
+ aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x32)
HIGHBD_BFP(BLOCK_64X32, aom_highbd_sad64x32_bits8,
aom_highbd_sad64x32_avg_bits8, aom_highbd_8_variance64x32,
aom_highbd_8_sub_pixel_variance64x32,
aom_highbd_8_sub_pixel_avg_variance64x32,
aom_highbd_sad64x32x4d_bits8,
- aom_highbd_jnt_sad64x32_avg_bits8,
- aom_highbd_8_jnt_sub_pixel_avg_variance64x32)
+ aom_highbd_dist_wtd_sad64x32_avg_bits8,
+ aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x32)
HIGHBD_BFP(BLOCK_32X64, aom_highbd_sad32x64_bits8,
aom_highbd_sad32x64_avg_bits8, aom_highbd_8_variance32x64,
aom_highbd_8_sub_pixel_variance32x64,
aom_highbd_8_sub_pixel_avg_variance32x64,
aom_highbd_sad32x64x4d_bits8,
- aom_highbd_jnt_sad32x64_avg_bits8,
- aom_highbd_8_jnt_sub_pixel_avg_variance32x64)
+ aom_highbd_dist_wtd_sad32x64_avg_bits8,
+ aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x64)
HIGHBD_BFP(BLOCK_32X32, aom_highbd_sad32x32_bits8,
aom_highbd_sad32x32_avg_bits8, aom_highbd_8_variance32x32,
aom_highbd_8_sub_pixel_variance32x32,
aom_highbd_8_sub_pixel_avg_variance32x32,
aom_highbd_sad32x32x4d_bits8,
- aom_highbd_jnt_sad32x32_avg_bits8,
- aom_highbd_8_jnt_sub_pixel_avg_variance32x32)
+ aom_highbd_dist_wtd_sad32x32_avg_bits8,
+ aom_highbd_8_dist_wtd_sub_pixel_avg_variance32x32)
HIGHBD_BFP(BLOCK_64X64, aom_highbd_sad64x64_bits8,
aom_highbd_sad64x64_avg_bits8, aom_highbd_8_variance64x64,
aom_highbd_8_sub_pixel_variance64x64,
aom_highbd_8_sub_pixel_avg_variance64x64,
aom_highbd_sad64x64x4d_bits8,
- aom_highbd_jnt_sad64x64_avg_bits8,
- aom_highbd_8_jnt_sub_pixel_avg_variance64x64)
+ aom_highbd_dist_wtd_sad64x64_avg_bits8,
+ aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x64)
HIGHBD_BFP(BLOCK_16X16, aom_highbd_sad16x16_bits8,
aom_highbd_sad16x16_avg_bits8, aom_highbd_8_variance16x16,
aom_highbd_8_sub_pixel_variance16x16,
aom_highbd_8_sub_pixel_avg_variance16x16,
aom_highbd_sad16x16x4d_bits8,
- aom_highbd_jnt_sad16x16_avg_bits8,
- aom_highbd_8_jnt_sub_pixel_avg_variance16x16)
+ aom_highbd_dist_wtd_sad16x16_avg_bits8,
+ aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x16)
HIGHBD_BFP(
BLOCK_16X8, aom_highbd_sad16x8_bits8, aom_highbd_sad16x8_avg_bits8,
aom_highbd_8_variance16x8, aom_highbd_8_sub_pixel_variance16x8,
aom_highbd_8_sub_pixel_avg_variance16x8,
- aom_highbd_sad16x8x4d_bits8, aom_highbd_jnt_sad16x8_avg_bits8,
- aom_highbd_8_jnt_sub_pixel_avg_variance16x8)
+ aom_highbd_sad16x8x4d_bits8, aom_highbd_dist_wtd_sad16x8_avg_bits8,
+ aom_highbd_8_dist_wtd_sub_pixel_avg_variance16x8)
HIGHBD_BFP(
BLOCK_8X16, aom_highbd_sad8x16_bits8, aom_highbd_sad8x16_avg_bits8,
aom_highbd_8_variance8x16, aom_highbd_8_sub_pixel_variance8x16,
aom_highbd_8_sub_pixel_avg_variance8x16,
- aom_highbd_sad8x16x4d_bits8, aom_highbd_jnt_sad8x16_avg_bits8,
- aom_highbd_8_jnt_sub_pixel_avg_variance8x16)
-
- HIGHBD_BFP(BLOCK_8X8, aom_highbd_sad8x8_bits8,
- aom_highbd_sad8x8_avg_bits8, aom_highbd_8_variance8x8,
- aom_highbd_8_sub_pixel_variance8x8,
- aom_highbd_8_sub_pixel_avg_variance8x8,
- aom_highbd_sad8x8x4d_bits8, aom_highbd_jnt_sad8x8_avg_bits8,
- aom_highbd_8_jnt_sub_pixel_avg_variance8x8)
-
- HIGHBD_BFP(BLOCK_8X4, aom_highbd_sad8x4_bits8,
- aom_highbd_sad8x4_avg_bits8, aom_highbd_8_variance8x4,
- aom_highbd_8_sub_pixel_variance8x4,
- aom_highbd_8_sub_pixel_avg_variance8x4,
- aom_highbd_sad8x4x4d_bits8, aom_highbd_jnt_sad8x4_avg_bits8,
- aom_highbd_8_jnt_sub_pixel_avg_variance8x4)
-
- HIGHBD_BFP(BLOCK_4X8, aom_highbd_sad4x8_bits8,
- aom_highbd_sad4x8_avg_bits8, aom_highbd_8_variance4x8,
- aom_highbd_8_sub_pixel_variance4x8,
- aom_highbd_8_sub_pixel_avg_variance4x8,
- aom_highbd_sad4x8x4d_bits8, aom_highbd_jnt_sad4x8_avg_bits8,
- aom_highbd_8_jnt_sub_pixel_avg_variance4x8)
-
- HIGHBD_BFP(BLOCK_4X4, aom_highbd_sad4x4_bits8,
- aom_highbd_sad4x4_avg_bits8, aom_highbd_8_variance4x4,
- aom_highbd_8_sub_pixel_variance4x4,
- aom_highbd_8_sub_pixel_avg_variance4x4,
- aom_highbd_sad4x4x4d_bits8, aom_highbd_jnt_sad4x4_avg_bits8,
- aom_highbd_8_jnt_sub_pixel_avg_variance4x4)
+ aom_highbd_sad8x16x4d_bits8, aom_highbd_dist_wtd_sad8x16_avg_bits8,
+ aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x16)
+
+ HIGHBD_BFP(
+ BLOCK_8X8, aom_highbd_sad8x8_bits8, aom_highbd_sad8x8_avg_bits8,
+ aom_highbd_8_variance8x8, aom_highbd_8_sub_pixel_variance8x8,
+ aom_highbd_8_sub_pixel_avg_variance8x8, aom_highbd_sad8x8x4d_bits8,
+ aom_highbd_dist_wtd_sad8x8_avg_bits8,
+ aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x8)
+
+ HIGHBD_BFP(
+ BLOCK_8X4, aom_highbd_sad8x4_bits8, aom_highbd_sad8x4_avg_bits8,
+ aom_highbd_8_variance8x4, aom_highbd_8_sub_pixel_variance8x4,
+ aom_highbd_8_sub_pixel_avg_variance8x4, aom_highbd_sad8x4x4d_bits8,
+ aom_highbd_dist_wtd_sad8x4_avg_bits8,
+ aom_highbd_8_dist_wtd_sub_pixel_avg_variance8x4)
+
+ HIGHBD_BFP(
+ BLOCK_4X8, aom_highbd_sad4x8_bits8, aom_highbd_sad4x8_avg_bits8,
+ aom_highbd_8_variance4x8, aom_highbd_8_sub_pixel_variance4x8,
+ aom_highbd_8_sub_pixel_avg_variance4x8, aom_highbd_sad4x8x4d_bits8,
+ aom_highbd_dist_wtd_sad4x8_avg_bits8,
+ aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x8)
HIGHBD_BFP(
- BLOCK_128X128, aom_highbd_sad128x128_bits8,
- aom_highbd_sad128x128_avg_bits8, aom_highbd_8_variance128x128,
- aom_highbd_8_sub_pixel_variance128x128,
- aom_highbd_8_sub_pixel_avg_variance128x128,
- aom_highbd_sad128x128x4d_bits8, aom_highbd_jnt_sad128x128_avg_bits8,
- aom_highbd_8_jnt_sub_pixel_avg_variance128x128)
+ BLOCK_4X4, aom_highbd_sad4x4_bits8, aom_highbd_sad4x4_avg_bits8,
+ aom_highbd_8_variance4x4, aom_highbd_8_sub_pixel_variance4x4,
+ aom_highbd_8_sub_pixel_avg_variance4x4, aom_highbd_sad4x4x4d_bits8,
+ aom_highbd_dist_wtd_sad4x4_avg_bits8,
+ aom_highbd_8_dist_wtd_sub_pixel_avg_variance4x4)
+
+ HIGHBD_BFP(BLOCK_128X128, aom_highbd_sad128x128_bits8,
+ aom_highbd_sad128x128_avg_bits8,
+ aom_highbd_8_variance128x128,
+ aom_highbd_8_sub_pixel_variance128x128,
+ aom_highbd_8_sub_pixel_avg_variance128x128,
+ aom_highbd_sad128x128x4d_bits8,
+ aom_highbd_dist_wtd_sad128x128_avg_bits8,
+ aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x128)
HIGHBD_BFP(BLOCK_128X64, aom_highbd_sad128x64_bits8,
aom_highbd_sad128x64_avg_bits8, aom_highbd_8_variance128x64,
aom_highbd_8_sub_pixel_variance128x64,
aom_highbd_8_sub_pixel_avg_variance128x64,
aom_highbd_sad128x64x4d_bits8,
- aom_highbd_jnt_sad128x64_avg_bits8,
- aom_highbd_8_jnt_sub_pixel_avg_variance128x64)
+ aom_highbd_dist_wtd_sad128x64_avg_bits8,
+ aom_highbd_8_dist_wtd_sub_pixel_avg_variance128x64)
HIGHBD_BFP(BLOCK_64X128, aom_highbd_sad64x128_bits8,
aom_highbd_sad64x128_avg_bits8, aom_highbd_8_variance64x128,
aom_highbd_8_sub_pixel_variance64x128,
aom_highbd_8_sub_pixel_avg_variance64x128,
aom_highbd_sad64x128x4d_bits8,
- aom_highbd_jnt_sad64x128_avg_bits8,
- aom_highbd_8_jnt_sub_pixel_avg_variance64x128)
+ aom_highbd_dist_wtd_sad64x128_avg_bits8,
+ aom_highbd_8_dist_wtd_sub_pixel_avg_variance64x128)
HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits8,
aom_highbd_8_masked_sub_pixel_variance128x128)
@@ -1815,148 +1773,148 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
aom_highbd_10_sub_pixel_variance64x16,
aom_highbd_10_sub_pixel_avg_variance64x16,
aom_highbd_sad64x16x4d_bits10,
- aom_highbd_jnt_sad64x16_avg_bits10,
- aom_highbd_10_jnt_sub_pixel_avg_variance64x16);
+ aom_highbd_dist_wtd_sad64x16_avg_bits10,
+ aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x16);
HIGHBD_BFP(BLOCK_16X64, aom_highbd_sad16x64_bits10,
aom_highbd_sad16x64_avg_bits10, aom_highbd_10_variance16x64,
aom_highbd_10_sub_pixel_variance16x64,
aom_highbd_10_sub_pixel_avg_variance16x64,
aom_highbd_sad16x64x4d_bits10,
- aom_highbd_jnt_sad16x64_avg_bits10,
- aom_highbd_10_jnt_sub_pixel_avg_variance16x64);
+ aom_highbd_dist_wtd_sad16x64_avg_bits10,
+ aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x64);
HIGHBD_BFP(BLOCK_32X8, aom_highbd_sad32x8_bits10,
aom_highbd_sad32x8_avg_bits10, aom_highbd_10_variance32x8,
aom_highbd_10_sub_pixel_variance32x8,
aom_highbd_10_sub_pixel_avg_variance32x8,
aom_highbd_sad32x8x4d_bits10,
- aom_highbd_jnt_sad32x8_avg_bits10,
- aom_highbd_10_jnt_sub_pixel_avg_variance32x8);
+ aom_highbd_dist_wtd_sad32x8_avg_bits10,
+ aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x8);
HIGHBD_BFP(BLOCK_8X32, aom_highbd_sad8x32_bits10,
aom_highbd_sad8x32_avg_bits10, aom_highbd_10_variance8x32,
aom_highbd_10_sub_pixel_variance8x32,
aom_highbd_10_sub_pixel_avg_variance8x32,
aom_highbd_sad8x32x4d_bits10,
- aom_highbd_jnt_sad8x32_avg_bits10,
- aom_highbd_10_jnt_sub_pixel_avg_variance8x32);
+ aom_highbd_dist_wtd_sad8x32_avg_bits10,
+ aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x32);
HIGHBD_BFP(BLOCK_16X4, aom_highbd_sad16x4_bits10,
aom_highbd_sad16x4_avg_bits10, aom_highbd_10_variance16x4,
aom_highbd_10_sub_pixel_variance16x4,
aom_highbd_10_sub_pixel_avg_variance16x4,
aom_highbd_sad16x4x4d_bits10,
- aom_highbd_jnt_sad16x4_avg_bits10,
- aom_highbd_10_jnt_sub_pixel_avg_variance16x4);
+ aom_highbd_dist_wtd_sad16x4_avg_bits10,
+ aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x4);
HIGHBD_BFP(BLOCK_4X16, aom_highbd_sad4x16_bits10,
aom_highbd_sad4x16_avg_bits10, aom_highbd_10_variance4x16,
aom_highbd_10_sub_pixel_variance4x16,
aom_highbd_10_sub_pixel_avg_variance4x16,
aom_highbd_sad4x16x4d_bits10,
- aom_highbd_jnt_sad4x16_avg_bits10,
- aom_highbd_10_jnt_sub_pixel_avg_variance4x16);
+ aom_highbd_dist_wtd_sad4x16_avg_bits10,
+ aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x16);
HIGHBD_BFP(BLOCK_32X16, aom_highbd_sad32x16_bits10,
aom_highbd_sad32x16_avg_bits10, aom_highbd_10_variance32x16,
aom_highbd_10_sub_pixel_variance32x16,
aom_highbd_10_sub_pixel_avg_variance32x16,
aom_highbd_sad32x16x4d_bits10,
- aom_highbd_jnt_sad32x16_avg_bits10,
- aom_highbd_10_jnt_sub_pixel_avg_variance32x16);
+ aom_highbd_dist_wtd_sad32x16_avg_bits10,
+ aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x16);
HIGHBD_BFP(BLOCK_16X32, aom_highbd_sad16x32_bits10,
aom_highbd_sad16x32_avg_bits10, aom_highbd_10_variance16x32,
aom_highbd_10_sub_pixel_variance16x32,
aom_highbd_10_sub_pixel_avg_variance16x32,
aom_highbd_sad16x32x4d_bits10,
- aom_highbd_jnt_sad16x32_avg_bits10,
- aom_highbd_10_jnt_sub_pixel_avg_variance16x32);
+ aom_highbd_dist_wtd_sad16x32_avg_bits10,
+ aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x32);
HIGHBD_BFP(BLOCK_64X32, aom_highbd_sad64x32_bits10,
aom_highbd_sad64x32_avg_bits10, aom_highbd_10_variance64x32,
aom_highbd_10_sub_pixel_variance64x32,
aom_highbd_10_sub_pixel_avg_variance64x32,
aom_highbd_sad64x32x4d_bits10,
- aom_highbd_jnt_sad64x32_avg_bits10,
- aom_highbd_10_jnt_sub_pixel_avg_variance64x32);
+ aom_highbd_dist_wtd_sad64x32_avg_bits10,
+ aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x32);
HIGHBD_BFP(BLOCK_32X64, aom_highbd_sad32x64_bits10,
aom_highbd_sad32x64_avg_bits10, aom_highbd_10_variance32x64,
aom_highbd_10_sub_pixel_variance32x64,
aom_highbd_10_sub_pixel_avg_variance32x64,
aom_highbd_sad32x64x4d_bits10,
- aom_highbd_jnt_sad32x64_avg_bits10,
- aom_highbd_10_jnt_sub_pixel_avg_variance32x64);
+ aom_highbd_dist_wtd_sad32x64_avg_bits10,
+ aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x64);
HIGHBD_BFP(BLOCK_32X32, aom_highbd_sad32x32_bits10,
aom_highbd_sad32x32_avg_bits10, aom_highbd_10_variance32x32,
aom_highbd_10_sub_pixel_variance32x32,
aom_highbd_10_sub_pixel_avg_variance32x32,
aom_highbd_sad32x32x4d_bits10,
- aom_highbd_jnt_sad32x32_avg_bits10,
- aom_highbd_10_jnt_sub_pixel_avg_variance32x32);
+ aom_highbd_dist_wtd_sad32x32_avg_bits10,
+ aom_highbd_10_dist_wtd_sub_pixel_avg_variance32x32);
HIGHBD_BFP(BLOCK_64X64, aom_highbd_sad64x64_bits10,
aom_highbd_sad64x64_avg_bits10, aom_highbd_10_variance64x64,
aom_highbd_10_sub_pixel_variance64x64,
aom_highbd_10_sub_pixel_avg_variance64x64,
aom_highbd_sad64x64x4d_bits10,
- aom_highbd_jnt_sad64x64_avg_bits10,
- aom_highbd_10_jnt_sub_pixel_avg_variance64x64);
+ aom_highbd_dist_wtd_sad64x64_avg_bits10,
+ aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x64);
HIGHBD_BFP(BLOCK_16X16, aom_highbd_sad16x16_bits10,
aom_highbd_sad16x16_avg_bits10, aom_highbd_10_variance16x16,
aom_highbd_10_sub_pixel_variance16x16,
aom_highbd_10_sub_pixel_avg_variance16x16,
aom_highbd_sad16x16x4d_bits10,
- aom_highbd_jnt_sad16x16_avg_bits10,
- aom_highbd_10_jnt_sub_pixel_avg_variance16x16);
+ aom_highbd_dist_wtd_sad16x16_avg_bits10,
+ aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x16);
HIGHBD_BFP(BLOCK_16X8, aom_highbd_sad16x8_bits10,
aom_highbd_sad16x8_avg_bits10, aom_highbd_10_variance16x8,
aom_highbd_10_sub_pixel_variance16x8,
aom_highbd_10_sub_pixel_avg_variance16x8,
aom_highbd_sad16x8x4d_bits10,
- aom_highbd_jnt_sad16x8_avg_bits10,
- aom_highbd_10_jnt_sub_pixel_avg_variance16x8);
+ aom_highbd_dist_wtd_sad16x8_avg_bits10,
+ aom_highbd_10_dist_wtd_sub_pixel_avg_variance16x8);
HIGHBD_BFP(BLOCK_8X16, aom_highbd_sad8x16_bits10,
aom_highbd_sad8x16_avg_bits10, aom_highbd_10_variance8x16,
aom_highbd_10_sub_pixel_variance8x16,
aom_highbd_10_sub_pixel_avg_variance8x16,
aom_highbd_sad8x16x4d_bits10,
- aom_highbd_jnt_sad8x16_avg_bits10,
- aom_highbd_10_jnt_sub_pixel_avg_variance8x16);
+ aom_highbd_dist_wtd_sad8x16_avg_bits10,
+ aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x16);
HIGHBD_BFP(
BLOCK_8X8, aom_highbd_sad8x8_bits10, aom_highbd_sad8x8_avg_bits10,
aom_highbd_10_variance8x8, aom_highbd_10_sub_pixel_variance8x8,
aom_highbd_10_sub_pixel_avg_variance8x8,
- aom_highbd_sad8x8x4d_bits10, aom_highbd_jnt_sad8x8_avg_bits10,
- aom_highbd_10_jnt_sub_pixel_avg_variance8x8);
+ aom_highbd_sad8x8x4d_bits10, aom_highbd_dist_wtd_sad8x8_avg_bits10,
+ aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x8);
HIGHBD_BFP(
BLOCK_8X4, aom_highbd_sad8x4_bits10, aom_highbd_sad8x4_avg_bits10,
aom_highbd_10_variance8x4, aom_highbd_10_sub_pixel_variance8x4,
aom_highbd_10_sub_pixel_avg_variance8x4,
- aom_highbd_sad8x4x4d_bits10, aom_highbd_jnt_sad8x4_avg_bits10,
- aom_highbd_10_jnt_sub_pixel_avg_variance8x4);
+ aom_highbd_sad8x4x4d_bits10, aom_highbd_dist_wtd_sad8x4_avg_bits10,
+ aom_highbd_10_dist_wtd_sub_pixel_avg_variance8x4);
HIGHBD_BFP(
BLOCK_4X8, aom_highbd_sad4x8_bits10, aom_highbd_sad4x8_avg_bits10,
aom_highbd_10_variance4x8, aom_highbd_10_sub_pixel_variance4x8,
aom_highbd_10_sub_pixel_avg_variance4x8,
- aom_highbd_sad4x8x4d_bits10, aom_highbd_jnt_sad4x8_avg_bits10,
- aom_highbd_10_jnt_sub_pixel_avg_variance4x8);
+ aom_highbd_sad4x8x4d_bits10, aom_highbd_dist_wtd_sad4x8_avg_bits10,
+ aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x8);
HIGHBD_BFP(
BLOCK_4X4, aom_highbd_sad4x4_bits10, aom_highbd_sad4x4_avg_bits10,
aom_highbd_10_variance4x4, aom_highbd_10_sub_pixel_variance4x4,
aom_highbd_10_sub_pixel_avg_variance4x4,
- aom_highbd_sad4x4x4d_bits10, aom_highbd_jnt_sad4x4_avg_bits10,
- aom_highbd_10_jnt_sub_pixel_avg_variance4x4);
+ aom_highbd_sad4x4x4d_bits10, aom_highbd_dist_wtd_sad4x4_avg_bits10,
+ aom_highbd_10_dist_wtd_sub_pixel_avg_variance4x4);
HIGHBD_BFP(BLOCK_128X128, aom_highbd_sad128x128_bits10,
aom_highbd_sad128x128_avg_bits10,
@@ -1964,24 +1922,26 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
aom_highbd_10_sub_pixel_variance128x128,
aom_highbd_10_sub_pixel_avg_variance128x128,
aom_highbd_sad128x128x4d_bits10,
- aom_highbd_jnt_sad128x128_avg_bits10,
- aom_highbd_10_jnt_sub_pixel_avg_variance128x128);
-
- HIGHBD_BFP(
- BLOCK_128X64, aom_highbd_sad128x64_bits10,
- aom_highbd_sad128x64_avg_bits10, aom_highbd_10_variance128x64,
- aom_highbd_10_sub_pixel_variance128x64,
- aom_highbd_10_sub_pixel_avg_variance128x64,
- aom_highbd_sad128x64x4d_bits10, aom_highbd_jnt_sad128x64_avg_bits10,
- aom_highbd_10_jnt_sub_pixel_avg_variance128x64);
-
- HIGHBD_BFP(
- BLOCK_64X128, aom_highbd_sad64x128_bits10,
- aom_highbd_sad64x128_avg_bits10, aom_highbd_10_variance64x128,
- aom_highbd_10_sub_pixel_variance64x128,
- aom_highbd_10_sub_pixel_avg_variance64x128,
- aom_highbd_sad64x128x4d_bits10, aom_highbd_jnt_sad64x128_avg_bits10,
- aom_highbd_10_jnt_sub_pixel_avg_variance64x128);
+ aom_highbd_dist_wtd_sad128x128_avg_bits10,
+ aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x128);
+
+ HIGHBD_BFP(BLOCK_128X64, aom_highbd_sad128x64_bits10,
+ aom_highbd_sad128x64_avg_bits10,
+ aom_highbd_10_variance128x64,
+ aom_highbd_10_sub_pixel_variance128x64,
+ aom_highbd_10_sub_pixel_avg_variance128x64,
+ aom_highbd_sad128x64x4d_bits10,
+ aom_highbd_dist_wtd_sad128x64_avg_bits10,
+ aom_highbd_10_dist_wtd_sub_pixel_avg_variance128x64);
+
+ HIGHBD_BFP(BLOCK_64X128, aom_highbd_sad64x128_bits10,
+ aom_highbd_sad64x128_avg_bits10,
+ aom_highbd_10_variance64x128,
+ aom_highbd_10_sub_pixel_variance64x128,
+ aom_highbd_10_sub_pixel_avg_variance64x128,
+ aom_highbd_sad64x128x4d_bits10,
+ aom_highbd_dist_wtd_sad64x128_avg_bits10,
+ aom_highbd_10_dist_wtd_sub_pixel_avg_variance64x128);
HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits10,
aom_highbd_10_masked_sub_pixel_variance128x128)
@@ -2107,148 +2067,148 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
aom_highbd_12_sub_pixel_variance64x16,
aom_highbd_12_sub_pixel_avg_variance64x16,
aom_highbd_sad64x16x4d_bits12,
- aom_highbd_jnt_sad64x16_avg_bits12,
- aom_highbd_12_jnt_sub_pixel_avg_variance64x16);
+ aom_highbd_dist_wtd_sad64x16_avg_bits12,
+ aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x16);
HIGHBD_BFP(BLOCK_16X64, aom_highbd_sad16x64_bits12,
aom_highbd_sad16x64_avg_bits12, aom_highbd_12_variance16x64,
aom_highbd_12_sub_pixel_variance16x64,
aom_highbd_12_sub_pixel_avg_variance16x64,
aom_highbd_sad16x64x4d_bits12,
- aom_highbd_jnt_sad16x64_avg_bits12,
- aom_highbd_12_jnt_sub_pixel_avg_variance16x64);
+ aom_highbd_dist_wtd_sad16x64_avg_bits12,
+ aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x64);
HIGHBD_BFP(BLOCK_32X8, aom_highbd_sad32x8_bits12,
aom_highbd_sad32x8_avg_bits12, aom_highbd_12_variance32x8,
aom_highbd_12_sub_pixel_variance32x8,
aom_highbd_12_sub_pixel_avg_variance32x8,
aom_highbd_sad32x8x4d_bits12,
- aom_highbd_jnt_sad32x8_avg_bits12,
- aom_highbd_12_jnt_sub_pixel_avg_variance32x8);
+ aom_highbd_dist_wtd_sad32x8_avg_bits12,
+ aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x8);
HIGHBD_BFP(BLOCK_8X32, aom_highbd_sad8x32_bits12,
aom_highbd_sad8x32_avg_bits12, aom_highbd_12_variance8x32,
aom_highbd_12_sub_pixel_variance8x32,
aom_highbd_12_sub_pixel_avg_variance8x32,
aom_highbd_sad8x32x4d_bits12,
- aom_highbd_jnt_sad8x32_avg_bits12,
- aom_highbd_12_jnt_sub_pixel_avg_variance8x32);
+ aom_highbd_dist_wtd_sad8x32_avg_bits12,
+ aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x32);
HIGHBD_BFP(BLOCK_16X4, aom_highbd_sad16x4_bits12,
aom_highbd_sad16x4_avg_bits12, aom_highbd_12_variance16x4,
aom_highbd_12_sub_pixel_variance16x4,
aom_highbd_12_sub_pixel_avg_variance16x4,
aom_highbd_sad16x4x4d_bits12,
- aom_highbd_jnt_sad16x4_avg_bits12,
- aom_highbd_12_jnt_sub_pixel_avg_variance16x4);
+ aom_highbd_dist_wtd_sad16x4_avg_bits12,
+ aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x4);
HIGHBD_BFP(BLOCK_4X16, aom_highbd_sad4x16_bits12,
aom_highbd_sad4x16_avg_bits12, aom_highbd_12_variance4x16,
aom_highbd_12_sub_pixel_variance4x16,
aom_highbd_12_sub_pixel_avg_variance4x16,
aom_highbd_sad4x16x4d_bits12,
- aom_highbd_jnt_sad4x16_avg_bits12,
- aom_highbd_12_jnt_sub_pixel_avg_variance4x16);
+ aom_highbd_dist_wtd_sad4x16_avg_bits12,
+ aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x16);
HIGHBD_BFP(BLOCK_32X16, aom_highbd_sad32x16_bits12,
aom_highbd_sad32x16_avg_bits12, aom_highbd_12_variance32x16,
aom_highbd_12_sub_pixel_variance32x16,
aom_highbd_12_sub_pixel_avg_variance32x16,
aom_highbd_sad32x16x4d_bits12,
- aom_highbd_jnt_sad32x16_avg_bits12,
- aom_highbd_12_jnt_sub_pixel_avg_variance32x16);
+ aom_highbd_dist_wtd_sad32x16_avg_bits12,
+ aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x16);
HIGHBD_BFP(BLOCK_16X32, aom_highbd_sad16x32_bits12,
aom_highbd_sad16x32_avg_bits12, aom_highbd_12_variance16x32,
aom_highbd_12_sub_pixel_variance16x32,
aom_highbd_12_sub_pixel_avg_variance16x32,
aom_highbd_sad16x32x4d_bits12,
- aom_highbd_jnt_sad16x32_avg_bits12,
- aom_highbd_12_jnt_sub_pixel_avg_variance16x32);
+ aom_highbd_dist_wtd_sad16x32_avg_bits12,
+ aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x32);
HIGHBD_BFP(BLOCK_64X32, aom_highbd_sad64x32_bits12,
aom_highbd_sad64x32_avg_bits12, aom_highbd_12_variance64x32,
aom_highbd_12_sub_pixel_variance64x32,
aom_highbd_12_sub_pixel_avg_variance64x32,
aom_highbd_sad64x32x4d_bits12,
- aom_highbd_jnt_sad64x32_avg_bits12,
- aom_highbd_12_jnt_sub_pixel_avg_variance64x32);
+ aom_highbd_dist_wtd_sad64x32_avg_bits12,
+ aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x32);
HIGHBD_BFP(BLOCK_32X64, aom_highbd_sad32x64_bits12,
aom_highbd_sad32x64_avg_bits12, aom_highbd_12_variance32x64,
aom_highbd_12_sub_pixel_variance32x64,
aom_highbd_12_sub_pixel_avg_variance32x64,
aom_highbd_sad32x64x4d_bits12,
- aom_highbd_jnt_sad32x64_avg_bits12,
- aom_highbd_12_jnt_sub_pixel_avg_variance32x64);
+ aom_highbd_dist_wtd_sad32x64_avg_bits12,
+ aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x64);
HIGHBD_BFP(BLOCK_32X32, aom_highbd_sad32x32_bits12,
aom_highbd_sad32x32_avg_bits12, aom_highbd_12_variance32x32,
aom_highbd_12_sub_pixel_variance32x32,
aom_highbd_12_sub_pixel_avg_variance32x32,
aom_highbd_sad32x32x4d_bits12,
- aom_highbd_jnt_sad32x32_avg_bits12,
- aom_highbd_12_jnt_sub_pixel_avg_variance32x32);
+ aom_highbd_dist_wtd_sad32x32_avg_bits12,
+ aom_highbd_12_dist_wtd_sub_pixel_avg_variance32x32);
HIGHBD_BFP(BLOCK_64X64, aom_highbd_sad64x64_bits12,
aom_highbd_sad64x64_avg_bits12, aom_highbd_12_variance64x64,
aom_highbd_12_sub_pixel_variance64x64,
aom_highbd_12_sub_pixel_avg_variance64x64,
aom_highbd_sad64x64x4d_bits12,
- aom_highbd_jnt_sad64x64_avg_bits12,
- aom_highbd_12_jnt_sub_pixel_avg_variance64x64);
+ aom_highbd_dist_wtd_sad64x64_avg_bits12,
+ aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x64);
HIGHBD_BFP(BLOCK_16X16, aom_highbd_sad16x16_bits12,
aom_highbd_sad16x16_avg_bits12, aom_highbd_12_variance16x16,
aom_highbd_12_sub_pixel_variance16x16,
aom_highbd_12_sub_pixel_avg_variance16x16,
aom_highbd_sad16x16x4d_bits12,
- aom_highbd_jnt_sad16x16_avg_bits12,
- aom_highbd_12_jnt_sub_pixel_avg_variance16x16);
+ aom_highbd_dist_wtd_sad16x16_avg_bits12,
+ aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x16);
HIGHBD_BFP(BLOCK_16X8, aom_highbd_sad16x8_bits12,
aom_highbd_sad16x8_avg_bits12, aom_highbd_12_variance16x8,
aom_highbd_12_sub_pixel_variance16x8,
aom_highbd_12_sub_pixel_avg_variance16x8,
aom_highbd_sad16x8x4d_bits12,
- aom_highbd_jnt_sad16x8_avg_bits12,
- aom_highbd_12_jnt_sub_pixel_avg_variance16x8);
+ aom_highbd_dist_wtd_sad16x8_avg_bits12,
+ aom_highbd_12_dist_wtd_sub_pixel_avg_variance16x8);
HIGHBD_BFP(BLOCK_8X16, aom_highbd_sad8x16_bits12,
aom_highbd_sad8x16_avg_bits12, aom_highbd_12_variance8x16,
aom_highbd_12_sub_pixel_variance8x16,
aom_highbd_12_sub_pixel_avg_variance8x16,
aom_highbd_sad8x16x4d_bits12,
- aom_highbd_jnt_sad8x16_avg_bits12,
- aom_highbd_12_jnt_sub_pixel_avg_variance8x16);
+ aom_highbd_dist_wtd_sad8x16_avg_bits12,
+ aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x16);
HIGHBD_BFP(
BLOCK_8X8, aom_highbd_sad8x8_bits12, aom_highbd_sad8x8_avg_bits12,
aom_highbd_12_variance8x8, aom_highbd_12_sub_pixel_variance8x8,
aom_highbd_12_sub_pixel_avg_variance8x8,
- aom_highbd_sad8x8x4d_bits12, aom_highbd_jnt_sad8x8_avg_bits12,
- aom_highbd_12_jnt_sub_pixel_avg_variance8x8);
+ aom_highbd_sad8x8x4d_bits12, aom_highbd_dist_wtd_sad8x8_avg_bits12,
+ aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x8);
HIGHBD_BFP(
BLOCK_8X4, aom_highbd_sad8x4_bits12, aom_highbd_sad8x4_avg_bits12,
aom_highbd_12_variance8x4, aom_highbd_12_sub_pixel_variance8x4,
aom_highbd_12_sub_pixel_avg_variance8x4,
- aom_highbd_sad8x4x4d_bits12, aom_highbd_jnt_sad8x4_avg_bits12,
- aom_highbd_12_jnt_sub_pixel_avg_variance8x4);
+ aom_highbd_sad8x4x4d_bits12, aom_highbd_dist_wtd_sad8x4_avg_bits12,
+ aom_highbd_12_dist_wtd_sub_pixel_avg_variance8x4);
HIGHBD_BFP(
BLOCK_4X8, aom_highbd_sad4x8_bits12, aom_highbd_sad4x8_avg_bits12,
aom_highbd_12_variance4x8, aom_highbd_12_sub_pixel_variance4x8,
aom_highbd_12_sub_pixel_avg_variance4x8,
- aom_highbd_sad4x8x4d_bits12, aom_highbd_jnt_sad4x8_avg_bits12,
- aom_highbd_12_jnt_sub_pixel_avg_variance4x8);
+ aom_highbd_sad4x8x4d_bits12, aom_highbd_dist_wtd_sad4x8_avg_bits12,
+ aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x8);
HIGHBD_BFP(
BLOCK_4X4, aom_highbd_sad4x4_bits12, aom_highbd_sad4x4_avg_bits12,
aom_highbd_12_variance4x4, aom_highbd_12_sub_pixel_variance4x4,
aom_highbd_12_sub_pixel_avg_variance4x4,
- aom_highbd_sad4x4x4d_bits12, aom_highbd_jnt_sad4x4_avg_bits12,
- aom_highbd_12_jnt_sub_pixel_avg_variance4x4);
+ aom_highbd_sad4x4x4d_bits12, aom_highbd_dist_wtd_sad4x4_avg_bits12,
+ aom_highbd_12_dist_wtd_sub_pixel_avg_variance4x4);
HIGHBD_BFP(BLOCK_128X128, aom_highbd_sad128x128_bits12,
aom_highbd_sad128x128_avg_bits12,
@@ -2256,24 +2216,26 @@ static void highbd_set_var_fns(AV1_COMP *const cpi) {
aom_highbd_12_sub_pixel_variance128x128,
aom_highbd_12_sub_pixel_avg_variance128x128,
aom_highbd_sad128x128x4d_bits12,
- aom_highbd_jnt_sad128x128_avg_bits12,
- aom_highbd_12_jnt_sub_pixel_avg_variance128x128);
-
- HIGHBD_BFP(
- BLOCK_128X64, aom_highbd_sad128x64_bits12,
- aom_highbd_sad128x64_avg_bits12, aom_highbd_12_variance128x64,
- aom_highbd_12_sub_pixel_variance128x64,
- aom_highbd_12_sub_pixel_avg_variance128x64,
- aom_highbd_sad128x64x4d_bits12, aom_highbd_jnt_sad128x64_avg_bits12,
- aom_highbd_12_jnt_sub_pixel_avg_variance128x64);
-
- HIGHBD_BFP(
- BLOCK_64X128, aom_highbd_sad64x128_bits12,
- aom_highbd_sad64x128_avg_bits12, aom_highbd_12_variance64x128,
- aom_highbd_12_sub_pixel_variance64x128,
- aom_highbd_12_sub_pixel_avg_variance64x128,
- aom_highbd_sad64x128x4d_bits12, aom_highbd_jnt_sad64x128_avg_bits12,
- aom_highbd_12_jnt_sub_pixel_avg_variance64x128);
+ aom_highbd_dist_wtd_sad128x128_avg_bits12,
+ aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x128);
+
+ HIGHBD_BFP(BLOCK_128X64, aom_highbd_sad128x64_bits12,
+ aom_highbd_sad128x64_avg_bits12,
+ aom_highbd_12_variance128x64,
+ aom_highbd_12_sub_pixel_variance128x64,
+ aom_highbd_12_sub_pixel_avg_variance128x64,
+ aom_highbd_sad128x64x4d_bits12,
+ aom_highbd_dist_wtd_sad128x64_avg_bits12,
+ aom_highbd_12_dist_wtd_sub_pixel_avg_variance128x64);
+
+ HIGHBD_BFP(BLOCK_64X128, aom_highbd_sad64x128_bits12,
+ aom_highbd_sad64x128_avg_bits12,
+ aom_highbd_12_variance64x128,
+ aom_highbd_12_sub_pixel_variance64x128,
+ aom_highbd_12_sub_pixel_avg_variance64x128,
+ aom_highbd_sad64x128x4d_bits12,
+ aom_highbd_dist_wtd_sad64x128_avg_bits12,
+ aom_highbd_12_dist_wtd_sub_pixel_avg_variance64x128);
HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits12,
aom_highbd_12_masked_sub_pixel_variance128x128)
@@ -2433,6 +2395,16 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
assert(IMPLIES(seq_params->profile <= PROFILE_1,
seq_params->bit_depth <= AOM_BITS_10));
+ memcpy(cpi->target_seq_level_idx, oxcf->target_seq_level_idx,
+ sizeof(cpi->target_seq_level_idx));
+ cpi->keep_level_stats = 0;
+ for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) {
+ if (cpi->target_seq_level_idx[i] < SEQ_LEVELS) {
+ cpi->keep_level_stats = 1;
+ break;
+ }
+ }
+
cm->timing_info_present = oxcf->timing_info_present;
cm->timing_info.num_units_in_display_tick =
oxcf->timing_info.num_units_in_display_tick;
@@ -2541,6 +2513,8 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
// Superblock size should not be updated after the first key frame.
if (!cpi->seq_params_locked) {
set_sb_size(&cm->seq_params, select_sb_size(cpi));
+ for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i)
+ seq_params->tier[i] = (oxcf->tier_mask >> i) & 1;
}
if (cpi->initial_width || sb_size != seq_params->sb_size) {
@@ -2558,10 +2532,6 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
cpi->alt_ref_source = NULL;
rc->is_src_frame_alt_ref = 0;
- rc->is_bwd_ref_frame = 0;
- rc->is_last_bipred_frame = 0;
- rc->is_bipred_frame = 0;
-
set_tile_info(cpi);
cpi->ext_refresh_frame_flags_pending = 0;
@@ -2578,6 +2548,21 @@ void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
}
}
+static void init_level_info(AV1LevelInfo *level_info) {
+ memset(level_info, 0, MAX_NUM_OPERATING_POINTS * sizeof(*level_info));
+ for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) {
+ AV1LevelSpec *const level_spec = &level_info[i].level_spec;
+ level_spec->level = SEQ_LEVEL_MAX;
+ AV1LevelStats *const level_stats = &level_info[i].level_stats;
+ level_stats->min_cropped_tile_width = INT_MAX;
+ level_stats->min_cropped_tile_height = INT_MAX;
+ level_stats->min_frame_width = INT_MAX;
+ level_stats->min_frame_height = INT_MAX;
+ level_stats->tile_width_is_valid = 1;
+ level_stats->min_cr = 1e8;
+ }
+}
+
AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
BufferPool *const pool) {
unsigned int i;
@@ -2620,10 +2605,11 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
av1_rc_init(&cpi->oxcf, oxcf->pass, &cpi->rc);
cm->current_frame.frame_number = 0;
+ cm->current_frame_id = -1;
cpi->seq_params_locked = 0;
cpi->partition_search_skippable_frame = 0;
cpi->tile_data = NULL;
- cpi->last_show_frame_buf_idx = INVALID_IDX;
+ cpi->last_show_frame_buf = NULL;
realloc_segmentation_maps(cpi);
memset(cpi->nmv_costs, 0, sizeof(cpi->nmv_costs));
@@ -2636,19 +2622,10 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
aom_calloc(cm->MBs * sizeof(*cpi->mbgraph_stats[i].mb_stats), 1));
}
-#if CONFIG_FP_MB_STATS
- cpi->use_fp_mb_stats = 0;
- if (cpi->use_fp_mb_stats) {
- // a place holder used to store the first pass mb stats in the first pass
- CHECK_MEM_ERROR(cm, cpi->twopass.frame_mb_stats_buf,
- aom_calloc(cm->MBs * sizeof(uint8_t), 1));
- } else {
- cpi->twopass.frame_mb_stats_buf = NULL;
- }
-#endif
-
cpi->refresh_alt_ref_frame = 0;
+ init_level_info(cpi->level_info);
+
cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
#if CONFIG_INTERNAL_STATS
cpi->b_calculate_blockiness = 1;
@@ -2659,6 +2636,9 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
cpi->count = 0;
cpi->bytes = 0;
+#if CONFIG_SPEED_STATS
+ cpi->tx_search_count = 0;
+#endif // CONFIG_SPEED_STATS
if (cpi->b_calculate_psnr) {
cpi->total_sq_error = 0;
@@ -2707,19 +2687,6 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
const size_t packet_sz = sizeof(FIRSTPASS_STATS);
const int packets = (int)(oxcf->two_pass_stats_in.sz / packet_sz);
-#if CONFIG_FP_MB_STATS
- if (cpi->use_fp_mb_stats) {
- const size_t psz = cpi->common.MBs * sizeof(uint8_t);
- const int ps = (int)(oxcf->firstpass_mb_stats_in.sz / psz);
-
- cpi->twopass.firstpass_mb_stats.mb_stats_start =
- oxcf->firstpass_mb_stats_in.buf;
- cpi->twopass.firstpass_mb_stats.mb_stats_end =
- cpi->twopass.firstpass_mb_stats.mb_stats_start +
- (ps - 1) * cpi->common.MBs * sizeof(uint8_t);
- }
-#endif
-
cpi->twopass.stats_in_start = oxcf->two_pass_stats_in.buf;
cpi->twopass.stats_in = cpi->twopass.stats_in_start;
cpi->twopass.stats_in_end = &cpi->twopass.stats_in[packets - 1];
@@ -2740,11 +2707,9 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
(int32_t *)aom_memalign(
16, MAX_SB_SQUARE * sizeof(*cpi->td.mb.wsrc_buf)));
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
CHECK_MEM_ERROR(
cm, cpi->td.mb.inter_modes_info,
(InterModesInfo *)aom_malloc(sizeof(*cpi->td.mb.inter_modes_info)));
-#endif
for (int x = 0; x < 2; x++)
for (int y = 0; y < 2; y++)
@@ -2759,8 +2724,8 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
(int32_t *)aom_memalign(
16, MAX_SB_SQUARE * sizeof(*cpi->td.mb.mask_buf)));
- av1_set_speed_features_framesize_independent(cpi);
- av1_set_speed_features_framesize_dependent(cpi);
+ av1_set_speed_features_framesize_independent(cpi, oxcf->speed);
+ av1_set_speed_features_framesize_dependent(cpi, oxcf->speed);
for (int frame = 0; frame < MAX_LAG_BUFFERS; ++frame) {
int mi_cols = ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2);
@@ -2777,6 +2742,10 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
cpi->tpl_stats[frame].mi_cols = cm->mi_cols;
}
+#if CONFIG_COLLECT_PARTITION_STATS == 2
+ av1_zero(cpi->partition_stats);
+#endif
+
#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, JSDAF, JSVAF) \
cpi->fn_ptr[BT].sdf = SDF; \
cpi->fn_ptr[BT].sdaf = SDAF; \
@@ -2789,103 +2758,109 @@ AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
BFP(BLOCK_4X16, aom_sad4x16, aom_sad4x16_avg, aom_variance4x16,
aom_sub_pixel_variance4x16, aom_sub_pixel_avg_variance4x16,
- aom_sad4x16x4d, aom_jnt_sad4x16_avg, aom_jnt_sub_pixel_avg_variance4x16)
+ aom_sad4x16x4d, aom_dist_wtd_sad4x16_avg,
+ aom_dist_wtd_sub_pixel_avg_variance4x16)
BFP(BLOCK_16X4, aom_sad16x4, aom_sad16x4_avg, aom_variance16x4,
aom_sub_pixel_variance16x4, aom_sub_pixel_avg_variance16x4,
- aom_sad16x4x4d, aom_jnt_sad16x4_avg, aom_jnt_sub_pixel_avg_variance16x4)
+ aom_sad16x4x4d, aom_dist_wtd_sad16x4_avg,
+ aom_dist_wtd_sub_pixel_avg_variance16x4)
BFP(BLOCK_8X32, aom_sad8x32, aom_sad8x32_avg, aom_variance8x32,
aom_sub_pixel_variance8x32, aom_sub_pixel_avg_variance8x32,
- aom_sad8x32x4d, aom_jnt_sad8x32_avg, aom_jnt_sub_pixel_avg_variance8x32)
+ aom_sad8x32x4d, aom_dist_wtd_sad8x32_avg,
+ aom_dist_wtd_sub_pixel_avg_variance8x32)
BFP(BLOCK_32X8, aom_sad32x8, aom_sad32x8_avg, aom_variance32x8,
aom_sub_pixel_variance32x8, aom_sub_pixel_avg_variance32x8,
- aom_sad32x8x4d, aom_jnt_sad32x8_avg, aom_jnt_sub_pixel_avg_variance32x8)
+ aom_sad32x8x4d, aom_dist_wtd_sad32x8_avg,
+ aom_dist_wtd_sub_pixel_avg_variance32x8)
BFP(BLOCK_16X64, aom_sad16x64, aom_sad16x64_avg, aom_variance16x64,
aom_sub_pixel_variance16x64, aom_sub_pixel_avg_variance16x64,
- aom_sad16x64x4d, aom_jnt_sad16x64_avg,
- aom_jnt_sub_pixel_avg_variance16x64)
+ aom_sad16x64x4d, aom_dist_wtd_sad16x64_avg,
+ aom_dist_wtd_sub_pixel_avg_variance16x64)
BFP(BLOCK_64X16, aom_sad64x16, aom_sad64x16_avg, aom_variance64x16,
aom_sub_pixel_variance64x16, aom_sub_pixel_avg_variance64x16,
- aom_sad64x16x4d, aom_jnt_sad64x16_avg,
- aom_jnt_sub_pixel_avg_variance64x16)
+ aom_sad64x16x4d, aom_dist_wtd_sad64x16_avg,
+ aom_dist_wtd_sub_pixel_avg_variance64x16)
BFP(BLOCK_128X128, aom_sad128x128, aom_sad128x128_avg, aom_variance128x128,
aom_sub_pixel_variance128x128, aom_sub_pixel_avg_variance128x128,
- aom_sad128x128x4d, aom_jnt_sad128x128_avg,
- aom_jnt_sub_pixel_avg_variance128x128)
+ aom_sad128x128x4d, aom_dist_wtd_sad128x128_avg,
+ aom_dist_wtd_sub_pixel_avg_variance128x128)
BFP(BLOCK_128X64, aom_sad128x64, aom_sad128x64_avg, aom_variance128x64,
aom_sub_pixel_variance128x64, aom_sub_pixel_avg_variance128x64,
- aom_sad128x64x4d, aom_jnt_sad128x64_avg,
- aom_jnt_sub_pixel_avg_variance128x64)
+ aom_sad128x64x4d, aom_dist_wtd_sad128x64_avg,
+ aom_dist_wtd_sub_pixel_avg_variance128x64)
BFP(BLOCK_64X128, aom_sad64x128, aom_sad64x128_avg, aom_variance64x128,
aom_sub_pixel_variance64x128, aom_sub_pixel_avg_variance64x128,
- aom_sad64x128x4d, aom_jnt_sad64x128_avg,
- aom_jnt_sub_pixel_avg_variance64x128)
+ aom_sad64x128x4d, aom_dist_wtd_sad64x128_avg,
+ aom_dist_wtd_sub_pixel_avg_variance64x128)
BFP(BLOCK_32X16, aom_sad32x16, aom_sad32x16_avg, aom_variance32x16,
aom_sub_pixel_variance32x16, aom_sub_pixel_avg_variance32x16,
- aom_sad32x16x4d, aom_jnt_sad32x16_avg,
- aom_jnt_sub_pixel_avg_variance32x16)
+ aom_sad32x16x4d, aom_dist_wtd_sad32x16_avg,
+ aom_dist_wtd_sub_pixel_avg_variance32x16)
BFP(BLOCK_16X32, aom_sad16x32, aom_sad16x32_avg, aom_variance16x32,
aom_sub_pixel_variance16x32, aom_sub_pixel_avg_variance16x32,
- aom_sad16x32x4d, aom_jnt_sad16x32_avg,
- aom_jnt_sub_pixel_avg_variance16x32)
+ aom_sad16x32x4d, aom_dist_wtd_sad16x32_avg,
+ aom_dist_wtd_sub_pixel_avg_variance16x32)
BFP(BLOCK_64X32, aom_sad64x32, aom_sad64x32_avg, aom_variance64x32,
aom_sub_pixel_variance64x32, aom_sub_pixel_avg_variance64x32,
- aom_sad64x32x4d, aom_jnt_sad64x32_avg,
- aom_jnt_sub_pixel_avg_variance64x32)
+ aom_sad64x32x4d, aom_dist_wtd_sad64x32_avg,
+ aom_dist_wtd_sub_pixel_avg_variance64x32)
BFP(BLOCK_32X64, aom_sad32x64, aom_sad32x64_avg, aom_variance32x64,
aom_sub_pixel_variance32x64, aom_sub_pixel_avg_variance32x64,
- aom_sad32x64x4d, aom_jnt_sad32x64_avg,
- aom_jnt_sub_pixel_avg_variance32x64)
+ aom_sad32x64x4d, aom_dist_wtd_sad32x64_avg,
+ aom_dist_wtd_sub_pixel_avg_variance32x64)
BFP(BLOCK_32X32, aom_sad32x32, aom_sad32x32_avg, aom_variance32x32,
aom_sub_pixel_variance32x32, aom_sub_pixel_avg_variance32x32,
- aom_sad32x32x4d, aom_jnt_sad32x32_avg,
- aom_jnt_sub_pixel_avg_variance32x32)
+ aom_sad32x32x4d, aom_dist_wtd_sad32x32_avg,
+ aom_dist_wtd_sub_pixel_avg_variance32x32)
BFP(BLOCK_64X64, aom_sad64x64, aom_sad64x64_avg, aom_variance64x64,
aom_sub_pixel_variance64x64, aom_sub_pixel_avg_variance64x64,
- aom_sad64x64x4d, aom_jnt_sad64x64_avg,
- aom_jnt_sub_pixel_avg_variance64x64)
+ aom_sad64x64x4d, aom_dist_wtd_sad64x64_avg,
+ aom_dist_wtd_sub_pixel_avg_variance64x64)
BFP(BLOCK_16X16, aom_sad16x16, aom_sad16x16_avg, aom_variance16x16,
aom_sub_pixel_variance16x16, aom_sub_pixel_avg_variance16x16,
- aom_sad16x16x4d, aom_jnt_sad16x16_avg,
- aom_jnt_sub_pixel_avg_variance16x16)
+ aom_sad16x16x4d, aom_dist_wtd_sad16x16_avg,
+ aom_dist_wtd_sub_pixel_avg_variance16x16)
BFP(BLOCK_16X8, aom_sad16x8, aom_sad16x8_avg, aom_variance16x8,
aom_sub_pixel_variance16x8, aom_sub_pixel_avg_variance16x8,
- aom_sad16x8x4d, aom_jnt_sad16x8_avg, aom_jnt_sub_pixel_avg_variance16x8)
+ aom_sad16x8x4d, aom_dist_wtd_sad16x8_avg,
+ aom_dist_wtd_sub_pixel_avg_variance16x8)
BFP(BLOCK_8X16, aom_sad8x16, aom_sad8x16_avg, aom_variance8x16,
aom_sub_pixel_variance8x16, aom_sub_pixel_avg_variance8x16,
- aom_sad8x16x4d, aom_jnt_sad8x16_avg, aom_jnt_sub_pixel_avg_variance8x16)
+ aom_sad8x16x4d, aom_dist_wtd_sad8x16_avg,
+ aom_dist_wtd_sub_pixel_avg_variance8x16)
BFP(BLOCK_8X8, aom_sad8x8, aom_sad8x8_avg, aom_variance8x8,
aom_sub_pixel_variance8x8, aom_sub_pixel_avg_variance8x8, aom_sad8x8x4d,
- aom_jnt_sad8x8_avg, aom_jnt_sub_pixel_avg_variance8x8)
+ aom_dist_wtd_sad8x8_avg, aom_dist_wtd_sub_pixel_avg_variance8x8)
BFP(BLOCK_8X4, aom_sad8x4, aom_sad8x4_avg, aom_variance8x4,
aom_sub_pixel_variance8x4, aom_sub_pixel_avg_variance8x4, aom_sad8x4x4d,
- aom_jnt_sad8x4_avg, aom_jnt_sub_pixel_avg_variance8x4)
+ aom_dist_wtd_sad8x4_avg, aom_dist_wtd_sub_pixel_avg_variance8x4)
BFP(BLOCK_4X8, aom_sad4x8, aom_sad4x8_avg, aom_variance4x8,
aom_sub_pixel_variance4x8, aom_sub_pixel_avg_variance4x8, aom_sad4x8x4d,
- aom_jnt_sad4x8_avg, aom_jnt_sub_pixel_avg_variance4x8)
+ aom_dist_wtd_sad4x8_avg, aom_dist_wtd_sub_pixel_avg_variance4x8)
BFP(BLOCK_4X4, aom_sad4x4, aom_sad4x4_avg, aom_variance4x4,
aom_sub_pixel_variance4x4, aom_sub_pixel_avg_variance4x4, aom_sad4x4x4d,
- aom_jnt_sad4x4_avg, aom_jnt_sub_pixel_avg_variance4x4)
+ aom_dist_wtd_sad4x4_avg, aom_dist_wtd_sub_pixel_avg_variance4x4)
#define OBFP(BT, OSDF, OVF, OSVF) \
cpi->fn_ptr[BT].osdf = OSDF; \
@@ -3083,6 +3058,17 @@ void av1_remove_compressor(AV1_COMP *cpi) {
fclose(f);
}
#endif // CONFIG_INTERNAL_STATS
+#if CONFIG_SPEED_STATS
+ if (cpi->oxcf.pass != 1) {
+ fprintf(stdout, "tx_search_count = %d\n", cpi->tx_search_count);
+ }
+#endif // CONFIG_SPEED_STATS
+
+#if CONFIG_COLLECT_PARTITION_STATS == 2
+ if (cpi->oxcf.pass != 1) {
+ av1_print_partition_stats(&cpi->partition_stats);
+ }
+#endif
}
for (int frame = 0; frame < MAX_LAG_BUFFERS; ++frame) {
@@ -3090,7 +3076,7 @@ void av1_remove_compressor(AV1_COMP *cpi) {
cpi->tpl_stats[frame].is_valid = 0;
}
- for (t = 0; t < cpi->num_workers; ++t) {
+ for (t = cpi->num_workers - 1; t >= 0; --t) {
AVxWorker *const worker = &cpi->workers[t];
EncWorkerData *const thread_data = &cpi->tile_thr_data[t];
@@ -3099,7 +3085,7 @@ void av1_remove_compressor(AV1_COMP *cpi) {
// Deallocate allocated thread data.
if (cpi->row_mt == 1) aom_free(thread_data->td->tctx);
- if (t < cpi->num_workers - 1) {
+ if (t > 0) {
aom_free(thread_data->td->palette_buffer);
aom_free(thread_data->td->tmp_conv_dst);
for (int j = 0; j < 2; ++j) {
@@ -3109,9 +3095,7 @@ void av1_remove_compressor(AV1_COMP *cpi) {
aom_free(thread_data->td->left_pred_buf);
aom_free(thread_data->td->wsrc_buf);
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
aom_free(thread_data->td->inter_modes_info);
-#endif
for (int x = 0; x < 2; x++) {
for (int y = 0; y < 2; y++) {
aom_free(thread_data->td->hash_value_buffer[x][y]);
@@ -3148,12 +3132,6 @@ void av1_remove_compressor(AV1_COMP *cpi) {
aom_free(cpi->mbgraph_stats[i].mb_stats);
}
-#if CONFIG_FP_MB_STATS
- if (cpi->use_fp_mb_stats) {
- aom_free(cpi->twopass.frame_mb_stats_buf);
- cpi->twopass.frame_mb_stats_buf = NULL;
- }
-#endif
#if CONFIG_INTERNAL_STATS
aom_free(cpi->ssim_vars);
cpi->ssim_vars = NULL;
@@ -3179,7 +3157,7 @@ static void generate_psnr_packet(AV1_COMP *cpi) {
struct aom_codec_cx_pkt pkt;
int i;
PSNR_STATS psnr;
- aom_calc_highbd_psnr(cpi->source, cpi->common.frame_to_show, &psnr,
+ aom_calc_highbd_psnr(cpi->source, &cpi->common.cur_frame->buf, &psnr,
cpi->td.mb.e_mbd.bd, cpi->oxcf.input_bit_depth);
for (i = 0; i < 4; ++i) {
@@ -3198,15 +3176,6 @@ int av1_use_as_reference(AV1_COMP *cpi, int ref_frame_flags) {
return 0;
}
-void av1_update_reference(AV1_COMP *cpi, int ref_frame_upd_flags) {
- cpi->ext_refresh_last_frame = (ref_frame_upd_flags & AOM_LAST_FLAG) != 0;
- cpi->ext_refresh_golden_frame = (ref_frame_upd_flags & AOM_GOLD_FLAG) != 0;
- cpi->ext_refresh_alt_ref_frame = (ref_frame_upd_flags & AOM_ALT_FLAG) != 0;
- cpi->ext_refresh_bwd_ref_frame = (ref_frame_upd_flags & AOM_BWD_FLAG) != 0;
- cpi->ext_refresh_alt2_ref_frame = (ref_frame_upd_flags & AOM_ALT2_FLAG) != 0;
- cpi->ext_refresh_frame_flags_pending = 1;
-}
-
int av1_copy_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd) {
AV1_COMMON *const cm = &cpi->common;
const int num_planes = av1_num_planes(cm);
@@ -3269,62 +3238,6 @@ void aom_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f) {
}
#endif
-static void check_show_existing_frame(AV1_COMP *cpi) {
- const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
- AV1_COMMON *const cm = &cpi->common;
- const FRAME_UPDATE_TYPE next_frame_update_type =
- gf_group->update_type[gf_group->index];
-#if USE_SYMM_MULTI_LAYER
- const int which_arf = (cpi->new_bwdref_update_rule == 1)
- ? gf_group->arf_update_idx[gf_group->index] > 0
- : gf_group->arf_update_idx[gf_group->index];
-#else
- const int which_arf = gf_group->arf_update_idx[gf_group->index];
-#endif
-
- if (cm->show_existing_frame == 1) {
- cm->show_existing_frame = 0;
- } else if (cpi->rc.is_last_bipred_frame) {
-#if USE_SYMM_MULTI_LAYER
- // NOTE: When new structure is used, every bwdref will have one overlay
- // frame. Therefore, there is no need to find out which frame to
- // show in advance.
- if (cpi->new_bwdref_update_rule == 0) {
-#endif
- // NOTE: If the current frame is a last bi-predictive frame, it is
- // needed next to show the BWDREF_FRAME, which is pointed by
- // the last_fb_idxes[0] after reference frame buffer update
- cpi->rc.is_last_bipred_frame = 0;
- cm->show_existing_frame = 1;
- cpi->existing_fb_idx_to_show = cpi->remapped_ref_idx[0];
-#if USE_SYMM_MULTI_LAYER
- }
-#endif
- } else if (cpi->is_arf_filter_off[which_arf] &&
- (next_frame_update_type == OVERLAY_UPDATE ||
- next_frame_update_type == INTNL_OVERLAY_UPDATE)) {
-#if USE_SYMM_MULTI_LAYER
- const int bwdref_to_show =
- (cpi->new_bwdref_update_rule == 1) ? BWDREF_FRAME : ALTREF2_FRAME;
-#else
- const int bwdref_to_show = ALTREF2_FRAME;
-#endif
- // Other parameters related to OVERLAY_UPDATE will be taken care of
- // in av1_rc_get_second_pass_params(cpi)
- cm->show_existing_frame = 1;
- cpi->rc.is_src_frame_alt_ref = 1;
- cpi->existing_fb_idx_to_show =
- (next_frame_update_type == OVERLAY_UPDATE)
- ? get_ref_frame_map_idx(cpi, ALTREF_FRAME)
- : get_ref_frame_map_idx(cpi, bwdref_to_show);
-#if USE_SYMM_MULTI_LAYER
- if (cpi->new_bwdref_update_rule == 0)
-#endif
- cpi->is_arf_filter_off[which_arf] = 0;
- }
- cpi->rc.is_src_frame_ext_arf = 0;
-}
-
#ifdef OUTPUT_YUV_REC
void aom_write_one_yuv_frame(AV1_COMMON *cm, YV12_BUFFER_CONFIG *s) {
uint8_t *src = s->y_buffer;
@@ -3433,379 +3346,6 @@ static int recode_loop_test(AV1_COMP *cpi, int high_limit, int low_limit, int q,
return force_recode;
}
-#define DUMP_REF_FRAME_IMAGES 0
-
-#if DUMP_REF_FRAME_IMAGES == 1
-static int dump_one_image(AV1_COMMON *cm,
- const YV12_BUFFER_CONFIG *const ref_buf,
- char *file_name) {
- int h;
- FILE *f_ref = NULL;
-
- if (ref_buf == NULL) {
- printf("Frame data buffer is NULL.\n");
- return AOM_CODEC_MEM_ERROR;
- }
-
- if ((f_ref = fopen(file_name, "wb")) == NULL) {
- printf("Unable to open file %s to write.\n", file_name);
- return AOM_CODEC_MEM_ERROR;
- }
-
- // --- Y ---
- for (h = 0; h < cm->height; ++h) {
- fwrite(&ref_buf->y_buffer[h * ref_buf->y_stride], 1, cm->width, f_ref);
- }
- // --- U ---
- for (h = 0; h < (cm->height >> 1); ++h) {
- fwrite(&ref_buf->u_buffer[h * ref_buf->uv_stride], 1, (cm->width >> 1),
- f_ref);
- }
- // --- V ---
- for (h = 0; h < (cm->height >> 1); ++h) {
- fwrite(&ref_buf->v_buffer[h * ref_buf->uv_stride], 1, (cm->width >> 1),
- f_ref);
- }
-
- fclose(f_ref);
-
- return AOM_CODEC_OK;
-}
-
-static void dump_ref_frame_images(AV1_COMP *cpi) {
- AV1_COMMON *const cm = &cpi->common;
- MV_REFERENCE_FRAME ref_frame;
-
- for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
- char file_name[256] = "";
- snprintf(file_name, sizeof(file_name), "/tmp/enc_F%d_ref_%d.yuv",
- cm->current_frame.frame_number, ref_frame);
- dump_one_image(cm, get_ref_frame_buffer(cpi, ref_frame), file_name);
- }
-}
-#endif // DUMP_REF_FRAME_IMAGES == 1
-
-// This function is used to shift the virtual indices of last reference frames
-// as follows:
-// LAST_FRAME -> LAST2_FRAME -> LAST3_FRAME
-// when the LAST_FRAME is updated.
-static INLINE void shift_last_ref_frames(AV1_COMP *cpi) {
- // TODO(isbs): shift the scaled indices as well
- for (int ref_frame = LAST3_FRAME; ref_frame > LAST_FRAME; --ref_frame) {
- const int ref_idx = ref_frame - LAST_FRAME;
- cpi->remapped_ref_idx[ref_idx] = cpi->remapped_ref_idx[ref_idx - 1];
-
- if (!cpi->rc.is_src_frame_alt_ref) {
- memcpy(cpi->interp_filter_selected[ref_frame],
- cpi->interp_filter_selected[ref_frame - 1],
- sizeof(cpi->interp_filter_selected[ref_frame - 1]));
- }
- }
-}
-
-#if USE_SYMM_MULTI_LAYER
-// This function is used to shift the virtual indices of bwd reference
-// frames as follows:
-// BWD_REF -> ALT2_REF -> EXT_REF
-// to clear a space to store the closest bwdref
-static INLINE void rshift_bwd_ref_frames(AV1_COMP *cpi) {
- // TODO(isbs): shift the scaled indices as well
- static const int ordered_bwd[3] = { BWDREF_FRAME, ALTREF2_FRAME,
- EXTREF_FRAME };
-
- for (int i = 2; i > 0; --i) {
- // [0] is allocated to the current coded frame, i.e. bwdref
- memcpy(cpi->interp_filter_selected[ordered_bwd[i]],
- cpi->interp_filter_selected[ordered_bwd[i - 1]],
- sizeof(cpi->interp_filter_selected[ordered_bwd[i - 1]]));
-
- cpi->remapped_ref_idx[ordered_bwd[i] - LAST_FRAME] =
- cpi->remapped_ref_idx[ordered_bwd[i - 1] - LAST_FRAME];
- }
-}
-
-// This function is used to shift the virtual indices of bwd reference
-// frames as follows:
-// BWD_REF <- ALT2_REF <- EXT_REF
-// to update the bwd reference frame for coding the next frame.
-static INLINE void lshift_bwd_ref_frames(AV1_COMP *cpi) {
- // TODO(isbs): shift the scaled indices as well
- static const int ordered_bwd[3] = { BWDREF_FRAME, ALTREF2_FRAME,
- EXTREF_FRAME };
-
- for (int i = 0; i < 2; ++i) {
- // [0] is allocated to the current coded frame, i.e. bwdref
- memcpy(cpi->interp_filter_selected[ordered_bwd[i]],
- cpi->interp_filter_selected[ordered_bwd[i + 1]],
- sizeof(cpi->interp_filter_selected[ordered_bwd[i + 1]]));
-
- cpi->remapped_ref_idx[ordered_bwd[i] - LAST_FRAME] =
- cpi->remapped_ref_idx[ordered_bwd[i + 1] - LAST_FRAME];
- }
-}
-#endif // USE_SYMM_MULTI_LAYER
-
-static void update_reference_frames(AV1_COMP *cpi) {
- AV1_COMMON *const cm = &cpi->common;
-
- // NOTE: Save the new show frame buffer index for --test-code=warn, i.e.,
- // for the purpose to verify no mismatch between encoder and decoder.
- if (cm->show_frame) cpi->last_show_frame_buf_idx = cm->new_fb_idx;
-
- // In the case of show_existing frame, we will not send fresh flag
- // to decoder. Any change in the reference frame buffer can be done by
- // switching the virtual indices.
- if (cm->show_existing_frame) {
- // If we are not indicating to the decoder that this frame is
- // a show_existing_frame, which occurs in error_resilient mode,
- // we still want to refresh the LAST_FRAME when the current frame
- // was the source of an ext_arf.
- cpi->refresh_last_frame =
- !encode_show_existing_frame(cm) && cpi->rc.is_src_frame_ext_arf;
- cpi->refresh_golden_frame = 0;
- cpi->refresh_bwd_ref_frame = 0;
- cpi->refresh_alt2_ref_frame = 0;
- cpi->refresh_alt_ref_frame = 0;
-
- cpi->rc.is_bwd_ref_frame = 0;
- cpi->rc.is_last_bipred_frame = 0;
- cpi->rc.is_bipred_frame = 0;
- }
-
- BufferPool *const pool = cm->buffer_pool;
-
- // At this point the new frame has been encoded.
- // If any buffer copy / swapping is signaled it should be done here.
-
- // Only update all of the reference buffers if a KEY_FRAME is also a
- // show_frame. This ensures a fwd keyframe does not update all of the buffers
- if ((cm->current_frame.frame_type == KEY_FRAME && cm->show_frame) ||
- frame_is_sframe(cm)) {
- for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) {
- assign_frame_buffer(pool->frame_bufs,
- &cm->ref_frame_map[cpi->remapped_ref_idx[ref_frame]],
- cm->new_fb_idx);
- }
- return;
- }
-
- if (av1_preserve_existing_gf(cpi)) {
- // We have decided to preserve the previously existing golden frame as our
- // new ARF frame. However, in the short term in function
- // av1_bitstream.c::get_refresh_mask() we left it in the GF slot and, if
- // we're updating the GF with the current decoded frame, we save it to the
- // ARF slot instead.
- // We now have to update the ARF with the current frame and swap gld_fb_idx
- // and alt_fb_idx so that, overall, we've stored the old GF in the new ARF
- // slot and, if we're updating the GF, the current frame becomes the new GF.
- int tmp;
-
- // ARF in general is a better reference than overlay. We shouldkeep ARF as
- // reference instead of replacing it with overlay.
-
- if (!cpi->preserve_arf_as_gld) {
- assign_frame_buffer(
- pool->frame_bufs,
- &cm->ref_frame_map[get_ref_frame_map_idx(cpi, ALTREF_FRAME)],
- cm->new_fb_idx);
- }
-
- tmp = get_ref_frame_map_idx(cpi, ALTREF_FRAME);
- cpi->remapped_ref_idx[ALTREF_FRAME - 1] =
- get_ref_frame_map_idx(cpi, GOLDEN_FRAME);
- cpi->remapped_ref_idx[GOLDEN_FRAME - 1] = tmp;
-
- // TODO(zoeliu): Do we need to copy cpi->interp_filter_selected[0] over to
- // cpi->interp_filter_selected[GOLDEN_FRAME]?
- } else if (cpi->rc.is_src_frame_ext_arf && encode_show_existing_frame(cm)) {
-#if CONFIG_DEBUG
- const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
- assert(gf_group->update_type[gf_group->index] == INTNL_OVERLAY_UPDATE);
-#endif
-#if USE_SYMM_MULTI_LAYER
- const int bwdref_to_show =
- (cpi->new_bwdref_update_rule == 1) ? BWDREF_FRAME : ALTREF2_FRAME;
-#else
- const int bwdref_to_show = ALTREF2_FRAME;
-#endif
- // Deal with the special case for showing existing internal ALTREF_FRAME
- // Refresh the LAST_FRAME with the ALTREF_FRAME and retire the LAST3_FRAME
- // by updating the virtual indices.
- const int last3_remapped_idx = get_ref_frame_map_idx(cpi, LAST3_FRAME);
- shift_last_ref_frames(cpi);
-
- cpi->remapped_ref_idx[LAST_FRAME - 1] =
- get_ref_frame_map_idx(cpi, bwdref_to_show);
-
- memcpy(cpi->interp_filter_selected[LAST_FRAME],
- cpi->interp_filter_selected[bwdref_to_show],
- sizeof(cpi->interp_filter_selected[bwdref_to_show]));
-#if USE_SYMM_MULTI_LAYER
- if (cpi->new_bwdref_update_rule == 1) {
- lshift_bwd_ref_frames(cpi);
- // pass outdated forward reference frame (previous LAST3) to the
- // spared space
- cpi->remapped_ref_idx[EXTREF_FRAME - 1] = last3_remapped_idx;
- } else {
-#endif
- cpi->remapped_ref_idx[bwdref_to_show - 1] = last3_remapped_idx;
-#if USE_SYMM_MULTI_LAYER
- }
-#endif
- } else { /* For non key/golden frames */
- // === ALTREF_FRAME ===
- if (cpi->refresh_alt_ref_frame) {
- int arf_idx = get_ref_frame_map_idx(cpi, ALTREF_FRAME);
- assign_frame_buffer(pool->frame_bufs, &cm->ref_frame_map[arf_idx],
- cm->new_fb_idx);
-
- memcpy(cpi->interp_filter_selected[ALTREF_FRAME],
- cpi->interp_filter_selected[0],
- sizeof(cpi->interp_filter_selected[0]));
- }
-
- // === GOLDEN_FRAME ===
- if (cpi->refresh_golden_frame) {
- assign_frame_buffer(
- pool->frame_bufs,
- &cm->ref_frame_map[get_ref_frame_map_idx(cpi, GOLDEN_FRAME)],
- cm->new_fb_idx);
-
- memcpy(cpi->interp_filter_selected[GOLDEN_FRAME],
- cpi->interp_filter_selected[0],
- sizeof(cpi->interp_filter_selected[0]));
- }
-
- // === BWDREF_FRAME ===
- if (cpi->refresh_bwd_ref_frame) {
-#if USE_SYMM_MULTI_LAYER
- if (cpi->new_bwdref_update_rule) {
- // We shift the backward reference frame as follows:
- // BWDREF -> ALTREF2 -> EXTREF
- // and assign the newly coded frame to BWDREF so that it always
- // keeps the nearest future frame
- int tmp = get_ref_frame_map_idx(cpi, EXTREF_FRAME);
- assign_frame_buffer(pool->frame_bufs, &cm->ref_frame_map[tmp],
- cm->new_fb_idx);
-
- rshift_bwd_ref_frames(cpi);
- cpi->remapped_ref_idx[BWDREF_FRAME - 1] = tmp;
- } else {
-#endif // USE_SYMM_MULTI_LAYER
- assign_frame_buffer(
- pool->frame_bufs,
- &cm->ref_frame_map[get_ref_frame_map_idx(cpi, BWDREF_FRAME)],
- cm->new_fb_idx);
-#if USE_SYMM_MULTI_LAYER
- }
-#endif
- memcpy(cpi->interp_filter_selected[BWDREF_FRAME],
- cpi->interp_filter_selected[0],
- sizeof(cpi->interp_filter_selected[0]));
- }
-
- // === ALTREF2_FRAME ===
- if (cpi->refresh_alt2_ref_frame) {
- assign_frame_buffer(
- pool->frame_bufs,
- &cm->ref_frame_map[get_ref_frame_map_idx(cpi, ALTREF2_FRAME)],
- cm->new_fb_idx);
-
- memcpy(cpi->interp_filter_selected[ALTREF2_FRAME],
- cpi->interp_filter_selected[0],
- sizeof(cpi->interp_filter_selected[0]));
- }
- }
-
- if (cpi->refresh_last_frame) {
- // NOTE(zoeliu): We have two layers of mapping (1) from the per-frame
- // reference to the reference frame buffer virtual index; and then (2) from
- // the virtual index to the reference frame buffer physical index:
- //
- // LAST_FRAME, ..., EXTREF_FRAME
- // | |
- // v v
- // remapped_ref_idx[LAST_FRAME - 1], ..., remapped_ref_idx[EXTREF_FRAME - 1]
- // | |
- // v v
- // ref_frame_map[], ..., ref_frame_map[]
- //
- // When refresh_last_frame is set, it is intended to retire LAST3_FRAME,
- // have the other 2 LAST reference frames shifted as follows:
- // LAST_FRAME -> LAST2_FRAME -> LAST3_FRAME
- // , and then have LAST_FRAME refreshed by the newly coded frame.
- //
- // To fulfill it, the decoder will be notified to execute following 2 steps:
- //
- // (a) To change ref_frame_map[] and have the virtual index of LAST3_FRAME
- // to point to the newly coded frame, i.e.
- // ref_frame_map[lst_fb_idexes[2]] => new_fb_idx;
- //
- // (b) To change the 1st layer mapping to have LAST_FRAME mapped to the
- // original virtual index of LAST3_FRAME and have the other mappings
- // shifted as follows:
- // LAST_FRAME, LAST2_FRAME, LAST3_FRAME
- // | | |
- // v v v
- // remapped_ref_idx[2], remapped_ref_idx[0], remapped_ref_idx[1]
- assign_frame_buffer(
- pool->frame_bufs,
- &cm->ref_frame_map[get_ref_frame_map_idx(cpi, LAST3_FRAME)],
- cm->new_fb_idx);
-
- int last3_remapped_idx = get_ref_frame_map_idx(cpi, LAST3_FRAME);
-
- shift_last_ref_frames(cpi);
- cpi->remapped_ref_idx[LAST_FRAME - 1] = last3_remapped_idx;
-
- assert(!encode_show_existing_frame(cm));
- memcpy(cpi->interp_filter_selected[LAST_FRAME],
- cpi->interp_filter_selected[0],
- sizeof(cpi->interp_filter_selected[0]));
-
- // If the new structure is used, we will always have overlay frames coupled
- // with bwdref frames. Therefore, we won't have to perform this update
- // in advance (we do this update when the overlay frame shows up).
-#if USE_SYMM_MULTI_LAYER
- if (cpi->new_bwdref_update_rule == 0 && cpi->rc.is_last_bipred_frame) {
-#else
- if (cpi->rc.is_last_bipred_frame) {
-#endif
- // Refresh the LAST_FRAME with the BWDREF_FRAME and retire the
- // LAST3_FRAME by updating the virtual indices.
- //
- // NOTE: The source frame for BWDREF does not have a holding position as
- // the OVERLAY frame for ALTREF's. Hence, to resolve the reference
- // virtual index reshuffling for BWDREF, the encoder always
- // specifies a LAST_BIPRED right before BWDREF and completes the
- // reshuffling job accordingly.
- last3_remapped_idx = get_ref_frame_map_idx(cpi, LAST3_FRAME);
-
- shift_last_ref_frames(cpi);
- cpi->remapped_ref_idx[LAST_FRAME - 1] =
- get_ref_frame_map_idx(cpi, BWDREF_FRAME);
- cpi->remapped_ref_idx[BWDREF_FRAME - 1] = last3_remapped_idx;
-
- memcpy(cpi->interp_filter_selected[LAST_FRAME],
- cpi->interp_filter_selected[BWDREF_FRAME],
- sizeof(cpi->interp_filter_selected[BWDREF_FRAME]));
- }
- }
-
-#if DUMP_REF_FRAME_IMAGES == 1
- // Dump out all reference frame images.
- dump_ref_frame_images(cpi);
-#endif // DUMP_REF_FRAME_IMAGES
-}
-
-static INLINE void alloc_frame_mvs(AV1_COMMON *const cm, int buffer_idx) {
- assert(buffer_idx != INVALID_IDX);
- RefCntBuffer *const new_fb_ptr = &cm->buffer_pool->frame_bufs[buffer_idx];
- ensure_mv_buffer(new_fb_ptr, cm);
- new_fb_ptr->width = cm->width;
- new_fb_ptr->height = cm->height;
-}
-
static void scale_references(AV1_COMP *cpi) {
AV1_COMMON *cm = &cpi->common;
const int num_planes = av1_num_planes(cm);
@@ -3820,68 +3360,79 @@ static void scale_references(AV1_COMP *cpi) {
if (cpi->ref_frame_flags & ref_mask[ref_frame - 1]) {
BufferPool *const pool = cm->buffer_pool;
const YV12_BUFFER_CONFIG *const ref =
- get_ref_frame_buffer(cpi, ref_frame);
+ get_ref_frame_yv12_buf(cm, ref_frame);
if (ref == NULL) {
- cpi->scaled_ref_idx[ref_frame - 1] = INVALID_IDX;
+ cpi->scaled_ref_buf[ref_frame - 1] = NULL;
continue;
}
if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
- RefCntBuffer *new_fb_ptr = NULL;
+ // Replace the reference buffer with a copy having a thicker border,
+ // if the reference buffer is higher resolution than the current
+ // frame, and the border is thin.
+ if ((ref->y_crop_width > cm->width ||
+ ref->y_crop_height > cm->height) &&
+ ref->border < AOM_BORDER_IN_PIXELS) {
+ RefCntBuffer *ref_fb = get_ref_frame_buf(cm, ref_frame);
+ if (aom_yv12_realloc_with_new_border(
+ &ref_fb->buf, AOM_BORDER_IN_PIXELS, cm->byte_alignment,
+ num_planes) != 0) {
+ aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate frame buffer");
+ }
+ }
int force_scaling = 0;
- int new_fb = cpi->scaled_ref_idx[ref_frame - 1];
- if (new_fb == INVALID_IDX) {
- new_fb = get_free_fb(cm);
- if (new_fb == INVALID_IDX)
+ RefCntBuffer *new_fb = cpi->scaled_ref_buf[ref_frame - 1];
+ if (new_fb == NULL) {
+ const int new_fb_idx = get_free_fb(cm);
+ if (new_fb_idx == INVALID_IDX) {
aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
"Unable to find free frame buffer");
+ }
force_scaling = 1;
+ new_fb = &pool->frame_bufs[new_fb_idx];
}
- new_fb_ptr = &pool->frame_bufs[new_fb];
- if (force_scaling || new_fb_ptr->buf.y_crop_width != cm->width ||
- new_fb_ptr->buf.y_crop_height != cm->height) {
+
+ if (force_scaling || new_fb->buf.y_crop_width != cm->width ||
+ new_fb->buf.y_crop_height != cm->height) {
if (aom_realloc_frame_buffer(
- &new_fb_ptr->buf, cm->width, cm->height,
+ &new_fb->buf, cm->width, cm->height,
cm->seq_params.subsampling_x, cm->seq_params.subsampling_y,
cm->seq_params.use_highbitdepth, AOM_BORDER_IN_PIXELS,
cm->byte_alignment, NULL, NULL, NULL)) {
if (force_scaling) {
// Release the reference acquired in the get_free_fb() call above.
- --new_fb_ptr->ref_count;
+ --new_fb->ref_count;
}
aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
"Failed to allocate frame buffer");
}
av1_resize_and_extend_frame(
- ref, &new_fb_ptr->buf, (int)cm->seq_params.bit_depth, num_planes);
- cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
+ ref, &new_fb->buf, (int)cm->seq_params.bit_depth, num_planes);
+ cpi->scaled_ref_buf[ref_frame - 1] = new_fb;
alloc_frame_mvs(cm, new_fb);
}
} else {
- const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
- RefCntBuffer *const buf = &pool->frame_bufs[buf_idx];
+ RefCntBuffer *buf = get_ref_frame_buf(cm, ref_frame);
buf->buf.y_crop_width = ref->y_crop_width;
buf->buf.y_crop_height = ref->y_crop_height;
- cpi->scaled_ref_idx[ref_frame - 1] = buf_idx;
+ cpi->scaled_ref_buf[ref_frame - 1] = buf;
++buf->ref_count;
}
} else {
- if (cpi->oxcf.pass != 0) cpi->scaled_ref_idx[ref_frame - 1] = INVALID_IDX;
+ if (cpi->oxcf.pass != 0) cpi->scaled_ref_buf[ref_frame - 1] = NULL;
}
}
}
static void release_scaled_references(AV1_COMP *cpi) {
- AV1_COMMON *cm = &cpi->common;
- int i;
// TODO(isbs): only refresh the necessary frames, rather than all of them
- for (i = 0; i < INTER_REFS_PER_FRAME; ++i) {
- const int idx = cpi->scaled_ref_idx[i];
- if (idx != INVALID_IDX) {
- RefCntBuffer *const buf = &cm->buffer_pool->frame_bufs[idx];
+ for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+ RefCntBuffer *const buf = cpi->scaled_ref_buf[i];
+ if (buf != NULL) {
--buf->ref_count;
- cpi->scaled_ref_idx[i] = INVALID_IDX;
+ cpi->scaled_ref_buf[i] = NULL;
}
}
}
@@ -3911,6 +3462,71 @@ static void set_mv_search_params(AV1_COMP *cpi) {
}
}
+static void set_screen_content_options(AV1_COMP *cpi) {
+ AV1_COMMON *cm = &cpi->common;
+
+ if (cm->seq_params.force_screen_content_tools != 2) {
+ cm->allow_screen_content_tools = cm->allow_intrabc =
+ cm->seq_params.force_screen_content_tools;
+ return;
+ }
+
+ if (cpi->oxcf.content == AOM_CONTENT_SCREEN) {
+ cm->allow_screen_content_tools = cm->allow_intrabc = 1;
+ return;
+ }
+
+ // Estimate if the source frame is screen content, based on the portion of
+ // blocks that have few luma colors.
+ const uint8_t *src = cpi->source->y_buffer;
+ assert(src != NULL);
+ const int use_hbd = cpi->source->flags & YV12_FLAG_HIGHBITDEPTH;
+ const int stride = cpi->source->y_stride;
+ const int width = cpi->source->y_width;
+ const int height = cpi->source->y_height;
+ const int bd = cm->seq_params.bit_depth;
+ const int blk_w = 16;
+ const int blk_h = 16;
+ // These threshold values are selected experimentally.
+ const int color_thresh = 4;
+ const unsigned int var_thresh = 0;
+ // Counts of blocks with no more than color_thresh colors.
+ int counts_1 = 0;
+ // Counts of blocks with no more than color_thresh colors and variance larger
+ // than var_thresh.
+ int counts_2 = 0;
+
+ for (int r = 0; r + blk_h <= height; r += blk_h) {
+ for (int c = 0; c + blk_w <= width; c += blk_w) {
+ int count_buf[1 << 12]; // Maximum (1 << 12) color levels.
+ const uint8_t *const this_src = src + r * stride + c;
+ const int n_colors =
+ use_hbd ? av1_count_colors_highbd(this_src, stride, blk_w, blk_h, bd,
+ count_buf)
+ : av1_count_colors(this_src, stride, blk_w, blk_h, count_buf);
+ if (n_colors > 1 && n_colors <= color_thresh) {
+ ++counts_1;
+ struct buf_2d buf;
+ buf.stride = stride;
+ buf.buf = (uint8_t *)this_src;
+ const unsigned int var =
+ use_hbd
+ ? av1_high_get_sby_perpixel_variance(cpi, &buf, BLOCK_16X16, bd)
+ : av1_get_sby_perpixel_variance(cpi, &buf, BLOCK_16X16);
+ if (var > var_thresh) ++counts_2;
+ }
+ }
+ }
+
+ // The threshold values are selected experimentally.
+ cm->allow_screen_content_tools =
+ counts_1 * blk_h * blk_w * 10 > width * height;
+ // IntraBC would force loop filters off, so we use more strict rules that also
+ // requires that the block has high variance.
+ cm->allow_intrabc = cm->allow_screen_content_tools &&
+ counts_2 * blk_h * blk_w * 15 > width * height;
+}
+
static void set_size_independent_vars(AV1_COMP *cpi) {
int i;
AV1_COMMON *cm = &cpi->common;
@@ -3918,25 +3534,14 @@ static void set_size_independent_vars(AV1_COMP *cpi) {
cm->global_motion[i] = default_warp_params;
}
cpi->global_motion_search_done = 0;
- av1_set_speed_features_framesize_independent(cpi);
+
+ if (frame_is_intra_only(cm)) set_screen_content_options(cpi);
+ cpi->is_screen_content_type = (cm->allow_screen_content_tools != 0);
+
+ av1_set_speed_features_framesize_independent(cpi, cpi->speed);
av1_set_rd_speed_thresholds(cpi);
- av1_set_rd_speed_thresholds_sub8x8(cpi);
cm->interp_filter = SWITCHABLE;
cm->switchable_motion_mode = 1;
-
- if (frame_is_intra_only(cm)) {
- if (cm->seq_params.force_screen_content_tools == 2) {
- cm->allow_screen_content_tools =
- cpi->oxcf.content == AOM_CONTENT_SCREEN ||
- is_screen_content(cpi->source->y_buffer,
- cpi->source->flags & YV12_FLAG_HIGHBITDEPTH,
- cm->seq_params.bit_depth, cpi->source->y_stride,
- cpi->source->y_width, cpi->source->y_height);
- } else {
- cm->allow_screen_content_tools =
- cm->seq_params.force_screen_content_tools;
- }
- }
}
static void set_size_dependent_vars(AV1_COMP *cpi, int *q, int *bottom_index,
@@ -3945,7 +3550,7 @@ static void set_size_dependent_vars(AV1_COMP *cpi, int *q, int *bottom_index,
const AV1EncoderConfig *const oxcf = &cpi->oxcf;
// Setup variables that depend on the dimensions of the frame.
- av1_set_speed_features_framesize_dependent(cpi);
+ av1_set_speed_features_framesize_dependent(cpi, cpi->speed);
// Decide q and q bounds.
*q = av1_rc_pick_q_and_bounds(cpi, cm->width, cm->height, bottom_index,
@@ -3966,11 +3571,17 @@ static void set_size_dependent_vars(AV1_COMP *cpi, int *q, int *bottom_index,
static void init_motion_estimation(AV1_COMP *cpi) {
int y_stride = cpi->scaled_source.y_stride;
+ int y_stride_src = (cpi->oxcf.resize_mode || cpi->oxcf.superres_mode)
+ ? y_stride
+ : cpi->lookahead->buf->img.y_stride;
if (cpi->sf.mv.search_method == NSTEP) {
- av1_init3smotion_compensation(&cpi->ss_cfg, y_stride);
+ av1_init3smotion_compensation(&cpi->ss_cfg[SS_CFG_SRC], y_stride);
+ av1_init3smotion_compensation(&cpi->ss_cfg[SS_CFG_LOOKAHEAD], y_stride_src);
} else if (cpi->sf.mv.search_method == DIAMOND) {
- av1_init_dsmotion_compensation(&cpi->ss_cfg, y_stride);
+ av1_init_dsmotion_compensation(&cpi->ss_cfg[SS_CFG_SRC], y_stride);
+ av1_init_dsmotion_compensation(&cpi->ss_cfg[SS_CFG_LOOKAHEAD],
+ y_stride_src);
}
}
@@ -3999,10 +3610,9 @@ static void init_ref_frame_bufs(AV1_COMP *cpi) {
AV1_COMMON *const cm = &cpi->common;
int i;
BufferPool *const pool = cm->buffer_pool;
- cm->new_fb_idx = INVALID_IDX;
cm->cur_frame = NULL;
for (i = 0; i < REF_FRAMES; ++i) {
- cm->ref_frame_map[i] = INVALID_IDX;
+ cm->ref_frame_map[i] = NULL;
}
for (i = 0; i < FRAME_BUFFERS; ++i) {
pool->frame_bufs[i].ref_count = 0;
@@ -4064,7 +3674,7 @@ static int set_size_literal(AV1_COMP *cpi, int width, int height) {
return 0;
}
-static void set_frame_size(AV1_COMP *cpi, int width, int height) {
+void av1_set_frame_size(AV1_COMP *cpi, int width, int height) {
AV1_COMMON *const cm = &cpi->common;
const SequenceHeader *const seq_params = &cm->seq_params;
const int num_planes = av1_num_planes(cm);
@@ -4083,7 +3693,7 @@ static void set_frame_size(AV1_COMP *cpi, int width, int height) {
av1_set_target_rate(cpi, cm->width, cm->height);
}
- alloc_frame_mvs(cm, cm->new_fb_idx);
+ alloc_frame_mvs(cm, cm->cur_frame);
// Allocate above context buffers
if (cm->num_allocated_above_context_planes < av1_num_planes(cm) ||
@@ -4099,7 +3709,7 @@ static void set_frame_size(AV1_COMP *cpi, int width, int height) {
if (aom_realloc_frame_buffer(
&cm->cur_frame->buf, cm->width, cm->height, seq_params->subsampling_x,
seq_params->subsampling_y, seq_params->use_highbitdepth,
- AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL))
+ cpi->oxcf.border_in_pixels, cm->byte_alignment, NULL, NULL, NULL))
aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
"Failed to allocate frame buffer");
@@ -4116,20 +3726,13 @@ static void set_frame_size(AV1_COMP *cpi, int width, int height) {
init_motion_estimation(cpi);
for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
- RefBuffer *const ref_buf =
- &cm->current_frame.frame_refs[ref_frame - LAST_FRAME];
- const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
-
- if (buf_idx != INVALID_IDX) {
- RefCntBuffer *const buf = &cm->buffer_pool->frame_bufs[buf_idx];
- ref_buf->buf = buf;
- av1_setup_scale_factors_for_frame(&ref_buf->sf, buf->buf.y_crop_width,
+ RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
+ if (buf != NULL) {
+ struct scale_factors *sf = get_ref_scale_factors(cm, ref_frame);
+ av1_setup_scale_factors_for_frame(sf, buf->buf.y_crop_width,
buf->buf.y_crop_height, cm->width,
cm->height);
- if (av1_is_scaled(&ref_buf->sf))
- aom_extend_frame_borders(&buf->buf, num_planes);
- } else {
- ref_buf->buf = NULL;
+ if (av1_is_scaled(sf)) aom_extend_frame_borders(&buf->buf, num_planes);
}
}
@@ -4161,24 +3764,33 @@ static uint8_t calculate_next_resize_scale(const AV1_COMP *cpi) {
return new_denom;
}
-#define ENERGY_BY_Q2_THRESH 0.015
+#define ENERGY_BY_Q2_THRESH 0.01
+#define ENERGY_BY_AC_THRESH 0.2
static uint8_t get_superres_denom_from_qindex_energy(int qindex, double *energy,
- double thresh) {
+ double threshq,
+ double threshp) {
const double q = av1_convert_qindex_to_q(qindex, AOM_BITS_8);
- const double threshq2 = thresh * q * q;
+ const double tq = threshq * q * q;
+ const double tp = threshp * energy[1];
+ const double thresh = AOMMIN(tq, tp);
int k;
- for (k = 8; k > 0; --k) {
- if (energy[k - 1] > threshq2) break;
+ for (k = 16; k > 8; --k) {
+ if (energy[k - 1] > thresh) break;
}
- return 2 * SCALE_NUMERATOR - k;
+ return 3 * SCALE_NUMERATOR - k;
}
static uint8_t get_superres_denom_for_qindex(const AV1_COMP *cpi, int qindex) {
- double energy[8];
+ double energy[16];
analyze_hor_freq(cpi, energy);
- return get_superres_denom_from_qindex_energy(qindex, energy,
- ENERGY_BY_Q2_THRESH);
+ /*
+ printf("\nenergy = [");
+ for (int k = 1; k < 16; ++k) printf("%f, ", energy[k]);
+ printf("]\n");
+ */
+ return get_superres_denom_from_qindex_energy(
+ qindex, energy, ENERGY_BY_Q2_THRESH, ENERGY_BY_AC_THRESH);
}
static uint8_t calculate_next_superres_scale(AV1_COMP *cpi) {
@@ -4216,25 +3828,31 @@ static uint8_t calculate_next_superres_scale(AV1_COMP *cpi) {
const int qthresh = (frame_is_intra_only(&cpi->common))
? oxcf->superres_kf_qthresh
: oxcf->superres_qthresh;
- if (q < qthresh) {
+ if (q <= qthresh) {
new_denom = SCALE_NUMERATOR;
} else {
- // TODO(debargha): Experiment with the variant below.
- // new_denom = get_superres_denom_for_qindex(cpi, q);
- uint8_t max_denom = get_superres_denom_for_qindex(cpi, MAXQ);
- if (max_denom == SCALE_NUMERATOR) {
- new_denom = max_denom;
- break;
- } else {
- const uint8_t q_denom_step =
- max_denom - SCALE_NUMERATOR == 0
- ? 255
- : (MAXQ - qthresh + 1 + max_denom - SCALE_NUMERATOR - 1) /
- (max_denom - SCALE_NUMERATOR);
- const uint8_t additional_denom =
- (q - qthresh + 1 + q_denom_step - 1) / q_denom_step;
- new_denom = AOMMIN(SCALE_NUMERATOR + additional_denom, max_denom);
- }
+ new_denom = get_superres_denom_for_qindex(cpi, q);
+ }
+ break;
+ }
+ case SUPERRES_AUTO: {
+ // Don't use when screen content tools are used.
+ if (cpi->common.allow_screen_content_tools) break;
+ // Don't use for inter frames.
+ if (!frame_is_intra_only(&cpi->common)) break;
+ // Don't use for keyframes that can be used as references.
+ if (cpi->rc.frames_to_key != 1) break;
+
+ // Now decide the use of superres based on 'q'.
+ int bottom_index, top_index;
+ const int q = av1_rc_pick_q_and_bounds(
+ cpi, cpi->oxcf.width, cpi->oxcf.height, &bottom_index, &top_index);
+
+ const int qthresh = 128;
+ if (q <= qthresh) {
+ new_denom = SCALE_NUMERATOR;
+ } else {
+ new_denom = get_superres_denom_for_qindex(cpi, q);
}
break;
}
@@ -4311,7 +3929,7 @@ static int validate_size_scales(RESIZE_MODE resize_mode,
}
// Calculates resize and superres params for next frame
-size_params_type av1_calculate_next_size_params(AV1_COMP *cpi) {
+static size_params_type calculate_next_size_params(AV1_COMP *cpi) {
const AV1EncoderConfig *oxcf = &cpi->oxcf;
size_params_type rsz = { oxcf->width, oxcf->height, SCALE_NUMERATOR };
int resize_denom;
@@ -4334,7 +3952,8 @@ size_params_type av1_calculate_next_size_params(AV1_COMP *cpi) {
return rsz;
}
-static void setup_frame_size_from_params(AV1_COMP *cpi, size_params_type *rsz) {
+static void setup_frame_size_from_params(AV1_COMP *cpi,
+ const size_params_type *rsz) {
int encode_width = rsz->resize_width;
int encode_height = rsz->resize_height;
@@ -4344,12 +3963,17 @@ static void setup_frame_size_from_params(AV1_COMP *cpi, size_params_type *rsz) {
cm->superres_scale_denominator = rsz->superres_denom;
av1_calculate_scaled_superres_size(&encode_width, &encode_height,
rsz->superres_denom);
- set_frame_size(cpi, encode_width, encode_height);
+ av1_set_frame_size(cpi, encode_width, encode_height);
}
-static void setup_frame_size(AV1_COMP *cpi) {
- size_params_type rsz = av1_calculate_next_size_params(cpi);
+void av1_setup_frame_size(AV1_COMP *cpi) {
+ AV1_COMMON *cm = &cpi->common;
+ // Reset superres params from previous frame.
+ cm->superres_scale_denominator = SCALE_NUMERATOR;
+ const size_params_type rsz = calculate_next_size_params(cpi);
setup_frame_size_from_params(cpi, &rsz);
+
+ assert(is_min_tile_width_satisfied(cm));
}
static void superres_post_encode(AV1_COMP *cpi) {
@@ -4398,237 +4022,431 @@ static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) {
assert(IMPLIES(is_lossless_requested(&cpi->oxcf),
cm->coded_lossless && cm->all_lossless));
- const int no_loopfilter = cm->coded_lossless || cm->large_scale_tile;
- const int no_cdef =
- !cm->seq_params.enable_cdef || cm->coded_lossless || cm->large_scale_tile;
- const int no_restoration = !cm->seq_params.enable_restoration ||
- cm->all_lossless || cm->large_scale_tile;
+ const int use_loopfilter = !cm->coded_lossless && !cm->large_scale_tile;
+ const int use_cdef = cm->seq_params.enable_cdef && !cm->coded_lossless &&
+ !cm->large_scale_tile;
+ const int use_restoration = cm->seq_params.enable_restoration &&
+ !cm->all_lossless && !cm->large_scale_tile;
struct loopfilter *lf = &cm->lf;
- if (no_loopfilter) {
- lf->filter_level[0] = 0;
- lf->filter_level[1] = 0;
- } else {
- struct aom_usec_timer timer;
-
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, loop_filter_time);
+#endif
+ if (use_loopfilter) {
aom_clear_system_state();
-
- aom_usec_timer_start(&timer);
-
av1_pick_filter_level(cpi->source, cpi, cpi->sf.lpf_pick);
-
- aom_usec_timer_mark(&timer);
- cpi->time_pick_lpf += aom_usec_timer_elapsed(&timer);
+ } else {
+ lf->filter_level[0] = 0;
+ lf->filter_level[1] = 0;
}
if (lf->filter_level[0] || lf->filter_level[1]) {
if (cpi->num_workers > 1)
- av1_loop_filter_frame_mt(cm->frame_to_show, cm, xd, 0, num_planes, 0,
+ av1_loop_filter_frame_mt(&cm->cur_frame->buf, cm, xd, 0, num_planes, 0,
#if LOOP_FILTER_BITMASK
0,
#endif
cpi->workers, cpi->num_workers,
&cpi->lf_row_sync);
else
- av1_loop_filter_frame(cm->frame_to_show, cm, xd,
+ av1_loop_filter_frame(&cm->cur_frame->buf, cm, xd,
#if LOOP_FILTER_BITMASK
0,
#endif
0, num_planes, 0);
}
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, loop_filter_time);
+#endif
- if (!no_restoration)
- av1_loop_restoration_save_boundary_lines(cm->frame_to_show, cm, 0);
+ if (use_restoration)
+ av1_loop_restoration_save_boundary_lines(&cm->cur_frame->buf, cm, 0);
- if (no_cdef) {
- cm->cdef_info.cdef_bits = 0;
- cm->cdef_info.cdef_strengths[0] = 0;
- cm->cdef_info.nb_cdef_strengths = 1;
- cm->cdef_info.cdef_uv_strengths[0] = 0;
- } else {
+ if (use_cdef) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, cdef_time);
+#endif
// Find CDEF parameters
- av1_cdef_search(cm->frame_to_show, cpi->source, cm, xd,
+ av1_cdef_search(&cm->cur_frame->buf, cpi->source, cm, xd,
cpi->sf.fast_cdef_search);
// Apply the filter
- av1_cdef_frame(cm->frame_to_show, cm, xd);
+ av1_cdef_frame(&cm->cur_frame->buf, cm, xd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, cdef_time);
+#endif
+ } else {
+ cm->cdef_info.cdef_bits = 0;
+ cm->cdef_info.cdef_strengths[0] = 0;
+ cm->cdef_info.nb_cdef_strengths = 1;
+ cm->cdef_info.cdef_uv_strengths[0] = 0;
}
superres_post_encode(cpi);
- if (no_restoration) {
- cm->rst_info[0].frame_restoration_type = RESTORE_NONE;
- cm->rst_info[1].frame_restoration_type = RESTORE_NONE;
- cm->rst_info[2].frame_restoration_type = RESTORE_NONE;
- } else {
- av1_loop_restoration_save_boundary_lines(cm->frame_to_show, cm, 1);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, loop_restoration_time);
+#endif
+ if (use_restoration) {
+ av1_loop_restoration_save_boundary_lines(&cm->cur_frame->buf, cm, 1);
av1_pick_filter_restoration(cpi->source, cpi);
if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
if (cpi->num_workers > 1)
- av1_loop_restoration_filter_frame_mt(cm->frame_to_show, cm, 0,
+ av1_loop_restoration_filter_frame_mt(&cm->cur_frame->buf, cm, 0,
cpi->workers, cpi->num_workers,
&cpi->lr_row_sync, &cpi->lr_ctxt);
else
- av1_loop_restoration_filter_frame(cm->frame_to_show, cm, 0,
+ av1_loop_restoration_filter_frame(&cm->cur_frame->buf, cm, 0,
&cpi->lr_ctxt);
}
+ } else {
+ cm->rst_info[0].frame_restoration_type = RESTORE_NONE;
+ cm->rst_info[1].frame_restoration_type = RESTORE_NONE;
+ cm->rst_info[2].frame_restoration_type = RESTORE_NONE;
}
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, loop_restoration_time);
+#endif
}
-static int encode_without_recode_loop(AV1_COMP *cpi) {
+static void fix_interp_filter(InterpFilter *const interp_filter,
+ const FRAME_COUNTS *const counts) {
+ if (*interp_filter == SWITCHABLE) {
+ // Check to see if only one of the filters is actually used
+ int count[SWITCHABLE_FILTERS] = { 0 };
+ int num_filters_used = 0;
+ for (int i = 0; i < SWITCHABLE_FILTERS; ++i) {
+ for (int j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
+ count[i] += counts->switchable_interp[j][i];
+ num_filters_used += (count[i] > 0);
+ }
+ if (num_filters_used == 1) {
+ // Only one filter is used. So set the filter at frame level
+ for (int i = 0; i < SWITCHABLE_FILTERS; ++i) {
+ if (count[i]) {
+ if (i == EIGHTTAP_REGULAR) *interp_filter = i;
+ break;
+ }
+ }
+ }
+ }
+}
+
+static void finalize_encoded_frame(AV1_COMP *const cpi) {
AV1_COMMON *const cm = &cpi->common;
- int q = 0, bottom_index = 0, top_index = 0; // Dummy variables.
+ CurrentFrame *const current_frame = &cm->current_frame;
- aom_clear_system_state();
+ if (!cm->seq_params.reduced_still_picture_hdr &&
+ encode_show_existing_frame(cm)) {
+ RefCntBuffer *const frame_to_show =
+ cm->ref_frame_map[cpi->existing_fb_idx_to_show];
- set_size_independent_vars(cpi);
+ if (frame_to_show == NULL) {
+ aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "Buffer does not contain a reconstructed frame");
+ }
+ assert(frame_to_show->ref_count > 0);
+ assign_frame_buffer_p(&cm->cur_frame, frame_to_show);
+ }
- setup_frame_size(cpi);
+ if (!encode_show_existing_frame(cm) &&
+ cm->seq_params.film_grain_params_present &&
+ (cm->show_frame || cm->showable_frame)) {
+ // Copy the current frame's film grain params to the its corresponding
+ // RefCntBuffer slot.
+ cm->cur_frame->film_grain_params = cm->film_grain_params;
- assert(cm->width == cpi->scaled_source.y_crop_width);
- assert(cm->height == cpi->scaled_source.y_crop_height);
+ // We must update the parameters if this is not an INTER_FRAME
+ if (current_frame->frame_type != INTER_FRAME)
+ cm->cur_frame->film_grain_params.update_parameters = 1;
- set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
+ // Iterate the random seed for the next frame.
+ cm->film_grain_params.random_seed += 3381;
+ if (cm->film_grain_params.random_seed == 0)
+ cm->film_grain_params.random_seed = 7391;
+ }
- cpi->source =
- av1_scale_if_required(cm, cpi->unscaled_source, &cpi->scaled_source);
- if (cpi->unscaled_last_source != NULL)
- cpi->last_source = av1_scale_if_required(cm, cpi->unscaled_last_source,
- &cpi->scaled_last_source);
- cpi->source->buf_8bit_valid = 0;
- if (frame_is_intra_only(cm) == 0) {
- scale_references(cpi);
+ // Initialise all tiles' contexts from the global frame context
+ for (int tile_col = 0; tile_col < cm->tile_cols; tile_col++) {
+ for (int tile_row = 0; tile_row < cm->tile_rows; tile_row++) {
+ const int tile_idx = tile_row * cm->tile_cols + tile_col;
+ cpi->tile_data[tile_idx].tctx = *cm->fc;
+ }
}
- av1_set_quantizer(cm, q);
- setup_frame(cpi);
- suppress_active_map(cpi);
+ fix_interp_filter(&cm->interp_filter, cpi->td.counts);
+}
+
+static int get_regulated_q_overshoot(AV1_COMP *const cpi, int q_low, int q_high,
+ int top_index, int bottom_index) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+
+ av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
+
+ int q_regulated =
+ av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+ AOMMAX(q_high, top_index), cm->width, cm->height);
- // Variance adaptive and in frame q adjustment experiments are mutually
- // exclusive.
- if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
- av1_vaq_frame_setup(cpi);
- } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
- av1_setup_in_frame_q_adj(cpi);
- } else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
- av1_cyclic_refresh_setup(cpi);
+ int retries = 0;
+ while (q_regulated < q_low && retries < 10) {
+ av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
+ q_regulated =
+ av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+ AOMMAX(q_high, top_index), cm->width, cm->height);
+ retries++;
}
- apply_active_map(cpi);
- if (cm->seg.enabled) {
- if (!cm->seg.update_data && cm->prev_frame) {
- segfeatures_copy(&cm->seg, &cm->prev_frame->seg);
+ return q_regulated;
+}
+
+static int get_regulated_q_undershoot(AV1_COMP *const cpi, int q_high,
+ int top_index, int bottom_index) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+
+ av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
+ int q_regulated = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+ top_index, cm->width, cm->height);
+
+ int retries = 0;
+ while (q_regulated > q_high && retries < 10) {
+ av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
+ q_regulated = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+ top_index, cm->width, cm->height);
+ retries++;
+ }
+ return q_regulated;
+}
+
+// Called after encode_with_recode_loop() has just encoded a frame and packed
+// its bitstream. This function works out whether we under- or over-shot
+// our bitrate target and adjusts q as appropriate. Also decides whether
+// or not we should do another recode loop, indicated by *loop
+static void recode_loop_update_q(AV1_COMP *const cpi, int *const loop,
+ int *const q, int *const q_low,
+ int *const q_high, const int top_index,
+ const int bottom_index,
+ int *const undershoot_seen,
+ int *const overshoot_seen,
+ const int loop_at_this_size) {
+ AV1_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+
+ int frame_over_shoot_limit = 0, frame_under_shoot_limit = 0;
+ av1_rc_compute_frame_size_bounds(cpi, rc->this_frame_target,
+ &frame_under_shoot_limit,
+ &frame_over_shoot_limit);
+ if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1;
+
+ if ((cm->current_frame.frame_type == KEY_FRAME) &&
+ rc->this_key_frame_forced &&
+ (rc->projected_frame_size < rc->max_frame_bandwidth)) {
+ int last_q = *q;
+ int64_t kf_err;
+
+ int64_t high_err_target = cpi->ambient_err;
+ int64_t low_err_target = cpi->ambient_err >> 1;
+
+ if (cm->seq_params.use_highbitdepth) {
+ kf_err = aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf);
} else {
- calculate_segdata(&cm->seg);
+ kf_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf);
+ }
+ // Prevent possible divide by zero error below for perfect KF
+ kf_err += !kf_err;
+
+ // The key frame is not good enough or we can afford
+ // to make it better without undue risk of popping.
+ if ((kf_err > high_err_target &&
+ rc->projected_frame_size <= frame_over_shoot_limit) ||
+ (kf_err > low_err_target &&
+ rc->projected_frame_size <= frame_under_shoot_limit)) {
+ // Lower q_high
+ *q_high = *q > *q_low ? *q - 1 : *q_low;
+
+ // Adjust Q
+ *q = (int)((*q * high_err_target) / kf_err);
+ *q = AOMMIN(*q, (*q_high + *q_low) >> 1);
+ } else if (kf_err < low_err_target &&
+ rc->projected_frame_size >= frame_under_shoot_limit) {
+ // The key frame is much better than the previous frame
+ // Raise q_low
+ *q_low = *q < *q_high ? *q + 1 : *q_high;
+
+ // Adjust Q
+ *q = (int)((*q * low_err_target) / kf_err);
+ *q = AOMMIN(*q, (*q_high + *q_low + 1) >> 1);
}
- } else {
- memset(&cm->seg, 0, sizeof(cm->seg));
- }
- segfeatures_copy(&cm->cur_frame->seg, &cm->seg);
- // transform / motion compensation build reconstruction frame
- av1_encode_frame(cpi);
+ // Clamp Q to upper and lower limits:
+ *q = clamp(*q, *q_low, *q_high);
+
+ *loop = *q != last_q;
+ } else if (recode_loop_test(cpi, frame_over_shoot_limit,
+ frame_under_shoot_limit, *q,
+ AOMMAX(*q_high, top_index), bottom_index)) {
+ // Is the projected frame size out of range and are we allowed
+ // to attempt to recode.
+ int last_q = *q;
+
+ // Frame size out of permitted range:
+ // Update correction factor & compute new Q to try...
+ // Frame is too large
+ if (rc->projected_frame_size > rc->this_frame_target) {
+ // Special case if the projected size is > the max allowed.
+ if (rc->projected_frame_size >= rc->max_frame_bandwidth)
+ *q_high = rc->worst_quality;
+
+ // Raise Qlow as to at least the current value
+ *q_low = *q < *q_high ? *q + 1 : *q_high;
+
+ if (*undershoot_seen || loop_at_this_size > 2 ||
+ (loop_at_this_size == 2 && !frame_is_intra_only(cm))) {
+ av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
+
+ *q = (*q_high + *q_low + 1) / 2;
+ } else if (loop_at_this_size == 2 && frame_is_intra_only(cm)) {
+ const int q_mid = (*q_high + *q_low + 1) / 2;
+ const int q_regulated = get_regulated_q_overshoot(
+ cpi, *q_low, *q_high, top_index, bottom_index);
+ // Get 'q' in-between 'q_mid' and 'q_regulated' for a smooth
+ // transition between loop_at_this_size < 2 and loop_at_this_size > 2.
+ *q = (q_mid + q_regulated + 1) / 2;
+ } else {
+ *q = get_regulated_q_overshoot(cpi, *q_low, *q_high, top_index,
+ bottom_index);
+ }
- // Update some stats from cyclic refresh, and check if we should not update
- // golden reference, for 1 pass CBR.
- if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
- cm->current_frame.frame_type != KEY_FRAME &&
- (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == AOM_CBR))
- av1_cyclic_refresh_check_golden_update(cpi);
+ *overshoot_seen = 1;
+ } else {
+ // Frame is too small
+ *q_high = *q > *q_low ? *q - 1 : *q_low;
+
+ if (*overshoot_seen || loop_at_this_size > 2 ||
+ (loop_at_this_size == 2 && !frame_is_intra_only(cm))) {
+ av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
+ *q = (*q_high + *q_low) / 2;
+ } else if (loop_at_this_size == 2 && frame_is_intra_only(cm)) {
+ const int q_mid = (*q_high + *q_low) / 2;
+ const int q_regulated =
+ get_regulated_q_undershoot(cpi, *q_high, top_index, bottom_index);
+ // Get 'q' in-between 'q_mid' and 'q_regulated' for a smooth
+ // transition between loop_at_this_size < 2 and loop_at_this_size > 2.
+ *q = (q_mid + q_regulated) / 2;
+
+ // Special case reset for qlow for constrained quality.
+ // This should only trigger where there is very substantial
+ // undershoot on a frame and the auto cq level is above
+ // the user passsed in value.
+ if (cpi->oxcf.rc_mode == AOM_CQ && q_regulated < *q_low) {
+ *q_low = *q;
+ }
+ } else {
+ *q = get_regulated_q_undershoot(cpi, *q_high, top_index, bottom_index);
+
+ // Special case reset for qlow for constrained quality.
+ // This should only trigger where there is very substantial
+ // undershoot on a frame and the auto cq level is above
+ // the user passsed in value.
+ if (cpi->oxcf.rc_mode == AOM_CQ && *q < *q_low) {
+ *q_low = *q;
+ }
+ }
- // Update the skip mb flag probabilities based on the distribution
- // seen in the last encoder iteration.
- // update_base_skip_probs(cpi);
- aom_clear_system_state();
- return AOM_CODEC_OK;
+ *undershoot_seen = 1;
+ }
+
+ // Clamp Q to upper and lower limits:
+ *q = clamp(*q, *q_low, *q_high);
+
+ *loop = (*q != last_q);
+ } else {
+ *loop = 0;
+ }
}
static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) {
AV1_COMMON *const cm = &cpi->common;
RATE_CONTROL *const rc = &cpi->rc;
- int bottom_index, top_index;
- int loop_count = 0;
- int loop_at_this_size = 0;
- int loop = 0;
- int overshoot_seen = 0;
- int undershoot_seen = 0;
- int frame_over_shoot_limit;
- int frame_under_shoot_limit;
- int q = 0, q_low = 0, q_high = 0;
+ const int allow_recode = cpi->sf.recode_loop != DISALLOW_RECODE;
set_size_independent_vars(cpi);
cpi->source->buf_8bit_valid = 0;
- aom_clear_system_state();
+ av1_setup_frame_size(cpi);
- setup_frame_size(cpi);
+ int top_index = 0, bottom_index = 0;
+ int q = 0, q_low = 0, q_high = 0;
set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
+ q_low = bottom_index;
+ q_high = top_index;
+ // Loop variables
+ int loop_count = 0;
+ int loop_at_this_size = 0;
+ int loop = 0;
+ int overshoot_seen = 0;
+ int undershoot_seen = 0;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ printf("\n Encoding a frame:");
+#endif
do {
aom_clear_system_state();
- if (loop_count == 0) {
- // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed.
- set_mv_search_params(cpi);
-
- // Reset the loop state for new frame size.
- overshoot_seen = 0;
- undershoot_seen = 0;
-
- q_low = bottom_index;
- q_high = top_index;
-
- loop_at_this_size = 0;
-
- // Decide frame size bounds first time through.
- av1_rc_compute_frame_size_bounds(cpi, rc->this_frame_target,
- &frame_under_shoot_limit,
- &frame_over_shoot_limit);
- }
-
// if frame was scaled calculate global_motion_search again if already
// done
- if (loop_count > 0 && cpi->source && cpi->global_motion_search_done)
+ if (loop_count > 0 && cpi->source && cpi->global_motion_search_done) {
if (cpi->source->y_crop_width != cm->width ||
- cpi->source->y_crop_height != cm->height)
+ cpi->source->y_crop_height != cm->height) {
cpi->global_motion_search_done = 0;
+ }
+ }
cpi->source =
av1_scale_if_required(cm, cpi->unscaled_source, &cpi->scaled_source);
- if (cpi->unscaled_last_source != NULL)
+ if (cpi->unscaled_last_source != NULL) {
cpi->last_source = av1_scale_if_required(cm, cpi->unscaled_last_source,
&cpi->scaled_last_source);
+ }
- if (frame_is_intra_only(cm) == 0) {
+ if (!frame_is_intra_only(cm)) {
if (loop_count > 0) {
release_scaled_references(cpi);
}
scale_references(cpi);
}
av1_set_quantizer(cm, q);
+ av1_init_quantizer(cpi);
+
+ av1_set_variance_partition_thresholds(cpi, q, 0);
+
// printf("Frame %d/%d: q = %d, frame_type = %d superres_denom = %d\n",
// cm->current_frame.frame_number, cm->show_frame, q,
// cm->current_frame.frame_type, cm->superres_scale_denominator);
- if (loop_count == 0) setup_frame(cpi);
-
- // Base q-index may have changed, so we need to assign proper default coef
- // probs before every iteration.
- if (cm->primary_ref_frame == PRIMARY_REF_NONE ||
- cm->current_frame.frame_refs[cm->primary_ref_frame].buf == NULL) {
+ if (loop_count == 0) {
+ setup_frame(cpi);
+ } else if (get_primary_ref_frame_buf(cm) == NULL) {
+ // Base q-index may have changed, so we need to assign proper default coef
+ // probs before every iteration.
av1_default_coef_probs(cm);
av1_setup_frame_contexts(cm);
}
- // Variance adaptive and in frame q adjustment experiments are mutually
- // exclusive.
if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
av1_vaq_frame_setup(cpi);
} else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
av1_setup_in_frame_q_adj(cpi);
+ } else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && !allow_recode) {
+ suppress_active_map(cpi);
+ av1_cyclic_refresh_setup(cpi);
+ apply_active_map(cpi);
}
+
if (cm->seg.enabled) {
if (!cm->seg.update_data && cm->prev_frame) {
segfeatures_copy(&cm->seg, &cm->prev_frame->seg);
@@ -4640,13 +4458,15 @@ static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) {
}
segfeatures_copy(&cm->cur_frame->seg, &cm->seg);
+ if (allow_recode) save_coding_context(cpi);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, av1_encode_frame_time);
+#endif
// transform / motion compensation build reconstruction frame
- save_coding_context(cpi);
av1_encode_frame(cpi);
-
- // Update the skip mb flag probabilities based on the distribution
- // seen in the last encoder iteration.
- // update_base_skip_probs(cpi);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, av1_encode_frame_time);
+#endif
aom_clear_system_state();
@@ -4656,141 +4476,20 @@ static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) {
if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) {
restore_coding_context(cpi);
- if (av1_pack_bitstream(cpi, dest, size) != AOM_CODEC_OK)
+ finalize_encoded_frame(cpi);
+ int largest_tile_id = 0; // Output from bitstream: unused here
+ if (av1_pack_bitstream(cpi, dest, size, &largest_tile_id) != AOM_CODEC_OK)
return AOM_CODEC_ERROR;
rc->projected_frame_size = (int)(*size) << 3;
restore_coding_context(cpi);
-
- if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1;
}
- if (cpi->oxcf.rc_mode == AOM_Q) {
- loop = 0;
- } else {
- if ((cm->current_frame.frame_type == KEY_FRAME) &&
- rc->this_key_frame_forced &&
- (rc->projected_frame_size < rc->max_frame_bandwidth)) {
- int last_q = q;
- int64_t kf_err;
-
- int64_t high_err_target = cpi->ambient_err;
- int64_t low_err_target = cpi->ambient_err >> 1;
-
- if (cm->seq_params.use_highbitdepth) {
- kf_err = aom_highbd_get_y_sse(cpi->source, &cm->cur_frame->buf);
- } else {
- kf_err = aom_get_y_sse(cpi->source, &cm->cur_frame->buf);
- }
- // Prevent possible divide by zero error below for perfect KF
- kf_err += !kf_err;
-
- // The key frame is not good enough or we can afford
- // to make it better without undue risk of popping.
- if ((kf_err > high_err_target &&
- rc->projected_frame_size <= frame_over_shoot_limit) ||
- (kf_err > low_err_target &&
- rc->projected_frame_size <= frame_under_shoot_limit)) {
- // Lower q_high
- q_high = q > q_low ? q - 1 : q_low;
-
- // Adjust Q
- q = (int)((q * high_err_target) / kf_err);
- q = AOMMIN(q, (q_high + q_low) >> 1);
- } else if (kf_err < low_err_target &&
- rc->projected_frame_size >= frame_under_shoot_limit) {
- // The key frame is much better than the previous frame
- // Raise q_low
- q_low = q < q_high ? q + 1 : q_high;
-
- // Adjust Q
- q = (int)((q * low_err_target) / kf_err);
- q = AOMMIN(q, (q_high + q_low + 1) >> 1);
- }
-
- // Clamp Q to upper and lower limits:
- q = clamp(q, q_low, q_high);
-
- loop = q != last_q;
- } else if (recode_loop_test(cpi, frame_over_shoot_limit,
- frame_under_shoot_limit, q,
- AOMMAX(q_high, top_index), bottom_index)) {
- // Is the projected frame size out of range and are we allowed
- // to attempt to recode.
- int last_q = q;
- int retries = 0;
-
- // Frame size out of permitted range:
- // Update correction factor & compute new Q to try...
- // Frame is too large
- if (rc->projected_frame_size > rc->this_frame_target) {
- // Special case if the projected size is > the max allowed.
- if (rc->projected_frame_size >= rc->max_frame_bandwidth)
- q_high = rc->worst_quality;
-
- // Raise Qlow as to at least the current value
- q_low = q < q_high ? q + 1 : q_high;
-
- if (undershoot_seen || loop_at_this_size > 1) {
- // Update rate_correction_factor unless
- av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
-
- q = (q_high + q_low + 1) / 2;
- } else {
- // Update rate_correction_factor unless
- av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
-
- q = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
- AOMMAX(q_high, top_index), cm->width,
- cm->height);
-
- while (q < q_low && retries < 10) {
- av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
- q = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
- AOMMAX(q_high, top_index), cm->width,
- cm->height);
- retries++;
- }
- }
-
- overshoot_seen = 1;
- } else {
- // Frame is too small
- q_high = q > q_low ? q - 1 : q_low;
-
- if (overshoot_seen || loop_at_this_size > 1) {
- av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
- q = (q_high + q_low) / 2;
- } else {
- av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
- q = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
- top_index, cm->width, cm->height);
- // Special case reset for qlow for constrained quality.
- // This should only trigger where there is very substantial
- // undershoot on a frame and the auto cq level is above
- // the user passsed in value.
- if (cpi->oxcf.rc_mode == AOM_CQ && q < q_low) {
- q_low = q;
- }
-
- while (q > q_high && retries < 10) {
- av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
- q = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
- top_index, cm->width, cm->height);
- retries++;
- }
- }
-
- undershoot_seen = 1;
- }
-
- // Clamp Q to upper and lower limits:
- q = clamp(q, q_low, q_high);
-
- loop = (q != last_q);
- } else {
- loop = 0;
- }
+ if (allow_recode && cpi->oxcf.rc_mode != AOM_Q) {
+ // Update q and decide whether to do a recode loop
+ recode_loop_update_q(cpi, &loop, &q, &q_low, &q_high, top_index,
+ bottom_index, &undershoot_seen, &overshoot_seen,
+ loop_at_this_size);
}
// Special case for overlay frame.
@@ -4798,8 +4497,9 @@ static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) {
rc->projected_frame_size < rc->max_frame_bandwidth)
loop = 0;
- if (!cpi->sf.gm_disable_recode) {
- if (recode_loop_test_global_motion(cpi)) loop = 1;
+ if (allow_recode && !cpi->sf.gm_disable_recode &&
+ recode_loop_test_global_motion(cpi)) {
+ loop = 1;
}
if (loop) {
@@ -4810,127 +4510,14 @@ static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) {
++cpi->tot_recode_hits;
#endif
}
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ if (loop) printf("\n Recoding:");
+#endif
} while (loop);
return AOM_CODEC_OK;
}
-static int get_ref_frame_flags(const AV1_COMP *cpi) {
- const int *const map = cpi->common.ref_frame_map;
-
- // No.1 Priority: LAST_FRAME
- const int last2_is_last =
- map[cpi->remapped_ref_idx[1]] == map[cpi->remapped_ref_idx[0]];
- const int last3_is_last =
- map[cpi->remapped_ref_idx[2]] == map[cpi->remapped_ref_idx[0]];
- const int gld_is_last = map[get_ref_frame_map_idx(cpi, GOLDEN_FRAME)] ==
- map[cpi->remapped_ref_idx[0]];
- const int bwd_is_last = map[get_ref_frame_map_idx(cpi, BWDREF_FRAME)] ==
- map[cpi->remapped_ref_idx[0]];
- const int alt2_is_last = map[get_ref_frame_map_idx(cpi, ALTREF2_FRAME)] ==
- map[cpi->remapped_ref_idx[0]];
- const int alt_is_last = map[get_ref_frame_map_idx(cpi, ALTREF_FRAME)] ==
- map[cpi->remapped_ref_idx[0]];
-
- // No.2 Priority: ALTREF_FRAME
- const int last2_is_alt = map[cpi->remapped_ref_idx[1]] ==
- map[get_ref_frame_map_idx(cpi, ALTREF_FRAME)];
- const int last3_is_alt = map[cpi->remapped_ref_idx[2]] ==
- map[get_ref_frame_map_idx(cpi, ALTREF_FRAME)];
- const int gld_is_alt = map[get_ref_frame_map_idx(cpi, GOLDEN_FRAME)] ==
- map[get_ref_frame_map_idx(cpi, ALTREF_FRAME)];
- const int bwd_is_alt = map[get_ref_frame_map_idx(cpi, BWDREF_FRAME)] ==
- map[get_ref_frame_map_idx(cpi, ALTREF_FRAME)];
- const int alt2_is_alt = map[get_ref_frame_map_idx(cpi, ALTREF2_FRAME)] ==
- map[get_ref_frame_map_idx(cpi, ALTREF_FRAME)];
-
- // No.3 Priority: LAST2_FRAME
- const int last3_is_last2 =
- map[cpi->remapped_ref_idx[2]] == map[cpi->remapped_ref_idx[1]];
- const int gld_is_last2 = map[get_ref_frame_map_idx(cpi, GOLDEN_FRAME)] ==
- map[cpi->remapped_ref_idx[1]];
- const int bwd_is_last2 = map[get_ref_frame_map_idx(cpi, BWDREF_FRAME)] ==
- map[cpi->remapped_ref_idx[1]];
- const int alt2_is_last2 = map[get_ref_frame_map_idx(cpi, ALTREF2_FRAME)] ==
- map[cpi->remapped_ref_idx[1]];
-
- // No.4 Priority: LAST3_FRAME
- const int gld_is_last3 = map[get_ref_frame_map_idx(cpi, GOLDEN_FRAME)] ==
- map[cpi->remapped_ref_idx[2]];
- const int bwd_is_last3 = map[get_ref_frame_map_idx(cpi, BWDREF_FRAME)] ==
- map[cpi->remapped_ref_idx[2]];
- const int alt2_is_last3 = map[get_ref_frame_map_idx(cpi, ALTREF2_FRAME)] ==
- map[cpi->remapped_ref_idx[2]];
-
- // No.5 Priority: GOLDEN_FRAME
- const int bwd_is_gld = map[get_ref_frame_map_idx(cpi, BWDREF_FRAME)] ==
- map[get_ref_frame_map_idx(cpi, GOLDEN_FRAME)];
- const int alt2_is_gld = map[get_ref_frame_map_idx(cpi, ALTREF2_FRAME)] ==
- map[get_ref_frame_map_idx(cpi, GOLDEN_FRAME)];
-
- // No.6 Priority: BWDREF_FRAME
- const int alt2_is_bwd = map[get_ref_frame_map_idx(cpi, ALTREF2_FRAME)] ==
- map[get_ref_frame_map_idx(cpi, BWDREF_FRAME)];
-
- // No.7 Priority: ALTREF2_FRAME
-
- // After av1_apply_encoding_flags() is called, cpi->ref_frame_flags might be
- // adjusted according to external encoder flags.
- int flags = cpi->ext_ref_frame_flags;
-
- if (cpi->rc.frames_till_gf_update_due == INT_MAX) flags &= ~AOM_GOLD_FLAG;
-
- if (alt_is_last) flags &= ~AOM_ALT_FLAG;
-
- if (last2_is_last || last2_is_alt) flags &= ~AOM_LAST2_FLAG;
-
- if (last3_is_last || last3_is_alt || last3_is_last2) flags &= ~AOM_LAST3_FLAG;
-
- if (gld_is_last || gld_is_alt || gld_is_last2 || gld_is_last3)
- flags &= ~AOM_GOLD_FLAG;
-
- if ((bwd_is_last || bwd_is_alt || bwd_is_last2 || bwd_is_last3 ||
- bwd_is_gld) &&
- (flags & AOM_BWD_FLAG))
- flags &= ~AOM_BWD_FLAG;
-
- if ((alt2_is_last || alt2_is_alt || alt2_is_last2 || alt2_is_last3 ||
- alt2_is_gld || alt2_is_bwd) &&
- (flags & AOM_ALT2_FLAG))
- flags &= ~AOM_ALT2_FLAG;
-
- return flags;
-}
-
-static void set_ext_overrides(AV1_COMP *cpi) {
- // Overrides the defaults with the externally supplied values with
- // av1_update_reference() and av1_update_entropy() calls
- // Note: The overrides are valid only for the next frame passed
- // to encode_frame_to_data_rate() function
- if (cpi->ext_use_s_frame) cpi->common.current_frame.frame_type = S_FRAME;
- cpi->common.force_primary_ref_none = cpi->ext_use_primary_ref_none;
-
- if (cpi->ext_refresh_frame_context_pending) {
- cpi->common.refresh_frame_context = cpi->ext_refresh_frame_context;
- cpi->ext_refresh_frame_context_pending = 0;
- }
- if (cpi->ext_refresh_frame_flags_pending) {
- cpi->refresh_last_frame = cpi->ext_refresh_last_frame;
- cpi->refresh_golden_frame = cpi->ext_refresh_golden_frame;
- cpi->refresh_alt_ref_frame = cpi->ext_refresh_alt_ref_frame;
- cpi->refresh_bwd_ref_frame = cpi->ext_refresh_bwd_ref_frame;
- cpi->refresh_alt2_ref_frame = cpi->ext_refresh_alt2_ref_frame;
- cpi->ext_refresh_frame_flags_pending = 0;
- }
- cpi->common.allow_ref_frame_mvs = cpi->ext_use_ref_frame_mvs;
- // A keyframe is already error resilient and keyframes with
- // error_resilient_mode interferes with the use of show_existing_frame
- // when forward reference keyframes are enabled.
- cpi->common.error_resilient_mode =
- cpi->ext_use_error_resilient &&
- cpi->common.current_frame.frame_type != KEY_FRAME;
-}
-
#define DUMP_RECON_FRAMES 0
#if DUMP_RECON_FRAMES == 1
@@ -4938,7 +4525,7 @@ static void set_ext_overrides(AV1_COMP *cpi) {
static void dump_filtered_recon_frames(AV1_COMP *cpi) {
AV1_COMMON *const cm = &cpi->common;
const CurrentFrame *const current_frame = &cm->current_frame;
- const YV12_BUFFER_CONFIG *recon_buf = cm->frame_to_show;
+ const YV12_BUFFER_CONFIG *recon_buf = &cm->cur_frame->buf;
if (recon_buf == NULL) {
printf("Frame %d is not ready.\n", current_frame->frame_number);
@@ -4960,12 +4547,10 @@ static void dump_filtered_recon_frames(AV1_COMP *cpi) {
current_frame->frame_number, current_frame->order_hint, cm->show_frame,
cm->show_existing_frame);
for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
- RefBuffer *buf = &cm->current_frame.frame_refs[ref_frame - LAST_FRAME];
- const int ref_offset = (buf->buf) ? (int)buf->buf->order_hint : -1;
- printf(" %d(%c-%d-%4.2f)", ref_offset,
- (cpi->ref_frame_flags & flag_list[ref_frame]) ? 'Y' : 'N',
- (buf->buf) ? (int)buf->buf->frame_rf_level : -1,
- (buf->buf) ? rate_factor_deltas[buf->buf->frame_rf_level] : -1);
+ const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
+ const int ref_offset = buf != NULL ? (int)buf->order_hint : -1;
+ printf(" %d(%c)", ref_offset,
+ (cpi->ref_frame_flags & flag_list[ref_frame]) ? 'Y' : 'N');
}
printf(" ]\n");
@@ -4993,25 +4578,18 @@ static void dump_filtered_recon_frames(AV1_COMP *cpi) {
printf(
"\nFrame=%5d, encode_update_type[%5d]=%1d, frame_offset=%d, "
"show_frame=%d, show_existing_frame=%d, source_alt_ref_active=%d, "
- "refresh_alt_ref_frame=%d, rf_level=%d, "
+ "refresh_alt_ref_frame=%d, "
"y_stride=%4d, uv_stride=%4d, cm->width=%4d, cm->height=%4d\n\n",
current_frame->frame_number, cpi->twopass.gf_group.index,
cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index],
current_frame->order_hint, cm->show_frame, cm->show_existing_frame,
cpi->rc.source_alt_ref_active, cpi->refresh_alt_ref_frame,
- cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index],
recon_buf->y_stride, recon_buf->uv_stride, cm->width, cm->height);
#if 0
int ref_frame;
printf("get_ref_frame_map_idx: [");
for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame)
- printf(" %d", get_ref_frame_map_idx(cpi, ref_frame));
- printf(" ]\n");
- printf("cm->new_fb_idx = %d\n", cm->new_fb_idx);
- printf("cm->ref_frame_map = [");
- for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
- printf(" %d", cm->ref_frame_map[ref_frame - LAST_FRAME]);
- }
+ printf(" %d", get_ref_frame_map_idx(cm, ref_frame));
printf(" ]\n");
#endif // 0
@@ -5035,31 +4613,209 @@ static void dump_filtered_recon_frames(AV1_COMP *cpi) {
}
#endif // DUMP_RECON_FRAMES
-static INLINE int is_frame_droppable(AV1_COMP *cpi) {
- return !(cpi->refresh_alt_ref_frame || cpi->refresh_alt2_ref_frame ||
- cpi->refresh_bwd_ref_frame || cpi->refresh_golden_frame ||
- cpi->refresh_last_frame);
+static int get_interp_filter_selected(const AV1_COMMON *const cm,
+ MV_REFERENCE_FRAME ref,
+ InterpFilters ifilter) {
+ const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref);
+ if (buf == NULL) return 0;
+ return buf->interp_filter_selected[ifilter];
+}
+
+static int setup_interp_filter_search_mask(AV1_COMP *cpi) {
+ const AV1_COMMON *const cm = &cpi->common;
+ int ref_total[REF_FRAMES] = { 0 };
+
+ if (cpi->common.last_frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame)
+ return 0;
+
+ for (MV_REFERENCE_FRAME ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref) {
+ for (InterpFilters ifilter = EIGHTTAP_REGULAR; ifilter <= MULTITAP_SHARP;
+ ++ifilter) {
+ ref_total[ref] += get_interp_filter_selected(cm, ref, ifilter);
+ }
+ }
+ int ref_total_total = (ref_total[LAST2_FRAME] + ref_total[LAST3_FRAME] +
+ ref_total[GOLDEN_FRAME] + ref_total[BWDREF_FRAME] +
+ ref_total[ALTREF2_FRAME] + ref_total[ALTREF_FRAME]);
+
+ int mask = 0;
+ for (InterpFilters ifilter = EIGHTTAP_REGULAR; ifilter <= MULTITAP_SHARP;
+ ++ifilter) {
+ int last_score = get_interp_filter_selected(cm, LAST_FRAME, ifilter) * 30;
+ if (ref_total[LAST_FRAME] && last_score <= ref_total[LAST_FRAME]) {
+ int filter_score =
+ get_interp_filter_selected(cm, LAST2_FRAME, ifilter) * 20 +
+ get_interp_filter_selected(cm, LAST3_FRAME, ifilter) * 20 +
+ get_interp_filter_selected(cm, GOLDEN_FRAME, ifilter) * 20 +
+ get_interp_filter_selected(cm, BWDREF_FRAME, ifilter) * 10 +
+ get_interp_filter_selected(cm, ALTREF2_FRAME, ifilter) * 10 +
+ get_interp_filter_selected(cm, ALTREF_FRAME, ifilter) * 10;
+ if (filter_score < ref_total_total) mask |= 1 << ifilter;
+ }
+ }
+ return mask;
+}
+
+static int is_integer_mv(AV1_COMP *cpi, const YV12_BUFFER_CONFIG *cur_picture,
+ const YV12_BUFFER_CONFIG *last_picture,
+ hash_table *last_hash_table) {
+ aom_clear_system_state();
+ // check use hash ME
+ int k;
+ uint32_t hash_value_1;
+ uint32_t hash_value_2;
+
+ const int block_size = 8;
+ const double threshold_current = 0.8;
+ const double threshold_average = 0.95;
+ const int max_history_size = 32;
+ int T = 0; // total block
+ int C = 0; // match with collocated block
+ int S = 0; // smooth region but not match with collocated block
+ int M = 0; // match with other block
+
+ const int pic_width = cur_picture->y_width;
+ const int pic_height = cur_picture->y_height;
+ for (int i = 0; i + block_size <= pic_height; i += block_size) {
+ for (int j = 0; j + block_size <= pic_width; j += block_size) {
+ const int x_pos = j;
+ const int y_pos = i;
+ int match = 1;
+ T++;
+
+ // check whether collocated block match with current
+ uint8_t *p_cur = cur_picture->y_buffer;
+ uint8_t *p_ref = last_picture->y_buffer;
+ int stride_cur = cur_picture->y_stride;
+ int stride_ref = last_picture->y_stride;
+ p_cur += (y_pos * stride_cur + x_pos);
+ p_ref += (y_pos * stride_ref + x_pos);
+
+ if (cur_picture->flags & YV12_FLAG_HIGHBITDEPTH) {
+ uint16_t *p16_cur = CONVERT_TO_SHORTPTR(p_cur);
+ uint16_t *p16_ref = CONVERT_TO_SHORTPTR(p_ref);
+ for (int tmpY = 0; tmpY < block_size && match; tmpY++) {
+ for (int tmpX = 0; tmpX < block_size && match; tmpX++) {
+ if (p16_cur[tmpX] != p16_ref[tmpX]) {
+ match = 0;
+ }
+ }
+ p16_cur += stride_cur;
+ p16_ref += stride_ref;
+ }
+ } else {
+ for (int tmpY = 0; tmpY < block_size && match; tmpY++) {
+ for (int tmpX = 0; tmpX < block_size && match; tmpX++) {
+ if (p_cur[tmpX] != p_ref[tmpX]) {
+ match = 0;
+ }
+ }
+ p_cur += stride_cur;
+ p_ref += stride_ref;
+ }
+ }
+
+ if (match) {
+ C++;
+ continue;
+ }
+
+ if (av1_hash_is_horizontal_perfect(cur_picture, block_size, x_pos,
+ y_pos) ||
+ av1_hash_is_vertical_perfect(cur_picture, block_size, x_pos, y_pos)) {
+ S++;
+ continue;
+ }
+
+ av1_get_block_hash_value(
+ cur_picture->y_buffer + y_pos * stride_cur + x_pos, stride_cur,
+ block_size, &hash_value_1, &hash_value_2,
+ (cur_picture->flags & YV12_FLAG_HIGHBITDEPTH), &cpi->td.mb);
+ // Hashing does not work for highbitdepth currently.
+ // TODO(Roger): Make it work for highbitdepth.
+ if (av1_use_hash_me(&cpi->common)) {
+ if (av1_has_exact_match(last_hash_table, hash_value_1, hash_value_2)) {
+ M++;
+ }
+ }
+ }
+ }
+
+ assert(T > 0);
+ double csm_rate = ((double)(C + S + M)) / ((double)(T));
+ double m_rate = ((double)(M)) / ((double)(T));
+
+ cpi->csm_rate_array[cpi->rate_index] = csm_rate;
+ cpi->m_rate_array[cpi->rate_index] = m_rate;
+
+ cpi->rate_index = (cpi->rate_index + 1) % max_history_size;
+ cpi->rate_size++;
+ cpi->rate_size = AOMMIN(cpi->rate_size, max_history_size);
+
+ if (csm_rate < threshold_current) {
+ return 0;
+ }
+
+ if (C == T) {
+ return 1;
+ }
+
+ double csm_average = 0.0;
+ double m_average = 0.0;
+
+ for (k = 0; k < cpi->rate_size; k++) {
+ csm_average += cpi->csm_rate_array[k];
+ m_average += cpi->m_rate_array[k];
+ }
+ csm_average /= cpi->rate_size;
+ m_average /= cpi->rate_size;
+
+ if (csm_average < threshold_average) {
+ return 0;
+ }
+
+ if (M > (T - C - S) / 3) {
+ return 1;
+ }
+
+ if (csm_rate > 0.99 && m_rate > 0.01) {
+ return 1;
+ }
+
+ if (csm_average + m_average > 1.01) {
+ return 1;
+ }
+
+ return 0;
}
-static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
- int skip_adapt,
- unsigned int *frame_flags) {
+// Refresh reference frame buffers according to refresh_frame_flags.
+static void refresh_reference_frames(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ // All buffers are refreshed for shown keyframes and S-frames.
+
+ for (int ref_frame = 0; ref_frame < REF_FRAMES; ref_frame++) {
+ if (((cm->current_frame.refresh_frame_flags >> ref_frame) & 1) == 1) {
+ assign_frame_buffer_p(&cm->ref_frame_map[ref_frame], cm->cur_frame);
+ }
+ }
+}
+
+static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size,
+ uint8_t *dest) {
AV1_COMMON *const cm = &cpi->common;
SequenceHeader *const seq_params = &cm->seq_params;
CurrentFrame *const current_frame = &cm->current_frame;
const AV1EncoderConfig *const oxcf = &cpi->oxcf;
struct segmentation *const seg = &cm->seg;
- set_ext_overrides(cpi);
- aom_clear_system_state();
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, encode_frame_to_data_rate_time);
+#endif
// frame type has been decided outside of this function call
- cm->cur_frame->intra_only = frame_is_intra_only(cm);
cm->cur_frame->frame_type = current_frame->frame_type;
- // S_FRAMEs are always error resilient
- cm->error_resilient_mode |= frame_is_sframe(cm);
-
cm->large_scale_tile = cpi->oxcf.large_scale_tile;
cm->single_tile_decoding = cpi->oxcf.single_tile_decoding;
@@ -5072,34 +4828,20 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
cm->allow_warped_motion =
cpi->oxcf.allow_warped_motion && frame_might_allow_warped_motion(cm);
- // Reset the frame packet stamp index.
- if (current_frame->frame_type == KEY_FRAME && cm->show_frame)
- current_frame->frame_number = 0;
+ cm->last_frame_type = current_frame->frame_type;
+ if (cpi->oxcf.pass == 2 && cpi->sf.adaptive_interp_filter_search)
+ cpi->sf.interp_filter_search_mask = setup_interp_filter_search_mask(cpi);
- // NOTE:
- // (1) Move the setup of the ref_frame_flags upfront as it would be
- // determined by the current frame properties;
- // (2) The setup of the ref_frame_flags applies to both
- // show_existing_frame's
- // and the other cases.
- if (current_frame->frame_number > 0)
- cpi->ref_frame_flags = get_ref_frame_flags(cpi);
+ cpi->two_pass_partition_search = cpi->sf.two_pass_partition_search &&
+ !cpi->partition_search_skippable_frame;
if (encode_show_existing_frame(cm)) {
- // NOTE(zoeliu): In BIDIR_PRED, the existing frame to show is the current
- // BWDREF_FRAME in the reference frame buffer.
- if (current_frame->frame_type == KEY_FRAME) {
- cm->reset_decoder_state = 1;
- } else {
- current_frame->frame_type = INTER_FRAME;
- }
- cm->show_frame = 1;
- cpi->frame_flags = *frame_flags;
-
restore_coding_context(cpi);
+ finalize_encoded_frame(cpi);
// Build the bitstream
- if (av1_pack_bitstream(cpi, dest, size) != AOM_CODEC_OK)
+ int largest_tile_id = 0; // Output from bitstream: unused here
+ if (av1_pack_bitstream(cpi, dest, size, &largest_tile_id) != AOM_CODEC_OK)
return AOM_CODEC_ERROR;
if (seq_params->frame_id_numbers_present_flag &&
@@ -5112,40 +4854,16 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
cpi->seq_params_locked = 1;
- // Set up frame to show to get ready for stats collection.
- cm->frame_to_show = &cm->cur_frame->buf;
-
- // Update current frame offset.
- current_frame->order_hint = cm->cur_frame->order_hint;
-
#if DUMP_RECON_FRAMES == 1
// NOTE(zoeliu): For debug - Output the filtered reconstructed video.
dump_filtered_recon_frames(cpi);
#endif // DUMP_RECON_FRAMES
- // Update the LAST_FRAME in the reference frame buffer.
- // NOTE:
- // (1) For BWDREF_FRAME as the show_existing_frame, the reference frame
- // update has been done previously when handling the LAST_BIPRED_FRAME
- // right before BWDREF_FRAME (in the display order);
- // (2) For INTNL_OVERLAY as the show_existing_frame, the reference frame
- // update will be done when the following is called, which will
- // exchange
- // the virtual indexes between LAST_FRAME and ALTREF2_FRAME, so that
- // LAST3 will get retired, LAST2 becomes LAST3, LAST becomes LAST2,
- // and
- // ALTREF2_FRAME will serve as the new LAST_FRAME.
- update_reference_frames(cpi);
-
- // Update frame flags
- cpi->frame_flags &= ~FRAMEFLAGS_GOLDEN;
- cpi->frame_flags &= ~FRAMEFLAGS_BWDREF;
- cpi->frame_flags &= ~FRAMEFLAGS_ALTREF;
-
- *frame_flags = cpi->frame_flags & ~FRAMEFLAGS_KEY;
-
- // Update the frame type
- cm->last_frame_type = current_frame->frame_type;
+ // NOTE: Save the new show frame buffer index for --test-code=warn, i.e.,
+ // for the purpose to verify no mismatch between encoder and decoder.
+ if (cm->show_frame) cpi->last_show_frame_buf = cm->cur_frame;
+
+ refresh_reference_frames(cpi);
// Since we allocate a spot for the OVERLAY frame in the gf group, we need
// to do post-encoding update accordingly.
@@ -5159,6 +4877,26 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
return AOM_CODEC_OK;
}
+ // Work out whether to force_integer_mv this frame
+ if (oxcf->pass != 1 && cpi->common.allow_screen_content_tools &&
+ !frame_is_intra_only(cm)) {
+ if (cpi->common.seq_params.force_integer_mv == 2) {
+ // Adaptive mode: see what previous frame encoded did
+ if (cpi->unscaled_last_source != NULL) {
+ cm->cur_frame_force_integer_mv =
+ is_integer_mv(cpi, cpi->source, cpi->unscaled_last_source,
+ cpi->previous_hash_table);
+ } else {
+ cpi->common.cur_frame_force_integer_mv = 0;
+ }
+ } else {
+ cpi->common.cur_frame_force_integer_mv =
+ cpi->common.seq_params.force_integer_mv;
+ }
+ } else {
+ cpi->common.cur_frame_force_integer_mv = 0;
+ }
+
// Set default state for segment based loop filter update flags.
cm->lf.mode_ref_delta_update = 0;
@@ -5190,6 +4928,7 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
current_frame->frame_type != KEY_FRAME) {
if (av1_rc_drop_frame(cpi)) {
av1_rc_postencode_update_drop_frame(cpi);
+ release_scaled_references(cpi);
return AOM_CODEC_OK;
}
}
@@ -5204,7 +4943,6 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
if (seq_params->frame_id_numbers_present_flag) {
/* Non-normative definition of current_frame_id ("frame counter" with
* wraparound) */
- const int frame_id_length = FRAME_ID_LENGTH;
if (cm->current_frame_id == -1) {
int lsb, msb;
/* quasi-random initialization of current_frame_id for a key frame */
@@ -5215,7 +4953,8 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
lsb = cpi->source->y_buffer[0] & 0xff;
msb = cpi->source->y_buffer[1] & 0xff;
}
- cm->current_frame_id = ((msb << 8) + lsb) % (1 << frame_id_length);
+ cm->current_frame_id =
+ ((msb << 8) + lsb) % (1 << seq_params->frame_id_length);
// S_frame is meant for stitching different streams of different
// resolutions together, so current_frame_id must be the
@@ -5225,8 +4964,8 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
if (cpi->oxcf.sframe_enabled) cm->current_frame_id = 0x37;
} else {
cm->current_frame_id =
- (cm->current_frame_id + 1 + (1 << frame_id_length)) %
- (1 << frame_id_length);
+ (cm->current_frame_id + 1 + (1 << seq_params->frame_id_length)) %
+ (1 << seq_params->frame_id_length);
}
}
@@ -5249,15 +4988,14 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
}
cm->timing_info_present &= !seq_params->reduced_still_picture_hdr;
- if (cpi->sf.recode_loop == DISALLOW_RECODE) {
- if (encode_without_recode_loop(cpi) != AOM_CODEC_OK) return AOM_CODEC_ERROR;
- } else {
- if (encode_with_recode_loop(cpi, size, dest) != AOM_CODEC_OK)
- return AOM_CODEC_ERROR;
- }
-
- cm->last_tile_cols = cm->tile_cols;
- cm->last_tile_rows = cm->tile_rows;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, encode_with_recode_loop_time);
+#endif
+ if (encode_with_recode_loop(cpi, size, dest) != AOM_CODEC_OK)
+ return AOM_CODEC_ERROR;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, encode_with_recode_loop_time);
+#endif
#ifdef OUTPUT_YUV_SKINMAP
if (cpi->common.current_frame.frame_number > 1) {
@@ -5276,23 +5014,16 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
}
}
- // If the encoder forced a KEY_FRAME decision or if frame is an S_FRAME
- if ((current_frame->frame_type == KEY_FRAME && cm->show_frame) ||
- frame_is_sframe(cm)) {
- cpi->refresh_last_frame = 1;
- }
-
- cm->frame_to_show = &cm->cur_frame->buf;
- cm->frame_to_show->color_primaries = seq_params->color_primaries;
- cm->frame_to_show->transfer_characteristics =
+ cm->cur_frame->buf.color_primaries = seq_params->color_primaries;
+ cm->cur_frame->buf.transfer_characteristics =
seq_params->transfer_characteristics;
- cm->frame_to_show->matrix_coefficients = seq_params->matrix_coefficients;
- cm->frame_to_show->monochrome = seq_params->monochrome;
- cm->frame_to_show->chroma_sample_position =
+ cm->cur_frame->buf.matrix_coefficients = seq_params->matrix_coefficients;
+ cm->cur_frame->buf.monochrome = seq_params->monochrome;
+ cm->cur_frame->buf.chroma_sample_position =
seq_params->chroma_sample_position;
- cm->frame_to_show->color_range = seq_params->color_range;
- cm->frame_to_show->render_width = cm->render_width;
- cm->frame_to_show->render_height = cm->render_height;
+ cm->cur_frame->buf.color_range = seq_params->color_range;
+ cm->cur_frame->buf.render_width = cm->render_width;
+ cm->cur_frame->buf.render_height = cm->render_height;
// TODO(zoeliu): For non-ref frames, loop filtering may need to be turned
// off.
@@ -5313,26 +5044,31 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
}
// TODO(debargha): Fix mv search range on encoder side
- // aom_extend_frame_inner_borders(cm->frame_to_show, av1_num_planes(cm));
- aom_extend_frame_borders(cm->frame_to_show, av1_num_planes(cm));
+ // aom_extend_frame_inner_borders(&cm->cur_frame->buf, av1_num_planes(cm));
+ aom_extend_frame_borders(&cm->cur_frame->buf, av1_num_planes(cm));
#ifdef OUTPUT_YUV_REC
- aom_write_one_yuv_frame(cm, cm->frame_to_show);
+ aom_write_one_yuv_frame(cm, &cm->cur_frame->buf);
#endif
+ finalize_encoded_frame(cpi);
// Build the bitstream
- if (av1_pack_bitstream(cpi, dest, size) != AOM_CODEC_OK)
+ int largest_tile_id = 0; // Output from pack_bitstream
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, av1_pack_bitstream_final_time);
+#endif
+ if (av1_pack_bitstream(cpi, dest, size, &largest_tile_id) != AOM_CODEC_OK)
return AOM_CODEC_ERROR;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, av1_pack_bitstream_final_time);
+#endif
cpi->seq_params_locked = 1;
- if (skip_adapt) return AOM_CODEC_OK;
-
+ // Update reference frame ids for reference frames this frame will overwrite
if (seq_params->frame_id_numbers_present_flag) {
- int i;
- // Update reference frame id values based on the value of refresh_frame_mask
- for (i = 0; i < REF_FRAMES; i++) {
- if ((cpi->refresh_frame_mask >> i) & 1) {
+ for (int i = 0; i < REF_FRAMES; i++) {
+ if ((current_frame->refresh_frame_flags >> i) & 1) {
cm->ref_frame_id[i] = cm->current_frame_id;
}
}
@@ -5347,7 +5083,7 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
if (cm->seg.update_map) {
update_reference_segmentation_map(cpi);
} else if (cm->last_frame_seg_map) {
- memcpy(cm->current_frame_seg_map, cm->last_frame_seg_map,
+ memcpy(cm->cur_frame->seg_map, cm->last_frame_seg_map,
cm->mi_cols * cm->mi_rows * sizeof(uint8_t));
}
}
@@ -5356,41 +5092,60 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
release_scaled_references(cpi);
}
- update_reference_frames(cpi);
+ // NOTE: Save the new show frame buffer index for --test-code=warn, i.e.,
+ // for the purpose to verify no mismatch between encoder and decoder.
+ if (cm->show_frame) cpi->last_show_frame_buf = cm->cur_frame;
+
+ refresh_reference_frames(cpi);
#if CONFIG_ENTROPY_STATS
av1_accumulate_frame_counts(&aggregate_fc, &cpi->counts);
#endif // CONFIG_ENTROPY_STATS
if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
- *cm->fc = cpi->tile_data[cm->largest_tile_id].tctx;
+ *cm->fc = cpi->tile_data[largest_tile_id].tctx;
av1_reset_cdf_symbol_counters(cm->fc);
}
+ if (!cm->large_scale_tile) {
+ cm->cur_frame->frame_context = *cm->fc;
+ }
+#define EXT_TILE_DEBUG 0
+#if EXT_TILE_DEBUG
+ if (cm->large_scale_tile && oxcf->pass == 2) {
+ char fn[20] = "./fc";
+ fn[4] = current_frame->frame_number / 100 + '0';
+ fn[5] = (current_frame->frame_number % 100) / 10 + '0';
+ fn[6] = (current_frame->frame_number % 10) + '0';
+ fn[7] = '\0';
+ av1_print_frame_contexts(cm->fc, fn);
+ }
+#endif // EXT_TILE_DEBUG
+#undef EXT_TILE_DEBUG
- if (cpi->refresh_golden_frame == 1)
- cpi->frame_flags |= FRAMEFLAGS_GOLDEN;
- else
- cpi->frame_flags &= ~FRAMEFLAGS_GOLDEN;
-
- if (cpi->refresh_alt_ref_frame == 1)
- cpi->frame_flags |= FRAMEFLAGS_ALTREF;
- else
- cpi->frame_flags &= ~FRAMEFLAGS_ALTREF;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, encode_frame_to_data_rate_time);
- if (cpi->refresh_bwd_ref_frame == 1)
- cpi->frame_flags |= FRAMEFLAGS_BWDREF;
- else
- cpi->frame_flags &= ~FRAMEFLAGS_BWDREF;
+ // Print out timing information.
+ int i;
+ fprintf(stderr, "\n Frame number: %d, Frame type: %s, Show Frame: %d\n",
+ cm->current_frame.frame_number,
+ get_frame_type_enum(cm->current_frame.frame_type), cm->show_frame);
+ for (i = 0; i < kTimingComponents; i++) {
+ cpi->component_time[i] += cpi->frame_component_time[i];
+ fprintf(stderr, " %s: %" PRId64 " us (total: %" PRId64 " us)\n",
+ get_component_name(i), cpi->frame_component_time[i],
+ cpi->component_time[i]);
+ cpi->frame_component_time[i] = 0;
+ }
+#endif
cm->last_frame_type = current_frame->frame_type;
av1_rc_postencode_update(cpi, *size);
- if (current_frame->frame_type == KEY_FRAME) {
- // Tell the caller that the frame was coded as a key frame
- *frame_flags = cpi->frame_flags | FRAMEFLAGS_KEY;
- } else {
- *frame_flags = cpi->frame_flags & ~FRAMEFLAGS_KEY;
+ // Store encoded frame's hash table for is_integer_mv() next time
+ if (oxcf->pass != 1 && cpi->common.allow_screen_content_tools) {
+ cpi->previous_hash_table = &cm->cur_frame->hash_table;
}
// Clear the one shot update flags for segmentation map and mode/ref loop
@@ -5414,114 +5169,62 @@ static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
++current_frame->frame_number;
}
- // NOTE: Shall not refer to any frame not used as reference.
- if (cm->is_reference_frame) {
- // keep track of the last coded dimensions
- cm->last_width = cm->width;
- cm->last_height = cm->height;
- }
-
return AOM_CODEC_OK;
}
-static INLINE void update_keyframe_counters(AV1_COMP *cpi) {
- // TODO(zoeliu): To investigate whether we should treat BWDREF_FRAME
- // differently here for rc->avg_frame_bandwidth.
- if (cpi->common.show_frame || cpi->rc.is_bwd_ref_frame) {
- if (!cpi->common.show_existing_frame || cpi->rc.is_src_frame_alt_ref ||
- cpi->common.current_frame.frame_type == KEY_FRAME) {
- // If this is a show_existing_frame with a source other than altref,
- // or if it is not a displayed forward keyframe, the keyframe update
- // counters were incremented when it was originally encoded.
- cpi->rc.frames_since_key++;
- cpi->rc.frames_to_key--;
- }
- }
-}
-
-static INLINE void update_frames_till_gf_update(AV1_COMP *cpi) {
- // TODO(weitinglin): Updating this counter for is_frame_droppable
- // is a work-around to handle the condition when a frame is drop.
- // We should fix the cpi->common.show_frame flag
- // instead of checking the other condition to update the counter properly.
- if (cpi->common.show_frame || is_frame_droppable(cpi)) {
- // Decrement count down till next gf
- if (cpi->rc.frames_till_gf_update_due > 0)
- cpi->rc.frames_till_gf_update_due--;
- }
-}
-
-static INLINE void update_twopass_gf_group_index(AV1_COMP *cpi) {
- // Increment the gf group index ready for the next frame. If this is
- // a show_existing_frame with a source other than altref, or if it is not
- // a displayed forward keyframe, the index was incremented when it was
- // originally encoded.
- if (!cpi->common.show_existing_frame || cpi->rc.is_src_frame_alt_ref ||
- cpi->common.current_frame.frame_type == KEY_FRAME) {
- ++cpi->twopass.gf_group.index;
- }
-}
+int av1_encode(AV1_COMP *const cpi, uint8_t *const dest,
+ const EncodeFrameInput *const frame_input,
+ const EncodeFrameParams *const frame_params,
+ EncodeFrameResults *const frame_results) {
+ AV1_COMMON *const cm = &cpi->common;
+ CurrentFrame *const current_frame = &cm->current_frame;
-static void update_rc_counts(AV1_COMP *cpi) {
- update_keyframe_counters(cpi);
- update_frames_till_gf_update(cpi);
- if (cpi->oxcf.pass == 2) update_twopass_gf_group_index(cpi);
-}
+ cpi->unscaled_source = frame_input->source;
+ cpi->source = frame_input->source;
+ cpi->unscaled_last_source = frame_input->last_source;
+
+ current_frame->refresh_frame_flags = frame_params->refresh_frame_flags;
+ cm->error_resilient_mode = frame_params->error_resilient_mode;
+ cm->primary_ref_frame = frame_params->primary_ref_frame;
+ cm->current_frame.frame_type = frame_params->frame_type;
+ cm->show_frame = frame_params->show_frame;
+ cpi->ref_frame_flags = frame_params->ref_frame_flags;
+ cpi->speed = frame_params->speed;
+ cm->show_existing_frame = frame_params->show_existing_frame;
+ cpi->existing_fb_idx_to_show = frame_params->existing_fb_idx_to_show;
+
+ memcpy(cm->remapped_ref_idx, frame_params->remapped_ref_idx,
+ REF_FRAMES * sizeof(*cm->remapped_ref_idx));
+
+ cpi->refresh_last_frame = frame_params->refresh_last_frame;
+ cpi->refresh_golden_frame = frame_params->refresh_golden_frame;
+ cpi->refresh_bwd_ref_frame = frame_params->refresh_bwd_ref_frame;
+ cpi->refresh_alt2_ref_frame = frame_params->refresh_alt2_ref_frame;
+ cpi->refresh_alt_ref_frame = frame_params->refresh_alt_ref_frame;
-static void set_additional_frame_flags(AV1_COMMON *const cm,
- unsigned int *frame_flags) {
- if (frame_is_intra_only(cm)) *frame_flags |= FRAMEFLAGS_INTRAONLY;
- if (frame_is_sframe(cm)) *frame_flags |= FRAMEFLAGS_SWITCH;
- if (cm->error_resilient_mode) *frame_flags |= FRAMEFLAGS_ERROR_RESILIENT;
-}
+ if (current_frame->frame_type == KEY_FRAME && cm->show_frame)
+ current_frame->frame_number = 0;
-static int Pass0Encode(AV1_COMP *cpi, size_t *size, uint8_t *dest,
- int skip_adapt, unsigned int *frame_flags) {
- if (cpi->oxcf.rc_mode == AOM_CBR) {
- av1_rc_get_one_pass_cbr_params(cpi);
+ if (cm->show_existing_frame) {
+ current_frame->order_hint = cm->cur_frame->order_hint;
} else {
- av1_rc_get_one_pass_vbr_params(cpi);
- }
- if (encode_frame_to_data_rate(cpi, size, dest, skip_adapt, frame_flags) !=
- AOM_CODEC_OK) {
- return AOM_CODEC_ERROR;
+ current_frame->order_hint =
+ current_frame->frame_number + frame_params->order_offset;
+ current_frame->order_hint %=
+ (1 << (cm->seq_params.order_hint_info.order_hint_bits_minus_1 + 1));
}
- set_additional_frame_flags(&cpi->common, frame_flags);
-
- update_rc_counts(cpi);
- check_show_existing_frame(cpi);
- return AOM_CODEC_OK;
-}
-
-static int Pass2Encode(AV1_COMP *cpi, size_t *size, uint8_t *dest,
- unsigned int *frame_flags) {
-#if CONFIG_MISMATCH_DEBUG
- mismatch_move_frame_idx_w();
-#endif
-#if TXCOEFF_COST_TIMER
- AV1_COMMON *cm = &cpi->common;
- cm->txcoeff_cost_timer = 0;
- cm->txcoeff_cost_count = 0;
-#endif
- if (encode_frame_to_data_rate(cpi, size, dest, 0, frame_flags) !=
- AOM_CODEC_OK) {
+ if (cpi->oxcf.pass == 1) {
+ av1_first_pass(cpi, frame_input->ts_duration);
+ } else if (cpi->oxcf.pass == 0 || cpi->oxcf.pass == 2) {
+ if (encode_frame_to_data_rate(cpi, &frame_results->size, dest) !=
+ AOM_CODEC_OK) {
+ return AOM_CODEC_ERROR;
+ }
+ } else {
return AOM_CODEC_ERROR;
}
- set_additional_frame_flags(&cpi->common, frame_flags);
-#if TXCOEFF_COST_TIMER
- cm->cum_txcoeff_cost_timer += cm->txcoeff_cost_timer;
- fprintf(stderr,
- "\ntxb coeff cost block number: %ld, frame time: %ld, cum time %ld "
- "in us\n",
- cm->txcoeff_cost_count, cm->txcoeff_cost_timer,
- cm->cum_txcoeff_cost_timer);
-#endif
-
- av1_twopass_postencode_update(cpi);
- update_rc_counts(cpi);
- check_show_existing_frame(cpi);
return AOM_CODEC_OK;
}
@@ -5564,7 +5267,6 @@ int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags,
int64_t end_time) {
AV1_COMMON *const cm = &cpi->common;
const SequenceHeader *const seq_params = &cm->seq_params;
- struct aom_usec_timer timer;
int res = 0;
const int subsampling_x = sd->subsampling_x;
const int subsampling_y = sd->subsampling_y;
@@ -5572,8 +5274,10 @@ int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags,
check_initial_width(cpi, use_highbitdepth, subsampling_x, subsampling_y);
+#if CONFIG_INTERNAL_STATS
+ struct aom_usec_timer timer;
aom_usec_timer_start(&timer);
-
+#endif
#if CONFIG_DENOISE
if (cpi->oxcf.noise_level > 0)
if (apply_denoise_2d(cpi, sd, cpi->oxcf.noise_block_size,
@@ -5584,9 +5288,10 @@ int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags,
if (av1_lookahead_push(cpi->lookahead, sd, time_stamp, end_time,
use_highbitdepth, frame_flags))
res = -1;
+#if CONFIG_INTERNAL_STATS
aom_usec_timer_mark(&timer);
cpi->time_receive_data += aom_usec_timer_elapsed(&timer);
-
+#endif
if ((seq_params->profile == PROFILE_0) && !seq_params->monochrome &&
(subsampling_x != 1 || subsampling_y != 1)) {
aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM,
@@ -5610,133 +5315,6 @@ int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags,
return res;
}
-static void adjust_frame_rate(AV1_COMP *cpi,
- const struct lookahead_entry *source) {
- int64_t this_duration;
- int step = 0;
-
- if (source->ts_start == cpi->first_time_stamp_ever) {
- this_duration = source->ts_end - source->ts_start;
- step = 1;
- } else {
- int64_t last_duration =
- cpi->last_end_time_stamp_seen - cpi->last_time_stamp_seen;
-
- this_duration = source->ts_end - cpi->last_end_time_stamp_seen;
-
- // do a step update if the duration changes by 10%
- if (last_duration)
- step = (int)((this_duration - last_duration) * 10 / last_duration);
- }
-
- if (this_duration) {
- if (step) {
- av1_new_framerate(cpi, 10000000.0 / this_duration);
- } else {
- // Average this frame's rate into the last second's average
- // frame rate. If we haven't seen 1 second yet, then average
- // over the whole interval seen.
- const double interval = AOMMIN(
- (double)(source->ts_end - cpi->first_time_stamp_ever), 10000000.0);
- double avg_duration = 10000000.0 / cpi->framerate;
- avg_duration *= (interval - avg_duration + this_duration);
- avg_duration /= interval;
-
- av1_new_framerate(cpi, 10000000.0 / avg_duration);
- }
- }
- cpi->last_time_stamp_seen = source->ts_start;
- cpi->last_end_time_stamp_seen = source->ts_end;
-}
-
-// Returns 0 if this is not an alt ref else the offset of the source frame
-// used as the arf midpoint.
-static int get_arf_src_index(AV1_COMP *cpi) {
- RATE_CONTROL *const rc = &cpi->rc;
- int arf_src_index = 0;
- if (is_altref_enabled(cpi)) {
- if (cpi->oxcf.pass == 2) {
- const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
- if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
- arf_src_index = gf_group->arf_src_offset[gf_group->index];
- }
- } else if (rc->source_alt_ref_pending) {
- arf_src_index = rc->frames_till_gf_update_due;
- }
- }
- return arf_src_index;
-}
-
-static int get_brf_src_index(AV1_COMP *cpi) {
- int brf_src_index = 0;
- const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-
- // TODO(zoeliu): We need to add the check on the -bwd_ref command line setup
- // flag.
- if (gf_group->bidir_pred_enabled[gf_group->index]) {
- if (cpi->oxcf.pass == 2) {
- if (gf_group->update_type[gf_group->index] == BRF_UPDATE)
- brf_src_index = gf_group->brf_src_offset[gf_group->index];
- } else {
- // TODO(zoeliu): To re-visit the setup for this scenario
- brf_src_index = cpi->rc.bipred_group_interval - 1;
- }
- }
-
- return brf_src_index;
-}
-
-// Returns 0 if this is not an alt ref else the offset of the source frame
-// used as the arf midpoint.
-static int get_arf2_src_index(AV1_COMP *cpi) {
- int arf2_src_index = 0;
- if (is_altref_enabled(cpi) && cpi->num_extra_arfs) {
- if (cpi->oxcf.pass == 2) {
- const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
- if (gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE) {
- arf2_src_index = gf_group->arf_src_offset[gf_group->index];
- }
- }
- }
- return arf2_src_index;
-}
-
-static void check_src_altref(AV1_COMP *cpi,
- const struct lookahead_entry *source) {
- RATE_CONTROL *const rc = &cpi->rc;
-
- // If pass == 2, the parameters set here will be reset in
- // av1_rc_get_second_pass_params()
-
- if (cpi->oxcf.pass == 2) {
- const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
- rc->is_src_frame_alt_ref =
- (gf_group->update_type[gf_group->index] == INTNL_OVERLAY_UPDATE) ||
- (gf_group->update_type[gf_group->index] == OVERLAY_UPDATE);
- rc->is_src_frame_ext_arf =
- gf_group->update_type[gf_group->index] == INTNL_OVERLAY_UPDATE;
- } else {
- rc->is_src_frame_alt_ref =
- cpi->alt_ref_source && (source == cpi->alt_ref_source);
- }
-
- if (rc->is_src_frame_alt_ref) {
- // Current frame is an ARF overlay frame.
- cpi->alt_ref_source = NULL;
-
- if (rc->is_src_frame_ext_arf && !cpi->common.show_existing_frame) {
- // For INTNL_OVERLAY, when show_existing_frame == 0, they do need to
- // refresh the LAST_FRAME, i.e. LAST3 gets retired, LAST2 becomes LAST3,
- // LAST becomes LAST2, and INTNL_OVERLAY becomes LAST.
- cpi->refresh_last_frame = 1;
- } else {
- // Don't refresh the last buffer for an ARF overlay frame. It will
- // become the GF so preserve last as an alternative prediction option.
- cpi->refresh_last_frame = 0;
- }
- }
-}
-
#if CONFIG_INTERNAL_STATS
extern double av1_get_blockiness(const unsigned char *img1, int img1_pitch,
const unsigned char *img2, int img2_pitch,
@@ -5768,7 +5346,7 @@ static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) {
}
if (cm->show_frame) {
const YV12_BUFFER_CONFIG *orig = cpi->source;
- const YV12_BUFFER_CONFIG *recon = cpi->common.frame_to_show;
+ const YV12_BUFFER_CONFIG *recon = &cpi->common.cur_frame->buf;
double y, u, v, frame_all;
cpi->count++;
@@ -5843,738 +5421,31 @@ static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) {
}
}
#endif // CONFIG_INTERNAL_STATS
-
-static int is_integer_mv(AV1_COMP *cpi, const YV12_BUFFER_CONFIG *cur_picture,
- const YV12_BUFFER_CONFIG *last_picture,
- hash_table *last_hash_table) {
- aom_clear_system_state();
- // check use hash ME
- int k;
- uint32_t hash_value_1;
- uint32_t hash_value_2;
-
- const int block_size = 8;
- const double threshold_current = 0.8;
- const double threshold_average = 0.95;
- const int max_history_size = 32;
- int T = 0; // total block
- int C = 0; // match with collocated block
- int S = 0; // smooth region but not match with collocated block
- int M = 0; // match with other block
-
- const int pic_width = cur_picture->y_width;
- const int pic_height = cur_picture->y_height;
- for (int i = 0; i + block_size <= pic_height; i += block_size) {
- for (int j = 0; j + block_size <= pic_width; j += block_size) {
- const int x_pos = j;
- const int y_pos = i;
- int match = 1;
- T++;
-
- // check whether collocated block match with current
- uint8_t *p_cur = cur_picture->y_buffer;
- uint8_t *p_ref = last_picture->y_buffer;
- int stride_cur = cur_picture->y_stride;
- int stride_ref = last_picture->y_stride;
- p_cur += (y_pos * stride_cur + x_pos);
- p_ref += (y_pos * stride_ref + x_pos);
-
- if (cur_picture->flags & YV12_FLAG_HIGHBITDEPTH) {
- uint16_t *p16_cur = CONVERT_TO_SHORTPTR(p_cur);
- uint16_t *p16_ref = CONVERT_TO_SHORTPTR(p_ref);
- for (int tmpY = 0; tmpY < block_size && match; tmpY++) {
- for (int tmpX = 0; tmpX < block_size && match; tmpX++) {
- if (p16_cur[tmpX] != p16_ref[tmpX]) {
- match = 0;
- }
- }
- p16_cur += stride_cur;
- p16_ref += stride_ref;
- }
- } else {
- for (int tmpY = 0; tmpY < block_size && match; tmpY++) {
- for (int tmpX = 0; tmpX < block_size && match; tmpX++) {
- if (p_cur[tmpX] != p_ref[tmpX]) {
- match = 0;
- }
- }
- p_cur += stride_cur;
- p_ref += stride_ref;
- }
- }
-
- if (match) {
- C++;
- continue;
- }
-
- if (av1_hash_is_horizontal_perfect(cur_picture, block_size, x_pos,
- y_pos) ||
- av1_hash_is_vertical_perfect(cur_picture, block_size, x_pos, y_pos)) {
- S++;
- continue;
- }
-
- av1_get_block_hash_value(
- cur_picture->y_buffer + y_pos * stride_cur + x_pos, stride_cur,
- block_size, &hash_value_1, &hash_value_2,
- (cur_picture->flags & YV12_FLAG_HIGHBITDEPTH), &cpi->td.mb);
- // Hashing does not work for highbitdepth currently.
- // TODO(Roger): Make it work for highbitdepth.
- if (av1_use_hash_me(&cpi->common)) {
- if (av1_has_exact_match(last_hash_table, hash_value_1, hash_value_2)) {
- M++;
- }
- }
- }
- }
-
- assert(T > 0);
- double csm_rate = ((double)(C + S + M)) / ((double)(T));
- double m_rate = ((double)(M)) / ((double)(T));
-
- cpi->csm_rate_array[cpi->rate_index] = csm_rate;
- cpi->m_rate_array[cpi->rate_index] = m_rate;
-
- cpi->rate_index = (cpi->rate_index + 1) % max_history_size;
- cpi->rate_size++;
- cpi->rate_size = AOMMIN(cpi->rate_size, max_history_size);
-
- if (csm_rate < threshold_current) {
- return 0;
- }
-
- if (C == T) {
- return 1;
- }
-
- double csm_average = 0.0;
- double m_average = 0.0;
-
- for (k = 0; k < cpi->rate_size; k++) {
- csm_average += cpi->csm_rate_array[k];
- m_average += cpi->m_rate_array[k];
- }
- csm_average /= cpi->rate_size;
- m_average /= cpi->rate_size;
-
- if (csm_average < threshold_average) {
- return 0;
- }
-
- if (M > (T - C - S) / 3) {
- return 1;
- }
-
- if (csm_rate > 0.99 && m_rate > 0.01) {
- return 1;
- }
-
- if (csm_average + m_average > 1.01) {
- return 1;
- }
-
- return 0;
-}
-
-// Code for temporal dependency model
-typedef struct GF_PICTURE {
- YV12_BUFFER_CONFIG *frame;
- int ref_frame[7];
-} GF_PICTURE;
-
-static void init_gop_frames(AV1_COMP *cpi, GF_PICTURE *gf_picture,
- const GF_GROUP *gf_group, int *tpl_group_frames) {
- AV1_COMMON *cm = &cpi->common;
- const SequenceHeader *const seq_params = &cm->seq_params;
- int frame_idx = 0;
- int i;
- int gld_index = -1;
- int alt_index = -1;
- int lst_index = -1;
- int extend_frame_count = 0;
- int pframe_qindex = cpi->tpl_stats[2].base_qindex;
-
- RefCntBuffer *frame_bufs = cm->buffer_pool->frame_bufs;
- int recon_frame_index[INTER_REFS_PER_FRAME + 1] = { -1, -1, -1, -1,
- -1, -1, -1, -1 };
-
- // TODO(jingning): To be used later for gf frame type parsing.
- (void)gf_group;
-
- for (i = 0; i < FRAME_BUFFERS && frame_idx < INTER_REFS_PER_FRAME + 1; ++i) {
- if (frame_bufs[i].ref_count == 0) {
- alloc_frame_mvs(cm, i);
- if (aom_realloc_frame_buffer(
- &frame_bufs[i].buf, cm->width, cm->height,
- seq_params->subsampling_x, seq_params->subsampling_y,
- seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS,
- cm->byte_alignment, NULL, NULL, NULL))
- aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
- "Failed to allocate frame buffer");
-
- recon_frame_index[frame_idx] = i;
- ++frame_idx;
- }
- }
-
- for (i = 0; i < INTER_REFS_PER_FRAME + 1; ++i) {
- assert(recon_frame_index[i] >= 0);
- cpi->tpl_recon_frames[i] = &frame_bufs[recon_frame_index[i]].buf;
- }
-
- *tpl_group_frames = 0;
-
- // Initialize Golden reference frame.
- gf_picture[0].frame = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
- for (i = 0; i < 7; ++i) gf_picture[0].ref_frame[i] = -1;
- gld_index = 0;
- ++*tpl_group_frames;
-
- // Initialize ARF frame
- gf_picture[1].frame = cpi->source;
- gf_picture[1].ref_frame[0] = gld_index;
- gf_picture[1].ref_frame[1] = lst_index;
- gf_picture[1].ref_frame[2] = alt_index;
- // TODO(yuec) Need o figure out full AV1 reference model
- for (i = 3; i < 7; ++i) gf_picture[1].ref_frame[i] = -1;
- alt_index = 1;
- ++*tpl_group_frames;
-
- // Initialize P frames
- for (frame_idx = 2; frame_idx < MAX_LAG_BUFFERS; ++frame_idx) {
- struct lookahead_entry *buf =
- av1_lookahead_peek(cpi->lookahead, frame_idx - 2);
-
- if (buf == NULL) break;
-
- gf_picture[frame_idx].frame = &buf->img;
- gf_picture[frame_idx].ref_frame[0] = gld_index;
- gf_picture[frame_idx].ref_frame[1] = lst_index;
- gf_picture[frame_idx].ref_frame[2] = alt_index;
- for (i = 3; i < 7; ++i) gf_picture[frame_idx].ref_frame[i] = -1;
-
- ++*tpl_group_frames;
- lst_index = frame_idx;
-
- if (frame_idx == cpi->rc.baseline_gf_interval + 1) break;
- }
-
- gld_index = frame_idx;
- lst_index = AOMMAX(0, frame_idx - 1);
- alt_index = -1;
- ++frame_idx;
-
- // Extend two frames outside the current gf group.
- for (; frame_idx < MAX_LAG_BUFFERS && extend_frame_count < 2; ++frame_idx) {
- struct lookahead_entry *buf =
- av1_lookahead_peek(cpi->lookahead, frame_idx - 2);
-
- if (buf == NULL) break;
-
- cpi->tpl_stats[frame_idx].base_qindex = pframe_qindex;
-
- gf_picture[frame_idx].frame = &buf->img;
- gf_picture[frame_idx].ref_frame[0] = gld_index;
- gf_picture[frame_idx].ref_frame[1] = lst_index;
- gf_picture[frame_idx].ref_frame[2] = alt_index;
- for (i = 3; i < 7; ++i) gf_picture[frame_idx].ref_frame[i] = -1;
- lst_index = frame_idx;
- ++*tpl_group_frames;
- ++extend_frame_count;
- }
-}
-
-static void init_tpl_stats(AV1_COMP *cpi) {
- int frame_idx;
- for (frame_idx = 0; frame_idx < MAX_LAG_BUFFERS; ++frame_idx) {
- TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
- memset(tpl_frame->tpl_stats_ptr, 0,
- tpl_frame->height * tpl_frame->width *
- sizeof(*tpl_frame->tpl_stats_ptr));
- tpl_frame->is_valid = 0;
- }
-}
-
-static uint32_t motion_compensated_prediction(AV1_COMP *cpi, ThreadData *td,
- uint8_t *cur_frame_buf,
- uint8_t *ref_frame_buf,
- int stride, BLOCK_SIZE bsize,
- int mi_row, int mi_col) {
- AV1_COMMON *cm = &cpi->common;
- MACROBLOCK *const x = &td->mb;
- MACROBLOCKD *const xd = &x->e_mbd;
- MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
- const SEARCH_METHODS search_method = NSTEP;
- int step_param;
- int sadpb = x->sadperbit16;
- uint32_t bestsme = UINT_MAX;
- int distortion;
- uint32_t sse;
- int cost_list[5];
- const MvLimits tmp_mv_limits = x->mv_limits;
-
- MV best_ref_mv1 = { 0, 0 };
- MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
-
- best_ref_mv1_full.col = best_ref_mv1.col >> 3;
- best_ref_mv1_full.row = best_ref_mv1.row >> 3;
-
- // Setup frame pointers
- x->plane[0].src.buf = cur_frame_buf;
- x->plane[0].src.stride = stride;
- xd->plane[0].pre[0].buf = ref_frame_buf;
- xd->plane[0].pre[0].stride = stride;
-
- step_param = mv_sf->reduce_first_step_size;
- step_param = AOMMIN(step_param, MAX_MVSEARCH_STEPS - 2);
-
- av1_set_mv_search_range(&x->mv_limits, &best_ref_mv1);
-
- av1_full_pixel_search(cpi, x, bsize, &best_ref_mv1_full, step_param,
- search_method, 0, sadpb, cond_cost_list(cpi, cost_list),
- &best_ref_mv1, INT_MAX, 0, (MI_SIZE * mi_col),
- (MI_SIZE * mi_row), 0);
-
- /* restore UMV window */
- x->mv_limits = tmp_mv_limits;
-
- const int pw = block_size_wide[bsize];
- const int ph = block_size_high[bsize];
- bestsme = cpi->find_fractional_mv_step(
- x, cm, mi_row, mi_col, &best_ref_mv1, cpi->common.allow_high_precision_mv,
- x->errorperbit, &cpi->fn_ptr[bsize], 0, mv_sf->subpel_iters_per_step,
- cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, NULL,
- 0, 0, pw, ph, 1, 1);
-
- return bestsme;
-}
-
-static int get_overlap_area(int grid_pos_row, int grid_pos_col, int ref_pos_row,
- int ref_pos_col, int block, BLOCK_SIZE bsize) {
- int width = 0, height = 0;
- int bw = 4 << mi_size_wide_log2[bsize];
- int bh = 4 << mi_size_high_log2[bsize];
-
- switch (block) {
- case 0:
- width = grid_pos_col + bw - ref_pos_col;
- height = grid_pos_row + bh - ref_pos_row;
- break;
- case 1:
- width = ref_pos_col + bw - grid_pos_col;
- height = grid_pos_row + bh - ref_pos_row;
- break;
- case 2:
- width = grid_pos_col + bw - ref_pos_col;
- height = ref_pos_row + bh - grid_pos_row;
- break;
- case 3:
- width = ref_pos_col + bw - grid_pos_col;
- height = ref_pos_row + bh - grid_pos_row;
- break;
- default: assert(0);
- }
-
- return width * height;
-}
-
-static int round_floor(int ref_pos, int bsize_pix) {
- int round;
- if (ref_pos < 0)
- round = -(1 + (-ref_pos - 1) / bsize_pix);
- else
- round = ref_pos / bsize_pix;
-
- return round;
-}
-
-static void tpl_model_store(TplDepStats *tpl_stats, int mi_row, int mi_col,
- BLOCK_SIZE bsize, int stride,
- const TplDepStats *src_stats) {
- const int mi_height = mi_size_high[bsize];
- const int mi_width = mi_size_wide[bsize];
- int idx, idy;
-
- int64_t intra_cost = src_stats->intra_cost / (mi_height * mi_width);
- int64_t inter_cost = src_stats->inter_cost / (mi_height * mi_width);
-
- TplDepStats *tpl_ptr;
-
- intra_cost = AOMMAX(1, intra_cost);
- inter_cost = AOMMAX(1, inter_cost);
-
- for (idy = 0; idy < mi_height; ++idy) {
- tpl_ptr = &tpl_stats[(mi_row + idy) * stride + mi_col];
- for (idx = 0; idx < mi_width; ++idx) {
- tpl_ptr->intra_cost = intra_cost;
- tpl_ptr->inter_cost = inter_cost;
- tpl_ptr->mc_dep_cost = tpl_ptr->intra_cost + tpl_ptr->mc_flow;
- tpl_ptr->ref_frame_index = src_stats->ref_frame_index;
- tpl_ptr->mv.as_int = src_stats->mv.as_int;
- ++tpl_ptr;
- }
- }
-}
-
-static void tpl_model_update_b(TplDepFrame *tpl_frame, TplDepStats *tpl_stats,
- int mi_row, int mi_col, const BLOCK_SIZE bsize) {
- TplDepFrame *ref_tpl_frame = &tpl_frame[tpl_stats->ref_frame_index];
- TplDepStats *ref_stats = ref_tpl_frame->tpl_stats_ptr;
- MV mv = tpl_stats->mv.as_mv;
- int mv_row = mv.row >> 3;
- int mv_col = mv.col >> 3;
-
- int ref_pos_row = mi_row * MI_SIZE + mv_row;
- int ref_pos_col = mi_col * MI_SIZE + mv_col;
-
- const int bw = 4 << mi_size_wide_log2[bsize];
- const int bh = 4 << mi_size_high_log2[bsize];
- const int mi_height = mi_size_high[bsize];
- const int mi_width = mi_size_wide[bsize];
- const int pix_num = bw * bh;
-
- // top-left on grid block location in pixel
- int grid_pos_row_base = round_floor(ref_pos_row, bh) * bh;
- int grid_pos_col_base = round_floor(ref_pos_col, bw) * bw;
- int block;
-
- for (block = 0; block < 4; ++block) {
- int grid_pos_row = grid_pos_row_base + bh * (block >> 1);
- int grid_pos_col = grid_pos_col_base + bw * (block & 0x01);
-
- if (grid_pos_row >= 0 && grid_pos_row < ref_tpl_frame->mi_rows * MI_SIZE &&
- grid_pos_col >= 0 && grid_pos_col < ref_tpl_frame->mi_cols * MI_SIZE) {
- int overlap_area = get_overlap_area(
- grid_pos_row, grid_pos_col, ref_pos_row, ref_pos_col, block, bsize);
- int ref_mi_row = round_floor(grid_pos_row, bh) * mi_height;
- int ref_mi_col = round_floor(grid_pos_col, bw) * mi_width;
-
- int64_t mc_flow = tpl_stats->mc_dep_cost -
- (tpl_stats->mc_dep_cost * tpl_stats->inter_cost) /
- tpl_stats->intra_cost;
-
- int idx, idy;
-
- for (idy = 0; idy < mi_height; ++idy) {
- for (idx = 0; idx < mi_width; ++idx) {
- TplDepStats *des_stats =
- &ref_stats[(ref_mi_row + idy) * ref_tpl_frame->stride +
- (ref_mi_col + idx)];
-
- des_stats->mc_flow += (mc_flow * overlap_area) / pix_num;
- des_stats->mc_ref_cost +=
- ((tpl_stats->intra_cost - tpl_stats->inter_cost) * overlap_area) /
- pix_num;
- assert(overlap_area >= 0);
- }
- }
- }
- }
-}
-
-static void tpl_model_update(TplDepFrame *tpl_frame, TplDepStats *tpl_stats,
- int mi_row, int mi_col, const BLOCK_SIZE bsize) {
- int idx, idy;
- const int mi_height = mi_size_high[bsize];
- const int mi_width = mi_size_wide[bsize];
-
- for (idy = 0; idy < mi_height; ++idy) {
- for (idx = 0; idx < mi_width; ++idx) {
- TplDepStats *tpl_ptr =
- &tpl_stats[(mi_row + idy) * tpl_frame->stride + (mi_col + idx)];
- tpl_model_update_b(tpl_frame, tpl_ptr, mi_row + idy, mi_col + idx,
- BLOCK_4X4);
- }
- }
-}
-
-static void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff,
- tran_low_t *qcoeff, tran_low_t *dqcoeff,
- TX_SIZE tx_size, int64_t *recon_error,
- int64_t *sse) {
- const struct macroblock_plane *const p = &x->plane[plane];
- const SCAN_ORDER *const scan_order = &av1_default_scan_orders[tx_size];
- uint16_t eob;
- int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]];
- const int shift = tx_size == TX_32X32 ? 0 : 2;
-
- av1_quantize_fp_32x32(coeff, pix_num, p->zbin_QTX, p->round_fp_QTX,
- p->quant_fp_QTX, p->quant_shift_QTX, qcoeff, dqcoeff,
- p->dequant_QTX, &eob, scan_order->scan,
- scan_order->iscan);
-
- *recon_error = av1_block_error(coeff, dqcoeff, pix_num, sse) >> shift;
- *recon_error = AOMMAX(*recon_error, 1);
-
- *sse = (*sse) >> shift;
- *sse = AOMMAX(*sse, 1);
-}
-
-static void wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
- TX_SIZE tx_size) {
- switch (tx_size) {
- case TX_8X8: aom_hadamard_8x8(src_diff, bw, coeff); break;
- case TX_16X16: aom_hadamard_16x16(src_diff, bw, coeff); break;
- case TX_32X32: aom_hadamard_32x32(src_diff, bw, coeff); break;
- default: assert(0);
- }
-}
-
-static void mode_estimation(AV1_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
- struct scale_factors *sf, GF_PICTURE *gf_picture,
- int frame_idx, int16_t *src_diff, tran_low_t *coeff,
- tran_low_t *qcoeff, tran_low_t *dqcoeff, int mi_row,
- int mi_col, BLOCK_SIZE bsize, TX_SIZE tx_size,
- YV12_BUFFER_CONFIG *ref_frame[], uint8_t *predictor,
- int64_t *recon_error, int64_t *sse,
- TplDepStats *tpl_stats) {
- AV1_COMMON *cm = &cpi->common;
- ThreadData *td = &cpi->td;
-
- const int bw = 4 << mi_size_wide_log2[bsize];
- const int bh = 4 << mi_size_high_log2[bsize];
- const int pix_num = bw * bh;
- int best_rf_idx = -1;
- int_mv best_mv;
- int64_t best_inter_cost = INT64_MAX;
- int64_t inter_cost;
- int rf_idx;
- const InterpFilters kernel =
- av1_make_interp_filters(EIGHTTAP_REGULAR, EIGHTTAP_REGULAR);
-
- int64_t best_intra_cost = INT64_MAX;
- int64_t intra_cost;
- PREDICTION_MODE mode;
- int mb_y_offset = mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
- MB_MODE_INFO mi_above, mi_left;
-
- memset(tpl_stats, 0, sizeof(*tpl_stats));
-
- xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
- xd->mb_to_bottom_edge = ((cm->mi_rows - 1 - mi_row) * MI_SIZE) * 8;
- xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
- xd->mb_to_right_edge = ((cm->mi_cols - 1 - mi_col) * MI_SIZE) * 8;
- xd->above_mbmi = (mi_row > 0) ? &mi_above : NULL;
- xd->left_mbmi = (mi_col > 0) ? &mi_left : NULL;
-
- // Intra prediction search
- for (mode = DC_PRED; mode <= PAETH_PRED; ++mode) {
- uint8_t *src, *dst;
- int src_stride, dst_stride;
-
- src = xd->cur_buf->y_buffer + mb_y_offset;
- src_stride = xd->cur_buf->y_stride;
-
- dst = &predictor[0];
- dst_stride = bw;
-
- xd->mi[0]->sb_type = bsize;
- xd->mi[0]->ref_frame[0] = INTRA_FRAME;
-
- av1_predict_intra_block(
- cm, xd, block_size_wide[bsize], block_size_high[bsize], tx_size, mode,
- 0, 0, FILTER_INTRA_MODES, src, src_stride, dst, dst_stride, 0, 0, 0);
-
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- aom_highbd_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst,
- dst_stride, xd->bd);
- } else {
- aom_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst,
- dst_stride);
- }
-
- wht_fwd_txfm(src_diff, bw, coeff, tx_size);
-
- intra_cost = aom_satd(coeff, pix_num);
-
- if (intra_cost < best_intra_cost) best_intra_cost = intra_cost;
- }
-
- // Motion compensated prediction
- best_mv.as_int = 0;
-
- (void)mb_y_offset;
- // Motion estimation column boundary
- x->mv_limits.col_min = -((mi_col * MI_SIZE) + (17 - 2 * AOM_INTERP_EXTEND));
- x->mv_limits.col_max =
- ((cm->mi_cols - 1 - mi_col) * MI_SIZE) + (17 - 2 * AOM_INTERP_EXTEND);
-
- for (rf_idx = 0; rf_idx < 7; ++rf_idx) {
- if (ref_frame[rf_idx] == NULL) continue;
-
- motion_compensated_prediction(cpi, td, xd->cur_buf->y_buffer + mb_y_offset,
- ref_frame[rf_idx]->y_buffer + mb_y_offset,
- xd->cur_buf->y_stride, bsize, mi_row, mi_col);
-
- // TODO(jingning): Not yet support high bit-depth in the next three
- // steps.
- ConvolveParams conv_params = get_conv_params(0, 0, xd->bd);
- WarpTypesAllowed warp_types;
- memset(&warp_types, 0, sizeof(WarpTypesAllowed));
-
- av1_build_inter_predictor(
- ref_frame[rf_idx]->y_buffer + mb_y_offset, ref_frame[rf_idx]->y_stride,
- &predictor[0], bw, &x->best_mv.as_mv, sf, bw, bh, &conv_params, kernel,
- &warp_types, mi_col * MI_SIZE, mi_row * MI_SIZE, 0, 0, MV_PRECISION_Q3,
- mi_col * MI_SIZE, mi_row * MI_SIZE, xd, 0);
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- aom_highbd_subtract_block(
- bh, bw, src_diff, bw, xd->cur_buf->y_buffer + mb_y_offset,
- xd->cur_buf->y_stride, &predictor[0], bw, xd->bd);
- } else {
- aom_subtract_block(bh, bw, src_diff, bw,
- xd->cur_buf->y_buffer + mb_y_offset,
- xd->cur_buf->y_stride, &predictor[0], bw);
- }
- wht_fwd_txfm(src_diff, bw, coeff, tx_size);
-
- inter_cost = aom_satd(coeff, pix_num);
- if (inter_cost < best_inter_cost) {
- best_rf_idx = rf_idx;
- best_inter_cost = inter_cost;
- best_mv.as_int = x->best_mv.as_int;
- get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, recon_error,
- sse);
- }
- }
- best_intra_cost = AOMMAX(best_intra_cost, 1);
- best_inter_cost = AOMMIN(best_intra_cost, best_inter_cost);
- tpl_stats->inter_cost = best_inter_cost << TPL_DEP_COST_SCALE_LOG2;
- tpl_stats->intra_cost = best_intra_cost << TPL_DEP_COST_SCALE_LOG2;
- tpl_stats->mc_dep_cost = tpl_stats->intra_cost + tpl_stats->mc_flow;
-
- tpl_stats->ref_frame_index = gf_picture[frame_idx].ref_frame[best_rf_idx];
- tpl_stats->mv.as_int = best_mv.as_int;
-}
-
-static void mc_flow_dispenser(AV1_COMP *cpi, GF_PICTURE *gf_picture,
- int frame_idx) {
- TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
- YV12_BUFFER_CONFIG *this_frame = gf_picture[frame_idx].frame;
- YV12_BUFFER_CONFIG *ref_frame[7] = {
- NULL, NULL, NULL, NULL, NULL, NULL, NULL
- };
-
- AV1_COMMON *cm = &cpi->common;
- struct scale_factors sf;
- int rdmult, idx;
- ThreadData *td = &cpi->td;
- MACROBLOCK *x = &td->mb;
- MACROBLOCKD *xd = &x->e_mbd;
- int mi_row, mi_col;
-
- DECLARE_ALIGNED(16, uint16_t, predictor16[32 * 32 * 3]);
- DECLARE_ALIGNED(16, uint8_t, predictor8[32 * 32 * 3]);
- uint8_t *predictor;
- DECLARE_ALIGNED(16, int16_t, src_diff[32 * 32]);
- DECLARE_ALIGNED(16, tran_low_t, coeff[32 * 32]);
- DECLARE_ALIGNED(16, tran_low_t, qcoeff[32 * 32]);
- DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
-
- const BLOCK_SIZE bsize = BLOCK_32X32;
- const TX_SIZE tx_size = max_txsize_lookup[bsize];
- const int mi_height = mi_size_high[bsize];
- const int mi_width = mi_size_wide[bsize];
- int64_t recon_error, sse;
-
- // Setup scaling factor
- av1_setup_scale_factors_for_frame(
- &sf, this_frame->y_crop_width, this_frame->y_crop_height,
- this_frame->y_crop_width, this_frame->y_crop_height);
-
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
- predictor = CONVERT_TO_BYTEPTR(predictor16);
- else
- predictor = predictor8;
-
- // Prepare reference frame pointers. If any reference frame slot is
- // unavailable, the pointer will be set to Null.
- for (idx = 0; idx < 7; ++idx) {
- int rf_idx = gf_picture[frame_idx].ref_frame[idx];
- if (rf_idx != -1) ref_frame[idx] = gf_picture[rf_idx].frame;
- }
-
- xd->mi = cm->mi_grid_visible;
- xd->mi[0] = cm->mi;
- xd->cur_buf = this_frame;
-
- // Get rd multiplier set up.
- rdmult = (int)av1_compute_rd_mult(cpi, tpl_frame->base_qindex);
- if (rdmult < 1) rdmult = 1;
- set_error_per_bit(&cpi->td.mb, rdmult);
- av1_initialize_me_consts(cpi, &cpi->td.mb, tpl_frame->base_qindex);
-
- tpl_frame->is_valid = 1;
-
- cm->base_qindex = tpl_frame->base_qindex;
- av1_frame_init_quantizer(cpi);
-
- for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
- // Motion estimation row boundary
- x->mv_limits.row_min = -((mi_row * MI_SIZE) + (17 - 2 * AOM_INTERP_EXTEND));
- x->mv_limits.row_max =
- (cm->mi_rows - 1 - mi_row) * MI_SIZE + (17 - 2 * AOM_INTERP_EXTEND);
- for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
- TplDepStats tpl_stats;
- mode_estimation(cpi, x, xd, &sf, gf_picture, frame_idx, src_diff, coeff,
- qcoeff, dqcoeff, mi_row, mi_col, bsize, tx_size,
- ref_frame, predictor, &recon_error, &sse, &tpl_stats);
-
- // Motion flow dependency dispenser.
- tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize,
- tpl_frame->stride, &tpl_stats);
-
- tpl_model_update(cpi->tpl_stats, tpl_frame->tpl_stats_ptr, mi_row, mi_col,
- bsize);
- }
- }
-}
-
-static void setup_tpl_stats(AV1_COMP *cpi) {
- GF_PICTURE gf_picture[MAX_LAG_BUFFERS];
- const GF_GROUP *gf_group = &cpi->twopass.gf_group;
- int tpl_group_frames = 0;
- int frame_idx;
-
- init_gop_frames(cpi, gf_picture, gf_group, &tpl_group_frames);
-
- init_tpl_stats(cpi);
-
- // Backward propagation from tpl_group_frames to 1.
- for (frame_idx = tpl_group_frames - 1; frame_idx > 0; --frame_idx)
- mc_flow_dispenser(cpi, gf_picture, frame_idx);
-}
-
int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
size_t *size, uint8_t *dest, int64_t *time_stamp,
int64_t *time_end, int flush,
const aom_rational_t *timebase) {
const AV1EncoderConfig *const oxcf = &cpi->oxcf;
AV1_COMMON *const cm = &cpi->common;
- CurrentFrame *const current_frame = &cm->current_frame;
- const int num_planes = av1_num_planes(cm);
- BufferPool *const pool = cm->buffer_pool;
- RATE_CONTROL *const rc = &cpi->rc;
- struct aom_usec_timer cmptimer;
- YV12_BUFFER_CONFIG *force_src_buffer = NULL;
- struct lookahead_entry *last_source = NULL;
- struct lookahead_entry *source = NULL;
- int arf_src_index;
- int brf_src_index;
- int i;
#if CONFIG_BITSTREAM_DEBUG
assert(cpi->oxcf.max_threads == 0 &&
"bitstream debug tool does not support multithreading");
bitstream_queue_record_write();
- bitstream_queue_set_frame_write(current_frame->frame_number * 2 +
+ bitstream_queue_set_frame_write(cm->current_frame.frame_number * 2 +
cm->show_frame);
#endif
+ // Indicates whether or not to use an adaptive quantize b rather than
+ // the traditional version
+ cm->use_quant_b_adapt = cpi->oxcf.quant_b_adapt;
+
cm->showable_frame = 0;
+ *size = 0;
+#if CONFIG_INTERNAL_STATS
+ struct aom_usec_timer cmptimer;
aom_usec_timer_start(&cmptimer);
-
+#endif
set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV, 0);
// Normal defaults
@@ -6584,387 +5455,42 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
if (oxcf->large_scale_tile)
cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
- // default reference buffers update config
- av1_configure_buffer_updates_firstpass(cpi, LF_UPDATE);
-
// Initialize fields related to forward keyframes
cpi->no_show_kf = 0;
- cm->reset_decoder_state = 0;
-
- // Don't allow a show_existing_frame to coincide with an error resilient or
- // S-Frame. An exception can be made in the case of a keyframe, since it
- // does not depend on any previous frames. We must make this exception here
- // because of the use of show_existing_frame with forward coded keyframes.
- struct lookahead_entry *lookahead_src = NULL;
- if (current_frame->frame_number > 0)
- lookahead_src = av1_lookahead_peek(cpi->lookahead, 0);
-
- int use_show_existing = 1;
- if (lookahead_src != NULL) {
- const int is_error_resilient =
- cpi->oxcf.error_resilient_mode ||
- (lookahead_src->flags & AOM_EFLAG_ERROR_RESILIENT);
- const int is_s_frame = cpi->oxcf.s_frame_mode ||
- (lookahead_src->flags & AOM_EFLAG_SET_S_FRAME);
- const int is_key_frame =
- (rc->frames_to_key == 0) || (cpi->frame_flags & FRAMEFLAGS_KEY);
- use_show_existing = !(is_error_resilient || is_s_frame) || is_key_frame;
- }
-
- if (oxcf->pass == 2 && cm->show_existing_frame && use_show_existing) {
- // Manage the source buffer and flush out the source frame that has been
- // coded already; Also get prepared for PSNR calculation if needed.
- if ((source = av1_lookahead_pop(cpi->lookahead, flush)) == NULL) {
- *size = 0;
- return -1;
- }
- av1_apply_encoding_flags(cpi, source->flags);
- cpi->source = &source->img;
- // TODO(zoeliu): To track down to determine whether it's needed to adjust
- // the frame rate.
- *time_stamp = source->ts_start;
- *time_end = source->ts_end;
-
- // We need to adjust frame rate for an overlay frame
- if (cpi->rc.is_src_frame_alt_ref) adjust_frame_rate(cpi, source);
-
- // Find a free buffer for the new frame, releasing the reference
- // previously held.
- if (cm->new_fb_idx != INVALID_IDX) {
- --pool->frame_bufs[cm->new_fb_idx].ref_count;
- }
-
- cm->cur_frame = NULL;
- cm->new_fb_idx = get_free_fb(cm);
- if (cm->new_fb_idx == INVALID_IDX) return -1;
- cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];
-
- // Clear down mmx registers
- aom_clear_system_state();
-
- // Start with a 0 size frame.
- *size = 0;
-
- // We need to update the gf_group for show_existing overlay frame
- if (cpi->rc.is_src_frame_alt_ref) av1_rc_get_second_pass_params(cpi);
-
- if (Pass2Encode(cpi, size, dest, frame_flags) != AOM_CODEC_OK)
- return AOM_CODEC_ERROR;
-
- if (cpi->b_calculate_psnr) generate_psnr_packet(cpi);
-
-#if CONFIG_INTERNAL_STATS
- compute_internal_stats(cpi, (int)(*size));
-#endif // CONFIG_INTERNAL_STATS
-
- // Clear down mmx registers
- aom_clear_system_state();
-
- cm->show_existing_frame = 0;
- return 0;
- }
-
- // Should we encode an arf frame.
- arf_src_index = get_arf_src_index(cpi);
- if (arf_src_index) {
- for (i = 0; i <= arf_src_index; ++i) {
- struct lookahead_entry *e = av1_lookahead_peek(cpi->lookahead, i);
- // Avoid creating an alt-ref if there's a forced keyframe pending.
- if (e == NULL) {
- break;
- } else if (e->flags == AOM_EFLAG_FORCE_KF) {
- arf_src_index = 0;
- flush = 1;
- break;
- }
- }
- }
-
- if (arf_src_index) {
- assert(arf_src_index <= rc->frames_to_key);
-
- if ((source = av1_lookahead_peek(cpi->lookahead, arf_src_index)) != NULL) {
- cm->showable_frame = 1;
- cpi->alt_ref_source = source;
- // When arf_src_index == rc->frames_to_key, it indicates a fwd_kf
- if (arf_src_index == rc->frames_to_key) {
- // Skip temporal filtering and mark as intra_only if we have a fwd_kf
- const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
- int which_arf = gf_group->arf_update_idx[gf_group->index];
- cpi->is_arf_filter_off[which_arf] = 1;
- cpi->no_show_kf = 1;
- } else {
- if (oxcf->arnr_max_frames > 0) {
- // Produce the filtered ARF frame.
- av1_temporal_filter(cpi, arf_src_index);
- aom_extend_frame_borders(&cpi->alt_ref_buffer, num_planes);
- force_src_buffer = &cpi->alt_ref_buffer;
- }
- }
- cm->show_frame = 0;
- current_frame->intra_only = 0;
-
- if (oxcf->pass < 2) {
- // In second pass, the buffer updates configure will be set
- // in the function av1_rc_get_second_pass_params
- av1_configure_buffer_updates_firstpass(cpi, ARF_UPDATE);
- }
- }
- rc->source_alt_ref_pending = 0;
- }
-
- // Should we encode an arf2 frame.
- arf_src_index = get_arf2_src_index(cpi);
- if (arf_src_index) {
- for (i = 0; i <= arf_src_index; ++i) {
- struct lookahead_entry *e = av1_lookahead_peek(cpi->lookahead, i);
- // Avoid creating an alt-ref if there's a forced keyframe pending.
- if (e == NULL) {
- break;
- } else if (e->flags == AOM_EFLAG_FORCE_KF) {
- arf_src_index = 0;
- flush = 1;
- break;
- }
- }
- }
-
- if (arf_src_index) {
- assert(arf_src_index <= rc->frames_to_key);
-
- if ((source = av1_lookahead_peek(cpi->lookahead, arf_src_index)) != NULL) {
- cm->showable_frame = 1;
- cpi->alt_ref_source = source;
-
- if (oxcf->arnr_max_frames > 0) {
- // Produce the filtered ARF frame.
- av1_temporal_filter(cpi, arf_src_index);
- aom_extend_frame_borders(&cpi->alt_ref_buffer, num_planes);
- force_src_buffer = &cpi->alt_ref_buffer;
- }
-
- cm->show_frame = 0;
- current_frame->intra_only = 0;
-
- if (oxcf->pass < 2) {
- // In second pass, the buffer updates configure will be set
- // in the function av1_rc_get_second_pass_params
- av1_configure_buffer_updates_firstpass(cpi, INTNL_ARF_UPDATE);
- }
- }
- rc->source_alt_ref_pending = 0;
- }
-
- rc->is_bwd_ref_frame = 0;
- brf_src_index = get_brf_src_index(cpi);
- if (brf_src_index) {
- assert(brf_src_index <= rc->frames_to_key);
- if ((source = av1_lookahead_peek(cpi->lookahead, brf_src_index)) != NULL) {
- cm->showable_frame = 1;
- cm->show_frame = 0;
- current_frame->intra_only = 0;
-
- if (oxcf->pass < 2) {
- // In second pass, the buffer updates configure will be set
- // in the function av1_rc_get_second_pass_params
- av1_configure_buffer_updates_firstpass(cpi, BIPRED_UPDATE);
- }
- }
- }
- if (!source) {
- // Get last frame source.
- if (current_frame->frame_number > 0) {
- if ((last_source = av1_lookahead_peek(cpi->lookahead, -1)) == NULL)
- return -1;
- }
- if (current_frame->frame_number > 0) assert(last_source != NULL);
- // Read in the source frame.
- source = av1_lookahead_pop(cpi->lookahead, flush);
-
- if (source != NULL) {
- cm->show_frame = 1;
- current_frame->intra_only = 0;
-
- // Check to see if the frame should be encoded as an arf overlay.
- check_src_altref(cpi, source);
- }
- }
- if (source) {
- cpi->unscaled_source = cpi->source =
- force_src_buffer ? force_src_buffer : &source->img;
- cpi->unscaled_last_source = last_source != NULL ? &last_source->img : NULL;
+ if (assign_cur_frame_new_fb(cm) == NULL) return AOM_CODEC_ERROR;
- *time_stamp = source->ts_start;
- *time_end = source->ts_end;
- av1_apply_encoding_flags(cpi, source->flags);
- *frame_flags = (source->flags & AOM_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
-
- } else {
- *size = 0;
- if (flush && oxcf->pass == 1 && !cpi->twopass.first_pass_done) {
- av1_end_first_pass(cpi); /* get last stats packet */
- cpi->twopass.first_pass_done = 1;
- }
+ const int result = av1_encode_strategy(cpi, size, dest, frame_flags,
+ time_stamp, time_end, timebase, flush);
+ if (result != AOM_CODEC_OK && result != -1) {
+ return AOM_CODEC_ERROR;
+ } else if (result == -1) {
+ // Returning -1 indicates no frame encoded; more input is required
return -1;
}
-
- if (source->ts_start < cpi->first_time_stamp_ever) {
- cpi->first_time_stamp_ever = source->ts_start;
- cpi->last_end_time_stamp_seen = source->ts_start;
- }
-
- // Clear down mmx registers
- aom_clear_system_state();
-
- // adjust frame rates based on timestamps given
- if (cm->show_frame) adjust_frame_rate(cpi, source);
-
- // Find a free buffer for the new frame, releasing the reference previously
- // held.
- if (cm->new_fb_idx != INVALID_IDX) {
- --pool->frame_bufs[cm->new_fb_idx].ref_count;
- }
- cm->new_fb_idx = get_free_fb(cm);
-
- if (cm->new_fb_idx == INVALID_IDX) return -1;
-
- cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];
- // Retain the RF_LEVEL for the current newly coded frame.
- cm->cur_frame->frame_rf_level =
- cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index];
-
- cm->cur_frame->buf.buf_8bit_valid = 0;
-
- if (cpi->film_grain_table) {
- cm->seq_params.film_grain_params_present = aom_film_grain_table_lookup(
- cpi->film_grain_table, *time_stamp, *time_end, 0 /* =erase */,
- &cm->film_grain_params);
- }
- cm->cur_frame->film_grain_params_present =
- cm->seq_params.film_grain_params_present;
-
- // only one operating point supported now
- const int64_t pts64 = ticks_to_timebase_units(timebase, *time_stamp);
- if (pts64 < 0 || pts64 > UINT32_MAX) return AOM_CODEC_ERROR;
- cpi->common.frame_presentation_time = (uint32_t)pts64;
-
- // Start with a 0 size frame.
- *size = 0;
-
- cpi->frame_flags = *frame_flags;
-
- if (oxcf->pass == 2) {
- av1_rc_get_second_pass_params(cpi);
- } else if (oxcf->pass == 1) {
- setup_frame_size(cpi);
- }
-
- if (cpi->oxcf.pass != 0 || frame_is_intra_only(cm) == 1) {
- for (i = 0; i < INTER_REFS_PER_FRAME; ++i)
- cpi->scaled_ref_idx[i] = INVALID_IDX;
- }
-
- cm->using_qmatrix = cpi->oxcf.using_qm;
- cm->min_qmlevel = cpi->oxcf.qm_minlevel;
- cm->max_qmlevel = cpi->oxcf.qm_maxlevel;
-
- if (cm->seq_params.frame_id_numbers_present_flag && *time_stamp == 0) {
- cpi->common.current_frame_id = -1;
- }
-
- cpi->cur_poc++;
- if (oxcf->pass != 1 && cpi->common.allow_screen_content_tools &&
- !frame_is_intra_only(cm)) {
- if (cpi->common.seq_params.force_integer_mv == 2) {
- struct lookahead_entry *previous_entry =
- av1_lookahead_peek(cpi->lookahead, cpi->previous_index);
- if (!previous_entry)
- cpi->common.cur_frame_force_integer_mv = 0;
- else
- cpi->common.cur_frame_force_integer_mv = is_integer_mv(
- cpi, cpi->source, &previous_entry->img, cpi->previous_hash_table);
- } else {
- cpi->common.cur_frame_force_integer_mv =
- cpi->common.seq_params.force_integer_mv;
- }
- } else {
- cpi->common.cur_frame_force_integer_mv = 0;
- }
-
- if (cpi->twopass.gf_group.index == 1 && cpi->oxcf.enable_tpl_model) {
- set_frame_size(cpi, cm->width, cm->height);
- setup_tpl_stats(cpi);
- }
-
- if (oxcf->pass == 1) {
- cpi->td.mb.e_mbd.lossless[0] = is_lossless_requested(oxcf);
- av1_first_pass(cpi, source);
- } else if (oxcf->pass == 2) {
- if (Pass2Encode(cpi, size, dest, frame_flags) != AOM_CODEC_OK)
- return AOM_CODEC_ERROR;
- } else {
- // One pass encode
- if (Pass0Encode(cpi, size, dest, 0, frame_flags) != AOM_CODEC_OK)
- return AOM_CODEC_ERROR;
- }
- if (oxcf->pass != 1 && cpi->common.allow_screen_content_tools) {
- cpi->previous_hash_table = &cm->cur_frame->hash_table;
- {
- int l;
- for (l = -MAX_PRE_FRAMES; l < cpi->lookahead->max_sz; l++) {
- if ((cpi->lookahead->buf + l) == source) {
- cpi->previous_index = l;
- break;
- }
- }
-
- if (l == cpi->lookahead->max_sz) {
- aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
- "Failed to find last frame original buffer");
- }
- }
- }
-
- if (!cm->large_scale_tile) {
- cm->cur_frame->frame_context = *cm->fc;
- }
-
-#define EXT_TILE_DEBUG 0
-#if EXT_TILE_DEBUG
- if (cm->large_scale_tile && oxcf->pass == 2) {
- char fn[20] = "./fc";
- fn[4] = current_frame->frame_number / 100 + '0';
- fn[5] = (current_frame->frame_number % 100) / 10 + '0';
- fn[6] = (current_frame->frame_number % 10) + '0';
- fn[7] = '\0';
- av1_print_frame_contexts(cm->fc, fn);
- }
-#endif // EXT_TILE_DEBUG
-#undef EXT_TILE_DEBUG
-
- cm->showable_frame = !cm->show_frame && cm->showable_frame;
-
- // No frame encoded, or frame was dropped, release scaled references.
- if ((*size == 0) && (frame_is_intra_only(cm) == 0)) {
- release_scaled_references(cpi);
- }
-
- if (*size > 0) {
- cpi->droppable = is_frame_droppable(cpi);
- }
-
+#if CONFIG_INTERNAL_STATS
aom_usec_timer_mark(&cmptimer);
cpi->time_compress_data += aom_usec_timer_elapsed(&cmptimer);
-
- if (cpi->b_calculate_psnr && oxcf->pass != 1 && cm->show_frame)
- generate_psnr_packet(cpi);
+#endif
+ if (cpi->b_calculate_psnr) {
+ if (cm->show_existing_frame || (oxcf->pass != 1 && cm->show_frame)) {
+ generate_psnr_packet(cpi);
+ }
+ }
+ if (cpi->keep_level_stats && oxcf->pass != 1)
+ av1_update_level_info(cpi, *size, *time_stamp, *time_end);
#if CONFIG_INTERNAL_STATS
if (oxcf->pass != 1) {
compute_internal_stats(cpi, (int)(*size));
}
#endif // CONFIG_INTERNAL_STATS
+#if CONFIG_SPEED_STATS
+ if (cpi->oxcf.pass != 1 && !cm->show_existing_frame) {
+ cpi->tx_search_count += cpi->td.mb.tx_search_count;
+ cpi->td.mb.tx_search_count = 0;
+ }
+#endif // CONFIG_SPEED_STATS
aom_clear_system_state();
@@ -6977,8 +5503,8 @@ int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest) {
return -1;
} else {
int ret;
- if (cm->frame_to_show) {
- *dest = *cm->frame_to_show;
+ if (cm->cur_frame != NULL) {
+ *dest = cm->cur_frame->buf;
dest->y_width = cm->width;
dest->y_height = cm->height;
dest->uv_width = cm->width >> cm->seq_params.subsampling_x;
@@ -6993,10 +5519,9 @@ int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest) {
}
int av1_get_last_show_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *frame) {
- if (cpi->last_show_frame_buf_idx == INVALID_IDX) return -1;
+ if (cpi->last_show_frame_buf == NULL) return -1;
- *frame =
- cpi->common.buffer_pool->frame_bufs[cpi->last_show_frame_buf_idx].buf;
+ *frame = cpi->last_show_frame_buf->buf;
return 0;
}
@@ -7148,7 +5673,14 @@ void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags) {
upd ^= AOM_ALT2_FLAG;
}
- av1_update_reference(cpi, upd);
+ cpi->ext_refresh_last_frame = (upd & AOM_LAST_FLAG) != 0;
+ cpi->ext_refresh_golden_frame = (upd & AOM_GOLD_FLAG) != 0;
+ cpi->ext_refresh_alt_ref_frame = (upd & AOM_ALT_FLAG) != 0;
+ cpi->ext_refresh_bwd_ref_frame = (upd & AOM_BWD_FLAG) != 0;
+ cpi->ext_refresh_alt2_ref_frame = (upd & AOM_ALT2_FLAG) != 0;
+ cpi->ext_refresh_frame_flags_pending = 1;
+ } else {
+ cpi->ext_refresh_frame_flags_pending = 0;
}
cpi->ext_use_ref_frame_mvs = cpi->oxcf.allow_ref_frame_mvs &
@@ -7164,15 +5696,6 @@ void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags) {
}
}
-int64_t timebase_units_to_ticks(const aom_rational_t *timebase, int64_t n) {
- return n * TICKS_PER_SEC * timebase->num / timebase->den;
-}
-
-int64_t ticks_to_timebase_units(const aom_rational_t *timebase, int64_t n) {
- const int64_t round = TICKS_PER_SEC * timebase->num / 2 - 1;
- return (n * timebase->den + round) / timebase->num / TICKS_PER_SEC;
-}
-
aom_fixed_buf_t *av1_get_global_headers(AV1_COMP *cpi) {
if (!cpi) return NULL;
@@ -7189,7 +5712,7 @@ aom_fixed_buf_t *av1_get_global_headers(AV1_COMP *cpi) {
if (payload_offset + sequence_header_size > sizeof(header_buf)) return NULL;
memmove(&header_buf[payload_offset], &header_buf[0], sequence_header_size);
- if (write_obu_header(OBU_SEQUENCE_HEADER, 0, &header_buf[0]) !=
+ if (av1_write_obu_header(cpi, OBU_SEQUENCE_HEADER, 0, &header_buf[0]) !=
obu_header_size) {
return NULL;
}
diff --git a/libaom/av1/encoder/encoder.h b/libaom/av1/encoder/encoder.h
index 1ff2ef7..bf02394 100644
--- a/libaom/av1/encoder/encoder.h
+++ b/libaom/av1/encoder/encoder.h
@@ -12,6 +12,7 @@
#ifndef AOM_AV1_ENCODER_ENCODER_H_
#define AOM_AV1_ENCODER_ENCODER_H_
+#include <stdbool.h>
#include <stdio.h>
#include "config/aom_config.h"
@@ -24,11 +25,14 @@
#include "av1/common/onyxc_int.h"
#include "av1/common/resize.h"
#include "av1/common/timing.h"
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
#include "av1/encoder/aq_cyclicrefresh.h"
#include "av1/encoder/av1_quantize.h"
#include "av1/encoder/context_tree.h"
#include "av1/encoder/encodemb.h"
#include "av1/encoder/firstpass.h"
+#include "av1/encoder/level.h"
#include "av1/encoder/lookahead.h"
#include "av1/encoder/mbgraph.h"
#include "av1/encoder/mcomp.h"
@@ -36,6 +40,7 @@
#include "av1/encoder/rd.h"
#include "av1/encoder/speed_features.h"
#include "av1/encoder/tokenize.h"
+#include "av1/encoder/block.h"
#if CONFIG_INTERNAL_STATS
#include "aom_dsp/ssim.h"
@@ -59,36 +64,33 @@ typedef struct {
FRAME_CONTEXT fc;
} CODING_CONTEXT;
-typedef enum {
- // regular inter frame
- REGULAR_FRAME = 0,
- // alternate reference frame
- ARF_FRAME = 1,
- // overlay frame
- OVERLAY_FRAME = 2,
- // golden frame
- GLD_FRAME = 3,
- // backward reference frame
- BRF_FRAME = 4,
- // extra alternate reference frame
- EXT_ARF_FRAME = 5,
+enum {
+ REGULAR_FRAME, // regular inter frame
+ ARF_FRAME, // alternate reference frame
+ OVERLAY_FRAME, // overlay frame
+ GLD_FRAME, // golden frame
+ BRF_FRAME, // backward reference frame
+ INTERNAL_ARF_FRAME, // internal alternate reference frame
FRAME_CONTEXT_INDEXES
-} FRAME_CONTEXT_INDEX;
+} UENUM1BYTE(FRAME_CONTEXT_INDEX);
-typedef enum {
+enum {
NORMAL = 0,
FOURFIVE = 1,
THREEFIVE = 2,
ONETWO = 3
-} AOM_SCALING;
+} UENUM1BYTE(AOM_SCALING);
-typedef enum {
+enum {
// Good Quality Fast Encoding. The encoder balances quality with the amount of
// time it takes to encode the output. Speed setting controls how fast.
- GOOD
-} MODE;
+ GOOD,
+ // Realtime Fast Encoding. Will force some restrictions on bitrate
+ // constraints.
+ REALTIME
+} UENUM1BYTE(MODE);
-typedef enum {
+enum {
FRAMEFLAGS_KEY = 1 << 0,
FRAMEFLAGS_GOLDEN = 1 << 1,
FRAMEFLAGS_BWDREF = 1 << 2,
@@ -97,46 +99,62 @@ typedef enum {
FRAMEFLAGS_INTRAONLY = 1 << 4,
FRAMEFLAGS_SWITCH = 1 << 5,
FRAMEFLAGS_ERROR_RESILIENT = 1 << 6,
-} FRAMETYPE_FLAGS;
+} UENUM1BYTE(FRAMETYPE_FLAGS);
-typedef enum {
+enum {
NO_AQ = 0,
VARIANCE_AQ = 1,
COMPLEXITY_AQ = 2,
CYCLIC_REFRESH_AQ = 3,
AQ_MODE_COUNT // This should always be the last member of the enum
-} AQ_MODE;
-typedef enum {
+} UENUM1BYTE(AQ_MODE);
+enum {
NO_DELTA_Q = 0,
DELTA_Q_ONLY = 1,
DELTA_Q_LF = 2,
DELTAQ_MODE_COUNT // This should always be the last member of the enum
-} DELTAQ_MODE;
+} UENUM1BYTE(DELTAQ_MODE);
-typedef enum {
+enum {
RESIZE_NONE = 0, // No frame resizing allowed.
RESIZE_FIXED = 1, // All frames are coded at the specified scale.
RESIZE_RANDOM = 2, // All frames are coded at a random scale.
RESIZE_MODES
-} RESIZE_MODE;
+} UENUM1BYTE(RESIZE_MODE);
+
+enum {
+ SUPERRES_NONE, // No frame superres allowed.
+ SUPERRES_FIXED, // All frames are coded at the specified scale,
+ // and super-resolved.
+ SUPERRES_RANDOM, // All frames are coded at a random scale,
+ // and super-resolved.
+ SUPERRES_QTHRESH, // Superres scale for a frame is determined based on
+ // q_index.
+ SUPERRES_AUTO, // Automatically select superres for appropriate frames.
+ SUPERRES_MODES
+} UENUM1BYTE(SUPERRES_MODE);
typedef enum {
- SUPERRES_NONE = 0, // No frame superres allowed
- SUPERRES_FIXED = 1, // All frames are coded at the specified scale,
- // and super-resolved.
- SUPERRES_RANDOM = 2, // All frames are coded at a random scale,
- // and super-resolved.
- SUPERRES_QTHRESH = 3, // Superres scale for a frame is determined based on
- // q_index
- SUPERRES_MODES
-} SUPERRES_MODE;
+ kInvalid = 0,
+ kLowSadLowSumdiff = 1,
+ kLowSadHighSumdiff = 2,
+ kHighSadLowSumdiff = 3,
+ kHighSadHighSumdiff = 4,
+ kLowVarHighSumdiff = 5,
+ kVeryHighSad = 6,
+} CONTENT_STATE_SB;
+
+enum {
+ SS_CFG_SRC = 0,
+ SS_CFG_LOOKAHEAD = 1,
+ SS_CFG_TOTAL = 2
+} UENUM1BYTE(SS_CFG_OFFSET);
typedef struct TplDepStats {
int64_t intra_cost;
int64_t inter_cost;
int64_t mc_flow;
int64_t mc_dep_cost;
- int64_t mc_ref_cost;
int ref_frame_index;
int_mv mv;
@@ -153,6 +171,12 @@ typedef struct TplDepFrame {
int base_qindex;
} TplDepFrame;
+typedef enum {
+ COST_UPD_SB,
+ COST_UPD_SBROW,
+ COST_UPD_TILE,
+} COST_UPDATE_TYPE;
+
#define TPL_DEP_COST_SCALE_LOG2 4
typedef struct AV1EncoderConfig {
@@ -215,6 +239,7 @@ typedef struct AV1EncoderConfig {
DELTAQ_MODE deltaq_mode;
int enable_cdef;
int enable_restoration;
+ int enable_obmc;
int disable_trellis_quant;
int using_qm;
int qm_y;
@@ -274,6 +299,7 @@ typedef struct AV1EncoderConfig {
int min_gf_interval;
int max_gf_interval;
+ int gf_max_pyr_height;
int row_mt;
int tile_columns;
@@ -288,11 +314,6 @@ typedef struct AV1EncoderConfig {
int max_threads;
aom_fixed_buf_t two_pass_stats_in;
- struct aom_codec_pkt_list *output_pkt_list;
-
-#if CONFIG_FP_MB_STATS
- aom_fixed_buf_t firstpass_mb_stats_in;
-#endif
aom_tune_metric tuning;
aom_tune_content content;
@@ -304,15 +325,12 @@ typedef struct AV1EncoderConfig {
int color_range;
int render_width;
int render_height;
- aom_timing_info_type_t timing_info_type;
int timing_info_present;
aom_timing_info_t timing_info;
int decoder_model_info_present_flag;
int display_model_info_present_flag;
int buffer_removal_time_present;
aom_dec_model_info_t buffer_model;
- aom_dec_model_op_parameters_t op_params[MAX_NUM_OPERATING_POINTS + 1];
- aom_op_timing_info_t op_frame_timing[MAX_NUM_OPERATING_POINTS + 1];
int film_grain_test_vector;
const char *film_grain_table_filename;
@@ -320,18 +338,44 @@ typedef struct AV1EncoderConfig {
aom_superblock_size_t superblock_size;
unsigned int large_scale_tile;
unsigned int single_tile_decoding;
- int monochrome;
+ uint8_t monochrome;
unsigned int full_still_picture_hdr;
int enable_dual_filter;
unsigned int motion_vector_unit_test;
const cfg_options_t *cfg;
+ int enable_rect_partitions;
+ int enable_ab_partitions;
+ int enable_1to4_partitions;
+ int min_partition_size;
+ int max_partition_size;
+ int enable_intra_edge_filter;
+ int enable_tx64;
+ int tx_size_search_method;
+ int enable_flip_idtx;
int enable_order_hint;
- int enable_jnt_comp;
+ int enable_dist_wtd_comp;
int enable_ref_frame_mvs;
+ unsigned int max_reference_frames;
+ int enable_reduced_reference_set;
unsigned int allow_ref_frame_mvs;
+ int enable_masked_comp;
+ int enable_onesided_comp;
+ int enable_interintra_comp;
+ int enable_smooth_interintra;
+ int enable_diff_wtd_comp;
+ int enable_interinter_wedge;
+ int enable_interintra_wedge;
+ int enable_global_motion;
int enable_warped_motion;
int allow_warped_motion;
+ int enable_filter_intra;
+ int enable_smooth_intra;
+ int enable_paeth_intra;
+ int enable_cfl_intra;
int enable_superres;
+ int enable_palette;
+ int enable_intrabc;
+ int enable_angle_delta;
unsigned int save_as_annexb;
#if CONFIG_DENOISE
@@ -341,6 +385,18 @@ typedef struct AV1EncoderConfig {
unsigned int chroma_subsampling_x;
unsigned int chroma_subsampling_y;
+ int reduced_tx_type_set;
+ int use_intra_dct_only;
+ int use_inter_dct_only;
+ int use_intra_default_tx_only;
+ int quant_b_adapt;
+ COST_UPDATE_TYPE coeff_cost_upd_freq;
+ COST_UPDATE_TYPE mode_cost_upd_freq;
+ int border_in_pixels;
+ AV1_LEVEL target_seq_level_idx[MAX_NUM_OPERATING_POINTS];
+ // Bit mask to specify which tier each of the 32 possible operating points
+ // conforms to.
+ unsigned int tier_mask;
} AV1EncoderConfig;
static INLINE int is_lossless_requested(const AV1EncoderConfig *cfg) {
@@ -397,7 +453,7 @@ typedef struct FRAME_COUNTS {
unsigned int interintra[BLOCK_SIZE_GROUPS][2];
unsigned int interintra_mode[BLOCK_SIZE_GROUPS][INTERINTRA_MODES];
unsigned int wedge_interintra[BLOCK_SIZES_ALL][2];
- unsigned int compound_type[BLOCK_SIZES_ALL][COMPOUND_TYPES - 1];
+ unsigned int compound_type[BLOCK_SIZES_ALL][MASKED_COMPOUND_TYPES];
unsigned int motion_mode[BLOCK_SIZES_ALL][MOTION_MODES];
unsigned int obmc[BLOCK_SIZES_ALL][2];
unsigned int intra_inter[INTRA_INTER_CONTEXTS][2];
@@ -433,7 +489,6 @@ typedef struct FRAME_COUNTS {
[SWITCHABLE_FILTERS];
} FRAME_COUNTS;
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
#define INTER_MODE_RD_DATA_OVERALL_SIZE 6400
typedef struct {
@@ -467,8 +522,12 @@ typedef struct inter_modes_info {
int64_t sse_arr[MAX_INTER_MODES];
int64_t est_rd_arr[MAX_INTER_MODES];
RdIdxPair rd_idx_pair_arr[MAX_INTER_MODES];
+ bool true_rd_arr[MAX_INTER_MODES];
+ uint8_t blk_skip_arr[MAX_INTER_MODES][MAX_MIB_SIZE * MAX_MIB_SIZE];
+ RD_STATS rd_cost_arr[MAX_INTER_MODES];
+ RD_STATS rd_cost_y_arr[MAX_INTER_MODES];
+ RD_STATS rd_cost_uv_arr[MAX_INTER_MODES];
} InterModesInfo;
-#endif
// Encoder row synchronization
typedef struct AV1RowMTSyncData {
@@ -491,16 +550,13 @@ typedef struct AV1RowMTInfo {
typedef struct TileDataEnc {
TileInfo tile_info;
int thresh_freq_fact[BLOCK_SIZES_ALL][MAX_MODES];
- int mode_map[BLOCK_SIZES_ALL][MAX_MODES];
int m_search_count;
int ex_search_count;
CFL_CTX cfl;
DECLARE_ALIGNED(16, FRAME_CONTEXT, tctx);
- DECLARE_ALIGNED(16, FRAME_CONTEXT, backup_tctx);
+ FRAME_CONTEXT *row_ctx;
uint8_t allow_update_cdf;
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
InterModeRdModel inter_mode_rd_models[BLOCK_SIZES_ALL];
-#endif
AV1RowMTSync row_mt_sync;
AV1RowMTInfo row_mt_info;
} TileDataEnc;
@@ -535,9 +591,7 @@ typedef struct ThreadData {
tran_low_t *tree_coeff_buf[MAX_MB_PLANE];
tran_low_t *tree_qcoeff_buf[MAX_MB_PLANE];
tran_low_t *tree_dqcoeff_buf[MAX_MB_PLANE];
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
InterModesInfo *inter_modes_info;
-#endif
uint32_t *hash_value_buffer[2][2];
int32_t *wsrc_buf;
int32_t *mask_buf;
@@ -560,13 +614,13 @@ typedef struct ActiveMap {
#if CONFIG_INTERNAL_STATS
// types of stats
-typedef enum {
+enum {
STAT_Y,
STAT_U,
STAT_V,
STAT_ALL,
NUM_STAT_TYPES // This should always be the last member of the enum
-} StatType;
+} UENUM1BYTE(StatType);
typedef struct IMAGE_STAT {
double stat[NUM_STAT_TYPES];
@@ -579,10 +633,83 @@ typedef struct {
YV12_BUFFER_CONFIG buf;
} EncRefCntBuffer;
-typedef struct TileBufferEnc {
- uint8_t *data;
- size_t size;
-} TileBufferEnc;
+#if CONFIG_COLLECT_PARTITION_STATS == 2
+typedef struct PartitionStats {
+ int partition_decisions[6][EXT_PARTITION_TYPES];
+ int partition_attempts[6][EXT_PARTITION_TYPES];
+ int64_t partition_times[6][EXT_PARTITION_TYPES];
+
+ int partition_redo;
+} PartitionStats;
+#endif
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+#include "aom_ports/aom_timer.h"
+// Adjust the following to add new components.
+enum {
+ encode_frame_to_data_rate_time,
+ encode_with_recode_loop_time,
+ loop_filter_time,
+ cdef_time,
+ loop_restoration_time,
+ av1_pack_bitstream_final_time,
+ av1_encode_frame_time,
+ av1_compute_global_motion_time,
+ av1_setup_motion_field_time,
+ encode_sb_time,
+ first_partition_search_pass_time,
+ rd_pick_partition_time,
+ rd_pick_sb_modes_time,
+ av1_rd_pick_intra_mode_sb_time,
+ av1_rd_pick_inter_mode_sb_time,
+ handle_intra_mode_time,
+ handle_inter_mode_time,
+ do_tx_search_time,
+ handle_newmv_time,
+ compound_type_rd_time,
+ interpolation_filter_search_time,
+ motion_mode_rd_time,
+ kTimingComponents,
+} UENUM1BYTE(TIMING_COMPONENT);
+
+static INLINE char const *get_component_name(int index) {
+ switch (index) {
+ case encode_frame_to_data_rate_time:
+ return "encode_frame_to_data_rate_time";
+ case encode_with_recode_loop_time: return "encode_with_recode_loop_time";
+ case loop_filter_time: return "loop_filter_time";
+ case cdef_time: return "cdef_time";
+ case loop_restoration_time: return "loop_restoration_time";
+ case av1_pack_bitstream_final_time: return "av1_pack_bitstream_final_time";
+ case av1_encode_frame_time: return "av1_encode_frame_time";
+ case av1_compute_global_motion_time:
+ return "av1_compute_global_motion_time";
+ case av1_setup_motion_field_time: return "av1_setup_motion_field_time";
+ case encode_sb_time: return "encode_sb_time";
+ case first_partition_search_pass_time:
+ return "first_partition_search_pass_time";
+ case rd_pick_partition_time: return "rd_pick_partition_time";
+ case rd_pick_sb_modes_time: return "rd_pick_sb_modes_time";
+ case av1_rd_pick_intra_mode_sb_time:
+ return "av1_rd_pick_intra_mode_sb_time";
+ case av1_rd_pick_inter_mode_sb_time:
+ return "av1_rd_pick_inter_mode_sb_time";
+ case handle_intra_mode_time: return "handle_intra_mode_time";
+ case handle_inter_mode_time: return "handle_inter_mode_time";
+ case do_tx_search_time: return "do_tx_search_time";
+ case handle_newmv_time: return "handle_newmv_time";
+ case compound_type_rd_time: return "compound_type_rd_time";
+ case interpolation_filter_search_time:
+ return "interpolation_filter_search_time";
+ case motion_mode_rd_time: return "motion_mode_rd_time";
+ default: assert(0);
+ }
+ return "error";
+}
+#endif
+
+// The maximum number of internal ARFs except ALTREF_FRAME
+#define MAX_INTERNAL_ARFS (REF_FRAMES - BWDREF_FRAME - 1)
typedef struct AV1_COMP {
QUANTS quants;
@@ -597,7 +724,6 @@ typedef struct AV1_COMP {
struct lookahead_entry *alt_ref_source;
int no_show_kf;
- int optimize_speed_feature;
int optimize_seg_arr[MAX_SEGMENTS];
YV12_BUFFER_CONFIG *source;
@@ -612,37 +738,20 @@ typedef struct AV1_COMP {
// For a still frame, this flag is set to 1 to skip partition search.
int partition_search_skippable_frame;
+ // The following item corresponds to two_pass_partition_search speed features.
+ int two_pass_partition_search;
+
double csm_rate_array[32];
double m_rate_array[32];
int rate_size;
int rate_index;
hash_table *previous_hash_table;
int previous_index;
- int cur_poc; // DebugInfo
unsigned int row_mt;
- int scaled_ref_idx[INTER_REFS_PER_FRAME];
-
- // For encoder, we have a two-level mapping from reference frame type to the
- // corresponding buffer in the buffer pool:
- // * 'remapped_ref_idx[i - 1]' maps reference type ‘i’ (range: LAST_FRAME ...
- // EXTREF_FRAME) to a remapped index ‘j’ (in range: 0 ... REF_FRAMES - 1)
- // * Later, 'cm->ref_frame_map[j]' maps the remapped index ‘j’ to actual index
- // of the buffer in the buffer pool ‘cm->buffer_pool.frame_bufs’.
- //
- // LAST_FRAME, ..., EXTREF_FRAME
- // | |
- // v v
- // remapped_ref_idx[LAST_FRAME - 1], ..., remapped_ref_idx[EXTREF_FRAME - 1]
- // | |
- // v v
- // ref_frame_map[], ..., ref_frame_map[]
- //
- // Note: INTRA_FRAME always refers to the current frame, so there's no need to
- // have a remapped index for the same.
- int remapped_ref_idx[REF_FRAMES];
+ RefCntBuffer *scaled_ref_buf[INTER_REFS_PER_FRAME];
- int last_show_frame_buf_idx; // last show frame buffer index
+ RefCntBuffer *last_show_frame_buf; // last show frame buffer
// refresh_*_frame are boolean flags. If 'refresh_xyz_frame' is true, then
// after the current frame is encoded, the XYZ reference frame gets refreshed
@@ -661,14 +770,11 @@ typedef struct AV1_COMP {
int refresh_alt2_ref_frame;
int refresh_alt_ref_frame;
-#if USE_SYMM_MULTI_LAYER
- // When true, a new rule for backward (future) reference frames is in effect:
- // - BWDREF_FRAME is always the closest future frame available
- // - ALTREF2_FRAME is always the 2nd closest future frame available
- // - 'refresh_bwd_ref_frame' flag is used for updating both the BWDREF_FRAME
- // and ALTREF2_FRAME. ('refresh_alt2_ref_frame' flag is irrelevant).
- int new_bwdref_update_rule;
-#endif
+ // For each type of reference frame, this contains the index of a reference
+ // frame buffer for a reference frame of the same type. We use this to
+ // choose our primary reference frame (which is the most recent reference
+ // frame of the same type as the current frame).
+ int fb_of_context_type[REF_FRAMES];
int ext_refresh_frame_flags_pending;
int ext_refresh_last_frame;
@@ -707,12 +813,6 @@ typedef struct AV1_COMP {
RATE_CONTROL rc;
double framerate;
- // Relevant for an inter frame.
- // - Index '0' corresponds to the values for the currently coded frame.
- // - Indices LAST_FRAME ... EXTREF_FRAMES are used to store values for all the
- // possible inter reference frames.
- int interp_filter_selected[REF_FRAMES + 1][SWITCHABLE];
-
struct aom_codec_pkt_list *output_pkt_list;
MBGRAPH_FRAME_STATS mbgraph_stats[MAX_LAG_BUFFERS];
@@ -721,12 +821,14 @@ typedef struct AV1_COMP {
int ref_frame_flags;
int ext_ref_frame_flags;
+ // speed is passed as a per-frame parameter into the encoder
+ int speed;
+ // sf contains fine-grained config set internally based on speed
SPEED_FEATURES sf;
unsigned int max_mv_magnitude;
int mv_step_param;
- int allow_comp_inter_inter;
int all_one_sided_refs;
uint8_t *segmentation_map;
@@ -737,13 +839,10 @@ typedef struct AV1_COMP {
fractional_mv_step_fp *find_fractional_mv_step;
av1_diamond_search_fn_t diamond_search_sad;
aom_variance_fn_ptr_t fn_ptr[BLOCK_SIZES_ALL];
+
+#if CONFIG_INTERNAL_STATS
uint64_t time_receive_data;
uint64_t time_compress_data;
- uint64_t time_pick_lpf;
- uint64_t time_encode_sb_row;
-
-#if CONFIG_FP_MB_STATS
- int use_fp_mb_stats;
#endif
TWO_PASS twopass;
@@ -779,6 +878,9 @@ typedef struct AV1_COMP {
Metrics metrics;
#endif
int b_calculate_psnr;
+#if CONFIG_SPEED_STATS
+ unsigned int tx_search_count;
+#endif // CONFIG_SPEED_STATS
int droppable;
@@ -796,23 +898,21 @@ typedef struct AV1_COMP {
int resize_pending_width;
int resize_pending_height;
- int frame_flags;
-
- search_site_config ss_cfg;
+ // ss_cfg[SS_CFG_LOOKAHEAD] : used in following cases
+ // -> temporal filtering
+ // -> intrabc
+ // ss_cfg[SS_CFG_SRC] : used everywhere except above mentioned cases
+ search_site_config ss_cfg[SS_CFG_TOTAL];
TileDataEnc *tile_data;
int allocated_tiles; // Keep track of memory allocated for tiles.
TOKENEXTRA *tile_tok[MAX_TILE_ROWS][MAX_TILE_COLS];
- unsigned int tok_count[MAX_TILE_ROWS][MAX_TILE_COLS];
TOKENLIST *tplist[MAX_TILE_ROWS][MAX_TILE_COLS];
- TileBufferEnc tile_buffers[MAX_TILE_ROWS][MAX_TILE_COLS];
-
int resize_state;
int resize_avg_qp;
int resize_buffer_underflow;
- int resize_count;
// Sequence parameters have been transmitted already and locked
// or not. Once locked av1_change_config cannot change the seq
@@ -822,19 +922,24 @@ typedef struct AV1_COMP {
// VARIANCE_AQ segment map refresh
int vaq_refresh;
+ // VAR_BASED_PARTITION thresholds
+ // 0 - threshold_128x128; 1 - threshold_64x64;
+ // 2 - threshold_32x32; 3 - threshold_16x16;
+ // 4 - vbp_threshold_8x8;
+ int64_t vbp_thresholds[5];
+ int64_t vbp_threshold_minmax;
+ int64_t vbp_threshold_sad;
+ int64_t vbp_threshold_copy;
+ BLOCK_SIZE vbp_bsize_min;
+
// Multi-threading
int num_workers;
AVxWorker *workers;
struct EncWorkerData *tile_thr_data;
- int refresh_frame_mask;
int existing_fb_idx_to_show;
- int is_arf_filter_off[MAX_EXT_ARFS + 1];
- int num_extra_arfs;
- int arf_pos_in_gf[MAX_EXT_ARFS + 1];
- int arf_pos_for_ovrly[MAX_EXT_ARFS + 1];
+ int is_arf_filter_off[MAX_INTERNAL_ARFS + 1];
int global_motion_search_done;
- tran_low_t *tcoeff_buf[MAX_MB_PLANE];
- int extra_arf_allowed;
+ int internal_altref_allowed;
// A flag to indicate if intrabc is ever used in current frame.
int intrabc_used;
int dv_cost[2][MV_VALS];
@@ -842,10 +947,16 @@ typedef struct AV1_COMP {
int dv_joint_cost[MV_JOINTS];
int has_lossless_segment;
- // For frame refs short signaling:
- // A mapping of each reference frame from its encoder side value to the
- // decoder side value obtained following the short signaling procedure.
- int ref_conv[REF_FRAMES];
+ // Factors to control gating of compound type selection based on best
+ // approximate rd so far
+ int max_comp_type_rd_threshold_mul;
+ int max_comp_type_rd_threshold_div;
+
+ unsigned int tx_domain_dist_threshold;
+
+ // Factor to control R-D optimization of coeffs based on block
+ // mse.
+ unsigned int coeff_opt_dist_threshold;
AV1LfSync lf_row_sync;
AV1LrSync lr_row_sync;
@@ -865,8 +976,72 @@ typedef struct AV1_COMP {
#if CONFIG_MULTITHREAD
pthread_mutex_t *row_mt_mutex_;
#endif
+ // Set if screen content is set or relevant tools are enabled
+ int is_screen_content_type;
+#if CONFIG_COLLECT_PARTITION_STATS == 2
+ PartitionStats partition_stats;
+#endif
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ // component_time[] are initialized to zero while encoder starts.
+ uint64_t component_time[kTimingComponents];
+ struct aom_usec_timer component_timer[kTimingComponents];
+ // frame_component_time[] are initialized to zero at beginning of each frame.
+ uint64_t frame_component_time[kTimingComponents];
+#endif
+
+ // The following data are for AV1 bitstream levels.
+ AV1_LEVEL target_seq_level_idx[MAX_NUM_OPERATING_POINTS];
+ int keep_level_stats;
+ AV1LevelInfo level_info[MAX_NUM_OPERATING_POINTS];
+ // Count the number of OBU_FRAME and OBU_FRAME_HEADER for level calculation.
+ int frame_header_count;
+ FrameWindowBuffer frame_window_buffer;
} AV1_COMP;
+typedef struct {
+ YV12_BUFFER_CONFIG *source;
+ YV12_BUFFER_CONFIG *last_source;
+ int64_t ts_duration;
+} EncodeFrameInput;
+
+// EncodeFrameParams contains per-frame encoding parameters decided upon by
+// av1_encode_strategy() and passed down to av1_encode()
+struct EncodeFrameParams {
+ int error_resilient_mode;
+ FRAME_TYPE frame_type;
+ int primary_ref_frame;
+ int order_offset;
+ int show_frame;
+ int refresh_frame_flags;
+
+ int show_existing_frame;
+ int existing_fb_idx_to_show;
+
+ // Bitmask of which reference buffers may be referenced by this frame
+ int ref_frame_flags;
+
+ // Reference buffer assignment for this frame.
+ int remapped_ref_idx[REF_FRAMES];
+
+ // Flags which determine which reference buffers are refreshed by this frame
+ int refresh_last_frame;
+ int refresh_golden_frame;
+ int refresh_bwd_ref_frame;
+ int refresh_alt2_ref_frame;
+ int refresh_alt_ref_frame;
+
+ // Speed level to use for this frame: Bigger number means faster.
+ int speed;
+};
+typedef struct EncodeFrameParams EncodeFrameParams;
+
+// EncodeFrameResults contains information about the result of encoding a
+// single frame
+typedef struct {
+ size_t size; // Size of resulting bitstream
+} EncodeFrameResults;
+
// Must not be called more than once.
void av1_initialize_enc(void);
@@ -887,6 +1062,11 @@ int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
int64_t *time_end, int flush,
const aom_rational_t *timebase);
+int av1_encode(AV1_COMP *const cpi, uint8_t *const dest,
+ const EncodeFrameInput *const frame_input,
+ const EncodeFrameParams *const frame_params,
+ EncodeFrameResults *const frame_results);
+
int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest);
int av1_get_last_show_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *frame);
@@ -897,12 +1077,12 @@ aom_codec_err_t av1_copy_new_frame_enc(AV1_COMMON *cm,
int av1_use_as_reference(AV1_COMP *cpi, int ref_frame_flags);
-void av1_update_reference(AV1_COMP *cpi, int ref_frame_flags);
-
int av1_copy_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd);
int av1_set_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd);
+void av1_set_frame_size(AV1_COMP *cpi, int width, int height);
+
int av1_update_entropy(AV1_COMP *cpi, int update);
int av1_set_active_map(AV1_COMP *cpi, unsigned char *map, int rows, int cols);
@@ -916,26 +1096,23 @@ int av1_get_quantizer(struct AV1_COMP *cpi);
int av1_convert_sect5obus_to_annexb(uint8_t *buffer, size_t *input_size);
-int64_t timebase_units_to_ticks(const aom_rational_t *timebase, int64_t n);
-int64_t ticks_to_timebase_units(const aom_rational_t *timebase, int64_t n);
+// av1 uses 10,000,000 ticks/second as time stamp
+#define TICKS_PER_SEC 10000000LL
-static INLINE int frame_is_kf_gf_arf(const AV1_COMP *cpi) {
- return frame_is_intra_only(&cpi->common) || cpi->refresh_alt_ref_frame ||
- (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref);
+static INLINE int64_t timebase_units_to_ticks(const aom_rational_t *timebase,
+ int64_t n) {
+ return n * TICKS_PER_SEC * timebase->num / timebase->den;
}
-static INLINE int get_ref_frame_map_idx(const AV1_COMP *cpi,
- MV_REFERENCE_FRAME ref_frame) {
- return (ref_frame >= LAST_FRAME)
- ? cpi->remapped_ref_idx[ref_frame - LAST_FRAME]
- : INVALID_IDX;
+static INLINE int64_t ticks_to_timebase_units(const aom_rational_t *timebase,
+ int64_t n) {
+ const int64_t round = TICKS_PER_SEC * timebase->num / 2 - 1;
+ return (n * timebase->den + round) / timebase->num / TICKS_PER_SEC;
}
-static INLINE int get_ref_frame_buf_idx(const AV1_COMP *cpi,
- MV_REFERENCE_FRAME ref_frame) {
- const AV1_COMMON *const cm = &cpi->common;
- const int map_idx = get_ref_frame_map_idx(cpi, ref_frame);
- return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : INVALID_IDX;
+static INLINE int frame_is_kf_gf_arf(const AV1_COMP *cpi) {
+ return frame_is_intra_only(&cpi->common) || cpi->refresh_alt_ref_frame ||
+ (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref);
}
// TODO(huisu@google.com, youzhou@microsoft.com): enable hash-me for HBD.
@@ -944,33 +1121,37 @@ static INLINE int av1_use_hash_me(const AV1_COMMON *const cm) {
}
static INLINE hash_table *av1_get_ref_frame_hash_map(
- const AV1_COMP *cpi, MV_REFERENCE_FRAME ref_frame) {
- const AV1_COMMON *const cm = &cpi->common;
- const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
- return buf_idx != INVALID_IDX
- ? &cm->buffer_pool->frame_bufs[buf_idx].hash_table
- : NULL;
+ const AV1_COMMON *cm, MV_REFERENCE_FRAME ref_frame) {
+ const int map_idx = get_ref_frame_map_idx(cm, ref_frame);
+ RefCntBuffer *buf =
+ (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : NULL;
+ return buf ? &buf->hash_table : NULL;
}
-static INLINE YV12_BUFFER_CONFIG *get_ref_frame_buffer(
- const AV1_COMP *cpi, MV_REFERENCE_FRAME ref_frame) {
- const AV1_COMMON *const cm = &cpi->common;
- const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
- return buf_idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[buf_idx].buf
- : NULL;
+static INLINE const YV12_BUFFER_CONFIG *get_ref_frame_yv12_buf(
+ const AV1_COMMON *const cm, MV_REFERENCE_FRAME ref_frame) {
+ const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
+ return buf != NULL ? &buf->buf : NULL;
}
-static INLINE int enc_is_ref_frame_buf(AV1_COMP *cpi, RefCntBuffer *frame_buf) {
- AV1_COMMON *const cm = &cpi->common;
+static INLINE int enc_is_ref_frame_buf(const AV1_COMMON *const cm,
+ const RefCntBuffer *const frame_buf) {
MV_REFERENCE_FRAME ref_frame;
for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
- const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
- if (buf_idx == INVALID_IDX) continue;
- if (frame_buf == &cm->buffer_pool->frame_bufs[buf_idx]) break;
+ const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame);
+ if (buf == NULL) continue;
+ if (frame_buf == buf) break;
}
return (ref_frame <= ALTREF_FRAME);
}
+static INLINE void alloc_frame_mvs(AV1_COMMON *const cm, RefCntBuffer *buf) {
+ assert(buf != NULL);
+ ensure_mv_buffer(buf, cm);
+ buf->width = cm->width;
+ buf->height = cm->height;
+}
+
// Token buffer is only used for palette tokens.
static INLINE unsigned int get_token_alloc(int mb_rows, int mb_cols,
int sb_size_log2,
@@ -1026,10 +1207,10 @@ static INLINE int is_altref_enabled(const AV1_COMP *const cpi) {
static INLINE void set_ref_ptrs(const AV1_COMMON *cm, MACROBLOCKD *xd,
MV_REFERENCE_FRAME ref0,
MV_REFERENCE_FRAME ref1) {
- xd->block_refs[0] =
- &cm->current_frame.frame_refs[ref0 >= LAST_FRAME ? ref0 - LAST_FRAME : 0];
- xd->block_refs[1] =
- &cm->current_frame.frame_refs[ref1 >= LAST_FRAME ? ref1 - LAST_FRAME : 0];
+ xd->block_ref_scale_factors[0] =
+ get_ref_scale_factors_const(cm, ref0 >= LAST_FRAME ? ref0 : 1);
+ xd->block_ref_scale_factors[1] =
+ get_ref_scale_factors_const(cm, ref1 >= LAST_FRAME ? ref1 : 1);
}
static INLINE int get_chessboard_index(int frame_index) {
@@ -1042,6 +1223,8 @@ static INLINE int *cond_cost_list(const struct AV1_COMP *cpi, int *cost_list) {
void av1_new_framerate(AV1_COMP *cpi, double framerate);
+void av1_setup_frame_size(AV1_COMP *cpi);
+
#define LAYER_IDS_TO_IDX(sl, tl, num_tl) ((sl) * (num_tl) + (tl))
// Returns 1 if a frame is scaled and 0 otherwise.
@@ -1062,6 +1245,48 @@ static INLINE int encode_show_existing_frame(const AV1_COMMON *cm) {
cm->current_frame.frame_type == KEY_FRAME);
}
+// Lighter version of set_offsets that only sets the mode info
+// pointers.
+static INLINE void set_mode_info_offsets(const AV1_COMP *const cpi,
+ MACROBLOCK *const x,
+ MACROBLOCKD *const xd, int mi_row,
+ int mi_col) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int idx_str = xd->mi_stride * mi_row + mi_col;
+ xd->mi = cm->mi_grid_visible + idx_str;
+ xd->mi[0] = cm->mi + idx_str;
+ x->mbmi_ext = cpi->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
+}
+
+// Check to see if the given partition size is allowed for a specified number
+// of mi block rows and columns remaining in the image.
+// If not then return the largest allowed partition size
+static INLINE BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize, int rows_left,
+ int cols_left, int *bh, int *bw) {
+ int int_size = (int)bsize;
+ if (rows_left <= 0 || cols_left <= 0) {
+ return AOMMIN(bsize, BLOCK_8X8);
+ } else {
+ for (; int_size > 0; int_size -= 3) {
+ *bh = mi_size_high[int_size];
+ *bw = mi_size_wide[int_size];
+ if ((*bh <= rows_left) && (*bw <= cols_left)) {
+ break;
+ }
+ }
+ }
+ return (BLOCK_SIZE)int_size;
+}
+
+static const uint8_t av1_ref_frame_flag_list[REF_FRAMES] = { 0,
+ AOM_LAST_FLAG,
+ AOM_LAST2_FLAG,
+ AOM_LAST3_FLAG,
+ AOM_GOLD_FLAG,
+ AOM_BWD_FLAG,
+ AOM_ALT2_FLAG,
+ AOM_ALT_FLAG };
+
// Returns a Sequence Header OBU stored in an aom_fixed_buf_t, or NULL upon
// failure. When a non-NULL aom_fixed_buf_t pointer is returned by this
// function, the memory must be freed by the caller. Both the buf member of the
@@ -1073,6 +1298,80 @@ static INLINE int encode_show_existing_frame(const AV1_COMMON *cm) {
// field.
aom_fixed_buf_t *av1_get_global_headers(AV1_COMP *cpi);
+#if CONFIG_COLLECT_PARTITION_STATS == 2
+static INLINE void av1_print_partition_stats(PartitionStats *part_stats) {
+ FILE *f = fopen("partition_stats.csv", "w");
+ if (!f) {
+ return;
+ }
+
+ fprintf(f, "bsize,redo,");
+ for (int part = 0; part < EXT_PARTITION_TYPES; part++) {
+ fprintf(f, "decision_%d,", part);
+ }
+ for (int part = 0; part < EXT_PARTITION_TYPES; part++) {
+ fprintf(f, "attempt_%d,", part);
+ }
+ for (int part = 0; part < EXT_PARTITION_TYPES; part++) {
+ fprintf(f, "time_%d,", part);
+ }
+ fprintf(f, "\n");
+
+ const int bsizes[6] = { 128, 64, 32, 16, 8, 4 };
+
+ for (int bsize_idx = 0; bsize_idx < 6; bsize_idx++) {
+ fprintf(f, "%d,%d,", bsizes[bsize_idx], part_stats->partition_redo);
+ for (int part = 0; part < EXT_PARTITION_TYPES; part++) {
+ fprintf(f, "%d,", part_stats->partition_decisions[bsize_idx][part]);
+ }
+ for (int part = 0; part < EXT_PARTITION_TYPES; part++) {
+ fprintf(f, "%d,", part_stats->partition_attempts[bsize_idx][part]);
+ }
+ for (int part = 0; part < EXT_PARTITION_TYPES; part++) {
+ fprintf(f, "%ld,", part_stats->partition_times[bsize_idx][part]);
+ }
+ fprintf(f, "\n");
+ }
+ fclose(f);
+}
+
+static INLINE int av1_get_bsize_idx_for_part_stats(BLOCK_SIZE bsize) {
+ assert(bsize == BLOCK_128X128 || bsize == BLOCK_64X64 ||
+ bsize == BLOCK_32X32 || bsize == BLOCK_16X16 || bsize == BLOCK_8X8 ||
+ bsize == BLOCK_4X4);
+ switch (bsize) {
+ case BLOCK_128X128: return 0;
+ case BLOCK_64X64: return 1;
+ case BLOCK_32X32: return 2;
+ case BLOCK_16X16: return 3;
+ case BLOCK_8X8: return 4;
+ case BLOCK_4X4: return 5;
+ default: assert(0 && "Invalid bsize for partition_stats."); return -1;
+ }
+}
+#endif
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+static INLINE void start_timing(AV1_COMP *cpi, int component) {
+ aom_usec_timer_start(&cpi->component_timer[component]);
+}
+static INLINE void end_timing(AV1_COMP *cpi, int component) {
+ aom_usec_timer_mark(&cpi->component_timer[component]);
+ cpi->frame_component_time[component] +=
+ aom_usec_timer_elapsed(&cpi->component_timer[component]);
+}
+static INLINE char const *get_frame_type_enum(int type) {
+ switch (type) {
+ case 0: return "KEY_FRAME";
+ case 1: return "INTER_FRAME";
+ case 2: return "INTRA_ONLY_FRAME";
+ case 3: return "S_FRAME";
+ default: assert(0);
+ }
+ return "error";
+}
+#endif
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/libaom/av1/encoder/encodetxb.c b/libaom/av1/encoder/encodetxb.c
index a0c6ec1..37f4bb9 100644
--- a/libaom/av1/encoder/encodetxb.c
+++ b/libaom/av1/encoder/encodetxb.c
@@ -76,21 +76,12 @@ void av1_free_txb_buf(AV1_COMP *cpi) { aom_free(cpi->coeff_buffer_base); }
void av1_set_coeff_buffer(const AV1_COMP *const cpi, MACROBLOCK *const x,
int mi_row, int mi_col) {
const AV1_COMMON *const cm = &cpi->common;
- const int num_planes = av1_num_planes(cm);
int mib_size_log2 = cm->seq_params.mib_size_log2;
int stride = (cm->mi_cols >> mib_size_log2) + 1;
int offset = (mi_row >> mib_size_log2) * stride + (mi_col >> mib_size_log2);
- CB_COEFF_BUFFER *coeff_buf = &cpi->coeff_buffer_base[offset];
- const int txb_offset = x->cb_offset / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
+ x->mbmi_ext->cb_coef_buff = &cpi->coeff_buffer_base[offset];
+ x->mbmi_ext->cb_offset = x->cb_offset;
assert(x->cb_offset < (1 << num_pels_log2_lookup[cm->seq_params.sb_size]));
- for (int plane = 0; plane < num_planes; ++plane) {
- x->mbmi_ext->tcoeff[plane] = coeff_buf->tcoeff[plane] + x->cb_offset;
- x->mbmi_ext->eobs[plane] = coeff_buf->eobs[plane] + txb_offset;
- x->mbmi_ext->txb_skip_ctx[plane] =
- coeff_buf->txb_skip_ctx[plane] + txb_offset;
- x->mbmi_ext->dc_sign_ctx[plane] =
- coeff_buf->dc_sign_ctx[plane] + txb_offset;
- }
}
static void write_golomb(aom_writer *w, int level) {
@@ -284,20 +275,16 @@ static INLINE int get_sign_bit_cost(tran_low_t qc, int coeff_idx,
return av1_cost_literal(1);
}
-static INLINE int get_br_cost(tran_low_t abs_qc, int ctx,
- const int *coeff_lps) {
- const tran_low_t min_level = 1 + NUM_BASE_LEVELS;
- const tran_low_t max_level = 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE;
- (void)ctx;
- if (abs_qc >= min_level) {
- if (abs_qc >= max_level) {
- return coeff_lps[COEFF_BASE_RANGE]; // COEFF_BASE_RANGE * cost0;
- } else {
- return coeff_lps[(abs_qc - min_level)]; // * cost0 + cost1;
- }
- }
- return 0;
-}
+static const int golomb_bits_cost[32] = {
+ 0, 512, 512 * 3, 512 * 3, 512 * 5, 512 * 5, 512 * 5, 512 * 5,
+ 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7, 512 * 7,
+ 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9,
+ 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9, 512 * 9
+};
+static const int golomb_cost_diff[32] = {
+ 0, 512, 512 * 2, 0, 512 * 2, 0, 0, 0, 512 * 2, 0, 0, 0, 0, 0, 0, 0,
+ 512 * 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
static INLINE int get_golomb_cost(int abs_qc) {
if (abs_qc >= 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) {
@@ -308,6 +295,32 @@ static INLINE int get_golomb_cost(int abs_qc) {
return 0;
}
+static INLINE int get_br_cost_with_diff(tran_low_t level, const int *coeff_lps,
+ int *diff) {
+ const int base_range = AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE);
+ int golomb_bits = 0;
+ if (level <= COEFF_BASE_RANGE + 1 + NUM_BASE_LEVELS)
+ *diff += coeff_lps[base_range + COEFF_BASE_RANGE + 1];
+
+ if (level >= COEFF_BASE_RANGE + 1 + NUM_BASE_LEVELS) {
+ int r = level - COEFF_BASE_RANGE - NUM_BASE_LEVELS;
+ if (r < 32) {
+ golomb_bits = golomb_bits_cost[r];
+ *diff += golomb_cost_diff[r];
+ } else {
+ golomb_bits = get_golomb_cost(level);
+ *diff += (r & (r - 1)) == 0 ? 1024 : 0;
+ }
+ }
+
+ return coeff_lps[base_range] + golomb_bits;
+}
+
+static INLINE int get_br_cost(tran_low_t level, const int *coeff_lps) {
+ const int base_range = AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE);
+ return coeff_lps[base_range] + get_golomb_cost(level);
+}
+
static int get_coeff_cost(const tran_low_t qc, const int scan_idx,
const int is_eob, const TxbInfo *const txb_info,
const LV_MAP_COEFF_COST *const txb_costs,
@@ -331,8 +344,7 @@ static int get_coeff_cost(const tran_low_t qc, const int scan_idx,
if (abs_qc > NUM_BASE_LEVELS) {
const int ctx =
get_br_ctx(txb_info->levels, pos, txb_info->bwl, tx_class);
- cost += get_br_cost(abs_qc, ctx, txb_costs->lps_cost[ctx]);
- cost += get_golomb_cost(abs_qc);
+ cost += get_br_cost(abs_qc, txb_costs->lps_cost[ctx]);
}
}
return cost;
@@ -464,8 +476,6 @@ void av1_txb_init_levels_c(const tran_low_t *const coeff, const int width,
const int stride = width + TX_PAD_HOR;
uint8_t *ls = levels;
- memset(levels - TX_PAD_TOP * stride, 0,
- sizeof(*levels) * TX_PAD_TOP * stride);
memset(levels + stride * height, 0,
sizeof(*levels) * (TX_PAD_BOTTOM * stride + TX_PAD_END));
@@ -554,14 +564,15 @@ void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
break;
}
- if (k_eob_offset_bits[eob_pt] > 0) {
+ const int eob_offset_bits = k_eob_offset_bits[eob_pt];
+ if (eob_offset_bits > 0) {
const int eob_ctx = eob_pt - 3;
- int eob_shift = k_eob_offset_bits[eob_pt] - 1;
+ int eob_shift = eob_offset_bits - 1;
int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0;
aom_write_symbol(w, bit,
ec_ctx->eob_extra_cdf[txs_ctx][plane_type][eob_ctx], 2);
- for (int i = 1; i < k_eob_offset_bits[eob_pt]; i++) {
- eob_shift = k_eob_offset_bits[eob_pt] - 1 - i;
+ for (int i = 1; i < eob_offset_bits; i++) {
+ eob_shift = eob_offset_bits - 1 - i;
bit = (eob_extra & (1 << eob_shift)) ? 1 : 0;
aom_write_bit(w, bit);
}
@@ -588,12 +599,11 @@ void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
// level is above 1.
const int base_range = level - 1 - NUM_BASE_LEVELS;
const int br_ctx = get_br_ctx(levels, pos, bwl, tx_class);
+ aom_cdf_prob *cdf =
+ ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type][br_ctx];
for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
const int k = AOMMIN(base_range - idx, BR_CDF_SIZE - 1);
- aom_write_symbol(
- w, k,
- ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type][br_ctx],
- BR_CDF_SIZE);
+ aom_write_symbol(w, k, cdf, BR_CDF_SIZE);
if (k < BR_CDF_SIZE - 1) break;
}
}
@@ -628,10 +638,18 @@ static void write_coeffs_txb_wrap(const AV1_COMMON *cm, MACROBLOCK *x,
aom_writer *w, int plane, int block,
int blk_row, int blk_col, TX_SIZE tx_size) {
MACROBLOCKD *xd = &x->e_mbd;
- tran_low_t *tcoeff = BLOCK_OFFSET(x->mbmi_ext->tcoeff[plane], block);
- uint16_t eob = x->mbmi_ext->eobs[plane][block];
- TXB_CTX txb_ctx = { x->mbmi_ext->txb_skip_ctx[plane][block],
- x->mbmi_ext->dc_sign_ctx[plane][block] };
+ const int txb_offset =
+ x->mbmi_ext->cb_offset / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
+ tran_low_t *tcoeff_txb =
+ x->mbmi_ext->cb_coef_buff->tcoeff[plane] + x->mbmi_ext->cb_offset;
+ uint16_t *eob_txb = x->mbmi_ext->cb_coef_buff->eobs[plane] + txb_offset;
+ uint8_t *txb_skip_ctx_txb =
+ x->mbmi_ext->cb_coef_buff->txb_skip_ctx[plane] + txb_offset;
+ int *dc_sign_ctx_txb =
+ x->mbmi_ext->cb_coef_buff->dc_sign_ctx[plane] + txb_offset;
+ tran_low_t *tcoeff = BLOCK_OFFSET(tcoeff_txb, block);
+ uint16_t eob = eob_txb[block];
+ TXB_CTX txb_ctx = { txb_skip_ctx_txb[block], dc_sign_ctx_txb[block] };
av1_write_coeffs_txb(cm, xd, w, blk_row, blk_col, plane, tx_size, tcoeff, eob,
&txb_ctx);
}
@@ -745,7 +763,8 @@ static AOM_FORCE_INLINE int warehouse_efficients_txb(
av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, coeff_contexts);
- const int(*lps_cost)[COEFF_BASE_RANGE + 1] = coeff_costs->lps_cost;
+ const int(*lps_cost)[COEFF_BASE_RANGE + 1 + COEFF_BASE_RANGE + 1] =
+ coeff_costs->lps_cost;
int c = eob - 1;
{
const int pos = scan[c];
@@ -758,11 +777,8 @@ static AOM_FORCE_INLINE int warehouse_efficients_txb(
if (v) {
// sign bit cost
if (level > NUM_BASE_LEVELS) {
- const int ctx = get_br_ctx(levels, pos, bwl, tx_class);
- const int base_range =
- AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE);
- cost += lps_cost[ctx][base_range];
- cost += get_golomb_cost(level);
+ const int ctx = get_br_ctx_eob(pos, bwl, tx_class);
+ cost += get_br_cost(level, lps_cost[ctx]);
}
if (c) {
cost += av1_cost_literal(1);
@@ -774,7 +790,7 @@ static AOM_FORCE_INLINE int warehouse_efficients_txb(
}
}
}
- const int(*base_cost)[4] = coeff_costs->base_cost;
+ const int(*base_cost)[8] = coeff_costs->base_cost;
for (c = eob - 2; c >= 1; --c) {
const int pos = scan[c];
const int coeff_ctx = coeff_contexts[pos];
@@ -786,10 +802,7 @@ static AOM_FORCE_INLINE int warehouse_efficients_txb(
cost += av1_cost_literal(1);
if (level > NUM_BASE_LEVELS) {
const int ctx = get_br_ctx(levels, pos, bwl, tx_class);
- const int base_range =
- AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE);
- cost += lps_cost[ctx][base_range];
- cost += get_golomb_cost(level);
+ cost += get_br_cost(level, lps_cost[ctx]);
}
}
cost += cost0;
@@ -809,10 +822,7 @@ static AOM_FORCE_INLINE int warehouse_efficients_txb(
cost += coeff_costs->dc_sign_cost[dc_sign_ctx][sign01];
if (level > NUM_BASE_LEVELS) {
const int ctx = get_br_ctx(levels, pos, bwl, tx_class);
- const int base_range =
- AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE);
- cost += lps_cost[ctx][base_range];
- cost += get_golomb_cost(level);
+ cost += get_br_cost(level, lps_cost[ctx]);
}
}
}
@@ -1284,20 +1294,47 @@ static int hbt_create_hashes(TxbInfo *txb_info,
txb_eob_costs, p, block, fast_mode, rate_cost);
}
-static AOM_FORCE_INLINE int get_coeff_cost_simple(
+static AOM_FORCE_INLINE int get_two_coeff_cost_simple(
int ci, tran_low_t abs_qc, int coeff_ctx,
const LV_MAP_COEFF_COST *txb_costs, int bwl, TX_CLASS tx_class,
- const uint8_t *levels) {
+ const uint8_t *levels, int *cost_low) {
// this simple version assumes the coeff's scan_idx is not DC (scan_idx != 0)
// and not the last (scan_idx != eob - 1)
assert(ci > 0);
int cost = txb_costs->base_cost[coeff_ctx][AOMMIN(abs_qc, 3)];
+ int diff = 0;
+ if (abs_qc <= 3) diff = txb_costs->base_cost[coeff_ctx][abs_qc + 4];
if (abs_qc) {
cost += av1_cost_literal(1);
if (abs_qc > NUM_BASE_LEVELS) {
const int br_ctx = get_br_ctx(levels, ci, bwl, tx_class);
- cost += get_br_cost(abs_qc, br_ctx, txb_costs->lps_cost[br_ctx]);
- cost += get_golomb_cost(abs_qc);
+ int brcost_diff = 0;
+ cost += get_br_cost_with_diff(abs_qc, txb_costs->lps_cost[br_ctx],
+ &brcost_diff);
+ diff += brcost_diff;
+ }
+ }
+ *cost_low = cost - diff;
+
+ return cost;
+}
+
+static INLINE int get_coeff_cost_eob(int ci, tran_low_t abs_qc, int sign,
+ int coeff_ctx, int dc_sign_ctx,
+ const LV_MAP_COEFF_COST *txb_costs,
+ int bwl, TX_CLASS tx_class) {
+ int cost = 0;
+ cost += txb_costs->base_eob_cost[coeff_ctx][AOMMIN(abs_qc, 3) - 1];
+ if (abs_qc != 0) {
+ if (ci == 0) {
+ cost += txb_costs->dc_sign_cost[dc_sign_ctx][sign];
+ } else {
+ cost += av1_cost_literal(1);
+ }
+ if (abs_qc > NUM_BASE_LEVELS) {
+ int br_ctx;
+ br_ctx = get_br_ctx_eob(ci, bwl, tx_class);
+ cost += get_br_cost(abs_qc, txb_costs->lps_cost[br_ctx]);
}
}
return cost;
@@ -1322,9 +1359,12 @@ static INLINE int get_coeff_cost_general(int is_last, int ci, tran_low_t abs_qc,
cost += av1_cost_literal(1);
}
if (abs_qc > NUM_BASE_LEVELS) {
- const int br_ctx = get_br_ctx(levels, ci, bwl, tx_class);
- cost += get_br_cost(abs_qc, br_ctx, txb_costs->lps_cost[br_ctx]);
- cost += get_golomb_cost(abs_qc);
+ int br_ctx;
+ if (is_last)
+ br_ctx = get_br_ctx_eob(ci, bwl, tx_class);
+ else
+ br_ctx = get_br_ctx(levels, ci, bwl, tx_class);
+ cost += get_br_cost(abs_qc, txb_costs->lps_cost[br_ctx]);
}
}
return cost;
@@ -1368,13 +1408,23 @@ static INLINE void update_coeff_general(
const int64_t rd = RDCOST(rdmult, rate, dist);
tran_low_t qc_low, dqc_low;
- get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
- const tran_low_t abs_qc_low = abs_qc - 1;
- const int64_t dist_low = get_coeff_dist(tqc, dqc_low, shift);
- const int rate_low =
- get_coeff_cost_general(is_last, ci, abs_qc_low, sign, coeff_ctx,
- dc_sign_ctx, txb_costs, bwl, tx_class, levels);
- const int64_t rd_low = RDCOST(rdmult, rate_low, dist_low);
+ tran_low_t abs_qc_low;
+ int64_t dist_low, rd_low;
+ int rate_low;
+ if (abs_qc == 1) {
+ abs_qc_low = qc_low = dqc_low = 0;
+ dist_low = dist0;
+ rate_low = txb_costs->base_cost[coeff_ctx][0];
+ } else {
+ get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
+ abs_qc_low = abs_qc - 1;
+ dist_low = get_coeff_dist(tqc, dqc_low, shift);
+ rate_low =
+ get_coeff_cost_general(is_last, ci, abs_qc_low, sign, coeff_ctx,
+ dc_sign_ctx, txb_costs, bwl, tx_class, levels);
+ }
+
+ rd_low = RDCOST(rdmult, rate_low, dist_low);
if (rd_low < rd) {
qcoeff[ci] = qc_low;
dqcoeff[ci] = dqc_low;
@@ -1408,28 +1458,28 @@ static AOM_FORCE_INLINE void update_coeff_simple(
*accu_rate += txb_costs->base_cost[coeff_ctx][0];
} else {
const tran_low_t abs_qc = abs(qc);
- const tran_low_t tqc = tcoeff[ci];
- const tran_low_t dqc = dqcoeff[ci];
- const int rate = get_coeff_cost_simple(ci, abs_qc, coeff_ctx, txb_costs,
- bwl, tx_class, levels);
- if (abs(dqc) < abs(tqc)) {
+ const tran_low_t abs_tqc = abs(tcoeff[ci]);
+ const tran_low_t abs_dqc = abs(dqcoeff[ci]);
+ int rate_low = 0;
+ const int rate = get_two_coeff_cost_simple(
+ ci, abs_qc, coeff_ctx, txb_costs, bwl, tx_class, levels, &rate_low);
+ if (abs_dqc < abs_tqc) {
*accu_rate += rate;
return;
}
- const int64_t dist = get_coeff_dist(tqc, dqc, shift);
+
+ const int64_t dist = get_coeff_dist(abs_tqc, abs_dqc, shift);
const int64_t rd = RDCOST(rdmult, rate, dist);
- const int sign = (qc < 0) ? 1 : 0;
- tran_low_t qc_low, dqc_low;
- get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
const tran_low_t abs_qc_low = abs_qc - 1;
- const int64_t dist_low = get_coeff_dist(tqc, dqc_low, shift);
- const int rate_low = get_coeff_cost_simple(
- ci, abs_qc_low, coeff_ctx, txb_costs, bwl, tx_class, levels);
+ const tran_low_t abs_dqc_low = (abs_qc_low * dqv) >> shift;
+ const int64_t dist_low = get_coeff_dist(abs_tqc, abs_dqc_low, shift);
const int64_t rd_low = RDCOST(rdmult, rate_low, dist_low);
+
if (rd_low < rd) {
- qcoeff[ci] = qc_low;
- dqcoeff[ci] = dqc_low;
+ const int sign = (qc < 0) ? 1 : 0;
+ qcoeff[ci] = (-sign ^ abs_qc_low) + sign;
+ dqcoeff[ci] = (-sign ^ abs_dqc_low) + sign;
levels[get_padded_idx(ci, bwl)] = AOMMIN(abs_qc_low, INT8_MAX);
*accu_rate += rate_low;
} else {
@@ -1438,6 +1488,36 @@ static AOM_FORCE_INLINE void update_coeff_simple(
}
}
+static INLINE void update_coeff_eob_fast(int *eob, int shift,
+ const int16_t *dequant_ptr,
+ const int16_t *scan,
+ const tran_low_t *coeff_ptr,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr) {
+ // TODO(sarahparker) make this work for aomqm
+ int eob_out = *eob;
+ int zbin[2] = { dequant_ptr[0] + ROUND_POWER_OF_TWO(dequant_ptr[0] * 70, 7),
+ dequant_ptr[1] + ROUND_POWER_OF_TWO(dequant_ptr[1] * 70, 7) };
+
+ for (int i = *eob - 1; i >= 0; i--) {
+ const int rc = scan[i];
+ const int qcoeff = qcoeff_ptr[rc];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+ if (((abs_coeff << (1 + shift)) < zbin[rc != 0]) || (qcoeff == 0)) {
+ eob_out--;
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ } else {
+ break;
+ }
+ }
+
+ *eob = eob_out;
+}
+
static AOM_FORCE_INLINE void update_coeff_eob(
int *accu_rate, int64_t *accu_dist, int *eob, int *nz_num, int *nz_ci,
int si, TX_SIZE tx_size, TX_CLASS tx_class, int bwl, int height,
@@ -1467,40 +1547,42 @@ static AOM_FORCE_INLINE void update_coeff_eob(
int64_t rd = RDCOST(rdmult, *accu_rate + rate, *accu_dist + dist);
tran_low_t qc_low, dqc_low;
- get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
- const tran_low_t abs_qc_low = abs_qc - 1;
- const int64_t dist_low = get_coeff_dist(tqc, dqc_low, shift) - dist0;
- const int rate_low =
- get_coeff_cost_general(0, ci, abs_qc_low, sign, coeff_ctx, dc_sign_ctx,
- txb_costs, bwl, tx_class, levels);
- const int64_t rd_low =
- RDCOST(rdmult, *accu_rate + rate_low, *accu_dist + dist_low);
+ tran_low_t abs_qc_low;
+ int64_t dist_low, rd_low;
+ int rate_low;
+ if (abs_qc == 1) {
+ abs_qc_low = 0;
+ dqc_low = qc_low = 0;
+ dist_low = 0;
+ rate_low = txb_costs->base_cost[coeff_ctx][0];
+ rd_low = RDCOST(rdmult, *accu_rate + rate_low, *accu_dist);
+ } else {
+ get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
+ abs_qc_low = abs_qc - 1;
+ dist_low = get_coeff_dist(tqc, dqc_low, shift) - dist0;
+ rate_low =
+ get_coeff_cost_general(0, ci, abs_qc_low, sign, coeff_ctx,
+ dc_sign_ctx, txb_costs, bwl, tx_class, levels);
+ rd_low = RDCOST(rdmult, *accu_rate + rate_low, *accu_dist + dist_low);
+ }
int lower_level_new_eob = 0;
const int new_eob = si + 1;
- uint8_t tmp_levels[3];
- for (int ni = 0; ni < *nz_num; ++ni) {
- const int last_ci = nz_ci[ni];
- tmp_levels[ni] = levels[get_padded_idx(last_ci, bwl)];
- levels[get_padded_idx(last_ci, bwl)] = 0;
- }
-
- const int coeff_ctx_new_eob = get_lower_levels_ctx_general(
- 1, si, bwl, height, levels, ci, tx_size, tx_class);
+ const int coeff_ctx_new_eob = get_lower_levels_ctx_eob(bwl, height, si);
const int new_eob_cost =
get_eob_cost(new_eob, txb_eob_costs, txb_costs, tx_class);
int rate_coeff_eob =
- new_eob_cost + get_coeff_cost_general(1, ci, abs_qc, sign,
- coeff_ctx_new_eob, dc_sign_ctx,
- txb_costs, bwl, tx_class, levels);
+ new_eob_cost + get_coeff_cost_eob(ci, abs_qc, sign, coeff_ctx_new_eob,
+ dc_sign_ctx, txb_costs, bwl,
+ tx_class);
int64_t dist_new_eob = dist;
int64_t rd_new_eob = RDCOST(rdmult, rate_coeff_eob, dist_new_eob);
if (abs_qc_low > 0) {
const int rate_coeff_eob_low =
- new_eob_cost +
- get_coeff_cost_general(1, ci, abs_qc_low, sign, coeff_ctx_new_eob,
- dc_sign_ctx, txb_costs, bwl, tx_class, levels);
+ new_eob_cost + get_coeff_cost_eob(ci, abs_qc_low, sign,
+ coeff_ctx_new_eob, dc_sign_ctx,
+ txb_costs, bwl, tx_class);
const int64_t dist_new_eob_low = dist_low;
const int64_t rd_new_eob_low =
RDCOST(rdmult, rate_coeff_eob_low, dist_new_eob_low);
@@ -1522,7 +1604,7 @@ static AOM_FORCE_INLINE void update_coeff_eob(
if (sharpness == 0 && rd_new_eob < rd) {
for (int ni = 0; ni < *nz_num; ++ni) {
int last_ci = nz_ci[ni];
- // levels[get_padded_idx(last_ci, bwl)] = 0;
+ levels[get_padded_idx(last_ci, bwl)] = 0;
qcoeff[last_ci] = 0;
dqcoeff[last_ci] = 0;
}
@@ -1532,10 +1614,6 @@ static AOM_FORCE_INLINE void update_coeff_eob(
*accu_dist = dist_new_eob;
lower_level = lower_level_new_eob;
} else {
- for (int ni = 0; ni < *nz_num; ++ni) {
- const int last_ci = nz_ci[ni];
- levels[get_padded_idx(last_ci, bwl)] = tmp_levels[ni];
- }
*accu_rate += rate;
*accu_dist += dist;
}
@@ -1575,35 +1653,44 @@ static INLINE void update_skip(int *accu_rate, int64_t accu_dist, int *eob,
int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
int block, TX_SIZE tx_size, TX_TYPE tx_type,
const TXB_CTX *const txb_ctx, int *rate_cost,
- int sharpness) {
- const AV1_COMMON *cm = &cpi->common;
+ int sharpness, int fast_mode) {
MACROBLOCKD *xd = &x->e_mbd;
- const PLANE_TYPE plane_type = get_plane_type(plane);
- const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
- const TX_CLASS tx_class = tx_type_to_class[tx_type];
- const MB_MODE_INFO *mbmi = xd->mi[0];
- const struct macroblock_plane *p = &x->plane[plane];
struct macroblockd_plane *pd = &xd->plane[plane];
+ const struct macroblock_plane *p = &x->plane[plane];
+ const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type);
+ const int16_t *scan = scan_order->scan;
+ const int shift = av1_get_tx_scale(tx_size);
+ int eob = p->eobs[block];
+ const int16_t *dequant = p->dequant_QTX;
tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
const tran_low_t *tcoeff = BLOCK_OFFSET(p->coeff, block);
- const int16_t *dequant = p->dequant_QTX;
+
+ if (fast_mode) {
+ update_coeff_eob_fast(&eob, shift, dequant, scan, tcoeff, qcoeff, dqcoeff);
+ p->eobs[block] = eob;
+ if (eob == 0) {
+ *rate_cost = av1_cost_skip_txb(x, txb_ctx, plane, tx_size);
+ return eob;
+ }
+ }
+
+ const AV1_COMMON *cm = &cpi->common;
+ const PLANE_TYPE plane_type = get_plane_type(plane);
+ const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+ const TX_CLASS tx_class = tx_type_to_class[tx_type];
+ const MB_MODE_INFO *mbmi = xd->mi[0];
const int bwl = get_txb_bwl(tx_size);
const int width = get_txb_wide(tx_size);
const int height = get_txb_high(tx_size);
assert(width == (1 << bwl));
const int is_inter = is_inter_block(mbmi);
- const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type);
- const int16_t *scan = scan_order->scan;
const LV_MAP_COEFF_COST *txb_costs = &x->coeff_costs[txs_ctx][plane_type];
const int eob_multi_size = txsize_log2_minus4[tx_size];
const LV_MAP_EOB_COST *txb_eob_costs =
&x->eob_costs[eob_multi_size][plane_type];
- const int shift = av1_get_tx_scale(tx_size);
- const int64_t rdmult =
- ((x->rdmult * plane_rd_mult[is_inter][plane_type] << (2 * (xd->bd - 8))) +
- 2) >>
+ const int rshift =
(sharpness +
(cpi->oxcf.aq_mode == VARIANCE_AQ && mbmi->segment_id < 4
? 7 - mbmi->segment_id
@@ -1612,17 +1699,21 @@ int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
cpi->oxcf.deltaq_mode > NO_DELTA_Q && x->sb_energy_level < 0
? (3 - x->sb_energy_level)
: 0));
+ const int64_t rdmult =
+ (((int64_t)x->rdmult *
+ (plane_rd_mult[is_inter][plane_type] << (2 * (xd->bd - 8)))) +
+ 2) >>
+ rshift;
uint8_t levels_buf[TX_PAD_2D];
uint8_t *const levels = set_levels(levels_buf, width);
- av1_txb_init_levels(qcoeff, width, height, levels);
+ if (eob > 1) av1_txb_init_levels(qcoeff, width, height, levels);
// TODO(angirbird): check iqmatrix
const int non_skip_cost = txb_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][0];
const int skip_cost = txb_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1];
- int eob = p->eobs[block];
const int eob_cost = get_eob_cost(eob, txb_eob_costs, txb_costs, tx_class);
int accu_rate = eob_cost;
int64_t accu_dist = 0;
@@ -1642,11 +1733,10 @@ int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
--si;
} else {
assert(abs_qc == 1);
- const int coeff_ctx = get_lower_levels_ctx_general(
- 1, si, bwl, height, levels, ci, tx_size, tx_class);
- accu_rate += get_coeff_cost_general(1, ci, abs_qc, sign, coeff_ctx,
- txb_ctx->dc_sign_ctx, txb_costs, bwl,
- tx_class, levels);
+ const int coeff_ctx = get_lower_levels_ctx_eob(bwl, height, si);
+ accu_rate +=
+ get_coeff_cost_eob(ci, abs_qc, sign, coeff_ctx, txb_ctx->dc_sign_ctx,
+ txb_costs, bwl, tx_class);
const tran_low_t tqc = tcoeff[ci];
const tran_low_t dqc = dqcoeff[ci];
const int64_t dist = get_coeff_dist(tqc, dqc, shift);
@@ -1657,7 +1747,7 @@ int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
#define UPDATE_COEFF_EOB_CASE(tx_class_literal) \
case tx_class_literal: \
- for (; si >= 0 && nz_num <= max_nz_num; --si) { \
+ for (; si >= 0 && nz_num <= max_nz_num && !fast_mode; --si) { \
update_coeff_eob(&accu_rate, &accu_dist, &eob, &nz_num, nz_ci, si, \
tx_size, tx_class_literal, bwl, height, \
txb_ctx->dc_sign_ctx, rdmult, shift, dequant, scan, \
@@ -1750,7 +1840,8 @@ int av1_optimize_txb(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
const int shift = av1_get_tx_scale(tx_size);
const int64_t rdmult =
- ((x->rdmult * plane_rd_mult[is_inter][plane_type] << (2 * (xd->bd - 8))) +
+ (((int64_t)x->rdmult * plane_rd_mult[is_inter][plane_type]
+ << (2 * (xd->bd - 8))) +
2) >>
2;
uint8_t levels_buf[TX_PAD_2D];
@@ -1763,10 +1854,9 @@ int av1_optimize_txb(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
assert(width == (1 << bwl));
const int tx_type_cost = get_tx_type_cost(cm, x, xd, plane, tx_size, tx_type);
TxbInfo txb_info = {
- qcoeff, levels, dqcoeff, tcoeff, dequant, shift,
- tx_size, txs_ctx, tx_type, bwl, width, height,
- eob, seg_eob, scan_order, txb_ctx, rdmult, &cm->coeff_ctx_table,
- iqmatrix, tx_type_cost,
+ qcoeff, levels, dqcoeff, tcoeff, dequant, shift, tx_size,
+ txs_ctx, tx_type, bwl, width, height, eob, seg_eob,
+ scan_order, txb_ctx, rdmult, iqmatrix, tx_type_cost,
};
// Hash based trellis (hbt) speed feature: avoid expensive optimize_txb calls
@@ -1918,15 +2008,22 @@ void av1_update_and_record_txb_context(int plane, int block, int blk_row,
2);
}
- x->mbmi_ext->txb_skip_ctx[plane][block] = txb_ctx.txb_skip_ctx;
- x->mbmi_ext->eobs[plane][block] = eob;
+ const int txb_offset =
+ x->mbmi_ext->cb_offset / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
+ uint16_t *eob_txb = x->mbmi_ext->cb_coef_buff->eobs[plane] + txb_offset;
+ uint8_t *txb_skip_ctx_txb =
+ x->mbmi_ext->cb_coef_buff->txb_skip_ctx[plane] + txb_offset;
+ txb_skip_ctx_txb[block] = txb_ctx.txb_skip_ctx;
+ eob_txb[block] = eob;
if (eob == 0) {
av1_set_contexts(xd, pd, plane, plane_bsize, tx_size, 0, blk_col, blk_row);
return;
}
- tran_low_t *tcoeff = BLOCK_OFFSET(x->mbmi_ext->tcoeff[plane], block);
+ tran_low_t *tcoeff_txb =
+ x->mbmi_ext->cb_coef_buff->tcoeff[plane] + x->mbmi_ext->cb_offset;
+ tran_low_t *tcoeff = BLOCK_OFFSET(tcoeff_txb, block);
const int segment_id = mbmi->segment_id;
const int seg_eob = av1_get_tx_eob(&cpi->common.seg, segment_id, tx_size);
const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
@@ -2019,7 +2116,9 @@ void av1_update_and_record_txb_context(int plane, int block, int blk_row,
#endif // CONFIG_ENTROPY_STATS
if (allow_update_cdf)
update_cdf(ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx], dc_sign, 2);
- x->mbmi_ext->dc_sign_ctx[plane][block] = dc_sign_ctx;
+ int *dc_sign_ctx_txb =
+ x->mbmi_ext->cb_coef_buff->dc_sign_ctx[plane] + txb_offset;
+ dc_sign_ctx_txb[block] = dc_sign_ctx;
}
const int cul_level = av1_get_txb_entropy_context(tcoeff, scan_order, eob);
diff --git a/libaom/av1/encoder/encodetxb.h b/libaom/av1/encoder/encodetxb.h
index 4ee41ce..0682590 100644
--- a/libaom/av1/encoder/encodetxb.h
+++ b/libaom/av1/encoder/encodetxb.h
@@ -42,7 +42,6 @@ typedef struct TxbInfo {
const SCAN_ORDER *scan_order;
TXB_CTX *txb_ctx;
int64_t rdmult;
- const LV_MAP_CTX_TABLE *coeff_ctx_table;
const qm_val_t *iqmatrix;
int tx_type_cost;
} TxbInfo;
@@ -79,7 +78,7 @@ void hbt_destroy();
int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
int block, TX_SIZE tx_size, TX_TYPE tx_type,
const TXB_CTX *const txb_ctx, int *rate_cost,
- int sharpness);
+ int sharpness, int fast_mode);
// These numbers are empirically obtained.
static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = {
diff --git a/libaom/av1/encoder/ethread.c b/libaom/av1/encoder/ethread.c
index a3fb93e..c8c2107 100644
--- a/libaom/av1/encoder/ethread.c
+++ b/libaom/av1/encoder/ethread.c
@@ -164,10 +164,7 @@ void av1_row_mt_sync_mem_alloc(AV1RowMTSync *row_mt_sync, AV1_COMMON *cm,
aom_malloc(sizeof(*row_mt_sync->cur_col) * rows));
// Set up nsync.
- if (cm->seq_params.mib_size_log2 == 4)
- row_mt_sync->sync_range = 2;
- else
- row_mt_sync->sync_range = 1;
+ row_mt_sync->sync_range = 1;
}
// Deallocate row based multi-threading synchronization related mutex and data
@@ -239,26 +236,34 @@ static void switch_tile_and_get_next_job(AV1_COMP *const cpi, int *cur_tile_id,
int tile_index = tile_row * tile_cols + tile_col;
TileDataEnc *this_tile = &cpi->tile_data[tile_index];
AV1RowMTInfo *row_mt_info = &this_tile->row_mt_info;
- int num_mis_to_encode =
- this_tile->tile_info.mi_row_end - row_mt_info->current_mi_row;
-
- // Tile to be processed by this thread is selected on the basis of
- // availability of jobs:
- // 1) If jobs are available, tile to be processed is chosen on the
- // basis of minimum number of threads working for that tile. If two or
- // more tiles have same number of threads working for them, then the tile
- // with maximum number of jobs available will be chosen.
- // 2) If no jobs are available, then end_of_frame is reached.
- if (num_mis_to_encode > 0) {
- int num_threads_working = row_mt_info->num_threads_working;
- if (num_threads_working < min_num_threads_working) {
- min_num_threads_working = num_threads_working;
- max_mis_to_encode = 0;
- }
- if (num_threads_working == min_num_threads_working &&
- num_mis_to_encode > max_mis_to_encode) {
- tile_id = tile_index;
- max_mis_to_encode = num_mis_to_encode;
+ int num_sb_rows_in_tile =
+ av1_get_sb_rows_in_tile(cm, this_tile->tile_info);
+ int num_sb_cols_in_tile =
+ av1_get_sb_cols_in_tile(cm, this_tile->tile_info);
+ int theoretical_limit_on_threads =
+ AOMMIN((num_sb_cols_in_tile + 1) >> 1, num_sb_rows_in_tile);
+ int num_threads_working = row_mt_info->num_threads_working;
+ if (num_threads_working < theoretical_limit_on_threads) {
+ int num_mis_to_encode =
+ this_tile->tile_info.mi_row_end - row_mt_info->current_mi_row;
+
+ // Tile to be processed by this thread is selected on the basis of
+ // availability of jobs:
+ // 1) If jobs are available, tile to be processed is chosen on the
+ // basis of minimum number of threads working for that tile. If two or
+ // more tiles have same number of threads working for them, then the
+ // tile with maximum number of jobs available will be chosen.
+ // 2) If no jobs are available, then end_of_frame is reached.
+ if (num_mis_to_encode > 0) {
+ if (num_threads_working < min_num_threads_working) {
+ min_num_threads_working = num_threads_working;
+ max_mis_to_encode = 0;
+ }
+ if (num_threads_working == min_num_threads_working &&
+ num_mis_to_encode > max_mis_to_encode) {
+ tile_id = tile_index;
+ max_mis_to_encode = num_mis_to_encode;
+ }
}
}
}
@@ -313,9 +318,14 @@ static int enc_row_mt_worker_hook(void *arg1, void *unused) {
td->mb.e_mbd.tile_ctx = td->tctx;
td->mb.tile_pb_ctx = &this_tile->tctx;
- td->mb.backup_tile_ctx = &this_tile->backup_tctx;
- if (current_mi_row == this_tile->tile_info.mi_row_start)
+ if (this_tile->allow_update_cdf) {
+ td->mb.row_ctx = this_tile->row_ctx;
+ if (current_mi_row == this_tile->tile_info.mi_row_start)
+ memcpy(td->mb.e_mbd.tile_ctx, &this_tile->tctx, sizeof(FRAME_CONTEXT));
+ } else {
memcpy(td->mb.e_mbd.tile_ctx, &this_tile->tctx, sizeof(FRAME_CONTEXT));
+ }
+
av1_init_above_context(cm, &td->mb.e_mbd, tile_row);
// Disable exhaustive search speed features for row based multi-threading of
@@ -356,10 +366,8 @@ static int enc_worker_hook(void *arg1, void *unused) {
TileDataEnc *const this_tile =
&cpi->tile_data[tile_row * cm->tile_cols + tile_col];
- thread_data->td->tctx = &this_tile->tctx;
- thread_data->td->mb.e_mbd.tile_ctx = thread_data->td->tctx;
- thread_data->td->mb.tile_pb_ctx = thread_data->td->tctx;
- thread_data->td->mb.backup_tile_ctx = &this_tile->backup_tctx;
+ thread_data->td->mb.e_mbd.tile_ctx = &this_tile->tctx;
+ thread_data->td->mb.tile_pb_ctx = &this_tile->tctx;
av1_encode_tile(cpi, thread_data->td, tile_row, tile_col);
}
@@ -386,7 +394,7 @@ static void create_enc_workers(AV1_COMP *cpi, int num_workers) {
}
#endif
- for (int i = 0; i < num_workers; i++) {
+ for (int i = num_workers - 1; i >= 0; i--) {
AVxWorker *const worker = &cpi->workers[i];
EncWorkerData *const thread_data = &cpi->tile_thr_data[i];
@@ -397,7 +405,7 @@ static void create_enc_workers(AV1_COMP *cpi, int num_workers) {
thread_data->cpi = cpi;
thread_data->thread_id = i;
- if (i < num_workers - 1) {
+ if (i > 0) {
// Allocate thread data.
CHECK_MEM_ERROR(cm, thread_data->td,
aom_memalign(32, sizeof(*thread_data->td)));
@@ -421,11 +429,9 @@ static void create_enc_workers(AV1_COMP *cpi, int num_workers) {
(int32_t *)aom_memalign(
16, MAX_SB_SQUARE * sizeof(*thread_data->td->wsrc_buf)));
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
CHECK_MEM_ERROR(cm, thread_data->td->inter_modes_info,
(InterModesInfo *)aom_malloc(
sizeof(*thread_data->td->inter_modes_info)));
-#endif
for (int x = 0; x < 2; x++)
for (int y = 0; y < 2; y++)
@@ -478,14 +484,14 @@ static void create_enc_workers(AV1_COMP *cpi, int num_workers) {
static void launch_enc_workers(AV1_COMP *cpi, int num_workers) {
const AVxWorkerInterface *const winterface = aom_get_worker_interface();
// Encode a frame
- for (int i = 0; i < num_workers; i++) {
+ for (int i = num_workers - 1; i >= 0; i--) {
AVxWorker *const worker = &cpi->workers[i];
EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
// Set the starting tile for each thread.
thread_data->start = i;
- if (i == cpi->num_workers - 1)
+ if (i == 0)
winterface->execute(worker);
else
winterface->launch(worker);
@@ -497,7 +503,7 @@ static void sync_enc_workers(AV1_COMP *cpi, int num_workers) {
int had_error = 0;
// Encoding ends.
- for (int i = 0; i < num_workers; i++) {
+ for (int i = num_workers - 1; i >= 0; i--) {
AVxWorker *const worker = &cpi->workers[i];
had_error |= !winterface->sync(worker);
}
@@ -508,22 +514,25 @@ static void sync_enc_workers(AV1_COMP *cpi, int num_workers) {
}
static void accumulate_counters_enc_workers(AV1_COMP *cpi, int num_workers) {
- for (int i = 0; i < num_workers; i++) {
+ for (int i = num_workers - 1; i >= 0; i--) {
AVxWorker *const worker = &cpi->workers[i];
EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
cpi->intrabc_used |= thread_data->td->intrabc_used;
// Accumulate counters.
- if (i < cpi->num_workers - 1) {
+ if (i > 0) {
av1_accumulate_frame_counts(&cpi->counts, thread_data->td->counts);
accumulate_rd_opt(&cpi->td, thread_data->td);
cpi->td.mb.txb_split_count += thread_data->td->mb.txb_split_count;
+#if CONFIG_SPEED_STATS
+ cpi->td.mb.tx_search_count += thread_data->td->mb.tx_search_count;
+#endif // CONFIG_SPEED_STATS
}
}
}
static void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
int num_workers) {
- for (int i = 0; i < num_workers; i++) {
+ for (int i = num_workers - 1; i >= 0; i--) {
AVxWorker *const worker = &cpi->workers[i];
EncWorkerData *const thread_data = &cpi->tile_thr_data[i];
@@ -541,9 +550,7 @@ static void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
thread_data->td->mb.left_pred_buf = thread_data->td->left_pred_buf;
thread_data->td->mb.wsrc_buf = thread_data->td->wsrc_buf;
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
thread_data->td->mb.inter_modes_info = thread_data->td->inter_modes_info;
-#endif
for (int x = 0; x < 2; x++) {
for (int y = 0; y < 2; y++) {
memcpy(thread_data->td->hash_value_buffer[x][y],
@@ -560,7 +567,7 @@ static void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
memcpy(thread_data->td->counts, &cpi->counts, sizeof(cpi->counts));
}
- if (i < num_workers - 1) {
+ if (i > 0) {
thread_data->td->mb.palette_buffer = thread_data->td->palette_buffer;
thread_data->td->mb.tmp_conv_dst = thread_data->td->tmp_conv_dst;
for (int j = 0; j < 2; ++j) {
@@ -617,7 +624,7 @@ void av1_encode_tiles_row_mt(AV1_COMP *cpi) {
const int tile_rows = cm->tile_rows;
MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
int num_workers = 0;
- int total_num_sb_rows = 0;
+ int total_num_threads_row_mt = 0;
int max_sb_rows = 0;
if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) {
@@ -632,11 +639,19 @@ void av1_encode_tiles_row_mt(AV1_COMP *cpi) {
TileDataEnc *tile_data = &cpi->tile_data[row * cm->tile_cols + col];
int num_sb_rows_in_tile =
av1_get_sb_rows_in_tile(cm, tile_data->tile_info);
- total_num_sb_rows += num_sb_rows_in_tile;
+ int num_sb_cols_in_tile =
+ av1_get_sb_cols_in_tile(cm, tile_data->tile_info);
+ total_num_threads_row_mt +=
+ AOMMIN((num_sb_cols_in_tile + 1) >> 1, num_sb_rows_in_tile);
max_sb_rows = AOMMAX(max_sb_rows, num_sb_rows_in_tile);
}
}
- num_workers = AOMMIN(cpi->oxcf.max_threads, total_num_sb_rows);
+ // TODO(ravi.chaudhary@ittiam.com): Currently the percentage of
+ // post-processing stages in encoder is quiet low, so limiting the number of
+ // threads to the theoretical limit in row-mt does not have much impact on
+ // post-processing multi-threading stage. Need to revisit this when
+ // post-processing time starts shooting up.
+ num_workers = AOMMIN(cpi->oxcf.max_threads, total_num_threads_row_mt);
if (multi_thread_ctxt->allocated_tile_cols != tile_cols ||
multi_thread_ctxt->allocated_tile_rows != tile_rows ||
@@ -659,9 +674,7 @@ void av1_encode_tiles_row_mt(AV1_COMP *cpi) {
this_tile->row_mt_info.current_mi_row = this_tile->tile_info.mi_row_start;
this_tile->row_mt_info.num_threads_working = 0;
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
av1_inter_mode_data_init(this_tile);
-#endif
av1_zero_above_context(cm, &cpi->td.mb.e_mbd,
this_tile->tile_info.mi_col_start,
this_tile->tile_info.mi_col_end, tile_row);
diff --git a/libaom/av1/encoder/firstpass.c b/libaom/av1/encoder/firstpass.c
index 5117c67..f6a0fb2 100644
--- a/libaom/av1/encoder/firstpass.c
+++ b/libaom/av1/encoder/firstpass.c
@@ -36,6 +36,7 @@
#include "av1/encoder/encodemb.h"
#include "av1/encoder/encodemv.h"
#include "av1/encoder/encoder.h"
+#include "av1/encoder/encode_strategy.h"
#include "av1/encoder/extend.h"
#include "av1/encoder/firstpass.h"
#include "av1/encoder/mcomp.h"
@@ -43,63 +44,14 @@
#include "av1/encoder/reconinter_enc.h"
#define OUTPUT_FPF 0
-#define ARF_STATS_OUTPUT 0
-#define GROUP_ADAPTIVE_MAXQ 1
-
-#define BOOST_BREAKOUT 12.5
-#define BOOST_FACTOR 12.5
-#define FACTOR_PT_LOW 0.70
-#define FACTOR_PT_HIGH 0.90
#define FIRST_PASS_Q 10.0
-#define GF_MAX_BOOST 90.0
#define INTRA_MODE_PENALTY 1024
-#define KF_MIN_FRAME_BOOST 80.0
-#define KF_MAX_FRAME_BOOST 128.0
-#define MIN_ARF_GF_BOOST 240
-#define MIN_DECAY_FACTOR 0.01
-#define MIN_KF_BOOST 300 // Minimum boost for non-static KF interval
-#define MIN_STATIC_KF_BOOST 5400 // Minimum boost for static KF interval
#define NEW_MV_MODE_PENALTY 32
#define DARK_THRESH 64
-#define DEFAULT_GRP_WEIGHT 1.0
-#define RC_FACTOR_MIN 0.75
-#define RC_FACTOR_MAX 1.75
-#define MIN_FWD_KF_INTERVAL 8
#define NCOUNT_INTRA_THRESH 8192
#define NCOUNT_INTRA_FACTOR 3
-#define NCOUNT_FRAME_II_THRESH 5.0
-
-#define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x)-0.000001 : (x) + 0.000001)
-
-#if ARF_STATS_OUTPUT
-unsigned int arf_count = 0;
-#endif
-
-// Resets the first pass file to the given position using a relative seek from
-// the current position.
-static void reset_fpf_position(TWO_PASS *p, const FIRSTPASS_STATS *position) {
- p->stats_in = position;
-}
-
-// Read frame stats at an offset from the current position.
-static const FIRSTPASS_STATS *read_frame_stats(const TWO_PASS *p, int offset) {
- if ((offset >= 0 && p->stats_in + offset >= p->stats_in_end) ||
- (offset < 0 && p->stats_in + offset < p->stats_in_start)) {
- return NULL;
- }
-
- return &p->stats_in[offset];
-}
-
-static int input_stats(TWO_PASS *p, FIRSTPASS_STATS *fps) {
- if (p->stats_in >= p->stats_in_end) return EOF;
-
- *fps = *p->stats_in;
- ++p->stats_in;
- return 1;
-}
static void output_stats(FIRSTPASS_STATS *stats,
struct aom_codec_pkt_list *pktlist) {
@@ -131,18 +83,7 @@ static void output_stats(FIRSTPASS_STATS *stats,
#endif
}
-#if CONFIG_FP_MB_STATS
-static void output_fpmb_stats(uint8_t *this_frame_mb_stats, int stats_size,
- struct aom_codec_pkt_list *pktlist) {
- struct aom_codec_cx_pkt pkt;
- pkt.kind = AOM_CODEC_FPMB_STATS_PKT;
- pkt.data.firstpass_mb_stats.buf = this_frame_mb_stats;
- pkt.data.firstpass_mb_stats.sz = stats_size * sizeof(*this_frame_mb_stats);
- aom_codec_pkt_list_add(pktlist, &pkt);
-}
-#endif
-
-static void zero_stats(FIRSTPASS_STATS *section) {
+void av1_twopass_zero_stats(FIRSTPASS_STATS *section) {
section->frame = 0.0;
section->weight = 0.0;
section->intra_error = 0.0;
@@ -195,98 +136,8 @@ static void accumulate_stats(FIRSTPASS_STATS *section,
section->duration += frame->duration;
}
-static void subtract_stats(FIRSTPASS_STATS *section,
- const FIRSTPASS_STATS *frame) {
- section->frame -= frame->frame;
- section->weight -= frame->weight;
- section->intra_error -= frame->intra_error;
- section->frame_avg_wavelet_energy -= frame->frame_avg_wavelet_energy;
- section->coded_error -= frame->coded_error;
- section->sr_coded_error -= frame->sr_coded_error;
- section->pcnt_inter -= frame->pcnt_inter;
- section->pcnt_motion -= frame->pcnt_motion;
- section->pcnt_second_ref -= frame->pcnt_second_ref;
- section->pcnt_neutral -= frame->pcnt_neutral;
- section->intra_skip_pct -= frame->intra_skip_pct;
- section->inactive_zone_rows -= frame->inactive_zone_rows;
- section->inactive_zone_cols -= frame->inactive_zone_cols;
- section->MVr -= frame->MVr;
- section->mvr_abs -= frame->mvr_abs;
- section->MVc -= frame->MVc;
- section->mvc_abs -= frame->mvc_abs;
- section->MVrv -= frame->MVrv;
- section->MVcv -= frame->MVcv;
- section->mv_in_out_count -= frame->mv_in_out_count;
- section->new_mv_count -= frame->new_mv_count;
- section->count -= frame->count;
- section->duration -= frame->duration;
-}
-
-// Calculate the linear size relative to a baseline of 1080P
-#define BASE_SIZE 2073600.0 // 1920x1080
-static double get_linear_size_factor(const AV1_COMP *cpi) {
- const double this_area = cpi->initial_width * cpi->initial_height;
- return pow(this_area / BASE_SIZE, 0.5);
-}
-
-// Calculate an active area of the image that discounts formatting
-// bars and partially discounts other 0 energy areas.
-#define MIN_ACTIVE_AREA 0.5
-#define MAX_ACTIVE_AREA 1.0
-static double calculate_active_area(const AV1_COMP *cpi,
- const FIRSTPASS_STATS *this_frame) {
- double active_pct;
-
- active_pct =
- 1.0 -
- ((this_frame->intra_skip_pct / 2) +
- ((this_frame->inactive_zone_rows * 2) / (double)cpi->common.mb_rows));
- return fclamp(active_pct, MIN_ACTIVE_AREA, MAX_ACTIVE_AREA);
-}
-
-// Calculate a modified Error used in distributing bits between easier and
-// harder frames.
-#define ACT_AREA_CORRECTION 0.5
-static double calculate_modified_err(const AV1_COMP *cpi,
- const TWO_PASS *twopass,
- const AV1EncoderConfig *oxcf,
- const FIRSTPASS_STATS *this_frame) {
- const FIRSTPASS_STATS *const stats = &twopass->total_stats;
- const double av_weight = stats->weight / stats->count;
- const double av_err = (stats->coded_error * av_weight) / stats->count;
- double modified_error =
- av_err * pow(this_frame->coded_error * this_frame->weight /
- DOUBLE_DIVIDE_CHECK(av_err),
- oxcf->two_pass_vbrbias / 100.0);
-
- // Correction for active area. Frames with a reduced active area
- // (eg due to formatting bars) have a higher error per mb for the
- // remaining active MBs. The correction here assumes that coding
- // 0.5N blocks of complexity 2X is a little easier than coding N
- // blocks of complexity X.
- modified_error *=
- pow(calculate_active_area(cpi, this_frame), ACT_AREA_CORRECTION);
-
- return fclamp(modified_error, twopass->modified_error_min,
- twopass->modified_error_max);
-}
-
-// This function returns the maximum target rate per frame.
-static int frame_max_bits(const RATE_CONTROL *rc,
- const AV1EncoderConfig *oxcf) {
- int64_t max_bits = ((int64_t)rc->avg_frame_bandwidth *
- (int64_t)oxcf->two_pass_vbrmax_section) /
- 100;
- if (max_bits < 0)
- max_bits = 0;
- else if (max_bits > rc->max_frame_bandwidth)
- max_bits = rc->max_frame_bandwidth;
-
- return (int)max_bits;
-}
-
void av1_init_first_pass(AV1_COMP *cpi) {
- zero_stats(&cpi->twopass.total_stats);
+ av1_twopass_zero_stats(&cpi->twopass.total_stats);
}
void av1_end_first_pass(AV1_COMP *cpi) {
@@ -380,13 +231,13 @@ static void first_pass_motion_search(AV1_COMP *cpi, MACROBLOCK *x,
// Override the default variance function to use MSE.
v_fn_ptr.vf = get_block_variance_fn(bsize);
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ if (is_cur_buf_hbd(xd)) {
v_fn_ptr.vf = highbd_get_block_variance_fn(bsize, xd->bd);
}
// Center the initial step/diamond search on best mv.
- tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv,
- step_param, x->sadperbit16, &num00,
+ tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg[SS_CFG_SRC], &ref_mv_full,
+ &tmp_mv, step_param, x->sadperbit16, &num00,
&v_fn_ptr, ref_mv);
if (tmp_err < INT_MAX)
tmp_err = av1_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1);
@@ -407,9 +258,9 @@ static void first_pass_motion_search(AV1_COMP *cpi, MACROBLOCK *x,
if (num00) {
--num00;
} else {
- tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv,
- step_param + n, x->sadperbit16, &num00,
- &v_fn_ptr, ref_mv);
+ tmp_err = cpi->diamond_search_sad(
+ x, &cpi->ss_cfg[SS_CFG_SRC], &ref_mv_full, &tmp_mv, step_param + n,
+ x->sadperbit16, &num00, &v_fn_ptr, ref_mv);
if (tmp_err < INT_MAX)
tmp_err = av1_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1);
if (tmp_err < INT_MAX - new_mv_mode_penalty)
@@ -439,26 +290,7 @@ static BLOCK_SIZE get_bsize(const AV1_COMMON *cm, int mb_row, int mb_col) {
}
static int find_fp_qindex(aom_bit_depth_t bit_depth) {
- int i;
-
- for (i = 0; i < QINDEX_RANGE; ++i)
- if (av1_convert_qindex_to_q(i, bit_depth) >= FIRST_PASS_Q) break;
-
- if (i == QINDEX_RANGE) i--;
-
- return i;
-}
-
-static void set_first_pass_params(AV1_COMP *cpi) {
- AV1_COMMON *const cm = &cpi->common;
- if (!cpi->refresh_alt_ref_frame && (cm->current_frame.frame_number == 0 ||
- (cpi->frame_flags & FRAMEFLAGS_KEY))) {
- cm->current_frame.frame_type = KEY_FRAME;
- } else {
- cm->current_frame.frame_type = INTER_FRAME;
- }
- // Do not use periodic key frames.
- cpi->rc.frames_to_key = INT_MAX;
+ return av1_find_qindex(FIRST_PASS_Q, bit_depth, 0, QINDEX_RANGE - 1);
}
static double raw_motion_error_stdev(int *raw_motion_err_list,
@@ -486,7 +318,7 @@ static double raw_motion_error_stdev(int *raw_motion_err_list,
#define UL_INTRA_THRESH 50
#define INVALID_ROW -1
-void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
+void av1_first_pass(AV1_COMP *cpi, const int64_t ts_duration) {
int mb_row, mb_col;
MACROBLOCK *const x = &cpi->td.mb;
AV1_COMMON *const cm = &cpi->common;
@@ -501,7 +333,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
&cpi->td.pc_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2]->none;
int i;
- int recon_yoffset, recon_uvoffset;
+ int recon_yoffset, src_yoffset, recon_uvoffset;
int64_t intra_error = 0;
int64_t frame_avg_wavelet_energy = 0;
int64_t coded_error = 0;
@@ -521,15 +353,15 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
int sum_in_vectors = 0;
MV lastmv = kZeroMv;
TWO_PASS *twopass = &cpi->twopass;
- int recon_y_stride, recon_uv_stride, uv_mb_height;
+ int recon_y_stride, src_y_stride, recon_uv_stride, uv_mb_height;
- YV12_BUFFER_CONFIG *const lst_yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
- YV12_BUFFER_CONFIG *gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+ const YV12_BUFFER_CONFIG *const lst_yv12 =
+ get_ref_frame_yv12_buf(cm, LAST_FRAME);
+ const YV12_BUFFER_CONFIG *gld_yv12 = get_ref_frame_yv12_buf(cm, GOLDEN_FRAME);
YV12_BUFFER_CONFIG *const new_yv12 = &cm->cur_frame->buf;
const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12;
double intra_factor;
double brightness_factor;
- BufferPool *const pool = cm->buffer_pool;
const int qindex = find_fp_qindex(seq_params->bit_depth);
const int mb_scale = mi_size_wide[BLOCK_16X16];
@@ -542,12 +374,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
assert(new_yv12 != NULL);
assert(frame_is_intra_only(cm) || (lst_yv12 != NULL));
-#if CONFIG_FP_MB_STATS
- if (cpi->use_fp_mb_stats) {
- av1_zero_array(cpi->twopass.frame_mb_stats_buf, cpi->initial_mbs);
- }
-#endif
-
+ av1_setup_frame_size(cpi);
aom_clear_system_state();
xd->mi = cm->mi_grid_visible;
@@ -558,7 +385,9 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
brightness_factor = 0.0;
neutral_count = 0.0;
- set_first_pass_params(cpi);
+ // Do not use periodic key frames.
+ cpi->rc.frames_to_key = INT_MAX;
+
av1_set_quantizer(cm, qindex);
av1_setup_block_planes(&x->e_mbd, seq_params->subsampling_x,
@@ -589,12 +418,11 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
}
av1_init_mv_probs(cm);
- av1_init_lv_map(cm);
av1_initialize_rd_consts(cpi);
// Tiling is ignored in the first pass.
av1_tile_init(&tile, cm, 0, 0);
-
+ src_y_stride = cpi->source->y_stride;
recon_y_stride = new_yv12->y_stride;
recon_uv_stride = new_yv12->uv_stride;
uv_mb_height = 16 >> (new_yv12->y_height > new_yv12->uv_height);
@@ -605,6 +433,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
// Reset above block coeffs.
xd->up_available = (mb_row != 0);
recon_yoffset = (mb_row * recon_y_stride * 16);
+ src_yoffset = (mb_row * src_y_stride * 16);
recon_uvoffset = (mb_row * recon_uv_stride * uv_mb_height);
// Set up limit values for motion vectors to prevent them extending
@@ -620,10 +449,6 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
double log_intra;
int level_sample;
-#if CONFIG_FP_MB_STATS
- const int mb_index = mb_row * cm->mb_cols + mb_col;
-#endif
-
aom_clear_system_state();
const int idx_str = xd->mi_stride * mb_row * mb_scale + mb_col * mb_scale;
@@ -650,11 +475,6 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
av1_encode_intra_block_plane(cpi, x, bsize, 0, 0, mb_row * 2, mb_col * 2);
this_error = aom_get_mb_ss(x->plane[0].src_diff);
- // Keep a record of blocks that have almost no intra error residual
- // (i.e. are in effect completely flat and untextured in the intra
- // domain). In natural videos this is uncommon, but it is much more
- // common in animations, graphics and screen content, so may be used
- // as a signal to detect these types of content.
if (this_error < UL_INTRA_THRESH) {
++intra_skip_count;
} else if ((mb_col > 0) && (image_data_start_row == INVALID_ROW)) {
@@ -702,21 +522,15 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
// Accumulate the intra error.
intra_error += (int64_t)this_error;
- int stride = x->plane[0].src.stride;
+ const int hbd = is_cur_buf_hbd(xd);
+ const int stride = x->plane[0].src.stride;
uint8_t *buf = x->plane[0].src.buf;
- for (int r8 = 0; r8 < 2; ++r8)
+ for (int r8 = 0; r8 < 2; ++r8) {
for (int c8 = 0; c8 < 2; ++c8) {
- int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
frame_avg_wavelet_energy += av1_haar_ac_sad_8x8_uint8_input(
buf + c8 * 8 + r8 * 8 * stride, stride, hbd);
}
-
-#if CONFIG_FP_MB_STATS
- if (cpi->use_fp_mb_stats) {
- // initialization
- cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
}
-#endif
// Set up limit values for motion vectors to prevent them extending
// outside the UMV borders.
@@ -731,7 +545,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
struct buf_2d unscaled_last_source_buf_2d;
xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ if (is_cur_buf_hbd(xd)) {
motion_error = highbd_get_prediction_error(
bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
} else {
@@ -743,10 +557,10 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
// frame as the reference. Skip the further motion search on
// reconstructed frame if this error is small.
unscaled_last_source_buf_2d.buf =
- cpi->unscaled_last_source->y_buffer + recon_yoffset;
+ cpi->unscaled_last_source->y_buffer + src_yoffset;
unscaled_last_source_buf_2d.stride =
cpi->unscaled_last_source->y_stride;
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ if (is_cur_buf_hbd(xd)) {
raw_motion_error = highbd_get_prediction_error(
bsize, &x->plane[0].src, &unscaled_last_source_buf_2d, xd->bd);
} else {
@@ -778,7 +592,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
int gf_motion_error;
xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset;
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ if (is_cur_buf_hbd(xd)) {
gf_motion_error = highbd_get_prediction_error(
bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
} else {
@@ -816,20 +630,6 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
best_ref_mv.row = 0;
best_ref_mv.col = 0;
-#if CONFIG_FP_MB_STATS
- if (cpi->use_fp_mb_stats) {
- // intra predication statistics
- cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
- cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_DCINTRA_MASK;
- cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK;
- if (this_error > FPMB_ERROR_LARGE_TH) {
- cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LARGE_MASK;
- } else if (this_error < FPMB_ERROR_SMALL_TH) {
- cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_SMALL_MASK;
- }
- }
-#endif
-
if (motion_error <= this_error) {
aom_clear_system_state();
@@ -855,8 +655,9 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
xd->mi[0]->tx_size = TX_4X4;
xd->mi[0]->ref_frame[0] = LAST_FRAME;
xd->mi[0]->ref_frame[1] = NONE_FRAME;
- av1_build_inter_predictors_sby(cm, xd, mb_row * mb_scale,
- mb_col * mb_scale, NULL, bsize);
+ av1_enc_build_inter_predictor(cm, xd, mb_row * mb_scale,
+ mb_col * mb_scale, NULL, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
av1_encode_sby_pass1(cm, x, bsize);
sum_mvr += mv.row;
sum_mvr_abs += abs(mv.row);
@@ -868,50 +669,9 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
best_ref_mv = mv;
-#if CONFIG_FP_MB_STATS
- if (cpi->use_fp_mb_stats) {
- // inter predication statistics
- cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
- cpi->twopass.frame_mb_stats_buf[mb_index] &= ~FPMB_DCINTRA_MASK;
- cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK;
- if (this_error > FPMB_ERROR_LARGE_TH) {
- cpi->twopass.frame_mb_stats_buf[mb_index] |=
- FPMB_ERROR_LARGE_MASK;
- } else if (this_error < FPMB_ERROR_SMALL_TH) {
- cpi->twopass.frame_mb_stats_buf[mb_index] |=
- FPMB_ERROR_SMALL_MASK;
- }
- }
-#endif
-
if (!is_zero_mv(&mv)) {
++mvcount;
-#if CONFIG_FP_MB_STATS
- if (cpi->use_fp_mb_stats) {
- cpi->twopass.frame_mb_stats_buf[mb_index] &=
- ~FPMB_MOTION_ZERO_MASK;
- // check estimated motion direction
- if (mv.col > 0 && mv.col >= abs(mv.row)) {
- // right direction
- cpi->twopass.frame_mb_stats_buf[mb_index] |=
- FPMB_MOTION_RIGHT_MASK;
- } else if (mv.row < 0 && abs(mv.row) >= abs(mv.col)) {
- // up direction
- cpi->twopass.frame_mb_stats_buf[mb_index] |=
- FPMB_MOTION_UP_MASK;
- } else if (mv.col < 0 && abs(mv.col) >= abs(mv.row)) {
- // left direction
- cpi->twopass.frame_mb_stats_buf[mb_index] |=
- FPMB_MOTION_LEFT_MASK;
- } else {
- // down direction
- cpi->twopass.frame_mb_stats_buf[mb_index] |=
- FPMB_MOTION_DOWN_MASK;
- }
- }
-#endif
-
// Non-zero vector, was it different from the last non zero vector?
if (!is_equal_mv(&mv, &lastmv)) ++new_mv_count;
lastmv = mv;
@@ -955,6 +715,7 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
x->plane[2].src.buf += uv_mb_height;
recon_yoffset += 16;
+ src_yoffset += 16;
recon_uvoffset += uv_mb_height;
}
// Adjust to the next row of MBs.
@@ -1039,19 +800,12 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
// TODO(paulwilkins): Handle the case when duration is set to 0, or
// something less than the full time between subsequent values of
// cpi->source_time_stamp.
- fps.duration = (double)(source->ts_end - source->ts_start);
+ fps.duration = (double)ts_duration;
// Don't want to do output stats with a stack variable!
twopass->this_frame_stats = fps;
output_stats(&twopass->this_frame_stats, cpi->output_pkt_list);
accumulate_stats(&twopass->total_stats, &fps);
-
-#if CONFIG_FP_MB_STATS
- if (cpi->use_fp_mb_stats) {
- output_fpmb_stats(twopass->frame_mb_stats_buf, cpi->initial_mbs,
- cpi->output_pkt_list);
- }
-#endif
}
// Copy the previous Last Frame back into gf and and arf buffers if
@@ -1062,10 +816,9 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
((twopass->this_frame_stats.intra_error /
DOUBLE_DIVIDE_CHECK(twopass->this_frame_stats.coded_error)) > 2.0))) {
if (gld_yv12 != NULL) {
- assign_frame_buffer(
- pool->frame_bufs,
- &cm->ref_frame_map[get_ref_frame_map_idx(cpi, GOLDEN_FRAME)],
- cm->ref_frame_map[get_ref_frame_map_idx(cpi, LAST_FRAME)]);
+ assign_frame_buffer_p(
+ &cm->ref_frame_map[get_ref_frame_map_idx(cm, GOLDEN_FRAME)],
+ cm->ref_frame_map[get_ref_frame_map_idx(cm, LAST_FRAME)]);
}
twopass->sr_update_lag = 1;
} else {
@@ -1075,19 +828,16 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
aom_extend_frame_borders(new_yv12, num_planes);
// The frame we just compressed now becomes the last frame.
- assign_frame_buffer(
- pool->frame_bufs,
- &cm->ref_frame_map[get_ref_frame_map_idx(cpi, LAST_FRAME)],
- cm->new_fb_idx);
+ assign_frame_buffer_p(
+ &cm->ref_frame_map[get_ref_frame_map_idx(cm, LAST_FRAME)], cm->cur_frame);
// Special case for the first frame. Copy into the GF buffer as a second
// reference.
if (current_frame->frame_number == 0 &&
- get_ref_frame_map_idx(cpi, GOLDEN_FRAME) != INVALID_IDX) {
- assign_frame_buffer(
- pool->frame_bufs,
- &cm->ref_frame_map[get_ref_frame_map_idx(cpi, GOLDEN_FRAME)],
- cm->ref_frame_map[get_ref_frame_map_idx(cpi, LAST_FRAME)]);
+ get_ref_frame_map_idx(cm, GOLDEN_FRAME) != INVALID_IDX) {
+ assign_frame_buffer_p(
+ &cm->ref_frame_map[get_ref_frame_map_idx(cm, GOLDEN_FRAME)],
+ cm->ref_frame_map[get_ref_frame_map_idx(cm, LAST_FRAME)]);
}
// Use this to see what the first pass reconstruction looks like.
@@ -1108,2333 +858,3 @@ void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
++current_frame->frame_number;
}
-
-static double calc_correction_factor(double err_per_mb, double err_divisor,
- double pt_low, double pt_high, int q,
- aom_bit_depth_t bit_depth) {
- const double error_term = err_per_mb / err_divisor;
-
- // Adjustment based on actual quantizer to power term.
- const double power_term =
- AOMMIN(av1_convert_qindex_to_q(q, bit_depth) * 0.01 + pt_low, pt_high);
-
- // Calculate correction factor.
- if (power_term < 1.0) assert(error_term >= 0.0);
-
- return fclamp(pow(error_term, power_term), 0.05, 5.0);
-}
-
-#define ERR_DIVISOR 100.0
-static int get_twopass_worst_quality(const AV1_COMP *cpi,
- const double section_err,
- double inactive_zone,
- int section_target_bandwidth,
- double group_weight_factor) {
- const RATE_CONTROL *const rc = &cpi->rc;
- const AV1EncoderConfig *const oxcf = &cpi->oxcf;
-
- inactive_zone = fclamp(inactive_zone, 0.0, 1.0);
-
- if (section_target_bandwidth <= 0) {
- return rc->worst_quality; // Highest value allowed
- } else {
- const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
- ? cpi->initial_mbs
- : cpi->common.MBs;
- const int active_mbs = AOMMAX(1, num_mbs - (int)(num_mbs * inactive_zone));
- const double av_err_per_mb = section_err / active_mbs;
- const double speed_term = 1.0;
- double ediv_size_correction;
- const int target_norm_bits_per_mb =
- (int)((uint64_t)section_target_bandwidth << BPER_MB_NORMBITS) /
- active_mbs;
- int q;
-
- // Larger image formats are expected to be a little harder to code
- // relatively given the same prediction error score. This in part at
- // least relates to the increased size and hence coding overheads of
- // motion vectors. Some account of this is made through adjustment of
- // the error divisor.
- ediv_size_correction =
- AOMMAX(0.2, AOMMIN(5.0, get_linear_size_factor(cpi)));
- if (ediv_size_correction < 1.0)
- ediv_size_correction = -(1.0 / ediv_size_correction);
- ediv_size_correction *= 4.0;
-
- // Try and pick a max Q that will be high enough to encode the
- // content at the given rate.
- for (q = rc->best_quality; q < rc->worst_quality; ++q) {
- const double factor = calc_correction_factor(
- av_err_per_mb, ERR_DIVISOR - ediv_size_correction, FACTOR_PT_LOW,
- FACTOR_PT_HIGH, q, cpi->common.seq_params.bit_depth);
- const int bits_per_mb = av1_rc_bits_per_mb(
- INTER_FRAME, q, factor * speed_term * group_weight_factor,
- cpi->common.seq_params.bit_depth);
- if (bits_per_mb <= target_norm_bits_per_mb) break;
- }
-
- // Restriction on active max q for constrained quality mode.
- if (cpi->oxcf.rc_mode == AOM_CQ) q = AOMMAX(q, oxcf->cq_level);
- return q;
- }
-}
-
-static void setup_rf_level_maxq(AV1_COMP *cpi) {
- int i;
- RATE_CONTROL *const rc = &cpi->rc;
- for (i = INTER_NORMAL; i < RATE_FACTOR_LEVELS; ++i) {
- int qdelta = av1_frame_type_qdelta(cpi, i, rc->worst_quality);
- rc->rf_level_maxq[i] = AOMMAX(rc->worst_quality + qdelta, rc->best_quality);
- }
-}
-
-void av1_init_second_pass(AV1_COMP *cpi) {
- const AV1EncoderConfig *const oxcf = &cpi->oxcf;
- TWO_PASS *const twopass = &cpi->twopass;
- double frame_rate;
- FIRSTPASS_STATS *stats;
-
- zero_stats(&twopass->total_stats);
- zero_stats(&twopass->total_left_stats);
-
- if (!twopass->stats_in_end) return;
-
- stats = &twopass->total_stats;
-
- *stats = *twopass->stats_in_end;
- twopass->total_left_stats = *stats;
-
- frame_rate = 10000000.0 * stats->count / stats->duration;
- // Each frame can have a different duration, as the frame rate in the source
- // isn't guaranteed to be constant. The frame rate prior to the first frame
- // encoded in the second pass is a guess. However, the sum duration is not.
- // It is calculated based on the actual durations of all frames from the
- // first pass.
- av1_new_framerate(cpi, frame_rate);
- twopass->bits_left =
- (int64_t)(stats->duration * oxcf->target_bandwidth / 10000000.0);
-
- // This variable monitors how far behind the second ref update is lagging.
- twopass->sr_update_lag = 1;
-
- // Scan the first pass file and calculate a modified total error based upon
- // the bias/power function used to allocate bits.
- {
- const double avg_error =
- stats->coded_error / DOUBLE_DIVIDE_CHECK(stats->count);
- const FIRSTPASS_STATS *s = twopass->stats_in;
- double modified_error_total = 0.0;
- twopass->modified_error_min =
- (avg_error * oxcf->two_pass_vbrmin_section) / 100;
- twopass->modified_error_max =
- (avg_error * oxcf->two_pass_vbrmax_section) / 100;
- while (s < twopass->stats_in_end) {
- modified_error_total += calculate_modified_err(cpi, twopass, oxcf, s);
- ++s;
- }
- twopass->modified_error_left = modified_error_total;
- }
-
- // Reset the vbr bits off target counters
- cpi->rc.vbr_bits_off_target = 0;
- cpi->rc.vbr_bits_off_target_fast = 0;
-
- cpi->rc.rate_error_estimate = 0;
-
- // Static sequence monitor variables.
- twopass->kf_zeromotion_pct = 100;
- twopass->last_kfgroup_zeromotion_pct = 100;
-
- if (oxcf->resize_mode != RESIZE_NONE) {
- setup_rf_level_maxq(cpi);
- }
-}
-
-#define SR_DIFF_PART 0.0015
-#define MOTION_AMP_PART 0.003
-#define INTRA_PART 0.005
-#define DEFAULT_DECAY_LIMIT 0.75
-#define LOW_SR_DIFF_TRHESH 0.1
-#define SR_DIFF_MAX 128.0
-
-static double get_sr_decay_rate(const AV1_COMP *cpi,
- const FIRSTPASS_STATS *frame) {
- const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
- : cpi->common.MBs;
- double sr_diff = (frame->sr_coded_error - frame->coded_error) / num_mbs;
- double sr_decay = 1.0;
- double modified_pct_inter;
- double modified_pcnt_intra;
- const double motion_amplitude_factor =
- frame->pcnt_motion * ((frame->mvc_abs + frame->mvr_abs) / 2);
-
- modified_pct_inter = frame->pcnt_inter;
- if ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) <
- (double)NCOUNT_FRAME_II_THRESH) {
- modified_pct_inter = frame->pcnt_inter - frame->pcnt_neutral;
- }
- modified_pcnt_intra = 100 * (1.0 - modified_pct_inter);
-
- if ((sr_diff > LOW_SR_DIFF_TRHESH)) {
- sr_diff = AOMMIN(sr_diff, SR_DIFF_MAX);
- sr_decay = 1.0 - (SR_DIFF_PART * sr_diff) -
- (MOTION_AMP_PART * motion_amplitude_factor) -
- (INTRA_PART * modified_pcnt_intra);
- }
- return AOMMAX(sr_decay, AOMMIN(DEFAULT_DECAY_LIMIT, modified_pct_inter));
-}
-
-// This function gives an estimate of how badly we believe the prediction
-// quality is decaying from frame to frame.
-static double get_zero_motion_factor(const AV1_COMP *cpi,
- const FIRSTPASS_STATS *frame) {
- const double zero_motion_pct = frame->pcnt_inter - frame->pcnt_motion;
- double sr_decay = get_sr_decay_rate(cpi, frame);
- return AOMMIN(sr_decay, zero_motion_pct);
-}
-
-#define ZM_POWER_FACTOR 0.75
-
-static double get_prediction_decay_rate(const AV1_COMP *cpi,
- const FIRSTPASS_STATS *next_frame) {
- const double sr_decay_rate = get_sr_decay_rate(cpi, next_frame);
- const double zero_motion_factor =
- (0.95 * pow((next_frame->pcnt_inter - next_frame->pcnt_motion),
- ZM_POWER_FACTOR));
-
- return AOMMAX(zero_motion_factor,
- (sr_decay_rate + ((1.0 - sr_decay_rate) * zero_motion_factor)));
-}
-
-// Function to test for a condition where a complex transition is followed
-// by a static section. For example in slide shows where there is a fade
-// between slides. This is to help with more optimal kf and gf positioning.
-static int detect_transition_to_still(AV1_COMP *cpi, int frame_interval,
- int still_interval,
- double loop_decay_rate,
- double last_decay_rate) {
- TWO_PASS *const twopass = &cpi->twopass;
- RATE_CONTROL *const rc = &cpi->rc;
-
- // Break clause to detect very still sections after motion
- // For example a static image after a fade or other transition
- // instead of a clean scene cut.
- if (frame_interval > rc->min_gf_interval && loop_decay_rate >= 0.999 &&
- last_decay_rate < 0.9) {
- int j;
-
- // Look ahead a few frames to see if static condition persists...
- for (j = 0; j < still_interval; ++j) {
- const FIRSTPASS_STATS *stats = &twopass->stats_in[j];
- if (stats >= twopass->stats_in_end) break;
-
- if (stats->pcnt_inter - stats->pcnt_motion < 0.999) break;
- }
-
- // Only if it does do we signal a transition to still.
- return j == still_interval;
- }
-
- return 0;
-}
-
-// This function detects a flash through the high relative pcnt_second_ref
-// score in the frame following a flash frame. The offset passed in should
-// reflect this.
-static int detect_flash(const TWO_PASS *twopass, int offset) {
- const FIRSTPASS_STATS *const next_frame = read_frame_stats(twopass, offset);
-
- // What we are looking for here is a situation where there is a
- // brief break in prediction (such as a flash) but subsequent frames
- // are reasonably well predicted by an earlier (pre flash) frame.
- // The recovery after a flash is indicated by a high pcnt_second_ref
- // compared to pcnt_inter.
- return next_frame != NULL &&
- next_frame->pcnt_second_ref > next_frame->pcnt_inter &&
- next_frame->pcnt_second_ref >= 0.5;
-}
-
-// Update the motion related elements to the GF arf boost calculation.
-static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats,
- double *mv_in_out,
- double *mv_in_out_accumulator,
- double *abs_mv_in_out_accumulator,
- double *mv_ratio_accumulator) {
- const double pct = stats->pcnt_motion;
-
- // Accumulate Motion In/Out of frame stats.
- *mv_in_out = stats->mv_in_out_count * pct;
- *mv_in_out_accumulator += *mv_in_out;
- *abs_mv_in_out_accumulator += fabs(*mv_in_out);
-
- // Accumulate a measure of how uniform (or conversely how random) the motion
- // field is (a ratio of abs(mv) / mv).
- if (pct > 0.05) {
- const double mvr_ratio =
- fabs(stats->mvr_abs) / DOUBLE_DIVIDE_CHECK(fabs(stats->MVr));
- const double mvc_ratio =
- fabs(stats->mvc_abs) / DOUBLE_DIVIDE_CHECK(fabs(stats->MVc));
-
- *mv_ratio_accumulator +=
- pct * (mvr_ratio < stats->mvr_abs ? mvr_ratio : stats->mvr_abs);
- *mv_ratio_accumulator +=
- pct * (mvc_ratio < stats->mvc_abs ? mvc_ratio : stats->mvc_abs);
- }
-}
-
-#define BASELINE_ERR_PER_MB 1000.0
-static double calc_frame_boost(AV1_COMP *cpi, const FIRSTPASS_STATS *this_frame,
- double this_frame_mv_in_out, double max_boost) {
- double frame_boost;
- const double lq = av1_convert_qindex_to_q(
- cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.seq_params.bit_depth);
- const double boost_q_correction = AOMMIN((0.5 + (lq * 0.015)), 1.5);
- int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
- : cpi->common.MBs;
-
- // Correct for any inactive region in the image
- num_mbs = (int)AOMMAX(1, num_mbs * calculate_active_area(cpi, this_frame));
-
- // Underlying boost factor is based on inter error ratio.
- frame_boost = (BASELINE_ERR_PER_MB * num_mbs) /
- DOUBLE_DIVIDE_CHECK(this_frame->coded_error);
- frame_boost = frame_boost * BOOST_FACTOR * boost_q_correction;
-
- // Increase boost for frames where new data coming into frame (e.g. zoom out).
- // Slightly reduce boost if there is a net balance of motion out of the frame
- // (zoom in). The range for this_frame_mv_in_out is -1.0 to +1.0.
- if (this_frame_mv_in_out > 0.0)
- frame_boost += frame_boost * (this_frame_mv_in_out * 2.0);
- // In the extreme case the boost is halved.
- else
- frame_boost += frame_boost * (this_frame_mv_in_out / 2.0);
-
- return AOMMIN(frame_boost, max_boost * boost_q_correction);
-}
-
-static int calc_arf_boost(AV1_COMP *cpi, int offset, int f_frames, int b_frames,
- int *f_boost, int *b_boost) {
- TWO_PASS *const twopass = &cpi->twopass;
- int i;
- double boost_score = 0.0;
- double mv_ratio_accumulator = 0.0;
- double decay_accumulator = 1.0;
- double this_frame_mv_in_out = 0.0;
- double mv_in_out_accumulator = 0.0;
- double abs_mv_in_out_accumulator = 0.0;
- int arf_boost;
- int flash_detected = 0;
-
- // Search forward from the proposed arf/next gf position.
- for (i = 0; i < f_frames; ++i) {
- const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset);
- if (this_frame == NULL) break;
-
- // Update the motion related elements to the boost calculation.
- accumulate_frame_motion_stats(
- this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator,
- &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
-
- // We want to discount the flash frame itself and the recovery
- // frame that follows as both will have poor scores.
- flash_detected = detect_flash(twopass, i + offset) ||
- detect_flash(twopass, i + offset + 1);
-
- // Accumulate the effect of prediction quality decay.
- if (!flash_detected) {
- decay_accumulator *= get_prediction_decay_rate(cpi, this_frame);
- decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
- ? MIN_DECAY_FACTOR
- : decay_accumulator;
- }
-
- boost_score +=
- decay_accumulator *
- calc_frame_boost(cpi, this_frame, this_frame_mv_in_out, GF_MAX_BOOST);
- }
-
- *f_boost = (int)boost_score;
-
- // Reset for backward looking loop.
- boost_score = 0.0;
- mv_ratio_accumulator = 0.0;
- decay_accumulator = 1.0;
- this_frame_mv_in_out = 0.0;
- mv_in_out_accumulator = 0.0;
- abs_mv_in_out_accumulator = 0.0;
-
- // Search backward towards last gf position.
- for (i = -1; i >= -b_frames; --i) {
- const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset);
- if (this_frame == NULL) break;
-
- // Update the motion related elements to the boost calculation.
- accumulate_frame_motion_stats(
- this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator,
- &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
-
- // We want to discount the the flash frame itself and the recovery
- // frame that follows as both will have poor scores.
- flash_detected = detect_flash(twopass, i + offset) ||
- detect_flash(twopass, i + offset + 1);
-
- // Cumulative effect of prediction quality decay.
- if (!flash_detected) {
- decay_accumulator *= get_prediction_decay_rate(cpi, this_frame);
- decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
- ? MIN_DECAY_FACTOR
- : decay_accumulator;
- }
-
- boost_score +=
- decay_accumulator *
- calc_frame_boost(cpi, this_frame, this_frame_mv_in_out, GF_MAX_BOOST);
- }
- *b_boost = (int)boost_score;
-
- arf_boost = (*f_boost + *b_boost);
- if (arf_boost < ((b_frames + f_frames) * 20))
- arf_boost = ((b_frames + f_frames) * 20);
- arf_boost = AOMMAX(arf_boost, MIN_ARF_GF_BOOST);
-
- return arf_boost;
-}
-
-// Calculate a section intra ratio used in setting max loop filter.
-static int calculate_section_intra_ratio(const FIRSTPASS_STATS *begin,
- const FIRSTPASS_STATS *end,
- int section_length) {
- const FIRSTPASS_STATS *s = begin;
- double intra_error = 0.0;
- double coded_error = 0.0;
- int i = 0;
-
- while (s < end && i < section_length) {
- intra_error += s->intra_error;
- coded_error += s->coded_error;
- ++s;
- ++i;
- }
-
- return (int)(intra_error / DOUBLE_DIVIDE_CHECK(coded_error));
-}
-
-// Calculate the total bits to allocate in this GF/ARF group.
-static int64_t calculate_total_gf_group_bits(AV1_COMP *cpi,
- double gf_group_err) {
- const RATE_CONTROL *const rc = &cpi->rc;
- const TWO_PASS *const twopass = &cpi->twopass;
- const int max_bits = frame_max_bits(rc, &cpi->oxcf);
- int64_t total_group_bits;
-
- // Calculate the bits to be allocated to the group as a whole.
- if ((twopass->kf_group_bits > 0) && (twopass->kf_group_error_left > 0)) {
- total_group_bits = (int64_t)(twopass->kf_group_bits *
- (gf_group_err / twopass->kf_group_error_left));
- } else {
- total_group_bits = 0;
- }
-
- // Clamp odd edge cases.
- total_group_bits = (total_group_bits < 0)
- ? 0
- : (total_group_bits > twopass->kf_group_bits)
- ? twopass->kf_group_bits
- : total_group_bits;
-
- // Clip based on user supplied data rate variability limit.
- if (total_group_bits > (int64_t)max_bits * rc->baseline_gf_interval)
- total_group_bits = (int64_t)max_bits * rc->baseline_gf_interval;
-
- return total_group_bits;
-}
-
-// Calculate the number bits extra to assign to boosted frames in a group.
-static int calculate_boost_bits(int frame_count, int boost,
- int64_t total_group_bits) {
- int allocation_chunks;
-
- // return 0 for invalid inputs (could arise e.g. through rounding errors)
- if (!boost || (total_group_bits <= 0) || (frame_count <= 0)) return 0;
-
- allocation_chunks = (frame_count * 100) + boost;
-
- // Prevent overflow.
- if (boost > 1023) {
- int divisor = boost >> 10;
- boost /= divisor;
- allocation_chunks /= divisor;
- }
-
- // Calculate the number of extra bits for use in the boosted frame or frames.
- return AOMMAX((int)(((int64_t)boost * total_group_bits) / allocation_chunks),
- 0);
-}
-
-#if USE_SYMM_MULTI_LAYER
-// #define CHCEK_GF_PARAMETER
-#ifdef CHCEK_GF_PARAMETER
-void check_frame_params(GF_GROUP *const gf_group, int gf_interval,
- int frame_nums) {
- static const char *update_type_strings[] = {
- "KF_UPDATE", "LF_UPDATE", "GF_UPDATE",
- "ARF_UPDATE", "OVERLAY_UPDATE", "BRF_UPDATE",
- "LAST_BIPRED_UPDATE", "BIPRED_UPDATE", "INTNL_OVERLAY_UPDATE",
- "INTNL_ARF_UPDATE"
- };
- FILE *fid = fopen("GF_PARAMS.txt", "a");
-
- fprintf(fid, "\n{%d}\n", gf_interval);
- for (int i = 0; i <= frame_nums; ++i) {
- fprintf(fid, "%s %d %d %d %d\n",
- update_type_strings[gf_group->update_type[i]],
- gf_group->arf_src_offset[i], gf_group->arf_pos_in_gf[i],
- gf_group->arf_update_idx[i], gf_group->pyramid_level[i]);
- }
-
- fprintf(fid, "number of nodes in each level: \n");
- for (int i = 0; i < MAX_PYRAMID_LVL; ++i) {
- fprintf(fid, "lvl %d: %d ", i, gf_group->pyramid_lvl_nodes[i]);
- }
- fprintf(fid, "\n");
- fclose(fid);
-}
-#endif // CHCEK_GF_PARAMETER
-static int update_type_2_rf_level(FRAME_UPDATE_TYPE update_type) {
- // Derive rf_level from update_type
- switch (update_type) {
- case LF_UPDATE: return INTER_NORMAL;
- case ARF_UPDATE: return GF_ARF_STD;
- case OVERLAY_UPDATE: return INTER_NORMAL;
- case BRF_UPDATE: return GF_ARF_LOW;
- case LAST_BIPRED_UPDATE: return INTER_NORMAL;
- case BIPRED_UPDATE: return INTER_NORMAL;
- case INTNL_ARF_UPDATE: return GF_ARF_LOW;
- case INTNL_OVERLAY_UPDATE: return INTER_NORMAL;
- default: return INTER_NORMAL;
- }
-}
-
-static void set_multi_layer_params(GF_GROUP *const gf_group, int l, int r,
- int *frame_ind, int arf_ind, int level) {
- if (r - l < 4) {
- while (++l < r) {
- // leaf nodes, not a look-ahead frame
- gf_group->update_type[*frame_ind] = LF_UPDATE;
- gf_group->arf_src_offset[*frame_ind] = 0;
- gf_group->arf_pos_in_gf[*frame_ind] = 0;
- gf_group->arf_update_idx[*frame_ind] = arf_ind;
- gf_group->pyramid_level[*frame_ind] = 0;
- ++gf_group->pyramid_lvl_nodes[0];
- ++(*frame_ind);
- }
- } else {
- int m = (l + r) / 2;
- int arf_pos_in_gf = *frame_ind;
-
- gf_group->update_type[*frame_ind] = INTNL_ARF_UPDATE;
- gf_group->arf_src_offset[*frame_ind] = m - l - 1;
- gf_group->arf_pos_in_gf[*frame_ind] = 0;
- gf_group->arf_update_idx[*frame_ind] = 1; // mark all internal ARF 1
- gf_group->pyramid_level[*frame_ind] = level;
- ++gf_group->pyramid_lvl_nodes[level];
- ++(*frame_ind);
-
- // set parameters for frames displayed before this frame
- set_multi_layer_params(gf_group, l, m, frame_ind, 1, level - 1);
-
- // for overlay frames, we need to record the position of its corresponding
- // arf frames for bit allocation
- gf_group->update_type[*frame_ind] = INTNL_OVERLAY_UPDATE;
- gf_group->arf_src_offset[*frame_ind] = 0;
- gf_group->arf_pos_in_gf[*frame_ind] = arf_pos_in_gf;
- gf_group->arf_update_idx[*frame_ind] = 1;
- gf_group->pyramid_level[*frame_ind] = 0;
- ++(*frame_ind);
-
- // set parameters for frames displayed after this frame
- set_multi_layer_params(gf_group, m, r, frame_ind, arf_ind, level - 1);
- }
-}
-
-static INLINE unsigned char get_pyramid_height(int pyramid_width) {
- assert(pyramid_width <= 16 && pyramid_width >= 4 &&
- "invalid gf interval for pyramid structure");
-
- return pyramid_width > 12 ? 4 : (pyramid_width > 6 ? 3 : 2);
-}
-
-static int construct_multi_layer_gf_structure(GF_GROUP *const gf_group,
- const int gf_interval) {
- int frame_index = 0;
- gf_group->pyramid_height = get_pyramid_height(gf_interval);
-
- assert(gf_group->pyramid_height <= MAX_PYRAMID_LVL);
-
- av1_zero_array(gf_group->pyramid_lvl_nodes, MAX_PYRAMID_LVL);
-
- // At the beginning of each GF group it will be a key or overlay frame,
- gf_group->update_type[frame_index] = OVERLAY_UPDATE;
- gf_group->arf_src_offset[frame_index] = 0;
- gf_group->arf_pos_in_gf[frame_index] = 0;
- gf_group->arf_update_idx[frame_index] = 0;
- gf_group->pyramid_level[frame_index] = 0;
- ++frame_index;
-
- // ALT0
- gf_group->update_type[frame_index] = ARF_UPDATE;
- gf_group->arf_src_offset[frame_index] = gf_interval - 1;
- gf_group->arf_pos_in_gf[frame_index] = 0;
- gf_group->arf_update_idx[frame_index] = 0;
- gf_group->pyramid_level[frame_index] = gf_group->pyramid_height;
- ++frame_index;
-
- // set parameters for the rest of the frames
- set_multi_layer_params(gf_group, 0, gf_interval, &frame_index, 0,
- gf_group->pyramid_height - 1);
- return frame_index;
-}
-
-static void define_customized_gf_group_structure(AV1_COMP *cpi) {
- RATE_CONTROL *const rc = &cpi->rc;
- TWO_PASS *const twopass = &cpi->twopass;
- GF_GROUP *const gf_group = &twopass->gf_group;
- const int key_frame = cpi->common.current_frame.frame_type == KEY_FRAME;
-
- assert(rc->baseline_gf_interval >= 4 &&
- rc->baseline_gf_interval <= MAX_PYRAMID_SIZE);
-
- const int gf_update_frames =
- construct_multi_layer_gf_structure(gf_group, rc->baseline_gf_interval);
- int frame_index;
-
- cpi->num_extra_arfs = 0;
-
- for (frame_index = 0; frame_index < gf_update_frames; ++frame_index) {
- // Set unused variables to default values
- gf_group->bidir_pred_enabled[frame_index] = 0;
- gf_group->brf_src_offset[frame_index] = 0;
-
- // Special handle for the first frame for assigning update_type
- if (frame_index == 0) {
- // For key frames the frame target rate is already set and it
- // is also the golden frame.
- if (key_frame) {
- gf_group->update_type[frame_index] = KF_UPDATE;
- continue;
- }
-
- if (rc->source_alt_ref_active) {
- gf_group->update_type[frame_index] = OVERLAY_UPDATE;
- } else {
- gf_group->update_type[frame_index] = GF_UPDATE;
- }
- } else {
- if (gf_group->update_type[frame_index] == INTNL_ARF_UPDATE)
- ++cpi->num_extra_arfs;
- }
-
- // Assign rf level based on update type
- gf_group->rf_level[frame_index] =
- update_type_2_rf_level(gf_group->update_type[frame_index]);
- }
-
- // NOTE: We need to configure the frame at the end of the sequence + 1 that
- // will be the start frame for the next group. Otherwise prior to the
- // call to av1_rc_get_second_pass_params() the data will be undefined.
- if (rc->source_alt_ref_pending) {
- gf_group->update_type[frame_index] = OVERLAY_UPDATE;
- gf_group->rf_level[frame_index] = INTER_NORMAL;
- } else {
- gf_group->update_type[frame_index] = GF_UPDATE;
- gf_group->rf_level[frame_index] = GF_ARF_STD;
- }
-
- gf_group->bidir_pred_enabled[frame_index] = 0;
- gf_group->brf_src_offset[frame_index] = 0;
- gf_group->arf_update_idx[frame_index] = 0;
- // This value is only used for INTNL_OVERLAY_UPDATE
- gf_group->arf_pos_in_gf[frame_index] = 0;
-
- // This parameter is useless?
- gf_group->arf_ref_idx[frame_index] = 0;
-#ifdef CHCEK_GF_PARAMETER
- check_frame_params(gf_group, rc->baseline_gf_interval, gf_update_frames);
-#endif
-}
-
-// It is an example of how to define a GF stucture manually. The function will
-// result in exactly the same GF group structure as
-// define_customized_gf_group_structure() when rc->baseline_gf_interval == 4
-#if USE_MANUAL_GF4_STRUCT
-#define GF_INTERVAL_4 4
-static const unsigned char gf4_multi_layer_params[][GF_FRAME_PARAMS] = {
- {
- // gf_group->index == 0 (Frame 0)
- // It can also be KEY frame. Will assign the proper value
- // in define_gf_group_structure
- OVERLAY_UPDATE, // update_type (default value)
- 0, // arf_src_offset
- 0, // arf_pos_in_gf
- 0 // arf_update_idx
- },
- {
- // gf_group->index == 1 (Frame 4)
- ARF_UPDATE, // update_type
- GF_INTERVAL_4 - 1, // arf_src_offset
- 0, // arf_pos_in_gf
- 0 // arf_update_idx
- },
- {
- // gf_group->index == 2 (Frame 2)
- INTNL_ARF_UPDATE, // update_type
- (GF_INTERVAL_4 >> 1) - 1, // arf_src_offset
- 0, // arf_pos_in_gf
- 0 // arf_update_idx
- },
- {
- // gf_group->index == 3 (Frame 1)
- LAST_BIPRED_UPDATE, // update_type
- 0, // arf_src_offset
- 0, // arf_pos_in_gf
- 0 // arf_update_idx
- },
-
- {
- // gf_group->index == 4 (Frame 2 - OVERLAY)
- INTNL_OVERLAY_UPDATE, // update_type
- 0, // arf_src_offset
- 2, // arf_pos_in_gf
- 0 // arf_update_idx
- },
- {
- // gf_group->index == 5 (Frame 3)
- LF_UPDATE, // update_type
- 0, // arf_src_offset
- 0, // arf_pos_in_gf
- 1 // arf_update_idx
- }
-};
-
-static int define_gf_group_structure_4(AV1_COMP *cpi) {
- RATE_CONTROL *const rc = &cpi->rc;
- TWO_PASS *const twopass = &cpi->twopass;
- GF_GROUP *const gf_group = &twopass->gf_group;
- const int key_frame = cpi->common.current_frame.frame_type == KEY_FRAME;
-
- assert(rc->baseline_gf_interval == GF_INTERVAL_4);
-
- const int gf_update_frames = rc->baseline_gf_interval + 2;
- int frame_index;
-
- for (frame_index = 0; frame_index < gf_update_frames; ++frame_index) {
- int param_idx = 0;
-
- gf_group->bidir_pred_enabled[frame_index] = 0;
-
- if (frame_index == 0) {
- // gf_group->arf_src_offset[frame_index] = 0;
- gf_group->brf_src_offset[frame_index] = 0;
- gf_group->bidir_pred_enabled[frame_index] = 0;
-
- // For key frames the frame target rate is already set and it
- // is also the golden frame.
- if (key_frame) continue;
-
- gf_group->update_type[frame_index] =
- gf4_multi_layer_params[frame_index][param_idx++];
-
- if (rc->source_alt_ref_active) {
- gf_group->update_type[frame_index] = OVERLAY_UPDATE;
- } else {
- gf_group->update_type[frame_index] = GF_UPDATE;
- }
- param_idx++;
- } else {
- gf_group->update_type[frame_index] =
- gf4_multi_layer_params[frame_index][param_idx++];
- }
-
- // setup other parameters
- gf_group->rf_level[frame_index] =
- update_type_2_rf_level(gf_group->update_type[frame_index]);
-
- // == arf_src_offset ==
- gf_group->arf_src_offset[frame_index] =
- gf4_multi_layer_params[frame_index][param_idx++];
-
- // == arf_pos_in_gf ==
- gf_group->arf_pos_in_gf[frame_index] =
- gf4_multi_layer_params[frame_index][param_idx++];
-
- // == arf_update_idx ==
- gf_group->brf_src_offset[frame_index] =
- gf4_multi_layer_params[frame_index][param_idx];
- }
-
- // NOTE: We need to configure the frame at the end of the sequence + 1 that
- // will be the start frame for the next group. Otherwise prior to the
- // call to av1_rc_get_second_pass_params() the data will be undefined.
- gf_group->arf_update_idx[frame_index] = 0;
- gf_group->arf_ref_idx[frame_index] = 0;
-
- if (rc->source_alt_ref_pending) {
- gf_group->update_type[frame_index] = OVERLAY_UPDATE;
- gf_group->rf_level[frame_index] = INTER_NORMAL;
-
- } else {
- gf_group->update_type[frame_index] = GF_UPDATE;
- gf_group->rf_level[frame_index] = GF_ARF_STD;
- }
-
- gf_group->bidir_pred_enabled[frame_index] = 0;
- gf_group->brf_src_offset[frame_index] = 0;
-
- // This value is only used for INTNL_OVERLAY_UPDATE
- gf_group->arf_pos_in_gf[frame_index] = 0;
-
- return gf_update_frames;
-}
-#endif // USE_MANUAL_GF4_STRUCT
-#endif // USE_SYMM_MULTI_LAYER
-
-static void define_gf_group_structure(AV1_COMP *cpi) {
- RATE_CONTROL *const rc = &cpi->rc;
-
-#if USE_SYMM_MULTI_LAYER
- const int valid_customized_gf_length =
- rc->baseline_gf_interval >= 4 &&
- rc->baseline_gf_interval <= MAX_PYRAMID_SIZE;
- // used the new structure only if extra_arf is allowed
- if (valid_customized_gf_length && rc->source_alt_ref_pending &&
- cpi->extra_arf_allowed > 0) {
-#if USE_MANUAL_GF4_STRUCT
- if (rc->baseline_gf_interval == 4)
- define_gf_group_structure_4(cpi);
- else
-#endif
- define_customized_gf_group_structure(cpi);
- cpi->new_bwdref_update_rule = 1;
- return;
- } else {
- cpi->new_bwdref_update_rule = 0;
- }
-#endif
-
- TWO_PASS *const twopass = &cpi->twopass;
- GF_GROUP *const gf_group = &twopass->gf_group;
- int i;
- int frame_index = 0;
- const int key_frame = cpi->common.current_frame.frame_type == KEY_FRAME;
-
- // The use of bi-predictive frames are only enabled when following 3
- // conditions are met:
- // (1) ALTREF is enabled;
- // (2) The bi-predictive group interval is at least 2; and
- // (3) The bi-predictive group interval is strictly smaller than the
- // golden group interval.
- const int is_bipred_enabled =
- cpi->extra_arf_allowed && rc->source_alt_ref_pending &&
- rc->bipred_group_interval &&
- rc->bipred_group_interval <=
- (rc->baseline_gf_interval - rc->source_alt_ref_pending);
- int bipred_group_end = 0;
- int bipred_frame_index = 0;
-
- const unsigned char ext_arf_interval =
- (unsigned char)(rc->baseline_gf_interval / (cpi->num_extra_arfs + 1) - 1);
- int which_arf = cpi->num_extra_arfs;
- int subgroup_interval[MAX_EXT_ARFS + 1];
- int is_sg_bipred_enabled = is_bipred_enabled;
- int accumulative_subgroup_interval = 0;
-
- // For key frames the frame target rate is already set and it
- // is also the golden frame.
- // === [frame_index == 0] ===
- if (!key_frame) {
- if (rc->source_alt_ref_active) {
- gf_group->update_type[frame_index] = OVERLAY_UPDATE;
- gf_group->rf_level[frame_index] = INTER_NORMAL;
- } else {
- gf_group->update_type[frame_index] = GF_UPDATE;
- gf_group->rf_level[frame_index] = GF_ARF_STD;
- }
- gf_group->arf_update_idx[frame_index] = 0;
- gf_group->arf_ref_idx[frame_index] = 0;
- }
-
- gf_group->bidir_pred_enabled[frame_index] = 0;
- gf_group->brf_src_offset[frame_index] = 0;
-
- frame_index++;
-
- bipred_frame_index++;
-
- // === [frame_index == 1] ===
- if (rc->source_alt_ref_pending) {
- gf_group->update_type[frame_index] = ARF_UPDATE;
- gf_group->rf_level[frame_index] = GF_ARF_STD;
- gf_group->arf_src_offset[frame_index] =
- (unsigned char)(rc->baseline_gf_interval - 1);
-
- gf_group->arf_update_idx[frame_index] = 0;
- gf_group->arf_ref_idx[frame_index] = 0;
-
- gf_group->bidir_pred_enabled[frame_index] = 0;
- gf_group->brf_src_offset[frame_index] = 0;
- // NOTE: "bidir_pred_frame_index" stays unchanged for ARF_UPDATE frames.
-
- // Work out the ARFs' positions in this gf group
- // NOTE(weitinglin): ALT_REFs' are indexed inversely, but coded in display
- // order (except for the original ARF). In the example of three ALT_REF's,
- // We index ALTREF's as: KEY ----- ALT2 ----- ALT1 ----- ALT0
- // but code them in the following order:
- // KEY-ALT0-ALT2 ----- OVERLAY2-ALT1 ----- OVERLAY1 ----- OVERLAY0
- //
- // arf_pos_for_ovrly[]: Position for OVERLAY
- // arf_pos_in_gf[]: Position for ALTREF
- cpi->arf_pos_for_ovrly[0] = frame_index + cpi->num_extra_arfs +
- gf_group->arf_src_offset[frame_index] + 1;
- for (i = 0; i < cpi->num_extra_arfs; ++i) {
- cpi->arf_pos_for_ovrly[i + 1] =
- frame_index + (cpi->num_extra_arfs - i) * (ext_arf_interval + 2);
- subgroup_interval[i] = cpi->arf_pos_for_ovrly[i] -
- cpi->arf_pos_for_ovrly[i + 1] - (i == 0 ? 1 : 2);
- }
- subgroup_interval[cpi->num_extra_arfs] =
- cpi->arf_pos_for_ovrly[cpi->num_extra_arfs] - frame_index -
- (cpi->num_extra_arfs == 0 ? 1 : 2);
-
- ++frame_index;
-
- // Insert an extra ARF
- // === [frame_index == 2] ===
- if (cpi->num_extra_arfs) {
- gf_group->update_type[frame_index] = INTNL_ARF_UPDATE;
- gf_group->rf_level[frame_index] = GF_ARF_LOW;
- gf_group->arf_src_offset[frame_index] = ext_arf_interval;
-
- gf_group->arf_update_idx[frame_index] = which_arf;
- gf_group->arf_ref_idx[frame_index] = 0;
- ++frame_index;
- }
- accumulative_subgroup_interval += subgroup_interval[cpi->num_extra_arfs];
- }
-
- const int normal_frames =
- rc->baseline_gf_interval - (key_frame || rc->source_alt_ref_pending);
-
- for (i = 0; i < normal_frames; ++i) {
- gf_group->arf_update_idx[frame_index] = which_arf;
- gf_group->arf_ref_idx[frame_index] = which_arf;
-
- // If we are going to have ARFs, check whether we can have BWDREF in this
- // subgroup, and further, whether we can have ARF subgroup which contains
- // the BWDREF subgroup but contained within the GF group:
- //
- // GF group --> ARF subgroup --> BWDREF subgroup
- if (rc->source_alt_ref_pending) {
- is_sg_bipred_enabled =
- is_bipred_enabled &&
- (subgroup_interval[which_arf] > rc->bipred_group_interval);
- }
-
- // NOTE: BIDIR_PRED is only enabled when the length of the bi-predictive
- // frame group interval is strictly smaller than that of the GOLDEN
- // FRAME group interval.
- // TODO(zoeliu): Currently BIDIR_PRED is only enabled when alt-ref is on.
- if (is_sg_bipred_enabled && !bipred_group_end) {
- const int cur_brf_src_offset = rc->bipred_group_interval - 1;
-
- if (bipred_frame_index == 1) {
- // --- BRF_UPDATE ---
- gf_group->update_type[frame_index] = BRF_UPDATE;
- gf_group->rf_level[frame_index] = GF_ARF_LOW;
- gf_group->brf_src_offset[frame_index] = cur_brf_src_offset;
- } else if (bipred_frame_index == rc->bipred_group_interval) {
- // --- LAST_BIPRED_UPDATE ---
- gf_group->update_type[frame_index] = LAST_BIPRED_UPDATE;
- gf_group->rf_level[frame_index] = INTER_NORMAL;
- gf_group->brf_src_offset[frame_index] = 0;
-
- // Reset the bi-predictive frame index.
- bipred_frame_index = 0;
- } else {
- // --- BIPRED_UPDATE ---
- gf_group->update_type[frame_index] = BIPRED_UPDATE;
- gf_group->rf_level[frame_index] = INTER_NORMAL;
- gf_group->brf_src_offset[frame_index] = 0;
- }
- gf_group->bidir_pred_enabled[frame_index] = 1;
-
- bipred_frame_index++;
- // Check whether the next bi-predictive frame group would entirely be
- // included within the current golden frame group.
- // In addition, we need to avoid coding a BRF right before an ARF.
- if (bipred_frame_index == 1 &&
- (i + 2 + cur_brf_src_offset) >= accumulative_subgroup_interval) {
- bipred_group_end = 1;
- }
- } else {
- gf_group->update_type[frame_index] = LF_UPDATE;
- gf_group->rf_level[frame_index] = INTER_NORMAL;
- gf_group->bidir_pred_enabled[frame_index] = 0;
- gf_group->brf_src_offset[frame_index] = 0;
- }
-
- ++frame_index;
-
- // Check if we need to update the ARF.
- if (is_sg_bipred_enabled && cpi->num_extra_arfs && which_arf > 0 &&
- frame_index > cpi->arf_pos_for_ovrly[which_arf]) {
- --which_arf;
- accumulative_subgroup_interval += subgroup_interval[which_arf] + 1;
-
- // Meet the new subgroup; Reset the bipred_group_end flag.
- bipred_group_end = 0;
- // Insert another extra ARF after the overlay frame
- if (which_arf) {
- gf_group->update_type[frame_index] = INTNL_ARF_UPDATE;
- gf_group->rf_level[frame_index] = GF_ARF_LOW;
- gf_group->arf_src_offset[frame_index] = ext_arf_interval;
-
- gf_group->arf_update_idx[frame_index] = which_arf;
- gf_group->arf_ref_idx[frame_index] = 0;
- ++frame_index;
- }
- }
- }
-
- // NOTE: We need to configure the frame at the end of the sequence + 1 that
- // will be the start frame for the next group. Otherwise prior to the
- // call to av1_rc_get_second_pass_params() the data will be undefined.
- gf_group->arf_update_idx[frame_index] = 0;
- gf_group->arf_ref_idx[frame_index] = 0;
-
- if (rc->source_alt_ref_pending) {
- gf_group->update_type[frame_index] = OVERLAY_UPDATE;
- gf_group->rf_level[frame_index] = INTER_NORMAL;
-
- cpi->arf_pos_in_gf[0] = 1;
- if (cpi->num_extra_arfs) {
- // Overwrite the update_type for extra-ARF's corresponding internal
- // OVERLAY's: Change from LF_UPDATE to INTNL_OVERLAY_UPDATE.
- for (i = cpi->num_extra_arfs; i > 0; --i) {
- cpi->arf_pos_in_gf[i] =
- (i == cpi->num_extra_arfs ? 2 : cpi->arf_pos_for_ovrly[i + 1] + 1);
-
- gf_group->update_type[cpi->arf_pos_for_ovrly[i]] = INTNL_OVERLAY_UPDATE;
- gf_group->rf_level[cpi->arf_pos_for_ovrly[i]] = INTER_NORMAL;
- }
- }
- } else {
- gf_group->update_type[frame_index] = GF_UPDATE;
- gf_group->rf_level[frame_index] = GF_ARF_STD;
- }
-
- gf_group->bidir_pred_enabled[frame_index] = 0;
- gf_group->brf_src_offset[frame_index] = 0;
-}
-
-#if USE_SYMM_MULTI_LAYER
-#define NEW_MULTI_LVL_BOOST_VBR_ALLOC 1
-
-#if NEW_MULTI_LVL_BOOST_VBR_ALLOC
-#define LEAF_REDUCTION_FACTOR 0.75
-static double lvl_budget_factor[MAX_PYRAMID_LVL - 1][MAX_PYRAMID_LVL - 1] = {
- { 1.0, 0.0, 0.0 }, { 0.6, 0.4, 0 }, { 0.45, 0.35, 0.20 }
-};
-#endif // NEW_MULTI_LVL_BOOST_VBR_ALLOC
-#endif // USE_SYMM_MULTI_LAYER
-static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,
- double group_error, int gf_arf_bits) {
- RATE_CONTROL *const rc = &cpi->rc;
- const AV1EncoderConfig *const oxcf = &cpi->oxcf;
- TWO_PASS *const twopass = &cpi->twopass;
- GF_GROUP *const gf_group = &twopass->gf_group;
- int i;
- int frame_index = 0;
- int key_frame;
- const int max_bits = frame_max_bits(&cpi->rc, &cpi->oxcf);
- int64_t total_group_bits = gf_group_bits;
- int ext_arf_boost[MAX_EXT_ARFS];
-
- define_gf_group_structure(cpi);
-
- av1_zero_array(ext_arf_boost, MAX_EXT_ARFS);
-
- key_frame = cpi->common.current_frame.frame_type == KEY_FRAME;
-
- // For key frames the frame target rate is already set and it
- // is also the golden frame.
- // === [frame_index == 0] ===
- if (!key_frame) {
- if (rc->source_alt_ref_active)
- gf_group->bit_allocation[frame_index] = 0;
- else
- gf_group->bit_allocation[frame_index] = gf_arf_bits;
-
- // Step over the golden frame / overlay frame
- FIRSTPASS_STATS frame_stats;
- if (EOF == input_stats(twopass, &frame_stats)) return;
- }
-
- // Deduct the boost bits for arf (or gf if it is not a key frame)
- // from the group total.
- if (rc->source_alt_ref_pending || !key_frame) total_group_bits -= gf_arf_bits;
-
- frame_index++;
-
- // Store the bits to spend on the ARF if there is one.
- // === [frame_index == 1] ===
- if (rc->source_alt_ref_pending) {
- gf_group->bit_allocation[frame_index] = gf_arf_bits;
-
- ++frame_index;
-
- // Skip all the extra-ARF's right after ARF at the starting segment of
- // the current GF group.
- if (cpi->num_extra_arfs) {
- while (gf_group->update_type[frame_index] == INTNL_ARF_UPDATE)
- ++frame_index;
- }
- }
-
-#if USE_SYMM_MULTI_LAYER
-#if NEW_MULTI_LVL_BOOST_VBR_ALLOC
- // Save.
- const int tmp_frame_index = frame_index;
- int budget_reduced_from_leaf_level = 0;
-#endif // NEW_MULTI_LVL_BOOST_VBR_ALLOC
-#endif // USE_SYMM_MULTI_LAYER
-
- // Allocate bits to the other frames in the group.
- const int normal_frames =
- rc->baseline_gf_interval - (key_frame || rc->source_alt_ref_pending);
-
- for (i = 0; i < normal_frames; ++i) {
- FIRSTPASS_STATS frame_stats;
- if (EOF == input_stats(twopass, &frame_stats)) break;
-
- const double modified_err =
- calculate_modified_err(cpi, twopass, oxcf, &frame_stats);
- const double err_fraction =
- (group_error > 0) ? modified_err / DOUBLE_DIVIDE_CHECK(group_error)
- : 0.0;
- const int target_frame_size =
- clamp((int)((double)total_group_bits * err_fraction), 0,
- AOMMIN(max_bits, (int)total_group_bits));
-
- if (gf_group->update_type[frame_index] == BRF_UPDATE) {
- // Boost up the allocated bits on BWDREF_FRAME
- gf_group->bit_allocation[frame_index] =
- target_frame_size + (target_frame_size >> 2);
- } else if (gf_group->update_type[frame_index] == LAST_BIPRED_UPDATE) {
- // Press down the allocated bits on LAST_BIPRED_UPDATE frames
- gf_group->bit_allocation[frame_index] =
- target_frame_size - (target_frame_size >> 1);
- } else if (gf_group->update_type[frame_index] == BIPRED_UPDATE) {
- // TODO(zoeliu): To investigate whether the allocated bits on
- // BIPRED_UPDATE frames need to be further adjusted.
- gf_group->bit_allocation[frame_index] = target_frame_size;
-#if USE_SYMM_MULTI_LAYER
- } else if (cpi->new_bwdref_update_rule &&
- gf_group->update_type[frame_index] == INTNL_OVERLAY_UPDATE) {
- assert(gf_group->pyramid_height <= MAX_PYRAMID_LVL &&
- "non-valid height for a pyramid structure");
-
- const int arf_pos = gf_group->arf_pos_in_gf[frame_index];
- gf_group->bit_allocation[frame_index] = 0;
-
- gf_group->bit_allocation[arf_pos] = target_frame_size;
- // Note: Boost, if needed, is added in the next loop.
-#endif // USE_SYMM_MULTI_LAYER
- } else {
- assert(gf_group->update_type[frame_index] == LF_UPDATE ||
- gf_group->update_type[frame_index] == INTNL_OVERLAY_UPDATE);
- gf_group->bit_allocation[frame_index] = target_frame_size;
-#if MULTI_LVL_BOOST_VBR_CQ
- if (cpi->new_bwdref_update_rule) {
-#if NEW_MULTI_LVL_BOOST_VBR_ALLOC
- const int this_budget_reduction =
- (int)(target_frame_size * LEAF_REDUCTION_FACTOR);
- gf_group->bit_allocation[frame_index] -= this_budget_reduction;
- budget_reduced_from_leaf_level += this_budget_reduction;
-#else
- gf_group->bit_allocation[frame_index] -= (target_frame_size >> 1);
-#endif // NEW_MULTI_LVL_BOOST_VBR_ALLOC
- }
-#endif // MULTI_LVL_BOOST_VBR_CQ
- }
-
- ++frame_index;
-
- // Skip all the extra-ARF's.
- if (cpi->num_extra_arfs) {
- while (gf_group->update_type[frame_index] == INTNL_ARF_UPDATE)
- ++frame_index;
- }
- }
-
-#if USE_SYMM_MULTI_LAYER
-#if MULTI_LVL_BOOST_VBR_CQ
- if (budget_reduced_from_leaf_level > 0) {
- // Restore.
- frame_index = tmp_frame_index;
-
- // Re-distribute this extra budget to overlay frames in the group.
- for (i = 0; i < normal_frames; ++i) {
- if (cpi->new_bwdref_update_rule &&
- gf_group->update_type[frame_index] == INTNL_OVERLAY_UPDATE) {
- assert(gf_group->pyramid_height <= MAX_PYRAMID_LVL &&
- "non-valid height for a pyramid structure");
- const int arf_pos = gf_group->arf_pos_in_gf[frame_index];
- const int this_lvl = gf_group->pyramid_level[arf_pos];
- const int dist2top = gf_group->pyramid_height - 1 - this_lvl;
-#if NEW_MULTI_LVL_BOOST_VBR_ALLOC
- const double lvl_boost_factor =
- lvl_budget_factor[gf_group->pyramid_height - 2][dist2top];
- const int extra_size =
- (int)(budget_reduced_from_leaf_level * lvl_boost_factor /
- gf_group->pyramid_lvl_nodes[this_lvl]);
-#else
- const int target_frame_size = gf_group->bit_allocation[arf_pos];
- const int extra_size = target_frame_size >> dist2top;
-#endif // NEW_MULTI_LVL_BOOST_VBR_ALLOC
- gf_group->bit_allocation[arf_pos] += extra_size;
- }
- ++frame_index;
-
- // Skip all the extra-ARF's.
- if (cpi->num_extra_arfs) {
- while (gf_group->update_type[frame_index] == INTNL_ARF_UPDATE)
- ++frame_index;
- }
- }
- }
-#endif // MULTI_LVL_BOOST_VBR_CQ
-#endif // USE_SYMM_MULTI_LAYER
-
-#if USE_SYMM_MULTI_LAYER
- if (cpi->new_bwdref_update_rule == 0 && rc->source_alt_ref_pending) {
-#else
- if (rc->source_alt_ref_pending) {
-#endif
- if (cpi->num_extra_arfs) {
- // NOTE: For bit allocation, move the allocated bits associated with
- // INTNL_OVERLAY_UPDATE to the corresponding INTNL_ARF_UPDATE.
- // i > 0 for extra-ARF's and i == 0 for ARF:
- // arf_pos_for_ovrly[i]: Position for INTNL_OVERLAY_UPDATE
- // arf_pos_in_gf[i]: Position for INTNL_ARF_UPDATE
- for (i = cpi->num_extra_arfs; i > 0; --i) {
- assert(gf_group->update_type[cpi->arf_pos_for_ovrly[i]] ==
- INTNL_OVERLAY_UPDATE);
-
- // Encoder's choice:
- // Set show_existing_frame == 1 for all extra-ARF's, and hence
- // allocate zero bit for both all internal OVERLAY frames.
- gf_group->bit_allocation[cpi->arf_pos_in_gf[i]] =
- gf_group->bit_allocation[cpi->arf_pos_for_ovrly[i]];
- gf_group->bit_allocation[cpi->arf_pos_for_ovrly[i]] = 0;
- }
- }
- }
-}
-
-// Returns true if KF group and GF group both are almost completely static.
-static INLINE int is_almost_static(double gf_zero_motion, int kf_zero_motion) {
- return (gf_zero_motion >= 0.995) &&
- (kf_zero_motion >= STATIC_KF_GROUP_THRESH);
-}
-
-// Analyse and define a gf/arf group.
-static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
- AV1_COMMON *const cm = &cpi->common;
- RATE_CONTROL *const rc = &cpi->rc;
- AV1EncoderConfig *const oxcf = &cpi->oxcf;
- TWO_PASS *const twopass = &cpi->twopass;
- FIRSTPASS_STATS next_frame;
- const FIRSTPASS_STATS *const start_pos = twopass->stats_in;
- int i;
-
- double boost_score = 0.0;
-#if !CONFIG_FIX_GF_LENGTH
- double old_boost_score = 0.0;
- double mv_ratio_accumulator_thresh;
- int active_max_gf_interval;
- int active_min_gf_interval;
-#endif
- double gf_group_err = 0.0;
-#if GROUP_ADAPTIVE_MAXQ
- double gf_group_raw_error = 0.0;
-#endif
- double gf_group_skip_pct = 0.0;
- double gf_group_inactive_zone_rows = 0.0;
- double gf_first_frame_err = 0.0;
- double mod_frame_err = 0.0;
-
- double mv_ratio_accumulator = 0.0;
- double decay_accumulator = 1.0;
- double zero_motion_accumulator = 1.0;
-
- double loop_decay_rate = 1.00;
- double last_loop_decay_rate = 1.00;
-
- double this_frame_mv_in_out = 0.0;
- double mv_in_out_accumulator = 0.0;
- double abs_mv_in_out_accumulator = 0.0;
-
- unsigned int allow_alt_ref = is_altref_enabled(cpi);
-
- int f_boost = 0;
- int b_boost = 0;
- int flash_detected;
- int64_t gf_group_bits;
- double gf_group_error_left;
- int gf_arf_bits;
- const int is_key_frame = frame_is_intra_only(cm);
- const int arf_active_or_kf = is_key_frame || rc->source_alt_ref_active;
-
- cpi->extra_arf_allowed = 1;
-
- // Reset the GF group data structures unless this is a key
- // frame in which case it will already have been done.
- if (is_key_frame == 0) {
- av1_zero(twopass->gf_group);
- }
-
- aom_clear_system_state();
- av1_zero(next_frame);
-
- // Load stats for the current frame.
- mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
-
- // Note the error of the frame at the start of the group. This will be
- // the GF frame error if we code a normal gf.
- gf_first_frame_err = mod_frame_err;
-
- // If this is a key frame or the overlay from a previous arf then
- // the error score / cost of this frame has already been accounted for.
- if (arf_active_or_kf) {
- gf_group_err -= gf_first_frame_err;
-#if GROUP_ADAPTIVE_MAXQ
- gf_group_raw_error -= this_frame->coded_error;
-#endif
- gf_group_skip_pct -= this_frame->intra_skip_pct;
- gf_group_inactive_zone_rows -= this_frame->inactive_zone_rows;
- }
-#if !CONFIG_FIX_GF_LENGTH
- // Motion breakout threshold for loop below depends on image size.
- mv_ratio_accumulator_thresh =
- (cpi->initial_height + cpi->initial_width) / 4.0;
- // Set a maximum and minimum interval for the GF group.
- // If the image appears almost completely static we can extend beyond this.
- {
- int int_max_q = (int)(av1_convert_qindex_to_q(
- twopass->active_worst_quality, cpi->common.seq_params.bit_depth));
- int int_lbq = (int)(av1_convert_qindex_to_q(
- rc->last_boosted_qindex, cpi->common.seq_params.bit_depth));
-
- active_min_gf_interval = rc->min_gf_interval + AOMMIN(2, int_max_q / 200);
- if (active_min_gf_interval > rc->max_gf_interval)
- active_min_gf_interval = rc->max_gf_interval;
-
- // The value chosen depends on the active Q range. At low Q we have
- // bits to spare and are better with a smaller interval and smaller boost.
- // At high Q when there are few bits to spare we are better with a longer
- // interval to spread the cost of the GF.
- active_max_gf_interval = 12 + AOMMIN(4, (int_lbq / 6));
-
- // We have: active_min_gf_interval <= rc->max_gf_interval
- if (active_max_gf_interval < active_min_gf_interval)
- active_max_gf_interval = active_min_gf_interval;
- else if (active_max_gf_interval > rc->max_gf_interval)
- active_max_gf_interval = rc->max_gf_interval;
- }
-#endif // !CONFIG_FIX_GF_LENGTH
- double avg_sr_coded_error = 0;
- double avg_raw_err_stdev = 0;
- int non_zero_stdev_count = 0;
-
- i = 0;
- while (i < rc->static_scene_max_gf_interval && i < rc->frames_to_key) {
- ++i;
-
- // Accumulate error score of frames in this gf group.
- mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
- gf_group_err += mod_frame_err;
-#if GROUP_ADAPTIVE_MAXQ
- gf_group_raw_error += this_frame->coded_error;
-#endif
- gf_group_skip_pct += this_frame->intra_skip_pct;
- gf_group_inactive_zone_rows += this_frame->inactive_zone_rows;
-
- if (EOF == input_stats(twopass, &next_frame)) break;
-
- // Test for the case where there is a brief flash but the prediction
- // quality back to an earlier frame is then restored.
- flash_detected = detect_flash(twopass, 0);
-
- // Update the motion related elements to the boost calculation.
- accumulate_frame_motion_stats(
- &next_frame, &this_frame_mv_in_out, &mv_in_out_accumulator,
- &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
- // sum up the metric values of current gf group
- avg_sr_coded_error += next_frame.sr_coded_error;
- if (fabs(next_frame.raw_error_stdev) > 0.000001) {
- non_zero_stdev_count++;
- avg_raw_err_stdev += next_frame.raw_error_stdev;
- }
-
- // Accumulate the effect of prediction quality decay.
- if (!flash_detected) {
- last_loop_decay_rate = loop_decay_rate;
- loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
-
- decay_accumulator = decay_accumulator * loop_decay_rate;
-
- // Monitor for static sections.
- if ((rc->frames_since_key + i - 1) > 1) {
- zero_motion_accumulator = AOMMIN(
- zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
- }
-
- // Break clause to detect very still sections after motion. For example,
- // a static image after a fade or other transition.
- if (detect_transition_to_still(cpi, i, 5, loop_decay_rate,
- last_loop_decay_rate)) {
- allow_alt_ref = 0;
- break;
- }
- }
-
- // Calculate a boost number for this frame.
- boost_score +=
- decay_accumulator *
- calc_frame_boost(cpi, &next_frame, this_frame_mv_in_out, GF_MAX_BOOST);
-#if CONFIG_FIX_GF_LENGTH
- // If almost totally static, we will not use the FIXED_GF_LENGTH later, so
- // we can continue for more frames.
- if (i >= (FIXED_GF_LENGTH + 1) &&
- !is_almost_static(zero_motion_accumulator,
- twopass->kf_zeromotion_pct)) {
- break;
- }
-#else
- // Break out conditions.
- // Break at maximum of active_max_gf_interval unless almost totally static.
- //
- // Note that the addition of a test of rc->source_alt_ref_active is
- // deliberate. The effect of this is that after a normal altref group even
- // if the material is static there will be one normal length GF group
- // before allowing longer GF groups. The reason for this is that in cases
- // such as slide shows where slides are separated by a complex transition
- // such as a fade, the arf group spanning the transition may not be coded
- // at a very high quality and hence this frame (with its overlay) is a
- // poor golden frame to use for an extended group.
- if ((i >= (active_max_gf_interval + arf_active_or_kf) &&
- ((zero_motion_accumulator < 0.995) || (rc->source_alt_ref_active))) ||
- (
- // Don't break out with a very short interval.
- (i >= active_min_gf_interval + arf_active_or_kf) &&
- (!flash_detected) &&
- ((mv_ratio_accumulator > mv_ratio_accumulator_thresh) ||
- (abs_mv_in_out_accumulator > 3.0) ||
- (mv_in_out_accumulator < -2.0) ||
- ((boost_score - old_boost_score) < BOOST_BREAKOUT)))) {
- // If GF group interval is < 12, we force it to be 8. Otherwise,
- // if it is >= 12, we keep it as is.
- // NOTE: 'i' is 1 more than the GF group interval candidate that is being
- // checked.
- if (i == (8 + 1) || i >= (12 + 1)) {
- boost_score = old_boost_score;
- break;
- }
- }
- old_boost_score = boost_score;
-#endif // CONFIG_FIX_GF_LENGTH
- *this_frame = next_frame;
- }
- twopass->gf_zeromotion_pct = (int)(zero_motion_accumulator * 1000.0);
-
- // Was the group length constrained by the requirement for a new KF?
- rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0;
-
- const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
- : cpi->common.MBs;
- assert(num_mbs > 0);
- if (i) avg_sr_coded_error /= i;
-
- if (non_zero_stdev_count) avg_raw_err_stdev /= non_zero_stdev_count;
-
- // Disable extra altrefs and backward refs for "still" gf group:
- // zero_motion_accumulator: minimum percentage of (0,0) motion;
- // avg_sr_coded_error: average of the SSE per pixel of each frame;
- // avg_raw_err_stdev: average of the standard deviation of (0,0)
- // motion error per block of each frame.
- const int disable_bwd_extarf =
- (zero_motion_accumulator > MIN_ZERO_MOTION &&
- avg_sr_coded_error / num_mbs < MAX_SR_CODED_ERROR &&
- avg_raw_err_stdev < MAX_RAW_ERR_VAR);
-
- if (disable_bwd_extarf) cpi->extra_arf_allowed = 0;
-
- const int use_alt_ref =
- !is_almost_static(zero_motion_accumulator, twopass->kf_zeromotion_pct) &&
- allow_alt_ref && (i < cpi->oxcf.lag_in_frames) &&
- (i >= rc->min_gf_interval);
-
-#define REDUCE_GF_LENGTH_THRESH 4
-#define REDUCE_GF_LENGTH_TO_KEY_THRESH 9
-#define REDUCE_GF_LENGTH_BY 1
- int alt_offset = 0;
-#if REDUCE_LAST_GF_LENGTH
- // The length reduction strategy is tweaked using AOM_Q mode, and doesn't work
- // for VBR mode.
- // Also, we don't have do adjustment for lossless mode.
- const int allow_gf_length_reduction =
- (cpi->oxcf.rc_mode == AOM_Q || cpi->extra_arf_allowed == 0) &&
- !is_lossless_requested(&cpi->oxcf);
-
- if (allow_gf_length_reduction && use_alt_ref) {
- // adjust length of this gf group if one of the following condition met
- // 1: only one overlay frame left and this gf is too long
- // 2: next gf group is too short to have arf compared to the current gf
-
- // maximum length of next gf group
- const int next_gf_len = rc->frames_to_key - i;
- const int single_overlay_left =
- next_gf_len == 0 && i > REDUCE_GF_LENGTH_THRESH;
- // the next gf is probably going to have a ARF but it will be shorter than
- // this gf
- const int unbalanced_gf =
- i > REDUCE_GF_LENGTH_TO_KEY_THRESH &&
- next_gf_len + 1 < REDUCE_GF_LENGTH_TO_KEY_THRESH &&
- next_gf_len + 1 >= rc->min_gf_interval;
-
- if (single_overlay_left || unbalanced_gf) {
- // Note: Tried roll_back = DIVIDE_AND_ROUND(i, 8), but is does not work
- // better in the current setting
- const int roll_back = REDUCE_GF_LENGTH_BY;
- alt_offset = -roll_back;
- i -= roll_back;
- }
- }
-#endif // REDUCE_LAST_GF_LENGTH
-
- // Should we use the alternate reference frame.
- if (use_alt_ref) {
- // Calculate the boost for alt ref.
- rc->gfu_boost =
- calc_arf_boost(cpi, alt_offset, (i - 1), (i - 1), &f_boost, &b_boost);
- rc->source_alt_ref_pending = 1;
-
- // do not replace ARFs with overlay frames, and keep it as GOLDEN_REF
- cpi->preserve_arf_as_gld = 1;
- } else {
- rc->gfu_boost = AOMMAX((int)boost_score, MIN_ARF_GF_BOOST);
- rc->source_alt_ref_pending = 0;
- cpi->preserve_arf_as_gld = 0;
- }
-
- // Set the interval until the next gf.
- // If forward keyframes are enabled, ensure the final gf group obeys the
- // MIN_FWD_KF_INTERVAL.
- if (cpi->oxcf.fwd_kf_enabled &&
- ((twopass->stats_in - i + rc->frames_to_key) < twopass->stats_in_end)) {
- if (i == rc->frames_to_key) {
- rc->baseline_gf_interval = i;
- // if the last gf group will be smaller than MIN_FWD_KF_INTERVAL
- } else if ((rc->frames_to_key - i <
- AOMMAX(MIN_FWD_KF_INTERVAL, rc->min_gf_interval)) &&
- (rc->frames_to_key != i)) {
- // if possible, merge the last two gf groups
- if (rc->frames_to_key <= MAX_PYRAMID_SIZE) {
- rc->baseline_gf_interval = rc->frames_to_key;
- // if merging the last two gf groups creates a group that is too long,
- // split them and force the last gf group to be the MIN_FWD_KF_INTERVAL
- } else {
- rc->baseline_gf_interval = rc->frames_to_key - MIN_FWD_KF_INTERVAL;
- }
- } else {
- rc->baseline_gf_interval = i - rc->source_alt_ref_pending;
- }
- } else {
- rc->baseline_gf_interval = i - rc->source_alt_ref_pending;
- }
-
-#if REDUCE_LAST_ALT_BOOST
-#define LAST_ALR_BOOST_FACTOR 0.2f
- rc->arf_boost_factor = 1.0;
- if (rc->source_alt_ref_pending && !is_lossless_requested(&cpi->oxcf)) {
- // Reduce the boost of altref in the last gf group
- if (rc->frames_to_key - i == REDUCE_GF_LENGTH_BY ||
- rc->frames_to_key - i == 0) {
- rc->arf_boost_factor = LAST_ALR_BOOST_FACTOR;
- }
- }
-#endif
-
- if (!cpi->extra_arf_allowed) {
- cpi->num_extra_arfs = 0;
- } else {
-#if USE_SYMM_MULTI_LAYER
- if (rc->baseline_gf_interval == 4 && rc->source_alt_ref_pending)
- cpi->num_extra_arfs = 1;
- else
- cpi->num_extra_arfs = get_number_of_extra_arfs(
- rc->baseline_gf_interval, rc->source_alt_ref_pending);
-#else
- // Compute how many extra alt_refs we can have
- cpi->num_extra_arfs = get_number_of_extra_arfs(rc->baseline_gf_interval,
- rc->source_alt_ref_pending);
-#endif // USE_SYMM_MULTI_LAYER
- }
-
-#if !USE_SYMM_MULTI_LAYER
- // Currently at maximum two extra ARFs' are allowed
- assert(cpi->num_extra_arfs <= MAX_EXT_ARFS);
-#endif
-
- rc->frames_till_gf_update_due = rc->baseline_gf_interval;
-
- rc->bipred_group_interval = BFG_INTERVAL;
- // The minimum bi-predictive frame group interval is 2.
- if (rc->bipred_group_interval < 2) rc->bipred_group_interval = 0;
-
- // Reset the file position.
- reset_fpf_position(twopass, start_pos);
-
- // Calculate the bits to be allocated to the gf/arf group as a whole
- gf_group_bits = calculate_total_gf_group_bits(cpi, gf_group_err);
-
-#if GROUP_ADAPTIVE_MAXQ
- // Calculate an estimate of the maxq needed for the group.
- // We are more agressive about correcting for sections
- // where there could be significant overshoot than for easier
- // sections where we do not wish to risk creating an overshoot
- // of the allocated bit budget.
- if ((cpi->oxcf.rc_mode != AOM_Q) && (rc->baseline_gf_interval > 1)) {
- const int vbr_group_bits_per_frame =
- (int)(gf_group_bits / rc->baseline_gf_interval);
- const double group_av_err = gf_group_raw_error / rc->baseline_gf_interval;
- const double group_av_skip_pct =
- gf_group_skip_pct / rc->baseline_gf_interval;
- const double group_av_inactive_zone =
- ((gf_group_inactive_zone_rows * 2) /
- (rc->baseline_gf_interval * (double)cm->mb_rows));
-
- int tmp_q;
- // rc factor is a weight factor that corrects for local rate control drift.
- double rc_factor = 1.0;
- if (rc->rate_error_estimate > 0) {
- rc_factor = AOMMAX(RC_FACTOR_MIN,
- (double)(100 - rc->rate_error_estimate) / 100.0);
- } else {
- rc_factor = AOMMIN(RC_FACTOR_MAX,
- (double)(100 - rc->rate_error_estimate) / 100.0);
- }
- tmp_q = get_twopass_worst_quality(
- cpi, group_av_err, (group_av_skip_pct + group_av_inactive_zone),
- vbr_group_bits_per_frame, twopass->kfgroup_inter_fraction * rc_factor);
- twopass->active_worst_quality =
- AOMMAX(tmp_q, twopass->active_worst_quality >> 1);
- }
-#endif
-
- // Calculate the extra bits to be used for boosted frame(s)
- gf_arf_bits = calculate_boost_bits(rc->baseline_gf_interval, rc->gfu_boost,
- gf_group_bits);
-
- // Adjust KF group bits and error remaining.
- twopass->kf_group_error_left -= (int64_t)gf_group_err;
-
- // If this is an arf update we want to remove the score for the overlay
- // frame at the end which will usually be very cheap to code.
- // The overlay frame has already, in effect, been coded so we want to spread
- // the remaining bits among the other frames.
- // For normal GFs remove the score for the GF itself unless this is
- // also a key frame in which case it has already been accounted for.
- if (rc->source_alt_ref_pending) {
- gf_group_error_left = gf_group_err - mod_frame_err;
- } else if (is_key_frame == 0) {
- gf_group_error_left = gf_group_err - gf_first_frame_err;
- } else {
- gf_group_error_left = gf_group_err;
- }
-
- // Allocate bits to each of the frames in the GF group.
- allocate_gf_group_bits(cpi, gf_group_bits, gf_group_error_left, gf_arf_bits);
-
- // Reset the file position.
- reset_fpf_position(twopass, start_pos);
-
- // Calculate a section intra ratio used in setting max loop filter.
- if (cpi->common.current_frame.frame_type != KEY_FRAME) {
- twopass->section_intra_rating = calculate_section_intra_ratio(
- start_pos, twopass->stats_in_end, rc->baseline_gf_interval);
- }
-}
-
-// Threshold for use of the lagging second reference frame. High second ref
-// usage may point to a transient event like a flash or occlusion rather than
-// a real scene cut.
-#define SECOND_REF_USEAGE_THRESH 0.1
-// Minimum % intra coding observed in first pass (1.0 = 100%)
-#define MIN_INTRA_LEVEL 0.25
-// Minimum ratio between the % of intra coding and inter coding in the first
-// pass after discounting neutral blocks (discounting neutral blocks in this
-// way helps catch scene cuts in clips with very flat areas or letter box
-// format clips with image padding.
-#define INTRA_VS_INTER_THRESH 2.0
-// Hard threshold where the first pass chooses intra for almost all blocks.
-// In such a case even if the frame is not a scene cut coding a key frame
-// may be a good option.
-#define VERY_LOW_INTER_THRESH 0.05
-// Maximum threshold for the relative ratio of intra error score vs best
-// inter error score.
-#define KF_II_ERR_THRESHOLD 2.5
-// In real scene cuts there is almost always a sharp change in the intra
-// or inter error score.
-#define ERR_CHANGE_THRESHOLD 0.4
-// For real scene cuts we expect an improvment in the intra inter error
-// ratio in the next frame.
-#define II_IMPROVEMENT_THRESHOLD 3.5
-#define KF_II_MAX 128.0
-
-static int test_candidate_kf(TWO_PASS *twopass,
- const FIRSTPASS_STATS *last_frame,
- const FIRSTPASS_STATS *this_frame,
- const FIRSTPASS_STATS *next_frame) {
- int is_viable_kf = 0;
- double pcnt_intra = 1.0 - this_frame->pcnt_inter;
- double modified_pcnt_inter =
- this_frame->pcnt_inter - this_frame->pcnt_neutral;
-
- // Does the frame satisfy the primary criteria of a key frame?
- // See above for an explanation of the test criteria.
- // If so, then examine how well it predicts subsequent frames.
- if ((this_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) &&
- (next_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) &&
- ((this_frame->pcnt_inter < VERY_LOW_INTER_THRESH) ||
- ((pcnt_intra > MIN_INTRA_LEVEL) &&
- (pcnt_intra > (INTRA_VS_INTER_THRESH * modified_pcnt_inter)) &&
- ((this_frame->intra_error /
- DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) <
- KF_II_ERR_THRESHOLD) &&
- ((fabs(last_frame->coded_error - this_frame->coded_error) /
- DOUBLE_DIVIDE_CHECK(this_frame->coded_error) >
- ERR_CHANGE_THRESHOLD) ||
- (fabs(last_frame->intra_error - this_frame->intra_error) /
- DOUBLE_DIVIDE_CHECK(this_frame->intra_error) >
- ERR_CHANGE_THRESHOLD) ||
- ((next_frame->intra_error /
- DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) >
- II_IMPROVEMENT_THRESHOLD))))) {
- int i;
- const FIRSTPASS_STATS *start_pos = twopass->stats_in;
- FIRSTPASS_STATS local_next_frame = *next_frame;
- double boost_score = 0.0;
- double old_boost_score = 0.0;
- double decay_accumulator = 1.0;
-
- // Examine how well the key frame predicts subsequent frames.
- for (i = 0; i < 16; ++i) {
- double next_iiratio = (BOOST_FACTOR * local_next_frame.intra_error /
- DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error));
-
- if (next_iiratio > KF_II_MAX) next_iiratio = KF_II_MAX;
-
- // Cumulative effect of decay in prediction quality.
- if (local_next_frame.pcnt_inter > 0.85)
- decay_accumulator *= local_next_frame.pcnt_inter;
- else
- decay_accumulator *= (0.85 + local_next_frame.pcnt_inter) / 2.0;
-
- // Keep a running total.
- boost_score += (decay_accumulator * next_iiratio);
-
- // Test various breakout clauses.
- if ((local_next_frame.pcnt_inter < 0.05) || (next_iiratio < 1.5) ||
- (((local_next_frame.pcnt_inter - local_next_frame.pcnt_neutral) <
- 0.20) &&
- (next_iiratio < 3.0)) ||
- ((boost_score - old_boost_score) < 3.0) ||
- (local_next_frame.intra_error < 200)) {
- break;
- }
-
- old_boost_score = boost_score;
-
- // Get the next frame details
- if (EOF == input_stats(twopass, &local_next_frame)) break;
- }
-
- // If there is tolerable prediction for at least the next 3 frames then
- // break out else discard this potential key frame and move on
- if (boost_score > 30.0 && (i > 3)) {
- is_viable_kf = 1;
- } else {
- // Reset the file position
- reset_fpf_position(twopass, start_pos);
-
- is_viable_kf = 0;
- }
- }
-
- return is_viable_kf;
-}
-
-#define FRAMES_TO_CHECK_DECAY 8
-
-static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
- int i, j;
- RATE_CONTROL *const rc = &cpi->rc;
- TWO_PASS *const twopass = &cpi->twopass;
- GF_GROUP *const gf_group = &twopass->gf_group;
- const AV1EncoderConfig *const oxcf = &cpi->oxcf;
- const FIRSTPASS_STATS first_frame = *this_frame;
- const FIRSTPASS_STATS *const start_position = twopass->stats_in;
- FIRSTPASS_STATS next_frame;
- FIRSTPASS_STATS last_frame;
- int kf_bits = 0;
- int loop_decay_counter = 0;
- double decay_accumulator = 1.0;
- double av_decay_accumulator = 0.0;
- double zero_motion_accumulator = 1.0;
- double boost_score = 0.0;
- double kf_mod_err = 0.0;
- double kf_group_err = 0.0;
- double recent_loop_decay[FRAMES_TO_CHECK_DECAY];
-
- av1_zero(next_frame);
-
- cpi->common.current_frame.frame_type = KEY_FRAME;
- rc->frames_since_key = 0;
-
- // Reset the GF group data structures.
- av1_zero(*gf_group);
-
- // Is this a forced key frame by interval.
- rc->this_key_frame_forced = rc->next_key_frame_forced;
-
- // Clear the alt ref active flag and last group multi arf flags as they
- // can never be set for a key frame.
- rc->source_alt_ref_active = 0;
-
- // KF is always a GF so clear frames till next gf counter.
- rc->frames_till_gf_update_due = 0;
-
- rc->frames_to_key = 1;
-
- twopass->kf_group_bits = 0; // Total bits available to kf group
- twopass->kf_group_error_left = 0; // Group modified error score.
-
- kf_mod_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
-
- // Initialize the decay rates for the recent frames to check
- for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j) recent_loop_decay[j] = 1.0;
-
- // Find the next keyframe.
- i = 0;
- while (twopass->stats_in < twopass->stats_in_end &&
- rc->frames_to_key < cpi->oxcf.key_freq) {
- // Accumulate kf group error.
- kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
-
- // Load the next frame's stats.
- last_frame = *this_frame;
- input_stats(twopass, this_frame);
-
- // Provided that we are not at the end of the file...
- if (cpi->oxcf.auto_key && twopass->stats_in < twopass->stats_in_end) {
- double loop_decay_rate;
-
- // Check for a scene cut.
- if (test_candidate_kf(twopass, &last_frame, this_frame,
- twopass->stats_in))
- break;
-
- // How fast is the prediction quality decaying?
- loop_decay_rate = get_prediction_decay_rate(cpi, twopass->stats_in);
-
- // We want to know something about the recent past... rather than
- // as used elsewhere where we are concerned with decay in prediction
- // quality since the last GF or KF.
- recent_loop_decay[i % FRAMES_TO_CHECK_DECAY] = loop_decay_rate;
- decay_accumulator = 1.0;
- for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j)
- decay_accumulator *= recent_loop_decay[j];
-
- // Special check for transition or high motion followed by a
- // static scene.
- if (detect_transition_to_still(cpi, i, cpi->oxcf.key_freq - i,
- loop_decay_rate, decay_accumulator))
- break;
-
- // Step on to the next frame.
- ++rc->frames_to_key;
-
- // If we don't have a real key frame within the next two
- // key_freq intervals then break out of the loop.
- if (rc->frames_to_key >= 2 * cpi->oxcf.key_freq) break;
- } else {
- ++rc->frames_to_key;
- }
- ++i;
- }
-
- // If there is a max kf interval set by the user we must obey it.
- // We already breakout of the loop above at 2x max.
- // This code centers the extra kf if the actual natural interval
- // is between 1x and 2x.
- if (cpi->oxcf.auto_key && rc->frames_to_key > cpi->oxcf.key_freq) {
- FIRSTPASS_STATS tmp_frame = first_frame;
-
- rc->frames_to_key /= 2;
-
- // Reset to the start of the group.
- reset_fpf_position(twopass, start_position);
-
- kf_group_err = 0.0;
-
- // Rescan to get the correct error data for the forced kf group.
- for (i = 0; i < rc->frames_to_key; ++i) {
- kf_group_err += calculate_modified_err(cpi, twopass, oxcf, &tmp_frame);
- input_stats(twopass, &tmp_frame);
- }
- rc->next_key_frame_forced = 1;
- } else if (twopass->stats_in == twopass->stats_in_end ||
- rc->frames_to_key >= cpi->oxcf.key_freq) {
- rc->next_key_frame_forced = 1;
- } else {
- rc->next_key_frame_forced = 0;
- }
-
- // Special case for the last key frame of the file.
- if (twopass->stats_in >= twopass->stats_in_end) {
- // Accumulate kf group error.
- kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
- }
-
- // Calculate the number of bits that should be assigned to the kf group.
- if (twopass->bits_left > 0 && twopass->modified_error_left > 0.0) {
- // Maximum number of bits for a single normal frame (not key frame).
- const int max_bits = frame_max_bits(rc, &cpi->oxcf);
-
- // Maximum number of bits allocated to the key frame group.
- int64_t max_grp_bits;
-
- // Default allocation based on bits left and relative
- // complexity of the section.
- twopass->kf_group_bits = (int64_t)(
- twopass->bits_left * (kf_group_err / twopass->modified_error_left));
-
- // Clip based on maximum per frame rate defined by the user.
- max_grp_bits = (int64_t)max_bits * (int64_t)rc->frames_to_key;
- if (twopass->kf_group_bits > max_grp_bits)
- twopass->kf_group_bits = max_grp_bits;
- } else {
- twopass->kf_group_bits = 0;
- }
- twopass->kf_group_bits = AOMMAX(0, twopass->kf_group_bits);
-
- // Reset the first pass file position.
- reset_fpf_position(twopass, start_position);
-
- // Scan through the kf group collating various stats used to determine
- // how many bits to spend on it.
- decay_accumulator = 1.0;
- boost_score = 0.0;
- const double kf_max_boost =
- cpi->oxcf.rc_mode == AOM_Q
- ? AOMMIN(AOMMAX(rc->frames_to_key * 2.0, KF_MIN_FRAME_BOOST),
- KF_MAX_FRAME_BOOST)
- : KF_MAX_FRAME_BOOST;
- for (i = 0; i < (rc->frames_to_key - 1); ++i) {
- if (EOF == input_stats(twopass, &next_frame)) break;
-
- // Monitor for static sections.
- // For the first frame in kf group, the second ref indicator is invalid.
- if (i > 0) {
- zero_motion_accumulator = AOMMIN(
- zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
- } else {
- zero_motion_accumulator = next_frame.pcnt_inter - next_frame.pcnt_motion;
- }
-
- // Not all frames in the group are necessarily used in calculating boost.
- if ((i <= rc->max_gf_interval) ||
- ((i <= (rc->max_gf_interval * 4)) && (decay_accumulator > 0.5))) {
- const double frame_boost =
- calc_frame_boost(cpi, this_frame, 0, kf_max_boost);
-
- // How fast is prediction quality decaying.
- if (!detect_flash(twopass, 0)) {
- const double loop_decay_rate =
- get_prediction_decay_rate(cpi, &next_frame);
- decay_accumulator *= loop_decay_rate;
- decay_accumulator = AOMMAX(decay_accumulator, MIN_DECAY_FACTOR);
- av_decay_accumulator += decay_accumulator;
- ++loop_decay_counter;
- }
- boost_score += (decay_accumulator * frame_boost);
- }
- }
- if (loop_decay_counter > 0)
- av_decay_accumulator /= (double)loop_decay_counter;
-
- reset_fpf_position(twopass, start_position);
-
- // Store the zero motion percentage
- twopass->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0);
-
- // Calculate a section intra ratio used in setting max loop filter.
- twopass->section_intra_rating = calculate_section_intra_ratio(
- start_position, twopass->stats_in_end, rc->frames_to_key);
-
- rc->kf_boost = (int)(av_decay_accumulator * boost_score);
-
- // Special case for static / slide show content but don't apply
- // if the kf group is very short.
- if ((zero_motion_accumulator > STATIC_KF_GROUP_FLOAT_THRESH) &&
- (rc->frames_to_key > 8)) {
- rc->kf_boost = AOMMAX(rc->kf_boost, MIN_STATIC_KF_BOOST);
- } else {
- // Apply various clamps for min and max boost
- rc->kf_boost = AOMMAX(rc->kf_boost, (rc->frames_to_key * 3));
- rc->kf_boost = AOMMAX(rc->kf_boost, MIN_KF_BOOST);
- }
-
- // Work out how many bits to allocate for the key frame itself.
- kf_bits = calculate_boost_bits((rc->frames_to_key - 1), rc->kf_boost,
- twopass->kf_group_bits);
- // printf("kf boost = %d kf_bits = %d kf_zeromotion_pct = %d\n", rc->kf_boost,
- // kf_bits, twopass->kf_zeromotion_pct);
-
- // Work out the fraction of the kf group bits reserved for the inter frames
- // within the group after discounting the bits for the kf itself.
- if (twopass->kf_group_bits) {
- twopass->kfgroup_inter_fraction =
- (double)(twopass->kf_group_bits - kf_bits) /
- (double)twopass->kf_group_bits;
- } else {
- twopass->kfgroup_inter_fraction = 1.0;
- }
-
- twopass->kf_group_bits -= kf_bits;
-
- // Save the bits to spend on the key frame.
- gf_group->bit_allocation[0] = kf_bits;
- gf_group->update_type[0] = KF_UPDATE;
- gf_group->rf_level[0] = KF_STD;
-
- // Note the total error score of the kf group minus the key frame itself.
- twopass->kf_group_error_left = (int)(kf_group_err - kf_mod_err);
-
- // Adjust the count of total modified error left.
- // The count of bits left is adjusted elsewhere based on real coded frame
- // sizes.
- twopass->modified_error_left -= kf_group_err;
-}
-
-void av1_configure_buffer_updates_firstpass(AV1_COMP *cpi,
- FRAME_UPDATE_TYPE update_type) {
- RATE_CONTROL *rc = &cpi->rc;
-
- cpi->refresh_last_frame = 1;
- cpi->refresh_golden_frame = 0;
- cpi->refresh_bwd_ref_frame = 0;
- cpi->refresh_alt2_ref_frame = 0;
- cpi->refresh_alt_ref_frame = 0;
-
- rc->is_bwd_ref_frame = 0;
-
- switch (update_type) {
- case ARF_UPDATE:
- cpi->refresh_alt_ref_frame = 1;
- cpi->refresh_last_frame = 0;
- cpi->refresh_golden_frame = 0;
- cpi->refresh_bwd_ref_frame = 0;
- cpi->refresh_alt2_ref_frame = 0;
-
- rc->is_src_frame_alt_ref = 0;
- break;
- case INTNL_ARF_UPDATE:
- cpi->refresh_alt2_ref_frame = 1;
- cpi->refresh_last_frame = 0;
- cpi->refresh_golden_frame = 0;
- cpi->refresh_bwd_ref_frame = 0;
- cpi->refresh_alt_ref_frame = 0;
- rc->is_src_frame_alt_ref = 0;
- rc->is_src_frame_ext_arf = 0;
-
- break;
- case BIPRED_UPDATE:
- cpi->refresh_bwd_ref_frame = 1;
- cpi->refresh_last_frame = 0;
- cpi->refresh_golden_frame = 0;
- cpi->refresh_alt2_ref_frame = 0;
- cpi->refresh_alt_ref_frame = 0;
-
- rc->is_bwd_ref_frame = 1;
- break;
- default: break;
- }
-}
-
-static int is_skippable_frame(const AV1_COMP *cpi) {
- // If the current frame does not have non-zero motion vector detected in the
- // first pass, and so do its previous and forward frames, then this frame
- // can be skipped for partition check, and the partition size is assigned
- // according to the variance
- const TWO_PASS *const twopass = &cpi->twopass;
-
- return (!frame_is_intra_only(&cpi->common) &&
- twopass->stats_in - 2 > twopass->stats_in_start &&
- twopass->stats_in < twopass->stats_in_end &&
- (twopass->stats_in - 1)->pcnt_inter -
- (twopass->stats_in - 1)->pcnt_motion ==
- 1 &&
- (twopass->stats_in - 2)->pcnt_inter -
- (twopass->stats_in - 2)->pcnt_motion ==
- 1 &&
- twopass->stats_in->pcnt_inter - twopass->stats_in->pcnt_motion == 1);
-}
-
-void av1_rc_get_second_pass_params(AV1_COMP *cpi) {
- AV1_COMMON *const cm = &cpi->common;
- CurrentFrame *const current_frame = &cm->current_frame;
- RATE_CONTROL *const rc = &cpi->rc;
- TWO_PASS *const twopass = &cpi->twopass;
- GF_GROUP *const gf_group = &twopass->gf_group;
- int frames_left;
- FIRSTPASS_STATS this_frame;
-
- int target_rate;
-
- frames_left = (int)(twopass->total_stats.count - current_frame->frame_number);
-
- if (!twopass->stats_in) return;
-
- // If this is an arf frame then we dont want to read the stats file or
- // advance the input pointer as we already have what we need.
- if (gf_group->update_type[gf_group->index] == ARF_UPDATE ||
- gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE) {
- av1_configure_buffer_updates(cpi);
- target_rate = gf_group->bit_allocation[gf_group->index];
- target_rate = av1_rc_clamp_pframe_target_size(cpi, target_rate);
- rc->base_frame_target = target_rate;
-
- if (cpi->no_show_kf) {
- assert(gf_group->update_type[gf_group->index] == ARF_UPDATE);
- current_frame->frame_type = KEY_FRAME;
- } else {
- current_frame->frame_type = INTER_FRAME;
- }
-
- // Do the firstpass stats indicate that this frame is skippable for the
- // partition search?
- if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2) {
- cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
- }
-
- return;
- }
-
- aom_clear_system_state();
-
- if (cpi->oxcf.rc_mode == AOM_Q) {
- twopass->active_worst_quality = cpi->oxcf.cq_level;
- } else if (current_frame->frame_number == 0) {
- // Special case code for first frame.
- const int section_target_bandwidth =
- (int)(twopass->bits_left / frames_left);
- const double section_length = twopass->total_left_stats.count;
- const double section_error =
- twopass->total_left_stats.coded_error / section_length;
- const double section_intra_skip =
- twopass->total_left_stats.intra_skip_pct / section_length;
- const double section_inactive_zone =
- (twopass->total_left_stats.inactive_zone_rows * 2) /
- ((double)cm->mb_rows * section_length);
- const int tmp_q = get_twopass_worst_quality(
- cpi, section_error, section_intra_skip + section_inactive_zone,
- section_target_bandwidth, DEFAULT_GRP_WEIGHT);
-
- twopass->active_worst_quality = tmp_q;
- twopass->baseline_active_worst_quality = tmp_q;
- rc->ni_av_qi = tmp_q;
- rc->last_q[INTER_FRAME] = tmp_q;
- rc->avg_q = av1_convert_qindex_to_q(tmp_q, cm->seq_params.bit_depth);
- rc->avg_frame_qindex[INTER_FRAME] = tmp_q;
- rc->last_q[KEY_FRAME] = (tmp_q + cpi->oxcf.best_allowed_q) / 2;
- rc->avg_frame_qindex[KEY_FRAME] = rc->last_q[KEY_FRAME];
- }
-
- av1_zero(this_frame);
- if (EOF == input_stats(twopass, &this_frame)) return;
-
- // Set the frame content type flag.
- if (this_frame.intra_skip_pct >= FC_ANIMATION_THRESH)
- twopass->fr_content_type = FC_GRAPHICS_ANIMATION;
- else
- twopass->fr_content_type = FC_NORMAL;
-
- // Keyframe and section processing.
- if (rc->frames_to_key == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY)) {
- FIRSTPASS_STATS this_frame_copy;
- this_frame_copy = this_frame;
- // Define next KF group and assign bits to it.
- find_next_key_frame(cpi, &this_frame);
- this_frame = this_frame_copy;
- } else {
- current_frame->frame_type = INTER_FRAME;
- }
-
- // Define a new GF/ARF group. (Should always enter here for key frames).
- if (rc->frames_till_gf_update_due == 0) {
- define_gf_group(cpi, &this_frame);
-
- rc->frames_till_gf_update_due = rc->baseline_gf_interval;
-
-#if ARF_STATS_OUTPUT
- {
- FILE *fpfile;
- fpfile = fopen("arf.stt", "a");
- ++arf_count;
- fprintf(fpfile, "%10d %10d %10d %10d %10d\n", current_frame->frame_number,
- rc->frames_till_gf_update_due, rc->kf_boost, arf_count,
- rc->gfu_boost);
-
- fclose(fpfile);
- }
-#endif
- }
-
- av1_configure_buffer_updates(cpi);
-
- // Do the firstpass stats indicate that this frame is skippable for the
- // partition search?
- if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2) {
- cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
- }
-
- target_rate = gf_group->bit_allocation[gf_group->index];
-
- if (cpi->common.current_frame.frame_type == KEY_FRAME)
- target_rate = av1_rc_clamp_iframe_target_size(cpi, target_rate);
- else
- target_rate = av1_rc_clamp_pframe_target_size(cpi, target_rate);
-
- rc->base_frame_target = target_rate;
-
- {
- const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
- ? cpi->initial_mbs
- : cpi->common.MBs;
- // The multiplication by 256 reverses a scaling factor of (>> 8)
- // applied when combining MB error values for the frame.
- twopass->mb_av_energy = log((this_frame.intra_error / num_mbs) + 1.0);
- twopass->frame_avg_haar_energy =
- log((this_frame.frame_avg_wavelet_energy / num_mbs) + 1.0);
- }
-
- // Update the total stats remaining structure.
- subtract_stats(&twopass->total_left_stats, &this_frame);
-}
-
-#define MINQ_ADJ_LIMIT 48
-#define MINQ_ADJ_LIMIT_CQ 20
-#define HIGH_UNDERSHOOT_RATIO 2
-void av1_twopass_postencode_update(AV1_COMP *cpi) {
- TWO_PASS *const twopass = &cpi->twopass;
- RATE_CONTROL *const rc = &cpi->rc;
- const int bits_used = rc->base_frame_target;
-
- // VBR correction is done through rc->vbr_bits_off_target. Based on the
- // sign of this value, a limited % adjustment is made to the target rate
- // of subsequent frames, to try and push it back towards 0. This method
- // is designed to prevent extreme behaviour at the end of a clip
- // or group of frames.
- rc->vbr_bits_off_target += rc->base_frame_target - rc->projected_frame_size;
- twopass->bits_left = AOMMAX(twopass->bits_left - bits_used, 0);
-
- // Calculate the pct rc error.
- if (rc->total_actual_bits) {
- rc->rate_error_estimate =
- (int)((rc->vbr_bits_off_target * 100) / rc->total_actual_bits);
- rc->rate_error_estimate = clamp(rc->rate_error_estimate, -100, 100);
- } else {
- rc->rate_error_estimate = 0;
- }
-
- if (cpi->common.current_frame.frame_type != KEY_FRAME) {
- twopass->kf_group_bits -= bits_used;
- twopass->last_kfgroup_zeromotion_pct = twopass->kf_zeromotion_pct;
- }
- twopass->kf_group_bits = AOMMAX(twopass->kf_group_bits, 0);
-
- // If the rate control is drifting consider adjustment to min or maxq.
- if ((cpi->oxcf.rc_mode != AOM_Q) &&
- (cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD) &&
- !cpi->rc.is_src_frame_alt_ref) {
- const int maxq_adj_limit =
- rc->worst_quality - twopass->active_worst_quality;
- const int minq_adj_limit =
- (cpi->oxcf.rc_mode == AOM_CQ ? MINQ_ADJ_LIMIT_CQ : MINQ_ADJ_LIMIT);
-
- // Undershoot.
- if (rc->rate_error_estimate > cpi->oxcf.under_shoot_pct) {
- --twopass->extend_maxq;
- if (rc->rolling_target_bits >= rc->rolling_actual_bits)
- ++twopass->extend_minq;
- // Overshoot.
- } else if (rc->rate_error_estimate < -cpi->oxcf.over_shoot_pct) {
- --twopass->extend_minq;
- if (rc->rolling_target_bits < rc->rolling_actual_bits)
- ++twopass->extend_maxq;
- } else {
- // Adjustment for extreme local overshoot.
- if (rc->projected_frame_size > (2 * rc->base_frame_target) &&
- rc->projected_frame_size > (2 * rc->avg_frame_bandwidth))
- ++twopass->extend_maxq;
-
- // Unwind undershoot or overshoot adjustment.
- if (rc->rolling_target_bits < rc->rolling_actual_bits)
- --twopass->extend_minq;
- else if (rc->rolling_target_bits > rc->rolling_actual_bits)
- --twopass->extend_maxq;
- }
-
- twopass->extend_minq = clamp(twopass->extend_minq, 0, minq_adj_limit);
- twopass->extend_maxq = clamp(twopass->extend_maxq, 0, maxq_adj_limit);
-
- // If there is a big and undexpected undershoot then feed the extra
- // bits back in quickly. One situation where this may happen is if a
- // frame is unexpectedly almost perfectly predicted by the ARF or GF
- // but not very well predcited by the previous frame.
- if (!frame_is_kf_gf_arf(cpi) && !cpi->rc.is_src_frame_alt_ref) {
- int fast_extra_thresh = rc->base_frame_target / HIGH_UNDERSHOOT_RATIO;
- if (rc->projected_frame_size < fast_extra_thresh) {
- rc->vbr_bits_off_target_fast +=
- fast_extra_thresh - rc->projected_frame_size;
- rc->vbr_bits_off_target_fast =
- AOMMIN(rc->vbr_bits_off_target_fast, (4 * rc->avg_frame_bandwidth));
-
- // Fast adaptation of minQ if necessary to use up the extra bits.
- if (rc->avg_frame_bandwidth) {
- twopass->extend_minq_fast =
- (int)(rc->vbr_bits_off_target_fast * 8 / rc->avg_frame_bandwidth);
- }
- twopass->extend_minq_fast = AOMMIN(
- twopass->extend_minq_fast, minq_adj_limit - twopass->extend_minq);
- } else if (rc->vbr_bits_off_target_fast) {
- twopass->extend_minq_fast = AOMMIN(
- twopass->extend_minq_fast, minq_adj_limit - twopass->extend_minq);
- } else {
- twopass->extend_minq_fast = 0;
- }
- }
- }
-}
diff --git a/libaom/av1/encoder/firstpass.h b/libaom/av1/encoder/firstpass.h
index 7c40615..1b8636c 100644
--- a/libaom/av1/encoder/firstpass.h
+++ b/libaom/av1/encoder/firstpass.h
@@ -21,35 +21,7 @@
extern "C" {
#endif
-#if CONFIG_FP_MB_STATS
-
-#define FPMB_DCINTRA_MASK 0x01
-
-#define FPMB_MOTION_ZERO_MASK 0x02
-#define FPMB_MOTION_LEFT_MASK 0x04
-#define FPMB_MOTION_RIGHT_MASK 0x08
-#define FPMB_MOTION_UP_MASK 0x10
-#define FPMB_MOTION_DOWN_MASK 0x20
-
-#define FPMB_ERROR_SMALL_MASK 0x40
-#define FPMB_ERROR_LARGE_MASK 0x80
-#define FPMB_ERROR_SMALL_TH 2000
-#define FPMB_ERROR_LARGE_TH 48000
-
-typedef struct {
- uint8_t *mb_stats_start;
- uint8_t *mb_stats_end;
-} FIRSTPASS_MB_STATS;
-#endif
-
-// Length of the bi-predictive frame group (BFG)
-// NOTE: Currently each BFG contains one backward ref (BWF) frame plus a certain
-// number of bi-predictive frames.
-#define BFG_INTERVAL 2
-// The maximum number of extra ALTREF's except ALTREF_FRAME
-#define MAX_EXT_ARFS (REF_FRAMES - BWDREF_FRAME - 1)
-
-#define MIN_EXT_ARF_INTERVAL 4
+#define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x)-0.000001 : (x) + 0.000001)
#define MIN_ZERO_MOTION 0.95
#define MAX_SR_CODED_ERROR 40
@@ -59,73 +31,99 @@ typedef struct {
#define VLOW_MOTION_THRESHOLD 950
typedef struct {
+ // Frame number in display order, if stats are for a single frame.
+ // No real meaning for a collection of frames.
double frame;
+ // Weight assigned to this frame (or total weight for the collection of
+ // frames) currently based on intra factor and brightness factor. This is used
+ // to distribute bits betweeen easier and harder frames.
double weight;
+ // Intra prediction error.
double intra_error;
+ // Average wavelet energy computed using Discrete Wavelet Transform (DWT).
double frame_avg_wavelet_energy;
+ // Best of intra pred error and inter pred error using last frame as ref.
double coded_error;
+ // Best of intra pred error and inter pred error using golden frame as ref.
double sr_coded_error;
+ // Percentage of blocks with inter pred error < intra pred error.
double pcnt_inter;
+ // Percentage of blocks using (inter prediction and) non-zero motion vectors.
double pcnt_motion;
+ // Percentage of blocks where golden frame was the best reference. That is:
+ // inter pred error using golden frame < inter pred error using last frame and
+ // inter pred error using golden frame < intra pred error
double pcnt_second_ref;
+ // Percentage of blocks where intra and inter prediction errors were very
+ // close. Note that this is a 'weighted count', that is, the so blocks may be
+ // weighted by how close the two errors were.
double pcnt_neutral;
+ // Percentage of blocks that have almost no intra error residual
+ // (i.e. are in effect completely flat and untextured in the intra
+ // domain). In natural videos this is uncommon, but it is much more
+ // common in animations, graphics and screen content, so may be used
+ // as a signal to detect these types of content.
double intra_skip_pct;
- double inactive_zone_rows; // Image mask rows top and bottom.
- double inactive_zone_cols; // Image mask columns at left and right edges.
+ // Image mask rows top and bottom.
+ double inactive_zone_rows;
+ // Image mask columns at left and right edges.
+ double inactive_zone_cols;
+ // Average of row motion vectors.
double MVr;
+ // Mean of absolute value of row motion vectors.
double mvr_abs;
+ // Mean of column motion vectors.
double MVc;
+ // Mean of absolute value of column motion vectors.
double mvc_abs;
+ // Variance of row motion vectors.
double MVrv;
+ // Variance of column motion vectors.
double MVcv;
+ // Value in range [-1,1] indicating fraction of row and column motion vectors
+ // that point inwards (negative MV value) or outwards (positive MV value).
+ // For example, value of 1 indicates, all row/column MVs are inwards.
double mv_in_out_count;
+ // Count of unique non-zero motion vectors.
double new_mv_count;
+ // Duration of the frame / collection of frames.
double duration;
+ // 1.0 if stats are for a single frame, OR
+ // Number of frames in this collection for which the stats are accumulated.
double count;
// standard deviation for (0, 0) motion prediction error
double raw_error_stdev;
} FIRSTPASS_STATS;
-typedef enum {
- KF_UPDATE = 0,
- LF_UPDATE = 1,
- GF_UPDATE = 2,
- ARF_UPDATE = 3,
- OVERLAY_UPDATE = 4,
- BRF_UPDATE = 5, // Backward Reference Frame
- LAST_BIPRED_UPDATE = 6, // Last Bi-predictive Frame
- BIPRED_UPDATE = 7, // Bi-predictive Frame, but not the last one
- INTNL_OVERLAY_UPDATE = 8, // Internal Overlay Frame
- INTNL_ARF_UPDATE = 9, // Internal Altref Frame (candidate for ALTREF2)
- FRAME_UPDATE_TYPES = 10
-} FRAME_UPDATE_TYPE;
+enum {
+ KF_UPDATE,
+ LF_UPDATE,
+ GF_UPDATE,
+ ARF_UPDATE,
+ OVERLAY_UPDATE,
+ INTNL_OVERLAY_UPDATE, // Internal Overlay Frame
+ INTNL_ARF_UPDATE, // Internal Altref Frame
+ FRAME_UPDATE_TYPES
+} UENUM1BYTE(FRAME_UPDATE_TYPE);
#define FC_ANIMATION_THRESH 0.15
-typedef enum {
+enum {
FC_NORMAL = 0,
FC_GRAPHICS_ANIMATION = 1,
FRAME_CONTENT_TYPES = 2
-} FRAME_CONTENT_TYPE;
+} UENUM1BYTE(FRAME_CONTENT_TYPE);
typedef struct {
unsigned char index;
- RATE_FACTOR_LEVEL rf_level[MAX_STATIC_GF_GROUP_LENGTH + 1];
FRAME_UPDATE_TYPE update_type[MAX_STATIC_GF_GROUP_LENGTH + 1];
unsigned char arf_src_offset[MAX_STATIC_GF_GROUP_LENGTH + 1];
unsigned char arf_update_idx[MAX_STATIC_GF_GROUP_LENGTH + 1];
- unsigned char arf_ref_idx[MAX_STATIC_GF_GROUP_LENGTH + 1];
-#if USE_SYMM_MULTI_LAYER
unsigned char arf_pos_in_gf[MAX_STATIC_GF_GROUP_LENGTH + 1];
unsigned char pyramid_level[MAX_STATIC_GF_GROUP_LENGTH + 1];
unsigned char pyramid_height;
unsigned char pyramid_lvl_nodes[MAX_PYRAMID_LVL];
-#endif // USE_SYMM_MULTI_LAYER
- unsigned char brf_src_offset[MAX_STATIC_GF_GROUP_LENGTH + 1];
- unsigned char bidir_pred_enabled[MAX_STATIC_GF_GROUP_LENGTH + 1];
- unsigned char ref_fb_idx_map[MAX_STATIC_GF_GROUP_LENGTH + 1][REF_FRAMES];
- unsigned char refresh_idx[MAX_STATIC_GF_GROUP_LENGTH + 1];
- unsigned char refresh_flag[MAX_STATIC_GF_GROUP_LENGTH + 1];
int bit_allocation[MAX_STATIC_GF_GROUP_LENGTH + 1];
+ int size;
} GF_GROUP;
typedef struct {
@@ -144,11 +142,6 @@ typedef struct {
double mb_av_energy;
double frame_avg_haar_energy;
-#if CONFIG_FP_MB_STATS
- uint8_t *frame_mb_stats_buf;
- uint8_t *this_frame_mb_stats;
- FIRSTPASS_MB_STATS firstpass_mb_stats;
-#endif
// An indication of the content type of the current frame
FRAME_CONTENT_TYPE fr_content_type;
@@ -165,7 +158,6 @@ typedef struct {
int kf_zeromotion_pct;
int last_kfgroup_zeromotion_pct;
- int gf_zeromotion_pct;
int active_worst_quality;
int baseline_active_worst_quality;
int extend_minq;
@@ -176,30 +168,15 @@ typedef struct {
} TWO_PASS;
struct AV1_COMP;
+struct EncodeFrameParams;
+struct AV1EncoderConfig;
void av1_init_first_pass(struct AV1_COMP *cpi);
void av1_rc_get_first_pass_params(struct AV1_COMP *cpi);
-void av1_first_pass(struct AV1_COMP *cpi, const struct lookahead_entry *source);
+void av1_first_pass(struct AV1_COMP *cpi, const int64_t ts_duration);
void av1_end_first_pass(struct AV1_COMP *cpi);
-void av1_init_second_pass(struct AV1_COMP *cpi);
-void av1_rc_get_second_pass_params(struct AV1_COMP *cpi);
-void av1_configure_buffer_updates_firstpass(struct AV1_COMP *cpi,
- FRAME_UPDATE_TYPE update_type);
-
-// Post encode update of the rate control parameters for 2-pass
-void av1_twopass_postencode_update(struct AV1_COMP *cpi);
-
-static INLINE int get_number_of_extra_arfs(int interval, int arf_pending) {
- if (arf_pending && MAX_EXT_ARFS > 0)
- return interval >= MIN_EXT_ARF_INTERVAL * (MAX_EXT_ARFS + 1)
- ? MAX_EXT_ARFS
- : interval >= MIN_EXT_ARF_INTERVAL * MAX_EXT_ARFS
- ? MAX_EXT_ARFS - 1
- : 0;
- else
- return 0;
-}
+void av1_twopass_zero_stats(FIRSTPASS_STATS *section);
#ifdef __cplusplus
} // extern "C"
diff --git a/libaom/av1/encoder/global_motion.c b/libaom/av1/encoder/global_motion.c
index e35a208..b8b13c3 100644
--- a/libaom/av1/encoder/global_motion.c
+++ b/libaom/av1/encoder/global_motion.c
@@ -32,17 +32,24 @@
#define MIN_INLIER_PROB 0.1
#define MIN_TRANS_THRESH (1 * GM_TRANS_DECODE_FACTOR)
-#define USE_GM_FEATURE_BASED 1
// Border over which to compute the global motion
#define ERRORADV_BORDER 0
// Number of pyramid levels in disflow computation
-#define N_LEVELS 5
+#define N_LEVELS 2
// Size of square patches in the disflow dense grid
-#define PATCH_SIZE 5
+#define PATCH_SIZE 8
+// Center point of square patch
+#define PATCH_CENTER ((PATCH_SIZE + 1) >> 1)
+// Step size between patches, lower value means greater patch overlap
+#define PATCH_STEP 1
// Minimum size of border padding for disflow
#define MIN_PAD 7
+// Warp error convergence threshold for disflow
+#define DISFLOW_ERROR_TR 0.01
+// Max number of iterations if warp convergence is not found
+#define DISFLOW_MAX_ITR 10
// Struct for an image pyramid
typedef struct {
@@ -104,7 +111,7 @@ static void convert_to_params(const double *params, int32_t *model) {
void av1_convert_model_to_params(const double *params,
WarpedMotionParams *model) {
convert_to_params(params, model->wmmat);
- model->wmtype = get_gmtype(model);
+ model->wmtype = get_wmtype(model);
model->invalid = 0;
}
@@ -237,7 +244,7 @@ int64_t av1_refine_integerized_param(WarpedMotionParams *wm,
}
}
force_wmtype(wm, wmtype);
- wm->wmtype = get_gmtype(wm);
+ wm->wmtype = get_wmtype(wm);
return best_error;
}
@@ -268,7 +275,6 @@ static unsigned char *downconvert_frame(YV12_BUFFER_CONFIG *frm,
return buf_8bit;
}
-#if USE_GM_FEATURE_BASED
static int compute_global_motion_feature_based(
TransformationType type, YV12_BUFFER_CONFIG *frm, YV12_BUFFER_CONFIG *ref,
int bit_depth, int *num_inliers_by_motion, double *params_by_motion,
@@ -323,7 +329,7 @@ static int compute_global_motion_feature_based(
}
return 0;
}
-#else
+
static INLINE RansacFuncDouble
get_ransac_double_prec_type(TransformationType type) {
switch (type) {
@@ -334,6 +340,35 @@ get_ransac_double_prec_type(TransformationType type) {
}
}
+// Don't use points around the frame border since they are less reliable
+static INLINE int valid_point(int x, int y, int width, int height) {
+ return (x > (PATCH_SIZE + PATCH_CENTER)) &&
+ (x < (width - PATCH_SIZE - PATCH_CENTER)) &&
+ (y > (PATCH_SIZE + PATCH_CENTER)) &&
+ (y < (height - PATCH_SIZE - PATCH_CENTER));
+}
+
+static int determine_disflow_correspondence(int *frm_corners,
+ int num_frm_corners, double *flow_u,
+ double *flow_v, int width,
+ int height, int stride,
+ double *correspondences) {
+ int num_correspondences = 0;
+ int x, y;
+ for (int i = 0; i < num_frm_corners; ++i) {
+ x = frm_corners[2 * i];
+ y = frm_corners[2 * i + 1];
+ if (valid_point(x, y, width, height)) {
+ correspondences[4 * num_correspondences] = x;
+ correspondences[4 * num_correspondences + 1] = y;
+ correspondences[4 * num_correspondences + 2] = x + flow_u[y * stride + x];
+ correspondences[4 * num_correspondences + 3] = y + flow_v[y * stride + x];
+ num_correspondences++;
+ }
+ }
+ return num_correspondences;
+}
+
double getCubicValue(double p[4], double x) {
return p[1] + 0.5 * x *
(p[2] - p[0] +
@@ -436,21 +471,24 @@ unsigned char interpolate(unsigned char *ref, double x, double y, int width,
// Warps a block using flow vector [u, v] and computes the mse
double compute_warp_and_error(unsigned char *ref, unsigned char *frm, int width,
- int height, int stride, double u, double v) {
+ int height, int stride, int x, int y, double u,
+ double v, int16_t *dt) {
int i, j;
- double warped, x, y;
+ unsigned char warped;
+ double x_w, y_w;
double mse = 0;
- double err = 0;
- for (i = 0; i < height; ++i)
- for (j = 0; j < width; ++j) {
- x = (double)j - u;
- y = (double)i - v;
- warped = interpolate(ref, x, y, width, height, stride);
+ int16_t err = 0;
+ for (i = y; i < y + PATCH_SIZE; ++i)
+ for (j = x; j < x + PATCH_SIZE; ++j) {
+ x_w = (double)j + u;
+ y_w = (double)i + v;
+ warped = interpolate(ref, x_w, y_w, width, height, stride);
err = warped - frm[j + i * stride];
mse += err * err;
+ dt[(i - y) * PATCH_SIZE + (j - x)] = err;
}
- mse /= (width * height);
+ mse /= (PATCH_SIZE * PATCH_SIZE);
return mse;
}
@@ -465,19 +503,21 @@ double compute_warp_and_error(unsigned char *ref, unsigned char *frm, int width,
// 2.) b = |sum(dx * dt)|
// |sum(dy * dt)|
// Where the sums are computed over a square window of PATCH_SIZE.
-static INLINE void compute_flow_system(const double *dx, const double *dy,
- const double *dt, int stride, double *M,
- double *b) {
+static INLINE void compute_flow_system(const double *dx, int dx_stride,
+ const double *dy, int dy_stride,
+ const int16_t *dt, int dt_stride,
+ double *M, double *b) {
for (int i = 0; i < PATCH_SIZE; i++) {
for (int j = 0; j < PATCH_SIZE; j++) {
- M[0] += dx[i * stride + j] * dx[i * stride + j];
- M[1] += dx[i * stride + j] * dy[i * stride + j];
- M[3] += dy[i * stride + j] * dy[i * stride + j];
+ M[0] += dx[i * dx_stride + j] * dx[i * dx_stride + j];
+ M[1] += dx[i * dx_stride + j] * dy[i * dy_stride + j];
+ M[3] += dy[i * dy_stride + j] * dy[i * dy_stride + j];
- b[0] += dx[i * stride + j] * dt[i * stride + j];
- b[1] += dy[i * stride + j] * dt[i * stride + j];
+ b[0] += dx[i * dx_stride + j] * dt[i * dt_stride + j];
+ b[1] += dy[i * dy_stride + j] * dt[i * dt_stride + j];
}
}
+
M[2] = M[1];
}
@@ -501,6 +541,7 @@ static INLINE void solve_2x2_system(const double *M, const double *b,
output_vec[1] = -M[2] * mult_b0 + M_0 * mult_b1;
}
+/*
static INLINE void image_difference(const uint8_t *src, int src_stride,
const uint8_t *ref, int ref_stride,
int16_t *dst, int dst_stride, int height,
@@ -515,6 +556,7 @@ static INLINE void image_difference(const uint8_t *src, int src_stride,
}
}
}
+*/
// Compute an image gradient using a sobel filter.
// If dir == 1, compute the x gradient. If dir == 0, compute y. This function
@@ -523,7 +565,7 @@ static INLINE void image_difference(const uint8_t *src, int src_stride,
static INLINE void sobel_xy_image_gradient(const uint8_t *src, int src_stride,
double *dst, int dst_stride,
int height, int width, int dir) {
- double norm = 1.0 / 8;
+ double norm = 1.0;
// TODO(sarahparker) experiment with doing this over larger block sizes
const int block_unit = 8;
// Filter in 8x8 blocks to eventually make use of optimized convolve function
@@ -606,6 +648,24 @@ static void compute_flow_pyramids(unsigned char *frm, const int frm_width,
frm_pyr->heights[0], frm_pyr->widths[0],
frm_pyr->strides[0]);
+ if (compute_grad) {
+ cur_width = frm_pyr->widths[0];
+ cur_height = frm_pyr->heights[0];
+ cur_stride = frm_pyr->strides[0];
+ cur_loc = frm_pyr->level_loc[0];
+ assert(frm_pyr->has_gradient && frm_pyr->level_dx_buffer != NULL &&
+ frm_pyr->level_dy_buffer != NULL);
+ // Computation x gradient
+ sobel_xy_image_gradient(frm_pyr->level_buffer + cur_loc, cur_stride,
+ frm_pyr->level_dx_buffer + cur_loc, cur_stride,
+ cur_height, cur_width, 1);
+
+ // Computation y gradient
+ sobel_xy_image_gradient(frm_pyr->level_buffer + cur_loc, cur_stride,
+ frm_pyr->level_dy_buffer + cur_loc, cur_stride,
+ cur_height, cur_width, 0);
+ }
+
// Start at the finest level and resize down to the coarsest level
for (int level = 1; level < n_levels; ++level) {
update_level_dims(frm_pyr, level);
@@ -636,6 +696,86 @@ static void compute_flow_pyramids(unsigned char *frm, const int frm_width,
}
}
+static INLINE void compute_flow_at_point(unsigned char *frm, unsigned char *ref,
+ double *dx, double *dy, int x, int y,
+ int width, int height, int stride,
+ double *u, double *v) {
+ double M[4] = { 0 };
+ double b[2] = { 0 };
+ double tmp_output_vec[2] = { 0 };
+ double error = 0;
+ int16_t dt[PATCH_SIZE * PATCH_SIZE];
+ double o_u = *u;
+ double o_v = *v;
+
+ for (int itr = 0; itr < DISFLOW_MAX_ITR; itr++) {
+ error = compute_warp_and_error(ref, frm, width, height, stride, x, y, *u,
+ *v, dt);
+ if (error <= DISFLOW_ERROR_TR) break;
+ compute_flow_system(dx, stride, dy, stride, dt, PATCH_SIZE, M, b);
+ solve_2x2_system(M, b, tmp_output_vec);
+ *u += tmp_output_vec[0];
+ *v += tmp_output_vec[1];
+ }
+ if (fabs(*u - o_u) > PATCH_SIZE || fabs(*v - o_u) > PATCH_SIZE) {
+ *u = o_u;
+ *v = o_v;
+ }
+}
+
+// make sure flow_u and flow_v start at 0
+static void compute_flow_field(ImagePyramid *frm_pyr, ImagePyramid *ref_pyr,
+ double *flow_u, double *flow_v) {
+ int cur_width, cur_height, cur_stride, cur_loc, patch_loc, patch_center;
+ double *u_upscale =
+ aom_malloc(frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_u));
+ double *v_upscale =
+ aom_malloc(frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_v));
+
+ assert(frm_pyr->n_levels == ref_pyr->n_levels);
+
+ // Compute flow field from coarsest to finest level of the pyramid
+ for (int level = frm_pyr->n_levels - 1; level >= 0; --level) {
+ cur_width = frm_pyr->widths[level];
+ cur_height = frm_pyr->heights[level];
+ cur_stride = frm_pyr->strides[level];
+ cur_loc = frm_pyr->level_loc[level];
+
+ for (int i = PATCH_SIZE; i < cur_height - PATCH_SIZE; i += PATCH_STEP) {
+ for (int j = PATCH_SIZE; j < cur_width - PATCH_SIZE; j += PATCH_STEP) {
+ patch_loc = i * cur_stride + j;
+ patch_center = patch_loc + PATCH_CENTER * cur_stride + PATCH_CENTER;
+ compute_flow_at_point(frm_pyr->level_buffer + cur_loc,
+ ref_pyr->level_buffer + cur_loc,
+ frm_pyr->level_dx_buffer + cur_loc + patch_loc,
+ frm_pyr->level_dy_buffer + cur_loc + patch_loc, j,
+ i, cur_width, cur_height, cur_stride,
+ flow_u + patch_center, flow_v + patch_center);
+ }
+ }
+ // TODO(sarahparker) Replace this with upscale function in resize.c
+ if (level > 0) {
+ int h_upscale = frm_pyr->heights[level - 1];
+ int w_upscale = frm_pyr->widths[level - 1];
+ int s_upscale = frm_pyr->strides[level - 1];
+ for (int i = 0; i < h_upscale; ++i) {
+ for (int j = 0; j < w_upscale; ++j) {
+ u_upscale[j + i * s_upscale] =
+ flow_u[(int)(j >> 1) + (int)(i >> 1) * cur_stride];
+ v_upscale[j + i * s_upscale] =
+ flow_v[(int)(j >> 1) + (int)(i >> 1) * cur_stride];
+ }
+ }
+ memcpy(flow_u, u_upscale,
+ frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_u));
+ memcpy(flow_v, v_upscale,
+ frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_v));
+ }
+ }
+ aom_free(u_upscale);
+ aom_free(v_upscale);
+}
+
static int compute_global_motion_disflow_based(
TransformationType type, YV12_BUFFER_CONFIG *frm, YV12_BUFFER_CONFIG *ref,
int bit_depth, int *num_inliers_by_motion, double *params_by_motion,
@@ -647,6 +787,11 @@ static int compute_global_motion_disflow_based(
const int ref_width = ref->y_width;
const int ref_height = ref->y_height;
const int pad_size = AOMMAX(PATCH_SIZE, MIN_PAD);
+ int num_frm_corners;
+ int num_correspondences;
+ double *correspondences;
+ int frm_corners[2 * MAX_CORNERS];
+ RansacFuncDouble ransac = get_ransac_double_prec_type(type);
assert(frm_width == ref_width);
assert(frm_height == ref_height);
@@ -683,29 +828,63 @@ static int compute_global_motion_disflow_based(
compute_flow_pyramids(ref_buffer, ref_width, ref_height, ref->y_stride,
n_levels, pad_size, compute_gradient, ref_pyr);
- // TODO(sarahparker) Implement the rest of DISFlow, currently only the image
- // pyramid is implemented.
- (void)num_inliers_by_motion;
- (void)params_by_motion;
- (void)num_motions;
- (void)type;
+ double *flow_u =
+ aom_malloc(frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_u));
+ double *flow_v =
+ aom_malloc(frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_v));
+
+ memset(flow_u, 0,
+ frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_u));
+ memset(flow_v, 0,
+ frm_pyr->strides[0] * frm_pyr->heights[0] * sizeof(*flow_v));
+
+ compute_flow_field(frm_pyr, ref_pyr, flow_u, flow_v);
+
+ // compute interest points in images using FAST features
+ num_frm_corners = fast_corner_detect(frm_buffer, frm_width, frm_height,
+ frm->y_stride, frm_corners, MAX_CORNERS);
+ // find correspondences between the two images using the flow field
+ correspondences = aom_malloc(num_frm_corners * 4 * sizeof(*correspondences));
+ num_correspondences = determine_disflow_correspondence(
+ frm_corners, num_frm_corners, flow_u, flow_v, frm_width, frm_height,
+ frm_pyr->strides[0], correspondences);
+ ransac(correspondences, num_correspondences, num_inliers_by_motion,
+ params_by_motion, num_motions);
+
free_pyramid(frm_pyr);
free_pyramid(ref_pyr);
+ aom_free(correspondences);
+ aom_free(flow_u);
+ aom_free(flow_v);
+ // Set num_inliers = 0 for motions with too few inliers so they are ignored.
+ for (int i = 0; i < num_motions; ++i) {
+ if (num_inliers_by_motion[i] < MIN_INLIER_PROB * num_correspondences) {
+ num_inliers_by_motion[i] = 0;
+ }
+ }
+
+ // Return true if any one of the motions has inliers.
+ for (int i = 0; i < num_motions; ++i) {
+ if (num_inliers_by_motion[i] > 0) return 1;
+ }
return 0;
}
-#endif
int av1_compute_global_motion(TransformationType type, YV12_BUFFER_CONFIG *frm,
YV12_BUFFER_CONFIG *ref, int bit_depth,
+ GlobalMotionEstimationType gm_estimation_type,
int *num_inliers_by_motion,
double *params_by_motion, int num_motions) {
-#if USE_GM_FEATURE_BASED
- return compute_global_motion_feature_based(type, frm, ref, bit_depth,
- num_inliers_by_motion,
- params_by_motion, num_motions);
-#else
- return compute_global_motion_disflow_based(type, frm, ref, bit_depth,
- num_inliers_by_motion,
- params_by_motion, num_motions);
-#endif
+ switch (gm_estimation_type) {
+ case GLOBAL_MOTION_FEATURE_BASED:
+ return compute_global_motion_feature_based(type, frm, ref, bit_depth,
+ num_inliers_by_motion,
+ params_by_motion, num_motions);
+ case GLOBAL_MOTION_DISFLOW_BASED:
+ return compute_global_motion_disflow_based(type, frm, ref, bit_depth,
+ num_inliers_by_motion,
+ params_by_motion, num_motions);
+ default: assert(0 && "Unknown global motion estimation type");
+ }
+ return 0;
}
diff --git a/libaom/av1/encoder/global_motion.h b/libaom/av1/encoder/global_motion.h
index 42cf221..2cfddad 100644
--- a/libaom/av1/encoder/global_motion.h
+++ b/libaom/av1/encoder/global_motion.h
@@ -22,6 +22,11 @@ extern "C" {
#define RANSAC_NUM_MOTIONS 1
+typedef enum {
+ GLOBAL_MOTION_FEATURE_BASED,
+ GLOBAL_MOTION_DISFLOW_BASED,
+} GlobalMotionEstimationType;
+
void av1_convert_model_to_params(const double *params,
WarpedMotionParams *model);
@@ -56,6 +61,7 @@ int64_t av1_refine_integerized_param(WarpedMotionParams *wm,
*/
int av1_compute_global_motion(TransformationType type, YV12_BUFFER_CONFIG *frm,
YV12_BUFFER_CONFIG *ref, int bit_depth,
+ GlobalMotionEstimationType gm_estimation_type,
int *num_inliers_by_motion,
double *params_by_motion, int num_motions);
#ifdef __cplusplus
diff --git a/libaom/av1/encoder/gop_structure.c b/libaom/av1/encoder/gop_structure.c
new file mode 100644
index 0000000..73cb0ed
--- /dev/null
+++ b/libaom/av1/encoder/gop_structure.c
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdint.h>
+
+#include "config/aom_config.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_codec.h"
+#include "aom/aom_encoder.h"
+
+#include "aom_ports/system_state.h"
+
+#include "av1/common/onyxc_int.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/gop_structure.h"
+
+// Set parameters for frames between 'start' and 'end' (excluding both).
+static void set_multi_layer_params(GF_GROUP *const gf_group, int start, int end,
+ int *frame_ind, int arf_ind, int level) {
+ assert(level >= MIN_PYRAMID_LVL);
+ const int num_frames_to_process = end - start - 1;
+ assert(num_frames_to_process >= 0);
+ if (num_frames_to_process == 0) return;
+
+ // Either we are at the last level of the pyramid, or we don't have enough
+ // frames between 'l' and 'r' to create one more level.
+ if (level == MIN_PYRAMID_LVL || num_frames_to_process < 3) {
+ // Leaf nodes.
+ while (++start < end) {
+ gf_group->update_type[*frame_ind] = LF_UPDATE;
+ gf_group->arf_src_offset[*frame_ind] = 0;
+ gf_group->arf_pos_in_gf[*frame_ind] = 0;
+ gf_group->arf_update_idx[*frame_ind] = arf_ind;
+ gf_group->pyramid_level[*frame_ind] = MIN_PYRAMID_LVL;
+ ++gf_group->pyramid_lvl_nodes[MIN_PYRAMID_LVL];
+ ++(*frame_ind);
+ }
+ } else {
+ const int m = (start + end) / 2;
+ const int arf_pos_in_gf = *frame_ind;
+
+ // Internal ARF.
+ gf_group->update_type[*frame_ind] = INTNL_ARF_UPDATE;
+ gf_group->arf_src_offset[*frame_ind] = m - start - 1;
+ gf_group->arf_pos_in_gf[*frame_ind] = 0;
+ gf_group->arf_update_idx[*frame_ind] = 1; // mark all internal ARF 1
+ gf_group->pyramid_level[*frame_ind] = level;
+ ++gf_group->pyramid_lvl_nodes[level];
+ ++(*frame_ind);
+
+ // Frames displayed before this internal ARF.
+ set_multi_layer_params(gf_group, start, m, frame_ind, 1, level - 1);
+
+ // Overlay for internal ARF.
+ gf_group->update_type[*frame_ind] = INTNL_OVERLAY_UPDATE;
+ gf_group->arf_src_offset[*frame_ind] = 0;
+ gf_group->arf_pos_in_gf[*frame_ind] = arf_pos_in_gf; // For bit allocation.
+ gf_group->arf_update_idx[*frame_ind] = 1;
+ gf_group->pyramid_level[*frame_ind] = MIN_PYRAMID_LVL;
+ ++(*frame_ind);
+
+ // Frames displayed after this internal ARF.
+ set_multi_layer_params(gf_group, m, end, frame_ind, arf_ind, level - 1);
+ }
+}
+
+static int construct_multi_layer_gf_structure(
+ GF_GROUP *const gf_group, int gf_interval, int pyr_height,
+ FRAME_UPDATE_TYPE first_frame_update_type) {
+ gf_group->pyramid_height = pyr_height;
+ av1_zero_array(gf_group->pyramid_lvl_nodes, MAX_PYRAMID_LVL);
+ int frame_index = 0;
+
+ // Keyframe / Overlay frame / Golden frame.
+ assert(gf_interval >= 1);
+ assert(first_frame_update_type == KF_UPDATE ||
+ first_frame_update_type == OVERLAY_UPDATE ||
+ first_frame_update_type == GF_UPDATE);
+ gf_group->update_type[frame_index] = first_frame_update_type;
+ gf_group->arf_src_offset[frame_index] = 0;
+ gf_group->arf_pos_in_gf[frame_index] = 0;
+ gf_group->arf_update_idx[frame_index] = 0;
+ gf_group->pyramid_level[frame_index] = MIN_PYRAMID_LVL;
+ ++frame_index;
+
+ // ALTREF.
+ const int use_altref = (gf_group->pyramid_height > 0);
+ if (use_altref) {
+ gf_group->update_type[frame_index] = ARF_UPDATE;
+ gf_group->arf_src_offset[frame_index] = gf_interval - 1;
+ gf_group->arf_pos_in_gf[frame_index] = 0;
+ gf_group->arf_update_idx[frame_index] = 0;
+ gf_group->pyramid_level[frame_index] = gf_group->pyramid_height;
+ ++frame_index;
+ }
+
+ // Rest of the frames.
+ const int next_height =
+ use_altref ? gf_group->pyramid_height - 1 : gf_group->pyramid_height;
+ assert(next_height >= MIN_PYRAMID_LVL);
+ set_multi_layer_params(gf_group, 0, gf_interval, &frame_index, 0,
+ next_height);
+ return frame_index;
+}
+
+#define CHECK_GF_PARAMETER 0
+#if CHECK_GF_PARAMETER
+void check_frame_params(GF_GROUP *const gf_group, int gf_interval) {
+ static const char *update_type_strings[FRAME_UPDATE_TYPES] = {
+ "KF_UPDATE", "LF_UPDATE", "GF_UPDATE",
+ "ARF_UPDATE", "OVERLAY_UPDATE", "INTNL_OVERLAY_UPDATE",
+ "INTNL_ARF_UPDATE"
+ };
+ FILE *fid = fopen("GF_PARAMS.txt", "a");
+
+ fprintf(fid, "\ngf_interval = {%d}\n", gf_interval);
+ for (int i = 0; i <= gf_group->size; ++i) {
+ fprintf(fid, "#%2d : %s %d %d %d %d\n", i,
+ update_type_strings[gf_group->update_type[i]],
+ gf_group->arf_src_offset[i], gf_group->arf_pos_in_gf[i],
+ gf_group->arf_update_idx[i], gf_group->pyramid_level[i]);
+ }
+
+ fprintf(fid, "number of nodes in each level: \n");
+ for (int i = 0; i < gf_group->pyramid_height; ++i) {
+ fprintf(fid, "lvl %d: %d ", i, gf_group->pyramid_lvl_nodes[i]);
+ }
+ fprintf(fid, "\n");
+ fclose(fid);
+}
+#endif // CHECK_GF_PARAMETER
+
+static INLINE int max_pyramid_height_from_width(int pyramid_width) {
+ if (pyramid_width > 12) return 4;
+ if (pyramid_width > 6) return 3;
+ if (pyramid_width > 3) return 2;
+ if (pyramid_width > 1) return 1;
+ return 0;
+}
+
+static int get_pyramid_height(const AV1_COMP *const cpi) {
+ const RATE_CONTROL *const rc = &cpi->rc;
+ assert(IMPLIES(cpi->oxcf.gf_max_pyr_height == MIN_PYRAMID_LVL,
+ !rc->source_alt_ref_pending)); // define_gf_group() enforced.
+ if (!rc->source_alt_ref_pending) {
+ return MIN_PYRAMID_LVL;
+ }
+ assert(cpi->oxcf.gf_max_pyr_height > MIN_PYRAMID_LVL);
+ if (!cpi->internal_altref_allowed) {
+ assert(MIN_PYRAMID_LVL + 1 <= cpi->oxcf.gf_max_pyr_height);
+ return MIN_PYRAMID_LVL + 1;
+ }
+ return AOMMIN(max_pyramid_height_from_width(rc->baseline_gf_interval),
+ cpi->oxcf.gf_max_pyr_height);
+}
+
+void av1_gop_setup_structure(AV1_COMP *cpi,
+ const EncodeFrameParams *const frame_params) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ TWO_PASS *const twopass = &cpi->twopass;
+ GF_GROUP *const gf_group = &twopass->gf_group;
+ const int key_frame = (frame_params->frame_type == KEY_FRAME);
+ const FRAME_UPDATE_TYPE first_frame_update_type =
+ key_frame ? KF_UPDATE
+ : rc->source_alt_ref_active ? OVERLAY_UPDATE : GF_UPDATE;
+ gf_group->size = construct_multi_layer_gf_structure(
+ gf_group, rc->baseline_gf_interval, get_pyramid_height(cpi),
+ first_frame_update_type);
+
+ // We need to configure the frame at the end of the sequence + 1 that
+ // will be the start frame for the next group. Otherwise prior to the
+ // call to av1_get_second_pass_params(), the data will be undefined.
+ gf_group->update_type[gf_group->size] =
+ (rc->source_alt_ref_pending) ? OVERLAY_UPDATE : GF_UPDATE;
+ gf_group->arf_update_idx[gf_group->size] = 0;
+ gf_group->arf_pos_in_gf[gf_group->size] = 0;
+
+#if CHECK_GF_PARAMETER
+ check_frame_params(gf_group, rc->baseline_gf_interval);
+#endif
+}
diff --git a/libaom/av1/encoder/gop_structure.h b/libaom/av1/encoder/gop_structure.h
new file mode 100644
index 0000000..d9d5ae7
--- /dev/null
+++ b/libaom/av1/encoder/gop_structure.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_GOP_STRUCTURE_H_
+#define AOM_AV1_ENCODER_GOP_STRUCTURE_H_
+
+#include "av1/common/onyxc_int.h"
+#include "av1/encoder/ratectrl.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1_COMP;
+struct EncodeFrameParams;
+
+// Set up the Group-Of-Pictures structure for this GF_GROUP. This involves
+// deciding where to place the various FRAME_UPDATE_TYPEs in the group. It does
+// this primarily by setting the contents of
+// cpi->twopass.gf_group.update_type[].
+void av1_gop_setup_structure(
+ struct AV1_COMP *cpi, const struct EncodeFrameParams *const frame_params);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_GOP_STRUCTURE_H_
diff --git a/libaom/av1/encoder/hash_motion.c b/libaom/av1/encoder/hash_motion.c
index e85a516..00915e5 100644
--- a/libaom/av1/encoder/hash_motion.c
+++ b/libaom/av1/encoder/hash_motion.c
@@ -147,7 +147,8 @@ static void hash_table_add_to_table(hash_table *p_hash_table,
}
}
-int32_t av1_hash_table_count(hash_table *p_hash_table, uint32_t hash_value) {
+int32_t av1_hash_table_count(const hash_table *p_hash_table,
+ uint32_t hash_value) {
if (p_hash_table->p_lookup_table[hash_value] == NULL) {
return 0;
} else {
@@ -392,8 +393,9 @@ void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size,
uint32_t *hash_value1, uint32_t *hash_value2,
int use_highbitdepth, MACROBLOCK *x) {
uint32_t to_hash[4];
- const int add_value = hash_block_size_to_index(block_size) << crc_bits;
+ int add_value = hash_block_size_to_index(block_size);
assert(add_value >= 0);
+ add_value <<= crc_bits;
const int crc_mask = (1 << crc_bits) - 1;
// 2x2 subblock hash values in current CU
diff --git a/libaom/av1/encoder/hash_motion.h b/libaom/av1/encoder/hash_motion.h
index df3ec32..ed9bb6e 100644
--- a/libaom/av1/encoder/hash_motion.h
+++ b/libaom/av1/encoder/hash_motion.h
@@ -37,7 +37,8 @@ typedef struct _hash_table {
void av1_hash_table_init(hash_table *p_hash_table, struct macroblock *x);
void av1_hash_table_destroy(hash_table *p_hash_table);
void av1_hash_table_create(hash_table *p_hash_table);
-int32_t av1_hash_table_count(hash_table *p_hash_table, uint32_t hash_value);
+int32_t av1_hash_table_count(const hash_table *p_hash_table,
+ uint32_t hash_value);
Iterator av1_hash_get_first_iterator(hash_table *p_hash_table,
uint32_t hash_value);
int32_t av1_has_exact_match(hash_table *p_hash_table, uint32_t hash_value1,
diff --git a/libaom/av1/encoder/level.c b/libaom/av1/encoder/level.c
new file mode 100644
index 0000000..1668bdf
--- /dev/null
+++ b/libaom/av1/encoder/level.c
@@ -0,0 +1,647 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_ports/system_state.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/level.h"
+
+#define UNDEFINED_LEVEL \
+ { \
+ .level = SEQ_LEVEL_MAX, .max_picture_size = 0, .max_h_size = 0, \
+ .max_v_size = 0, .max_display_rate = 0, .max_decode_rate = 0, \
+ .max_header_rate = 0, .main_mbps = 0, .high_mbps = 0, .main_cr = 0, \
+ .high_cr = 0, .max_tiles = 0, .max_tile_cols = 0 \
+ }
+
+static const AV1LevelSpec av1_level_defs[SEQ_LEVELS] = {
+ { .level = SEQ_LEVEL_2_0,
+ .max_picture_size = 147456,
+ .max_h_size = 2048,
+ .max_v_size = 1152,
+ .max_display_rate = 4423680L,
+ .max_decode_rate = 5529600L,
+ .max_header_rate = 150,
+ .main_mbps = 1.5,
+ .high_mbps = 0,
+ .main_cr = 2.0,
+ .high_cr = 0,
+ .max_tiles = 8,
+ .max_tile_cols = 4 },
+ { .level = SEQ_LEVEL_2_1,
+ .max_picture_size = 278784,
+ .max_h_size = 2816,
+ .max_v_size = 1584,
+ .max_display_rate = 8363520L,
+ .max_decode_rate = 10454400L,
+ .max_header_rate = 150,
+ .main_mbps = 3.0,
+ .high_mbps = 0,
+ .main_cr = 2.0,
+ .high_cr = 0,
+ .max_tiles = 8,
+ .max_tile_cols = 4 },
+ UNDEFINED_LEVEL,
+ UNDEFINED_LEVEL,
+ { .level = SEQ_LEVEL_3_0,
+ .max_picture_size = 665856,
+ .max_h_size = 4352,
+ .max_v_size = 2448,
+ .max_display_rate = 19975680L,
+ .max_decode_rate = 24969600L,
+ .max_header_rate = 150,
+ .main_mbps = 6.0,
+ .high_mbps = 0,
+ .main_cr = 2.0,
+ .high_cr = 0,
+ .max_tiles = 16,
+ .max_tile_cols = 6 },
+ { .level = SEQ_LEVEL_3_1,
+ .max_picture_size = 1065024,
+ .max_h_size = 5504,
+ .max_v_size = 3096,
+ .max_display_rate = 31950720L,
+ .max_decode_rate = 39938400L,
+ .max_header_rate = 150,
+ .main_mbps = 10.0,
+ .high_mbps = 0,
+ .main_cr = 2.0,
+ .high_cr = 0,
+ .max_tiles = 16,
+ .max_tile_cols = 6 },
+ UNDEFINED_LEVEL,
+ UNDEFINED_LEVEL,
+ { .level = SEQ_LEVEL_4_0,
+ .max_picture_size = 2359296,
+ .max_h_size = 6144,
+ .max_v_size = 3456,
+ .max_display_rate = 70778880L,
+ .max_decode_rate = 77856768L,
+ .max_header_rate = 300,
+ .main_mbps = 12.0,
+ .high_mbps = 30.0,
+ .main_cr = 4.0,
+ .high_cr = 4.0,
+ .max_tiles = 32,
+ .max_tile_cols = 8 },
+ { .level = SEQ_LEVEL_4_1,
+ .max_picture_size = 2359296,
+ .max_h_size = 6144,
+ .max_v_size = 3456,
+ .max_display_rate = 141557760L,
+ .max_decode_rate = 155713536L,
+ .max_header_rate = 300,
+ .main_mbps = 20.0,
+ .high_mbps = 50.0,
+ .main_cr = 4.0,
+ .high_cr = 4.0,
+ .max_tiles = 32,
+ .max_tile_cols = 8 },
+ UNDEFINED_LEVEL,
+ UNDEFINED_LEVEL,
+ { .level = SEQ_LEVEL_5_0,
+ .max_picture_size = 8912896,
+ .max_h_size = 8192,
+ .max_v_size = 4352,
+ .max_display_rate = 267386880L,
+ .max_decode_rate = 273715200L,
+ .max_header_rate = 300,
+ .main_mbps = 30.0,
+ .high_mbps = 100.0,
+ .main_cr = 6.0,
+ .high_cr = 4.0,
+ .max_tiles = 64,
+ .max_tile_cols = 8 },
+ { .level = SEQ_LEVEL_5_1,
+ .max_picture_size = 8912896,
+ .max_h_size = 8192,
+ .max_v_size = 4352,
+ .max_display_rate = 534773760L,
+ .max_decode_rate = 547430400L,
+ .max_header_rate = 300,
+ .main_mbps = 40.0,
+ .high_mbps = 160.0,
+ .main_cr = 8.0,
+ .high_cr = 4.0,
+ .max_tiles = 64,
+ .max_tile_cols = 8 },
+ { .level = SEQ_LEVEL_5_2,
+ .max_picture_size = 8912896,
+ .max_h_size = 8192,
+ .max_v_size = 4352,
+ .max_display_rate = 1069547520L,
+ .max_decode_rate = 1094860800L,
+ .max_header_rate = 300,
+ .main_mbps = 60.0,
+ .high_mbps = 240.0,
+ .main_cr = 8.0,
+ .high_cr = 4.0,
+ .max_tiles = 64,
+ .max_tile_cols = 8 },
+ { .level = SEQ_LEVEL_5_3,
+ .max_picture_size = 8912896,
+ .max_h_size = 8192,
+ .max_v_size = 4352,
+ .max_display_rate = 1069547520L,
+ .max_decode_rate = 1176502272L,
+ .max_header_rate = 300,
+ .main_mbps = 60.0,
+ .high_mbps = 240.0,
+ .main_cr = 8.0,
+ .high_cr = 4.0,
+ .max_tiles = 64,
+ .max_tile_cols = 8 },
+ { .level = SEQ_LEVEL_6_0,
+ .max_picture_size = 35651584,
+ .max_h_size = 16384,
+ .max_v_size = 8704,
+ .max_display_rate = 1069547520L,
+ .max_decode_rate = 1176502272L,
+ .max_header_rate = 300,
+ .main_mbps = 60.0,
+ .high_mbps = 240.0,
+ .main_cr = 8.0,
+ .high_cr = 4.0,
+ .max_tiles = 128,
+ .max_tile_cols = 16 },
+ { .level = SEQ_LEVEL_6_1,
+ .max_picture_size = 35651584,
+ .max_h_size = 16384,
+ .max_v_size = 8704,
+ .max_display_rate = 2139095040L,
+ .max_decode_rate = 2189721600L,
+ .max_header_rate = 300,
+ .main_mbps = 100.0,
+ .high_mbps = 480.0,
+ .main_cr = 8.0,
+ .high_cr = 4.0,
+ .max_tiles = 128,
+ .max_tile_cols = 16 },
+ { .level = SEQ_LEVEL_6_2,
+ .max_picture_size = 35651584,
+ .max_h_size = 16384,
+ .max_v_size = 8704,
+ .max_display_rate = 4278190080L,
+ .max_decode_rate = 4379443200L,
+ .max_header_rate = 300,
+ .main_mbps = 160.0,
+ .high_mbps = 800.0,
+ .main_cr = 8.0,
+ .high_cr = 4.0,
+ .max_tiles = 128,
+ .max_tile_cols = 16 },
+ { .level = SEQ_LEVEL_6_3,
+ .max_picture_size = 35651584,
+ .max_h_size = 16384,
+ .max_v_size = 8704,
+ .max_display_rate = 4278190080L,
+ .max_decode_rate = 4706009088L,
+ .max_header_rate = 300,
+ .main_mbps = 160.0,
+ .high_mbps = 800.0,
+ .main_cr = 8.0,
+ .high_cr = 4.0,
+ .max_tiles = 128,
+ .max_tile_cols = 16 },
+ UNDEFINED_LEVEL,
+ UNDEFINED_LEVEL,
+ UNDEFINED_LEVEL,
+ UNDEFINED_LEVEL,
+};
+
+typedef enum {
+ LUMA_PIC_SIZE_TOO_LARGE,
+ LUMA_PIC_H_SIZE_TOO_LARGE,
+ LUMA_PIC_V_SIZE_TOO_LARGE,
+ LUMA_PIC_H_SIZE_TOO_SMALL,
+ LUMA_PIC_V_SIZE_TOO_SMALL,
+ TOO_MANY_TILE_COLUMNS,
+ TOO_MANY_TILES,
+ TILE_RATE_TOO_HIGH,
+ TILE_TOO_LARGE,
+ SUPERRES_TILE_WIDTH_TOO_LARGE,
+ CROPPED_TILE_WIDTH_TOO_SMALL,
+ CROPPED_TILE_HEIGHT_TOO_SMALL,
+ TILE_WIDTH_INVALID,
+ FRAME_HEADER_RATE_TOO_HIGH,
+ DISPLAY_RATE_TOO_HIGH,
+ DECODE_RATE_TOO_HIGH,
+ CR_TOO_SMALL,
+
+ TARGET_LEVEL_FAIL_IDS,
+ TARGET_LEVEL_OK,
+} TARGET_LEVEL_FAIL_ID;
+
+static const char *level_fail_messages[TARGET_LEVEL_FAIL_IDS] = {
+ "The picture size is too large.",
+ "The picture width is too large.",
+ "The picture height is too large.",
+ "The picture width is too small.",
+ "The picture height is too small.",
+ "Too many tile columns are used.",
+ "Too many tiles are used.",
+ "The tile rate is too high.",
+ "The tile size is too large.",
+ "The superres tile width is too large.",
+ "The cropped tile width is less than 8.",
+ "The cropped tile height is less than 8.",
+ "The tile width is invalid.",
+ "The frame header rate is too high.",
+ "The display luma sample rate is too high.",
+ "The decoded luma sample rate is too high.",
+ "The compression ratio is too small.",
+};
+
+static double get_min_cr(const AV1LevelSpec *const level_spec, int tier,
+ int is_still_picture, int64_t decoded_sample_rate) {
+ if (is_still_picture) return 0.8;
+ const double min_cr_basis = tier ? level_spec->high_cr : level_spec->main_cr;
+ const double speed_adj =
+ (double)decoded_sample_rate / level_spec->max_display_rate;
+ return AOMMAX(min_cr_basis * speed_adj, 0.8);
+}
+
+static TARGET_LEVEL_FAIL_ID check_level_constraints(
+ const AV1LevelSpec *const target_level_spec,
+ const AV1LevelSpec *const level_spec,
+ const AV1LevelStats *const level_stats, int tier, int is_still_picture) {
+ const double min_cr = get_min_cr(target_level_spec, tier, is_still_picture,
+ level_spec->max_decode_rate);
+ TARGET_LEVEL_FAIL_ID fail_id = TARGET_LEVEL_OK;
+
+ do {
+ if (level_spec->max_picture_size > target_level_spec->max_picture_size) {
+ fail_id = LUMA_PIC_SIZE_TOO_LARGE;
+ break;
+ }
+
+ if (level_spec->max_h_size > target_level_spec->max_h_size) {
+ fail_id = LUMA_PIC_H_SIZE_TOO_LARGE;
+ break;
+ }
+
+ if (level_spec->max_v_size > target_level_spec->max_v_size) {
+ fail_id = LUMA_PIC_V_SIZE_TOO_LARGE;
+ break;
+ }
+
+ if (level_spec->max_tile_cols > target_level_spec->max_tile_cols) {
+ fail_id = TOO_MANY_TILE_COLUMNS;
+ break;
+ }
+
+ if (level_spec->max_tiles > target_level_spec->max_tiles) {
+ fail_id = TOO_MANY_TILES;
+ break;
+ }
+
+ if (level_spec->max_header_rate > target_level_spec->max_header_rate) {
+ fail_id = FRAME_HEADER_RATE_TOO_HIGH;
+ break;
+ }
+
+ if (level_spec->max_display_rate > target_level_spec->max_display_rate) {
+ fail_id = DISPLAY_RATE_TOO_HIGH;
+ break;
+ }
+
+ if (level_spec->max_decode_rate > target_level_spec->max_decode_rate) {
+ fail_id = DECODE_RATE_TOO_HIGH;
+ break;
+ }
+
+ if (level_spec->max_tile_rate > target_level_spec->max_tiles * 120) {
+ fail_id = TILE_RATE_TOO_HIGH;
+ break;
+ }
+
+ if (level_stats->max_tile_size > 4096 * 2304) {
+ fail_id = TILE_TOO_LARGE;
+ break;
+ }
+
+ if (level_stats->max_superres_tile_width > MAX_TILE_WIDTH) {
+ fail_id = SUPERRES_TILE_WIDTH_TOO_LARGE;
+ break;
+ }
+
+ if (level_stats->min_cropped_tile_width < 8) {
+ fail_id = CROPPED_TILE_WIDTH_TOO_SMALL;
+ break;
+ }
+
+ if (level_stats->min_cropped_tile_height < 8) {
+ fail_id = CROPPED_TILE_HEIGHT_TOO_SMALL;
+ break;
+ }
+
+ if (level_stats->min_frame_width < 16) {
+ fail_id = LUMA_PIC_H_SIZE_TOO_SMALL;
+ break;
+ }
+
+ if (level_stats->min_frame_height < 16) {
+ fail_id = LUMA_PIC_V_SIZE_TOO_SMALL;
+ break;
+ }
+
+ if (!level_stats->tile_width_is_valid) {
+ fail_id = TILE_WIDTH_INVALID;
+ break;
+ }
+
+ if (level_stats->min_cr < min_cr) {
+ fail_id = CR_TOO_SMALL;
+ break;
+ }
+ } while (0);
+
+ return fail_id;
+}
+
+static INLINE int is_in_operating_point(int operating_point,
+ int temporal_layer_id,
+ int spatial_layer_id) {
+ if (!operating_point) return 1;
+
+ return ((operating_point >> temporal_layer_id) & 1) &&
+ ((operating_point >> (spatial_layer_id + 8)) & 1);
+}
+
+static void get_tile_stats(const AV1_COMP *const cpi, int *max_tile_size,
+ int *max_superres_tile_width,
+ int *min_cropped_tile_width,
+ int *min_cropped_tile_height,
+ int *tile_width_valid) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int tile_cols = cm->tile_cols;
+ const int tile_rows = cm->tile_rows;
+ const int superres_scale_denominator = cm->superres_scale_denominator;
+
+ *max_tile_size = 0;
+ *max_superres_tile_width = 0;
+ *min_cropped_tile_width = INT_MAX;
+ *min_cropped_tile_height = INT_MAX;
+ *tile_width_valid = 1;
+
+ for (int tile_row = 0; tile_row < tile_rows; ++tile_row) {
+ for (int tile_col = 0; tile_col < tile_cols; ++tile_col) {
+ const TileInfo *const tile_info =
+ &cpi->tile_data[tile_row * cm->tile_cols + tile_col].tile_info;
+ const int tile_width =
+ (tile_info->mi_col_end - tile_info->mi_col_start) * MI_SIZE;
+ const int tile_height =
+ (tile_info->mi_row_end - tile_info->mi_row_start) * MI_SIZE;
+ const int tile_size = tile_width * tile_height;
+ *max_tile_size = AOMMAX(*max_tile_size, tile_size);
+
+ const int supperres_tile_width =
+ tile_width * superres_scale_denominator / SCALE_NUMERATOR;
+ *max_superres_tile_width =
+ AOMMAX(*max_superres_tile_width, supperres_tile_width);
+
+ const int cropped_tile_width =
+ cm->width - tile_info->mi_col_start * MI_SIZE;
+ const int cropped_tile_height =
+ cm->height - tile_info->mi_row_start * MI_SIZE;
+ *min_cropped_tile_width =
+ AOMMIN(*min_cropped_tile_width, cropped_tile_width);
+ *min_cropped_tile_height =
+ AOMMIN(*min_cropped_tile_height, cropped_tile_height);
+
+ const int is_right_most_tile = tile_info->mi_col_end == cm->mi_cols;
+ if (!is_right_most_tile) {
+ if (av1_superres_scaled(cm))
+ *tile_width_valid &= tile_width >= 128;
+ else
+ *tile_width_valid &= tile_width >= 64;
+ }
+ }
+ }
+}
+
+static int store_frame_record(int64_t ts_start, int64_t ts_end, int pic_size,
+ int frame_header_count, int tiles, int show_frame,
+ int show_existing_frame,
+ FrameWindowBuffer *const buffer) {
+ if (buffer->num < FRAME_WINDOW_SIZE) {
+ ++buffer->num;
+ } else {
+ buffer->start = (buffer->start + 1) % FRAME_WINDOW_SIZE;
+ }
+ const int new_idx = (buffer->start + buffer->num - 1) % FRAME_WINDOW_SIZE;
+ FrameRecord *const record = &buffer->buf[new_idx];
+ record->ts_start = ts_start;
+ record->ts_end = ts_end;
+ record->pic_size = pic_size;
+ record->frame_header_count = frame_header_count;
+ record->tiles = tiles;
+ record->show_frame = show_frame;
+ record->show_existing_frame = show_existing_frame;
+
+ return new_idx;
+}
+
+// Count the number of frames encoded in the last "duration" ticks, in display
+// time.
+static int count_frames(const FrameWindowBuffer *const buffer,
+ int64_t duration) {
+ const int current_idx = (buffer->start + buffer->num - 1) % FRAME_WINDOW_SIZE;
+ // Assume current frame is shown frame.
+ assert(buffer->buf[current_idx].show_frame);
+
+ const int64_t current_time = buffer->buf[current_idx].ts_end;
+ const int64_t time_limit = AOMMAX(current_time - duration, 0);
+ int num_frames = 1;
+ int index = current_idx - 1;
+ for (int i = buffer->num - 2; i >= 0; --i, --index, ++num_frames) {
+ if (index < 0) index = FRAME_WINDOW_SIZE - 1;
+ const FrameRecord *const record = &buffer->buf[index];
+ if (!record->show_frame) continue;
+ const int64_t ts_start = record->ts_start;
+ if (ts_start < time_limit) break;
+ }
+
+ return num_frames;
+}
+
+// Scan previously encoded frames and update level metrics accordingly.
+static void scan_past_frames(const FrameWindowBuffer *const buffer,
+ int num_frames_to_scan,
+ AV1LevelSpec *const level_spec) {
+ const int num_frames_in_buffer = buffer->num;
+ int index = (buffer->start + num_frames_in_buffer - 1) % FRAME_WINDOW_SIZE;
+ int frame_headers = 0;
+ int tiles = 0;
+ int64_t display_samples = 0;
+ int64_t decoded_samples = 0;
+ for (int i = 0; i < AOMMIN(num_frames_in_buffer, num_frames_to_scan); ++i) {
+ const FrameRecord *const record = &buffer->buf[index];
+ if (!record->show_existing_frame) {
+ frame_headers += record->frame_header_count;
+ decoded_samples += record->pic_size;
+ }
+ if (record->show_frame) {
+ display_samples += record->pic_size;
+ }
+ tiles += record->tiles;
+ --index;
+ if (index < 0) index = FRAME_WINDOW_SIZE - 1;
+ }
+ level_spec->max_header_rate =
+ AOMMAX(level_spec->max_header_rate, frame_headers);
+ level_spec->max_display_rate =
+ AOMMAX(level_spec->max_display_rate, display_samples);
+ level_spec->max_decode_rate =
+ AOMMAX(level_spec->max_decode_rate, decoded_samples);
+ level_spec->max_tile_rate = AOMMAX(level_spec->max_tile_rate, tiles);
+}
+
+void av1_update_level_info(AV1_COMP *cpi, size_t size, int64_t ts_start,
+ int64_t ts_end) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int upscaled_width = cm->superres_upscaled_width;
+ const int width = cm->width;
+ const int height = cm->height;
+ const int tile_cols = cm->tile_cols;
+ const int tile_rows = cm->tile_rows;
+ const int tiles = tile_cols * tile_rows;
+ const int luma_pic_size = upscaled_width * height;
+ const int frame_header_count = cpi->frame_header_count;
+ const int show_frame = cm->show_frame;
+ const int show_existing_frame = cm->show_existing_frame;
+
+ // Store info. of current frame into FrameWindowBuffer.
+ FrameWindowBuffer *const buffer = &cpi->frame_window_buffer;
+ store_frame_record(ts_start, ts_end, luma_pic_size, frame_header_count, tiles,
+ show_frame, show_existing_frame, buffer);
+ // Count the number of frames encoded in the past 1 second.
+ const int encoded_frames_in_last_second =
+ show_frame ? count_frames(buffer, TICKS_PER_SEC) : 0;
+
+ int max_tile_size;
+ int min_cropped_tile_width;
+ int min_cropped_tile_height;
+ int max_superres_tile_width;
+ int tile_width_is_valid;
+ get_tile_stats(cpi, &max_tile_size, &max_superres_tile_width,
+ &min_cropped_tile_width, &min_cropped_tile_height,
+ &tile_width_is_valid);
+
+ const SequenceHeader *const seq_params = &cm->seq_params;
+ const BITSTREAM_PROFILE profile = seq_params->profile;
+ const int pic_size_profile_factor =
+ profile == PROFILE_0 ? 15 : (profile == PROFILE_1 ? 30 : 36);
+ const size_t frame_compressed_size = (size > 129 ? size - 128 : 1);
+ const size_t frame_uncompressed_size =
+ (luma_pic_size * pic_size_profile_factor) >> 3;
+
+ aom_clear_system_state();
+ const double compression_ratio =
+ frame_uncompressed_size / (double)frame_compressed_size;
+ const double total_time_encoded =
+ (cpi->last_end_time_stamp_seen - cpi->first_time_stamp_ever) /
+ (double)TICKS_PER_SEC;
+
+ const int temporal_layer_id = cm->temporal_layer_id;
+ const int spatial_layer_id = cm->spatial_layer_id;
+ const int is_still_picture = seq_params->still_picture;
+ // update level_stats
+ // TODO(kyslov@) fix the implementation according to buffer model
+ for (int i = 0; i < seq_params->operating_points_cnt_minus_1 + 1; ++i) {
+ if (!is_in_operating_point(seq_params->operating_point_idc[i],
+ temporal_layer_id, spatial_layer_id)) {
+ continue;
+ }
+
+ AV1LevelInfo *const level_info = &cpi->level_info[i];
+ AV1LevelStats *const level_stats = &level_info->level_stats;
+
+ level_stats->max_tile_size =
+ AOMMAX(level_stats->max_tile_size, max_tile_size);
+ level_stats->max_superres_tile_width =
+ AOMMAX(level_stats->max_superres_tile_width, max_superres_tile_width);
+ level_stats->min_cropped_tile_width =
+ AOMMIN(level_stats->min_cropped_tile_width, min_cropped_tile_width);
+ level_stats->min_cropped_tile_height =
+ AOMMIN(level_stats->min_cropped_tile_height, min_cropped_tile_height);
+ level_stats->tile_width_is_valid &= tile_width_is_valid;
+ level_stats->min_frame_width = AOMMIN(level_stats->min_frame_width, width);
+ level_stats->min_frame_height =
+ AOMMIN(level_stats->min_frame_height, height);
+ level_stats->total_compressed_size += frame_compressed_size;
+ if (show_frame) level_stats->total_time_encoded = total_time_encoded;
+ level_stats->min_cr = AOMMIN(level_stats->min_cr, compression_ratio);
+
+ // update level_spec
+ // TODO(kyslov@) update all spec fields
+ AV1LevelSpec *const level_spec = &level_info->level_spec;
+ level_spec->max_picture_size =
+ AOMMAX(level_spec->max_picture_size, luma_pic_size);
+ level_spec->max_h_size =
+ AOMMAX(level_spec->max_h_size, cm->superres_upscaled_width);
+ level_spec->max_v_size = AOMMAX(level_spec->max_v_size, height);
+ level_spec->max_tile_cols = AOMMAX(level_spec->max_tile_cols, tile_cols);
+ level_spec->max_tiles = AOMMAX(level_spec->max_tiles, tiles);
+
+ if (show_frame) {
+ scan_past_frames(buffer, encoded_frames_in_last_second, level_spec);
+ }
+
+ // Check whether target level is met.
+ const AV1_LEVEL target_seq_level_idx = cpi->target_seq_level_idx[i];
+ if (target_seq_level_idx < SEQ_LEVELS) {
+ const AV1LevelSpec *const target_level_spec =
+ av1_level_defs + target_seq_level_idx;
+ const int tier = seq_params->tier[i];
+ const TARGET_LEVEL_FAIL_ID fail_id = check_level_constraints(
+ target_level_spec, level_spec, level_stats, tier, is_still_picture);
+ if (fail_id != TARGET_LEVEL_OK) {
+ const int target_level_major = 2 + (target_seq_level_idx >> 2);
+ const int target_level_minor = target_seq_level_idx & 3;
+ aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+ "Failed to encode to the target level %d_%d. %s",
+ target_level_major, target_level_minor,
+ level_fail_messages[fail_id]);
+ }
+ }
+ }
+}
+
+aom_codec_err_t av1_get_seq_level_idx(const AV1_COMP *cpi, int *seq_level_idx) {
+ const SequenceHeader *const seq_params = &cpi->common.seq_params;
+ if (!cpi->keep_level_stats) {
+ for (int op = 0; op < seq_params->operating_points_cnt_minus_1 + 1; ++op) {
+ seq_level_idx[op] = (int)SEQ_LEVEL_MAX;
+ }
+ return AOM_CODEC_OK;
+ }
+
+ const int is_still_picture = seq_params->still_picture;
+ for (int op = 0; op < seq_params->operating_points_cnt_minus_1 + 1; ++op) {
+ seq_level_idx[op] = (int)SEQ_LEVEL_MAX;
+ const int tier = seq_params->tier[op];
+ const AV1LevelInfo *const level_info = &cpi->level_info[op];
+ const AV1LevelStats *const level_stats = &level_info->level_stats;
+ const AV1LevelSpec *const level_spec = &level_info->level_spec;
+ for (int level = 0; level < SEQ_LEVELS; ++level) {
+ const AV1LevelSpec *const target_level_spec = av1_level_defs + level;
+ const TARGET_LEVEL_FAIL_ID fail_id = check_level_constraints(
+ target_level_spec, level_spec, level_stats, tier, is_still_picture);
+ if (fail_id == TARGET_LEVEL_OK) {
+ seq_level_idx[op] = level;
+ break;
+ }
+ }
+ }
+
+ return AOM_CODEC_OK;
+}
diff --git a/libaom/av1/encoder/level.h b/libaom/av1/encoder/level.h
new file mode 100644
index 0000000..9f1664d
--- /dev/null
+++ b/libaom/av1/encoder/level.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_LEVEL_H_
+#define AOM_AV1_ENCODER_LEVEL_H_
+
+#include "av1/common/enums.h"
+
+struct AV1_COMP;
+
+// AV1 Level Specifications
+typedef struct {
+ AV1_LEVEL level;
+ int max_picture_size;
+ int max_h_size;
+ int max_v_size;
+ int max_header_rate;
+ int max_tile_rate;
+ int max_tiles;
+ int max_tile_cols;
+ int64_t max_display_rate;
+ int64_t max_decode_rate;
+ double main_mbps;
+ double high_mbps;
+ double main_cr;
+ double high_cr;
+} AV1LevelSpec;
+
+typedef struct {
+ int64_t ts_start;
+ int64_t ts_end;
+ int pic_size;
+ int frame_header_count;
+ int tiles;
+ int show_frame;
+ int show_existing_frame;
+} FrameRecord;
+
+// Record frame info. in a rolling window.
+#define FRAME_WINDOW_SIZE 256
+typedef struct {
+ FrameRecord buf[FRAME_WINDOW_SIZE];
+ int num; // Number of FrameRecord stored in the buffer.
+ int start; // Buffer index of the first FrameRecord.
+} FrameWindowBuffer;
+
+// Used to keep track of AV1 Level Stats. Currently unimplemented.
+typedef struct {
+ uint64_t total_compressed_size;
+ int max_tile_size;
+ int max_superres_tile_width;
+ int min_cropped_tile_width;
+ int min_cropped_tile_height;
+ int tile_width_is_valid;
+ int min_frame_width;
+ int min_frame_height;
+ double total_time_encoded;
+ double min_cr;
+} AV1LevelStats;
+
+typedef struct {
+ AV1LevelStats level_stats;
+ AV1LevelSpec level_spec;
+} AV1LevelInfo;
+
+void av1_update_level_info(struct AV1_COMP *cpi, size_t size, int64_t ts_start,
+ int64_t ts_end);
+
+// Return sequence level indices in seq_level_idx[MAX_NUM_OPERATING_POINTS].
+aom_codec_err_t av1_get_seq_level_idx(const struct AV1_COMP *cpi,
+ int *seq_level_idx);
+
+#endif // AOM_AV1_ENCODER_LEVEL_H_
diff --git a/libaom/av1/encoder/lookahead.c b/libaom/av1/encoder/lookahead.c
index 1bf8ecb..f5298f7 100644
--- a/libaom/av1/encoder/lookahead.c
+++ b/libaom/av1/encoder/lookahead.c
@@ -43,7 +43,8 @@ void av1_lookahead_destroy(struct lookahead_ctx *ctx) {
struct lookahead_ctx *av1_lookahead_init(
unsigned int width, unsigned int height, unsigned int subsampling_x,
- unsigned int subsampling_y, int use_highbitdepth, unsigned int depth) {
+ unsigned int subsampling_y, int use_highbitdepth, unsigned int depth,
+ const int border_in_pixels, int is_scale) {
struct lookahead_ctx *ctx = NULL;
// Clamp the lookahead queue depth
@@ -61,10 +62,19 @@ struct lookahead_ctx *av1_lookahead_init(
ctx->buf = calloc(depth, sizeof(*ctx->buf));
if (!ctx->buf) goto bail;
for (i = 0; i < depth; i++)
- if (aom_alloc_frame_buffer(&ctx->buf[i].img, width, height, subsampling_x,
- subsampling_y, use_highbitdepth,
- AOM_BORDER_IN_PIXELS, legacy_byte_alignment))
- goto bail;
+ if (is_scale) {
+ if (aom_alloc_frame_buffer(
+ &ctx->buf[i].img, width, height, subsampling_x, subsampling_y,
+ use_highbitdepth, border_in_pixels, legacy_byte_alignment))
+ goto bail;
+ } else {
+ aom_free_frame_buffer(&ctx->buf[i].img);
+ if (aom_realloc_lookahead_buffer(
+ &ctx->buf[i].img, width, height, subsampling_x, subsampling_y,
+ use_highbitdepth, AOM_ENC_LOOKAHEAD_BORDER,
+ legacy_byte_alignment, NULL, NULL, NULL))
+ goto bail;
+ }
}
return ctx;
bail:
diff --git a/libaom/av1/encoder/lookahead.h b/libaom/av1/encoder/lookahead.h
index e55224c..3b2d94b 100644
--- a/libaom/av1/encoder/lookahead.h
+++ b/libaom/av1/encoder/lookahead.h
@@ -46,7 +46,8 @@ struct lookahead_ctx {
*/
struct lookahead_ctx *av1_lookahead_init(
unsigned int width, unsigned int height, unsigned int subsampling_x,
- unsigned int subsampling_y, int use_highbitdepth, unsigned int depth);
+ unsigned int subsampling_y, int use_highbitdepth, unsigned int depth,
+ const int border_in_pixels, int is_scale);
/**\brief Destroys the lookahead stage
*/
diff --git a/libaom/av1/encoder/mbgraph.c b/libaom/av1/encoder/mbgraph.c
index cc50458..0cb6286 100644
--- a/libaom/av1/encoder/mbgraph.c
+++ b/libaom/av1/encoder/mbgraph.c
@@ -71,8 +71,8 @@ static unsigned int do_16x16_motion_iteration(AV1_COMP *cpi, const MV *ref_mv,
xd->mi[0]->mv[0] = x->best_mv;
xd->mi[0]->ref_frame[1] = NONE_FRAME;
- av1_build_inter_predictors_sby(&cpi->common, xd, mb_row, mb_col, NULL,
- BLOCK_16X16);
+ av1_enc_build_inter_predictor(&cpi->common, xd, mb_row, mb_col, NULL,
+ BLOCK_16X16, AOM_PLANE_Y, AOM_PLANE_Y);
/* restore UMV window */
x->mv_limits = tmp_mv_limits;
@@ -364,7 +364,7 @@ static void separate_arf_mbs(AV1_COMP *cpi) {
void av1_update_mbgraph_stats(AV1_COMP *cpi) {
AV1_COMMON *const cm = &cpi->common;
int i, n_frames = av1_lookahead_depth(cpi->lookahead);
- YV12_BUFFER_CONFIG *golden_ref = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+ YV12_BUFFER_CONFIG *golden_ref = &get_ref_frame_buf(cm, GOLDEN_FRAME)->buf;
assert(golden_ref != NULL);
diff --git a/libaom/av1/encoder/mcomp.c b/libaom/av1/encoder/mcomp.c
index 63b4947..f077a4e 100644
--- a/libaom/av1/encoder/mcomp.c
+++ b/libaom/av1/encoder/mcomp.c
@@ -19,6 +19,7 @@
#include "aom_dsp/aom_dsp_common.h"
#include "aom_mem/aom_mem.h"
#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
#include "av1/common/common.h"
#include "av1/common/mvref_common.h"
@@ -28,6 +29,7 @@
#include "av1/encoder/encoder.h"
#include "av1/encoder/encodemv.h"
#include "av1/encoder/mcomp.h"
+#include "av1/encoder/partition_strategy.h"
#include "av1/encoder/rdopt.h"
#include "av1/encoder/reconinter_enc.h"
@@ -336,7 +338,7 @@ static unsigned int setup_center_error(
int *mvcost[2], unsigned int *sse1, int *distortion) {
unsigned int besterr;
if (second_pred != NULL) {
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ if (is_cur_buf_hbd(xd)) {
DECLARE_ALIGNED(16, uint16_t, comp_pred16[MAX_SB_SQUARE]);
uint8_t *comp_pred = CONVERT_TO_BYTEPTR(comp_pred16);
if (mask) {
@@ -641,7 +643,7 @@ static int upsampled_pref_error(MACROBLOCKD *xd, const AV1_COMMON *const cm,
int mask_stride, int invert_mask, int w, int h,
unsigned int *sse, int subpel_search) {
unsigned int besterr;
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ if (is_cur_buf_hbd(xd)) {
DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred16);
if (second_pred != NULL) {
@@ -899,7 +901,8 @@ unsigned int av1_compute_motion_cost(const AV1_COMP *cpi, MACROBLOCK *const x,
unsigned int mse;
unsigned int sse;
- av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, NULL, bsize);
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
mse = vfp->vf(dst, dst_stride, src, src_stride, &sse);
mse += mv_err_cost(this_mv, &ref_mv.as_mv, x->nmv_vec_cost, x->mv_cost_stack,
x->errorperbit);
@@ -1797,11 +1800,11 @@ static int full_pixel_diamond(const AV1_COMP *const cpi, MACROBLOCK *x,
MV *mvp_full, int step_param, int sadpb,
int further_steps, int do_refine, int *cost_list,
const aom_variance_fn_ptr_t *fn_ptr,
- const MV *ref_mv) {
+ const MV *ref_mv, const search_site_config *cfg) {
MV temp_mv;
int thissme, n, num00 = 0;
- int bestsme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv,
- step_param, sadpb, &n, fn_ptr, ref_mv);
+ int bestsme = cpi->diamond_search_sad(x, cfg, mvp_full, &temp_mv, step_param,
+ sadpb, &n, fn_ptr, ref_mv);
if (bestsme < INT_MAX)
bestsme = av1_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
x->best_mv.as_mv = temp_mv;
@@ -1816,9 +1819,9 @@ static int full_pixel_diamond(const AV1_COMP *const cpi, MACROBLOCK *x,
if (num00) {
num00--;
} else {
- thissme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv,
- step_param + n, sadpb, &num00, fn_ptr,
- ref_mv);
+ thissme =
+ cpi->diamond_search_sad(x, cfg, mvp_full, &temp_mv, step_param + n,
+ sadpb, &num00, fn_ptr, ref_mv);
if (thissme < INT_MAX)
thissme = av1_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
@@ -2094,11 +2097,222 @@ static int is_exhaustive_allowed(const AV1_COMP *const cpi, MACROBLOCK *x) {
return is_allowed;
}
+static int vector_match(int16_t *ref, int16_t *src, int bwl) {
+ int best_sad = INT_MAX;
+ int this_sad;
+ int d;
+ int center, offset = 0;
+ int bw = 4 << bwl; // redundant variable, to be changed in the experiments.
+ for (d = 0; d <= bw; d += 16) {
+ this_sad = aom_vector_var(&ref[d], src, bwl);
+ if (this_sad < best_sad) {
+ best_sad = this_sad;
+ offset = d;
+ }
+ }
+ center = offset;
+
+ for (d = -8; d <= 8; d += 16) {
+ int this_pos = offset + d;
+ // check limit
+ if (this_pos < 0 || this_pos > bw) continue;
+ this_sad = aom_vector_var(&ref[this_pos], src, bwl);
+ if (this_sad < best_sad) {
+ best_sad = this_sad;
+ center = this_pos;
+ }
+ }
+ offset = center;
+
+ for (d = -4; d <= 4; d += 8) {
+ int this_pos = offset + d;
+ // check limit
+ if (this_pos < 0 || this_pos > bw) continue;
+ this_sad = aom_vector_var(&ref[this_pos], src, bwl);
+ if (this_sad < best_sad) {
+ best_sad = this_sad;
+ center = this_pos;
+ }
+ }
+ offset = center;
+
+ for (d = -2; d <= 2; d += 4) {
+ int this_pos = offset + d;
+ // check limit
+ if (this_pos < 0 || this_pos > bw) continue;
+ this_sad = aom_vector_var(&ref[this_pos], src, bwl);
+ if (this_sad < best_sad) {
+ best_sad = this_sad;
+ center = this_pos;
+ }
+ }
+ offset = center;
+
+ for (d = -1; d <= 1; d += 2) {
+ int this_pos = offset + d;
+ // check limit
+ if (this_pos < 0 || this_pos > bw) continue;
+ this_sad = aom_vector_var(&ref[this_pos], src, bwl);
+ if (this_sad < best_sad) {
+ best_sad = this_sad;
+ center = this_pos;
+ }
+ }
+
+ return (center - (bw >> 1));
+}
+
+static const MV search_pos[4] = {
+ { -1, 0 },
+ { 0, -1 },
+ { 0, 1 },
+ { 1, 0 },
+};
+
+unsigned int av1_int_pro_motion_estimation(const AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int mi_row,
+ int mi_col, const MV *ref_mv) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mi = xd->mi[0];
+ struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } };
+ DECLARE_ALIGNED(16, int16_t, hbuf[256]);
+ DECLARE_ALIGNED(16, int16_t, vbuf[256]);
+ DECLARE_ALIGNED(16, int16_t, src_hbuf[128]);
+ DECLARE_ALIGNED(16, int16_t, src_vbuf[128]);
+ int idx;
+ const int bw = 4 << mi_size_wide_log2[bsize];
+ const int bh = 4 << mi_size_high_log2[bsize];
+ const int search_width = bw << 1;
+ const int search_height = bh << 1;
+ const int src_stride = x->plane[0].src.stride;
+ const int ref_stride = xd->plane[0].pre[0].stride;
+ uint8_t const *ref_buf, *src_buf;
+ MV *tmp_mv = &xd->mi[0]->mv[0].as_mv;
+ unsigned int best_sad, tmp_sad, this_sad[4];
+ MV this_mv;
+ const int norm_factor = 3 + (bw >> 5);
+ const YV12_BUFFER_CONFIG *scaled_ref_frame =
+ av1_get_scaled_ref_frame(cpi, mi->ref_frame[0]);
+ MvLimits subpel_mv_limits;
+
+ if (scaled_ref_frame) {
+ int i;
+ // Swap out the reference frame for a version that's been scaled to
+ // match the resolution of the current frame, allowing the existing
+ // motion search code to be used without additional modifications.
+ for (i = 0; i < MAX_MB_PLANE; i++) backup_yv12[i] = xd->plane[i].pre[0];
+ av1_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL,
+ MAX_MB_PLANE);
+ }
+
+ if (xd->bd != 8) {
+ unsigned int sad;
+ tmp_mv->row = 0;
+ tmp_mv->col = 0;
+ sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride,
+ xd->plane[0].pre[0].buf, ref_stride);
+
+ if (scaled_ref_frame) {
+ int i;
+ for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
+ }
+ return sad;
+ }
+
+ // Set up prediction 1-D reference set
+ ref_buf = xd->plane[0].pre[0].buf - (bw >> 1);
+ for (idx = 0; idx < search_width; idx += 16) {
+ aom_int_pro_row(&hbuf[idx], ref_buf, ref_stride, bh);
+ ref_buf += 16;
+ }
+
+ ref_buf = xd->plane[0].pre[0].buf - (bh >> 1) * ref_stride;
+ for (idx = 0; idx < search_height; ++idx) {
+ vbuf[idx] = aom_int_pro_col(ref_buf, bw) >> norm_factor;
+ ref_buf += ref_stride;
+ }
+
+ // Set up src 1-D reference set
+ for (idx = 0; idx < bw; idx += 16) {
+ src_buf = x->plane[0].src.buf + idx;
+ aom_int_pro_row(&src_hbuf[idx], src_buf, src_stride, bh);
+ }
+
+ src_buf = x->plane[0].src.buf;
+ for (idx = 0; idx < bh; ++idx) {
+ src_vbuf[idx] = aom_int_pro_col(src_buf, bw) >> norm_factor;
+ src_buf += src_stride;
+ }
+
+ // Find the best match per 1-D search
+ tmp_mv->col = vector_match(hbuf, src_hbuf, mi_size_wide_log2[bsize]);
+ tmp_mv->row = vector_match(vbuf, src_vbuf, mi_size_high_log2[bsize]);
+
+ this_mv = *tmp_mv;
+ src_buf = x->plane[0].src.buf;
+ ref_buf = xd->plane[0].pre[0].buf + this_mv.row * ref_stride + this_mv.col;
+ best_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
+
+ {
+ const uint8_t *const pos[4] = {
+ ref_buf - ref_stride,
+ ref_buf - 1,
+ ref_buf + 1,
+ ref_buf + ref_stride,
+ };
+
+ cpi->fn_ptr[bsize].sdx4df(src_buf, src_stride, pos, ref_stride, this_sad);
+ }
+
+ for (idx = 0; idx < 4; ++idx) {
+ if (this_sad[idx] < best_sad) {
+ best_sad = this_sad[idx];
+ tmp_mv->row = search_pos[idx].row + this_mv.row;
+ tmp_mv->col = search_pos[idx].col + this_mv.col;
+ }
+ }
+
+ if (this_sad[0] < this_sad[3])
+ this_mv.row -= 1;
+ else
+ this_mv.row += 1;
+
+ if (this_sad[1] < this_sad[2])
+ this_mv.col -= 1;
+ else
+ this_mv.col += 1;
+
+ ref_buf = xd->plane[0].pre[0].buf + this_mv.row * ref_stride + this_mv.col;
+
+ tmp_sad = cpi->fn_ptr[bsize].sdf(src_buf, src_stride, ref_buf, ref_stride);
+ if (best_sad > tmp_sad) {
+ *tmp_mv = this_mv;
+ best_sad = tmp_sad;
+ }
+
+ tmp_mv->row *= 8;
+ tmp_mv->col *= 8;
+
+ set_subpel_mv_search_range(
+ &x->mv_limits, &subpel_mv_limits.col_min, &subpel_mv_limits.col_max,
+ &subpel_mv_limits.row_min, &subpel_mv_limits.row_max, ref_mv);
+ clamp_mv(tmp_mv, subpel_mv_limits.col_min, subpel_mv_limits.col_max,
+ subpel_mv_limits.row_min, subpel_mv_limits.row_max);
+
+ if (scaled_ref_frame) {
+ int i;
+ for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
+ }
+
+ return best_sad;
+}
+
int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
MV *mvp_full, int step_param, int method,
int run_mesh_search, int error_per_bit,
int *cost_list, const MV *ref_mv, int var_max, int rd,
- int x_pos, int y_pos, int intra) {
+ int x_pos, int y_pos, int intra,
+ const search_site_config *cfg) {
const SPEED_FEATURES *const sf = &cpi->sf;
const aom_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize];
int var = 0;
@@ -2138,7 +2352,7 @@ int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
case NSTEP:
var = full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
MAX_MVSEARCH_STEPS - 1 - step_param, 1,
- cost_list, fn_ptr, ref_mv);
+ cost_list, fn_ptr, ref_mv, cfg);
// Should we allow a follow on exhaustive search?
if (is_exhaustive_allowed(cpi, x)) {
@@ -2209,13 +2423,12 @@ int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
// for the hashMap
hash_table *ref_frame_hash =
- intra
- ? &cpi->common.cur_frame->hash_table
- : av1_get_ref_frame_hash_map(cpi, x->e_mbd.mi[0]->ref_frame[0]);
+ intra ? &cpi->common.cur_frame->hash_table
+ : av1_get_ref_frame_hash_map(&cpi->common,
+ x->e_mbd.mi[0]->ref_frame[0]);
- av1_get_block_hash_value(
- what, what_stride, block_width, &hash_value1, &hash_value2,
- x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, x);
+ av1_get_block_hash_value(what, what_stride, block_width, &hash_value1,
+ &hash_value2, is_cur_buf_hbd(&x->e_mbd), x);
const int count = av1_hash_table_count(ref_frame_hash, hash_value1);
// for intra, at lest one matching can be found, itself.
@@ -2334,7 +2547,7 @@ static int upsampled_obmc_pref_error(
unsigned int besterr;
DECLARE_ALIGNED(16, uint8_t, pred[2 * MAX_SB_SQUARE]);
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ if (is_cur_buf_hbd(xd)) {
uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred);
aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred8, w, h,
subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd,
@@ -2676,14 +2889,15 @@ static int obmc_full_pixel_diamond(const AV1_COMP *cpi, MACROBLOCK *x,
MV *mvp_full, int step_param, int sadpb,
int further_steps, int do_refine,
const aom_variance_fn_ptr_t *fn_ptr,
- const MV *ref_mv, MV *dst_mv,
- int is_second) {
+ const MV *ref_mv, MV *dst_mv, int is_second,
+ const search_site_config *cfg) {
+ (void)cpi; // to silence compiler warning
const int32_t *wsrc = x->wsrc_buf;
const int32_t *mask = x->mask_buf;
MV temp_mv;
int thissme, n, num00 = 0;
int bestsme =
- obmc_diamond_search_sad(x, &cpi->ss_cfg, wsrc, mask, mvp_full, &temp_mv,
+ obmc_diamond_search_sad(x, cfg, wsrc, mask, mvp_full, &temp_mv,
step_param, sadpb, &n, fn_ptr, ref_mv, is_second);
if (bestsme < INT_MAX)
bestsme = get_obmc_mvpred_var(x, wsrc, mask, &temp_mv, ref_mv, fn_ptr, 1,
@@ -2700,9 +2914,9 @@ static int obmc_full_pixel_diamond(const AV1_COMP *cpi, MACROBLOCK *x,
if (num00) {
num00--;
} else {
- thissme = obmc_diamond_search_sad(x, &cpi->ss_cfg, wsrc, mask, mvp_full,
- &temp_mv, step_param + n, sadpb, &num00,
- fn_ptr, ref_mv, is_second);
+ thissme = obmc_diamond_search_sad(x, cfg, wsrc, mask, mvp_full, &temp_mv,
+ step_param + n, sadpb, &num00, fn_ptr,
+ ref_mv, is_second);
if (thissme < INT_MAX)
thissme = get_obmc_mvpred_var(x, wsrc, mask, &temp_mv, ref_mv, fn_ptr,
1, is_second);
@@ -2738,11 +2952,12 @@ int av1_obmc_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, MV *mvp_full,
int step_param, int sadpb, int further_steps,
int do_refine,
const aom_variance_fn_ptr_t *fn_ptr,
- const MV *ref_mv, MV *dst_mv, int is_second) {
+ const MV *ref_mv, MV *dst_mv, int is_second,
+ const search_site_config *cfg) {
if (cpi->sf.obmc_full_pixel_search_level == 0) {
return obmc_full_pixel_diamond(cpi, x, mvp_full, step_param, sadpb,
further_steps, do_refine, fn_ptr, ref_mv,
- dst_mv, is_second);
+ dst_mv, is_second, cfg);
} else {
const int32_t *wsrc = x->wsrc_buf;
const int32_t *mask = x->mask_buf;
@@ -2851,3 +3066,119 @@ int av1_return_min_sub_pixel_mv(
lower_mv_precision(bestmv, allow_hp, 0);
return besterr;
}
+
+void av1_simple_motion_search(AV1_COMP *const cpi, MACROBLOCK *x, int mi_row,
+ int mi_col, BLOCK_SIZE bsize, int ref,
+ MV ref_mv_full, int num_planes,
+ int use_subpixel) {
+ assert(num_planes == 1 &&
+ "Currently simple_motion_search only supports luma plane");
+ assert(!frame_is_intra_only(&cpi->common) &&
+ "Simple motion search only enabled for non-key frames");
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+
+ set_offsets_for_motion_search(cpi, x, mi_row, mi_col, bsize);
+
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ mbmi->sb_type = bsize;
+ mbmi->ref_frame[0] = ref;
+ mbmi->ref_frame[1] = NONE_FRAME;
+ mbmi->motion_mode = SIMPLE_TRANSLATION;
+
+ const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, ref);
+ const YV12_BUFFER_CONFIG *scaled_ref_frame =
+ av1_get_scaled_ref_frame(cpi, ref);
+ struct buf_2d backup_yv12;
+ // ref_mv is used to code the motion vector. ref_mv_full is the initial point.
+ // ref_mv is in units of 1/8 pel whereas ref_mv_full is in units of pel.
+ MV ref_mv = { 0, 0 };
+ const int step_param = cpi->mv_step_param;
+ const MvLimits tmp_mv_limits = x->mv_limits;
+ const SEARCH_METHODS search_methods = NSTEP;
+ const int do_mesh_search = 0;
+ const int sadpb = x->sadperbit16;
+ int cost_list[5];
+ const int ref_idx = 0;
+ int var;
+
+ av1_setup_pre_planes(xd, ref_idx, yv12, mi_row, mi_col,
+ get_ref_scale_factors(cm, ref), num_planes);
+ set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+ if (scaled_ref_frame) {
+ backup_yv12 = xd->plane[AOM_PLANE_Y].pre[ref_idx];
+ av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL,
+ num_planes);
+ }
+
+ // This overwrites the mv_limits so we will need to restore it later.
+ av1_set_mv_search_range(&x->mv_limits, &ref_mv);
+ var = av1_full_pixel_search(
+ cpi, x, bsize, &ref_mv_full, step_param, search_methods, do_mesh_search,
+ sadpb, cond_cost_list(cpi, cost_list), &ref_mv, INT_MAX, 1,
+ mi_col * MI_SIZE, mi_row * MI_SIZE, 0, &cpi->ss_cfg[SS_CFG_SRC]);
+ // Restore
+ x->mv_limits = tmp_mv_limits;
+
+ const int use_subpel_search =
+ var < INT_MAX && !cpi->common.cur_frame_force_integer_mv && use_subpixel;
+ if (scaled_ref_frame) {
+ xd->plane[AOM_PLANE_Y].pre[ref_idx] = backup_yv12;
+ }
+ if (use_subpel_search) {
+ int not_used = 0;
+ if (cpi->sf.use_accurate_subpel_search) {
+ const int pw = block_size_wide[bsize];
+ const int ph = block_size_high[bsize];
+ cpi->find_fractional_mv_step(
+ x, cm, mi_row, mi_col, &ref_mv, cm->allow_high_precision_mv,
+ x->errorperbit, &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
+ cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list),
+ x->nmv_vec_cost, x->mv_cost_stack, &not_used, &x->pred_sse[ref], NULL,
+ NULL, 0, 0, pw, ph, cpi->sf.use_accurate_subpel_search, 1);
+ } else {
+ cpi->find_fractional_mv_step(
+ x, cm, mi_row, mi_col, &ref_mv, cm->allow_high_precision_mv,
+ x->errorperbit, &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
+ cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list),
+ x->nmv_vec_cost, x->mv_cost_stack, &not_used, &x->pred_sse[ref], NULL,
+ NULL, 0, 0, 0, 0, 0, 1);
+ }
+ } else {
+ // Manually convert from units of pixel to 1/8-pixels if we are not doing
+ // subpel search
+ x->best_mv.as_mv.row *= 8;
+ x->best_mv.as_mv.col *= 8;
+ }
+
+ mbmi->mv[0].as_mv = x->best_mv.as_mv;
+
+ // Get a copy of the prediction output
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
+
+ aom_clear_system_state();
+
+ if (scaled_ref_frame) {
+ xd->plane[AOM_PLANE_Y].pre[ref_idx] = backup_yv12;
+ }
+}
+
+void av1_simple_motion_sse_var(AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
+ int mi_col, BLOCK_SIZE bsize,
+ const MV ref_mv_full, int use_subpixel,
+ unsigned int *sse, unsigned int *var) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ const MV_REFERENCE_FRAME ref =
+ cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME : LAST_FRAME;
+
+ av1_simple_motion_search(cpi, x, mi_row, mi_col, bsize, ref, ref_mv_full, 1,
+ use_subpixel);
+
+ const uint8_t *src = x->plane[0].src.buf;
+ const int src_stride = x->plane[0].src.stride;
+ const uint8_t *dst = xd->plane[0].dst.buf;
+ const int dst_stride = xd->plane[0].dst.stride;
+
+ *var = cpi->fn_ptr[bsize].vf(src, src_stride, dst, dst_stride, sse);
+}
diff --git a/libaom/av1/encoder/mcomp.h b/libaom/av1/encoder/mcomp.h
index 3f8b3b1..71547da 100644
--- a/libaom/av1/encoder/mcomp.h
+++ b/libaom/av1/encoder/mcomp.h
@@ -13,6 +13,7 @@
#define AOM_AV1_ENCODER_MCOMP_H_
#include "av1/encoder/block.h"
+
#include "aom_dsp/variance.h"
#ifdef __cplusplus
@@ -83,6 +84,11 @@ int av1_refining_search_sad(struct macroblock *x, MV *ref_mv, int sad_per_bit,
int distance, const aom_variance_fn_ptr_t *fn_ptr,
const MV *center_mv);
+unsigned int av1_int_pro_motion_estimation(const struct AV1_COMP *cpi,
+ MACROBLOCK *x, BLOCK_SIZE bsize,
+ int mi_row, int mi_col,
+ const MV *ref_mv);
+
// Runs sequence of diamond searches in smaller steps for RD.
int av1_full_pixel_diamond(const struct AV1_COMP *cpi, MACROBLOCK *x,
MV *mvp_full, int step_param, int sadpb,
@@ -132,13 +138,15 @@ int av1_full_pixel_search(const struct AV1_COMP *cpi, MACROBLOCK *x,
BLOCK_SIZE bsize, MV *mvp_full, int step_param,
int method, int run_mesh_search, int error_per_bit,
int *cost_list, const MV *ref_mv, int var_max, int rd,
- int x_pos, int y_pos, int intra);
+ int x_pos, int y_pos, int intra,
+ const search_site_config *cfg);
int av1_obmc_full_pixel_search(const struct AV1_COMP *cpi, MACROBLOCK *x,
MV *mvp_full, int step_param, int sadpb,
int further_steps, int do_refine,
const aom_variance_fn_ptr_t *fn_ptr,
- const MV *ref_mv, MV *dst_mv, int is_second);
+ const MV *ref_mv, MV *dst_mv, int is_second,
+ const search_site_config *cfg);
int av1_find_best_obmc_sub_pixel_tree_up(
MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col,
MV *bestmv, const MV *ref_mv, int allow_hp, int error_per_bit,
@@ -154,6 +162,19 @@ unsigned int av1_refine_warped_mv(const struct AV1_COMP *cpi,
int mi_row, int mi_col, int *pts0,
int *pts_inref0, int total_samples);
+// Performs a motion search in SIMPLE_TRANSLATION mode using reference frame
+// ref. Note that this sets the offset of mbmi, so we will need to reset it
+// after calling this function.
+void av1_simple_motion_search(struct AV1_COMP *const cpi, MACROBLOCK *x,
+ int mi_row, int mi_col, BLOCK_SIZE bsize, int ref,
+ MV ref_mv_full, int num_planes, int use_subpixel);
+
+// Performs a simple motion search to calculate the sse and var of the residue
+void av1_simple_motion_sse_var(struct AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
+ int mi_col, BLOCK_SIZE bsize,
+ const MV ref_mv_full, int use_subpixel,
+ unsigned int *sse, unsigned int *var);
+
static INLINE void av1_set_fractional_mv(int_mv *fractional_best_mv) {
for (int z = 0; z < 3; z++) {
fractional_best_mv[z].as_int = INVALID_MV;
diff --git a/libaom/av1/encoder/mips/msa/temporal_filter_msa.c b/libaom/av1/encoder/mips/msa/temporal_filter_msa.c
index 531ae09..effa75b 100644
--- a/libaom/av1/encoder/mips/msa/temporal_filter_msa.c
+++ b/libaom/av1/encoder/mips/msa/temporal_filter_msa.c
@@ -267,6 +267,7 @@ static void temporal_filter_apply_16size_msa(uint8_t *frm1_ptr, uint32_t stride,
}
}
+// TODO(yunqing) The following optimization is not used since c code changes.
void av1_temporal_filter_apply_msa(uint8_t *frame1_ptr, uint32_t stride,
uint8_t *frame2_ptr, uint32_t blk_w,
uint32_t blk_h, int32_t strength,
diff --git a/libaom/av1/encoder/ml.c b/libaom/av1/encoder/ml.c
index ad664ac..579900a 100644
--- a/libaom/av1/encoder/ml.c
+++ b/libaom/av1/encoder/ml.c
@@ -65,7 +65,9 @@ void av1_nn_softmax(const float *input, float *output, int n) {
for (int i = 1; i < n; i++) max_inp = AOMMAX(max_inp, input[i]);
float sum_out = 0.0f;
for (int i = 0; i < n; i++) {
- output[i] = (float)exp(input[i] - max_inp);
+ // Clamp to range [-10.0, 0.0] to prevent FE_UNDERFLOW errors.
+ const float normalized_input = AOMMAX(input[i] - max_inp, -10.0f);
+ output[i] = (float)exp(normalized_input);
sum_out += output[i];
}
for (int i = 0; i < n; i++) output[i] /= sum_out;
diff --git a/libaom/av1/encoder/partition_model_weights.h b/libaom/av1/encoder/partition_model_weights.h
index 271764a..b754c88 100644
--- a/libaom/av1/encoder/partition_model_weights.h
+++ b/libaom/av1/encoder/partition_model_weights.h
@@ -2441,145 +2441,20 @@ static const NN_CONFIG av1_rect_partition_nnconfig_128 = {
#undef NUM_NODES
#undef LABEL_SIZE
-#if CONFIG_ONE_PASS_SVM
-#define FEATURE_SIZE 24
-static const float av1_op_svm_early_term_weights_128[FEATURE_SIZE + 1] = {
- -4.5893036051f, 6.9065208136f, -9.1579514692f, 0.1353151366f,
- -1.0271889653f, -0.0020988254f, -0.0094355949f, 0.0040209656f,
- 0.0073014747f, 0.7939705382f, 0.0254545714f, 0.0557559708f,
- -0.0339662064f, -0.0496818300f, 0.3053600283f, 0.3699486845f,
- 0.0848271391f, 0.4091075988f, 0.1196729398f, -0.0038137193f,
- -0.0773495909f, -0.0651630642f, -0.0123704995f, -0.0036697401f,
- -4.1930227095f,
-};
-
-static const float av1_op_svm_early_term_weights_64[FEATURE_SIZE + 1] = {
- -2.7600454480f, 5.6822046712f, -6.7576830133f, 0.1326457117f,
- -1.0541818372f, 0.0107782654f, 0.0050469147f, -0.0021362631f,
- -0.0135151040f, -0.1020115005f, -0.0283409957f, -0.0176311233f,
- 0.0250648204f, 0.0196228570f, 0.5441528594f, 0.2767320141f,
- 0.1261231351f, 0.2998476408f, 0.1336215695f, -0.1107823946f,
- -0.0697279598f, -0.0577520545f, -0.0558441075f, -0.0699750617f,
- -2.6995991503f,
-};
-
-static const float av1_op_svm_early_term_weights_32[FEATURE_SIZE + 1] = {
- -0.8950734172f, 1.3559565008f, -2.6733642653f, 0.2661361319f,
- -0.0314731140f, 0.0044943456f, 0.0006438044f, -0.0029066686f,
- -0.0021903213f, 0.5845049496f, -0.0003629350f, 0.0006982840f,
- 0.0014157386f, -0.0017427528f, 0.7078456733f, 0.1600998068f,
- 0.0933852747f, 0.2822125876f, 0.1923826165f, -0.0905903459f,
- -0.0564717590f, -0.0591007486f, -0.0692268554f, -0.0677411981f,
- -0.7101853206f,
-};
-
-static const float av1_op_svm_early_term_weights_16[FEATURE_SIZE + 1] = {
- -0.1719124013f, -0.3192305362f, -1.1714597182f, 0.4437770294f,
- -0.0042344643f, 0.0000027764f, 0.0018827450f, -0.0015555613f,
- -0.0003250050f, 0.9413693294f, 0.0076188418f, -0.0067870352f,
- 0.0006329246f, -0.0013059613f, 0.8596697254f, 0.0635558018f,
- 0.0447224598f, 0.0915706321f, 0.0741662273f, -0.0269096547f,
- -0.0244610614f, -0.0281113318f, -0.0326108845f, -0.0350908892f,
- -0.0307521675f,
-};
-
-static const float av1_op_svm_early_term_mean_128[FEATURE_SIZE] = {
- 940540.3259649610f, 3988285.5905584921f, 575475302.3545289040f,
- 0.5775348803f, 866.9828469502f, 0.2503762393f,
- 0.2501466215f, 0.2513213770f, 0.2481557622f,
- 521994448.3219169378f, 0.2666920631f, 0.2535864361f,
- 0.2481589186f, 0.2315625823f, 100519.1049708007f,
- 12.1299754840f, 0.8279971004f, 12.6664603305f,
- 0.7313258998f, 935.8233056680f, 0.7436563032f,
- 0.7710055018f, 0.7376516970f, 0.6859818720f,
-};
-
-static const float av1_op_svm_early_term_mean_64[FEATURE_SIZE] = {
- 420419.7529613562f, 839754.4414347620f, 129360420.5256031156f,
- 0.6525652037f, 548.8972009954f, 0.2506918565f,
- 0.2488349076f, 0.2501724146f, 0.2503008213f,
- 113132974.7944754064f, 0.2479344278f, 0.2471446791f,
- 0.2524478512f, 0.2524730419f, 91147.9854189453f,
- 10.9642508460f, 0.8936554428f, 11.3877865621f,
- 0.8307555282f, 752.7787491956f, 0.7243363939f,
- 0.7198362119f, 0.7329432336f, 0.7245090283f,
-};
-
-static const float av1_op_svm_early_term_mean_32[FEATURE_SIZE] = {
- 105111.0236438536f, 184296.0939716828f, 29117017.6751756854f,
- 0.6402298612f, 140.2223339218f, 0.2495860872f,
- 0.2496407600f, 0.2506238629f, 0.2501492900f,
- 24480304.9390618578f, 0.2494442027f, 0.2496080963f,
- 0.2504881563f, 0.2504595447f, 60297.6762059058f,
- 9.4279752138f, 0.9287901132f, 9.6516813792f,
- 0.9009173677f, 591.5406335030f, 0.6944486917f,
- 0.6983941982f, 0.6927236901f, 0.6921613649f,
-};
-
-static const float av1_op_svm_early_term_mean_16[FEATURE_SIZE] = {
- 34080.7994802934f, 44108.1176228864f, 7494288.4946180154f, 0.6240636218f,
- 36.4539515827f, 0.2490867417f, 0.2499231014f, 0.2505361492f,
- 0.2504540077f, 5913397.2957480755f, 0.2487482536f, 0.2495500728f,
- 0.2503693302f, 0.2513323434f, 36574.9686737814f, 7.4345592768f,
- 0.9592429205f, 7.6001764585f, 0.9459867777f, 490.4635033056f,
- 0.6626215237f, 0.6580791886f, 0.6655481064f, 0.6589010119f,
-};
-
-static const float av1_op_svm_early_term_std_128[FEATURE_SIZE] = {
- 2054266.2732957317f, 7550554.6241466375f, 1078688147.1656334400f,
- 0.4939517611f, 1414.3139592985f, 0.1504634077f,
- 0.1515907199f, 0.1590329744f, 0.1515653324f,
- 1006422867.8989596367f, 0.1168668155f, 0.1195725959f,
- 0.1195825693f, 0.1123065533f, 195261.0940245980f,
- 4.5876675121f, 0.3773829648f, 4.8017339769f,
- 0.4432700397f, 973.7532938848f, 0.4790027843f,
- 0.5056275222f, 0.5262278749f, 0.4685586148f,
-};
-
-static const float av1_op_svm_early_term_std_64[FEATURE_SIZE] = {
- 1093636.0522712648f, 1749863.5221569177f, 255168612.8025657237f,
- 0.4761552884f, 1084.7927994662f, 0.1099344646f,
- 0.1100619440f, 0.1090853225f, 0.1115303745f,
- 232084513.1365262568f, 0.0759732385f, 0.0762942913f,
- 0.0785624106f, 0.0779284747f, 185687.9441778057f,
- 4.4371901245f, 0.3082781088f, 4.6670562831f,
- 0.3749677061f, 854.3212307408f, 0.4920531348f,
- 0.5073919158f, 0.5054698298f, 0.4904895620f,
-};
-
-static const float av1_op_svm_early_term_std_32[FEATURE_SIZE] = {
- 238229.7484988807f, 400136.8703966461f, 60267828.4581554681f,
- 0.4799328974f, 268.9377064297f, 0.1122938575f,
- 0.1126479260f, 0.1137018559f, 0.1126389337f,
- 52174139.1477040648f, 0.0715628767f, 0.0720997035f,
- 0.0728961434f, 0.0732065300f, 147785.0049793872f,
- 4.2092341484f, 0.2571751131f, 4.3893075417f,
- 0.2987729310f, 769.0253148602f, 0.5027558039f,
- 0.4982811444f, 0.5092312751f, 0.4991214994f,
-};
-
-static const float av1_op_svm_early_term_std_16[FEATURE_SIZE] = {
- 64177.9527087587f, 103729.9987511119f, 16632490.8146969266f,
- 0.4843637247f, 65.8114470725f, 0.0884226846f,
- 0.0912638659f, 0.0914771167f, 0.0916078800f,
- 13364581.3877149168f, 0.0677468925f, 0.0689631274f,
- 0.0689915367f, 0.0702648469f, 111397.2620676765f,
- 3.7858187888f, 0.1977269328f, 3.9420183951f,
- 0.2260437881f, 717.5336868275f, 0.5017939514f,
- 0.5066633533f, 0.5086806985f, 0.5085585987f,
-};
-
-#undef FEATURE_SIZE
-#endif // CONFIG_ONE_PASS_SVM
+// Below are the models used for simple_motion_search_based_split
+static const float av1_simple_motion_search_based_split_thresh_128 = 2.0f;
+static const float av1_simple_motion_search_based_split_thresh_64 = 2.0f;
+static const float av1_simple_motion_search_based_split_thresh_32 = 2.0f;
+static const float av1_simple_motion_search_based_split_thresh_16 = 2.0f;
+static const float av1_simple_motion_search_based_split_thresh_8 = 2.0f;
-// Below are the models used for full_pixel_motion_search_based_split
// BLOCK_128X128
#define NUM_HIDDEN_LAYERS_128 1
#define NUM_FEATURES_128 6
#define NUM_LAYER_0_UNITS_128 16
#define NUM_LOGITS_128 1
-static const float full_pixel_motion_search_based_split_layer_0_kernel_128[] = {
+static const float av1_simple_motion_search_based_split_layer_0_kernel_128[] = {
-0.807346f, 0.242298f, 12.9862f, -1.19161f, 5.21734f, -1.1363f,
-2.39127f, 0.930915f, -2.44285f, -2.42966f, 5.73476f, 0.0506879f,
-0.234878f, -0.317875f, 0.361322f, 0.431648f, -0.39105f, -0.110225f,
@@ -2598,23 +2473,23 @@ static const float full_pixel_motion_search_based_split_layer_0_kernel_128[] = {
0.702545f, -0.612227f, -7.68881f, 9.52225f, -1.18581f, -2.56762f
};
-static const float full_pixel_motion_search_based_split_logits_kernel_128[] = {
+static const float av1_simple_motion_search_based_split_logits_kernel_128[] = {
0.364895f, 0.577553f, 0.115758f, -0.999496f, 0.124885f, 3.23193f,
-0.00386642f, 0.970794f, 0.136637f, -4.28052f, -1.49234f, 0.370436f,
0.576981f, -0.469656f, -0.124071f, 1.07669f
};
-static const float full_pixel_motion_search_based_split_layer_0_bias_128[] = {
+static const float av1_simple_motion_search_based_split_layer_0_bias_128[] = {
1.32916f, 0.817212f, 0.0f, -0.921066f, 0.0f, 3.57649f,
-0.0204517f, 2.97286f, 0.0f, 5.49957f, -8.14518f, 0.0f,
1.30826f, -0.349536f, -0.638933f, 5.4496f
};
-static const float full_pixel_motion_search_based_split_logits_bias_128[] = {
+static const float av1_simple_motion_search_based_split_logits_bias_128[] = {
0.683442f
};
-static const NN_CONFIG full_pixel_motion_search_based_split_nn_config_128 = {
+static const NN_CONFIG av1_simple_motion_search_based_split_nn_config_128 = {
NUM_FEATURES_128,
NUM_LOGITS_128,
NUM_HIDDEN_LAYERS_128,
@@ -2622,17 +2497,15 @@ static const NN_CONFIG full_pixel_motion_search_based_split_nn_config_128 = {
NUM_LAYER_0_UNITS_128,
},
{
- full_pixel_motion_search_based_split_layer_0_kernel_128,
- full_pixel_motion_search_based_split_logits_kernel_128,
+ av1_simple_motion_search_based_split_layer_0_kernel_128,
+ av1_simple_motion_search_based_split_logits_kernel_128,
},
{
- full_pixel_motion_search_based_split_layer_0_bias_128,
- full_pixel_motion_search_based_split_logits_bias_128,
+ av1_simple_motion_search_based_split_layer_0_bias_128,
+ av1_simple_motion_search_based_split_logits_bias_128,
},
};
-static const float full_pixel_motion_search_based_split_thresh_128 = 2.0f;
-
#undef NUM_HIDDEN_LAYERS_128
#undef NUM_FEATURES_128
#undef NUM_LAYER_0_UNITS_128
@@ -2644,7 +2517,7 @@ static const float full_pixel_motion_search_based_split_thresh_128 = 2.0f;
#define NUM_LAYER_0_UNITS_64 16
#define NUM_LOGITS_64 1
-static const float full_pixel_motion_search_based_split_layer_0_kernel_64[] = {
+static const float av1_simple_motion_search_based_split_layer_0_kernel_64[] = {
0.0345945f, -0.394064f, 0.0919978f, 0.270358f, -0.384502f, -0.504608f,
-0.25759f, 0.155981f, 2.62567f, -10.7204f, -0.709802f, 8.15948f,
0.589866f, -0.445645f, -1.68232f, 10.0061f, -3.17671f, 4.87259f,
@@ -2663,23 +2536,23 @@ static const float full_pixel_motion_search_based_split_layer_0_kernel_64[] = {
-0.217072f, -0.0984913f, -0.265515f, 0.360021f, 0.0779512f, 0.361516f
};
-static const float full_pixel_motion_search_based_split_logits_kernel_64[] = {
+static const float av1_simple_motion_search_based_split_logits_kernel_64[] = {
0.470821f, 0.474747f, -0.571292f, 0.403221f, 0.628966f, -0.617029f,
0.501105f, 0.499962f, -1.5451f, -0.473518f, -0.730568f, -5.55817f,
0.776761f, 0.42569f, 0.311925f, 0.469968f
};
-static const float full_pixel_motion_search_based_split_layer_0_bias_64[] = {
+static const float av1_simple_motion_search_based_split_layer_0_bias_64[] = {
-0.134085f, 0.0758715f, 1.10419f, 0.0f, -5.75737f, 1.65494f,
0.0f, 3.44047f, 0.394852f, 3.43858f, 3.65871f, -4.84987f,
1.21207f, -1.7705f, -5.46469f, -0.0889634f
};
-static const float full_pixel_motion_search_based_split_logits_bias_64[] = {
+static const float av1_simple_motion_search_based_split_logits_bias_64[] = {
-0.479491f
};
-static const NN_CONFIG full_pixel_motion_search_based_split_nn_config_64 = {
+static const NN_CONFIG av1_simple_motion_search_based_split_nn_config_64 = {
NUM_FEATURES_64,
NUM_LOGITS_64,
NUM_HIDDEN_LAYERS_64,
@@ -2687,17 +2560,15 @@ static const NN_CONFIG full_pixel_motion_search_based_split_nn_config_64 = {
NUM_LAYER_0_UNITS_64,
},
{
- full_pixel_motion_search_based_split_layer_0_kernel_64,
- full_pixel_motion_search_based_split_logits_kernel_64,
+ av1_simple_motion_search_based_split_layer_0_kernel_64,
+ av1_simple_motion_search_based_split_logits_kernel_64,
},
{
- full_pixel_motion_search_based_split_layer_0_bias_64,
- full_pixel_motion_search_based_split_logits_bias_64,
+ av1_simple_motion_search_based_split_layer_0_bias_64,
+ av1_simple_motion_search_based_split_logits_bias_64,
},
};
-static const float full_pixel_motion_search_based_split_thresh_64 = 2.0f;
-
#undef NUM_HIDDEN_LAYERS_64
#undef NUM_FEATURES_64
#undef NUM_LAYER_0_UNITS_64
@@ -2709,7 +2580,7 @@ static const float full_pixel_motion_search_based_split_thresh_64 = 2.0f;
#define NUM_LAYER_0_UNITS_32 16
#define NUM_LOGITS_32 1
-static const float full_pixel_motion_search_based_split_layer_0_kernel_32[] = {
+static const float av1_simple_motion_search_based_split_layer_0_kernel_32[] = {
-1.61796f, 0.0585128f, 1.57904f, 1.52703f, 0.367779f, 0.220434f,
1.66652f, -1.77782f, 6.41118f, 4.16976f, 4.97299f, 4.84111f,
-0.0956536f, -0.163284f, -0.143662f, 0.129329f, 0.449659f, -0.528844f,
@@ -2728,23 +2599,23 @@ static const float full_pixel_motion_search_based_split_layer_0_kernel_32[] = {
-1.91327f, -0.0356497f, 1.47611f, 1.27499f, -1.76108f, -0.578954f
};
-static const float full_pixel_motion_search_based_split_logits_kernel_32[] = {
+static const float av1_simple_motion_search_based_split_logits_kernel_32[] = {
-0.220382f, -0.693902f, 0.424827f, 0.379952f, -0.413791f, -0.326785f,
-0.455086f, 0.242402f, 0.307986f, 0.175746f, 0.498901f, -0.628053f,
0.285447f, 0.230052f, 0.415151f, -0.842946f
};
-static const float full_pixel_motion_search_based_split_layer_0_bias_32[] = {
+static const float av1_simple_motion_search_based_split_layer_0_bias_32[] = {
-1.80751f, 6.40356f, -0.0512058f, -4.59163f, -0.369933f, -0.195755f,
-0.16648f, -0.599755f, -5.35975f, -1.21349f, 2.48414f, 1.07096f,
-3.66684f, -6.17761f, 4.2159f, -1.05286f
};
-static const float full_pixel_motion_search_based_split_logits_bias_32[] = {
+static const float av1_simple_motion_search_based_split_logits_bias_32[] = {
-2.58676f
};
-static const NN_CONFIG full_pixel_motion_search_based_split_nn_config_32 = {
+static const NN_CONFIG av1_simple_motion_search_based_split_nn_config_32 = {
NUM_FEATURES_32,
NUM_LOGITS_32,
NUM_HIDDEN_LAYERS_32,
@@ -2752,17 +2623,15 @@ static const NN_CONFIG full_pixel_motion_search_based_split_nn_config_32 = {
NUM_LAYER_0_UNITS_32,
},
{
- full_pixel_motion_search_based_split_layer_0_kernel_32,
- full_pixel_motion_search_based_split_logits_kernel_32,
+ av1_simple_motion_search_based_split_layer_0_kernel_32,
+ av1_simple_motion_search_based_split_logits_kernel_32,
},
{
- full_pixel_motion_search_based_split_layer_0_bias_32,
- full_pixel_motion_search_based_split_logits_bias_32,
+ av1_simple_motion_search_based_split_layer_0_bias_32,
+ av1_simple_motion_search_based_split_logits_bias_32,
},
};
-static const float full_pixel_motion_search_based_split_thresh_32 = 2.0f;
-
#undef NUM_HIDDEN_LAYERS_32
#undef NUM_FEATURES_32
#undef NUM_LAYER_0_UNITS_32
@@ -2774,7 +2643,7 @@ static const float full_pixel_motion_search_based_split_thresh_32 = 2.0f;
#define NUM_LAYER_0_UNITS_16 16
#define NUM_LOGITS_16 1
-static const float full_pixel_motion_search_based_split_layer_0_kernel_16[] = {
+static const float av1_simple_motion_search_based_split_layer_0_kernel_16[] = {
-0.611497f, -0.0422086f, -0.555957f, -0.632451f, -0.144179f, -0.152722f,
-0.330265f, -0.419866f, 0.287343f, 0.385295f, -0.424486f, 0.424281f,
2.27442f, -2.47933f, 5.24731f, 4.33827f, 4.73215f, 3.41909f,
@@ -2793,23 +2662,23 @@ static const float full_pixel_motion_search_based_split_layer_0_kernel_16[] = {
0.0333619f, -0.377782f, 0.160767f, -0.128169f, -0.484818f, -0.311973f
};
-static const float full_pixel_motion_search_based_split_logits_kernel_16[] = {
+static const float av1_simple_motion_search_based_split_logits_kernel_16[] = {
-0.132207f, 0.15176f, -0.680086f, 0.605921f, -0.43294f, 0.485811f,
-0.306286f, 0.551368f, 0.413904f, 0.548748f, -0.437391f, 0.560778f,
-0.00685266f, -0.558657f, 0.122127f, 0.260165f
};
-static const float full_pixel_motion_search_based_split_layer_0_bias_16[] = {
+static const float av1_simple_motion_search_based_split_layer_0_bias_16[] = {
-0.200928f, -0.074132f, 8.69963f, -9.00807f, 9.08983f, -6.83586f,
-3.89329f, 10.4881f, -0.0670618f, 0.0f, 9.21614f, 8.41773f,
-0.145851f, 0.0f, -1.43038f, -0.0460311f
};
-static const float full_pixel_motion_search_based_split_logits_bias_16[] = {
+static const float av1_simple_motion_search_based_split_logits_bias_16[] = {
-4.19885f
};
-static const NN_CONFIG full_pixel_motion_search_based_split_nn_config_16 = {
+static const NN_CONFIG av1_simple_motion_search_based_split_nn_config_16 = {
NUM_FEATURES_16,
NUM_LOGITS_16,
NUM_HIDDEN_LAYERS_16,
@@ -2817,17 +2686,15 @@ static const NN_CONFIG full_pixel_motion_search_based_split_nn_config_16 = {
NUM_LAYER_0_UNITS_16,
},
{
- full_pixel_motion_search_based_split_layer_0_kernel_16,
- full_pixel_motion_search_based_split_logits_kernel_16,
+ av1_simple_motion_search_based_split_layer_0_kernel_16,
+ av1_simple_motion_search_based_split_logits_kernel_16,
},
{
- full_pixel_motion_search_based_split_layer_0_bias_16,
- full_pixel_motion_search_based_split_logits_bias_16,
+ av1_simple_motion_search_based_split_layer_0_bias_16,
+ av1_simple_motion_search_based_split_logits_bias_16,
},
};
-static const float full_pixel_motion_search_based_split_thresh_16 = 2.0f;
-
#undef NUM_HIDDEN_LAYERS_16
#undef NUM_FEATURES_16
#undef NUM_LAYER_0_UNITS_16
@@ -2840,7 +2707,7 @@ static const float full_pixel_motion_search_based_split_thresh_16 = 2.0f;
#define NUM_LAYER_0_UNITS_8 16
#define NUM_LOGITS_8 1
-static const float full_pixel_motion_search_based_split_layer_0_kernel_8[] = {
+static const float av1_simple_motion_search_based_split_layer_0_kernel_8[] = {
0.0370236f, -0.580211f, 2.0134f, 1.69637f, 2.43181f, -0.521648f,
-0.00375187f, 0.122712f, -4.74411f, 7.36187f, 5.42574f, -5.53557f,
0.0993344f, -0.358843f, 0.0765453f, -0.615987f, -0.754633f, -0.175846f,
@@ -2859,23 +2726,1240 @@ static const float full_pixel_motion_search_based_split_layer_0_kernel_8[] = {
0.616966f, -0.451472f, -0.319365f, 0.00807278f, -0.303261f, -0.351679f
};
-static const float full_pixel_motion_search_based_split_logits_kernel_8[] = {
+static const float av1_simple_motion_search_based_split_logits_kernel_8[] = {
-0.625847f, 0.381323f, 0.342475f, 0.526161f, -0.665965f, -0.515317f,
-0.406218f, 0.568007f, 0.479397f, -0.426116f, 0.615638f, 0.338572f,
0.185583f, 0.308031f, 0.260748f, 0.531619f
};
-static const float full_pixel_motion_search_based_split_layer_0_bias_8[] = {
+static const float av1_simple_motion_search_based_split_layer_0_bias_8[] = {
4.73775f, -1.12658f, -0.258038f, -6.06696f, 1.79131f, 2.49609f,
4.28388f, 0.0f, -4.63598f, 3.06034f, 5.31994f, -0.152142f,
0.514738f, -1.30098f, 3.00296f, -3.83481f
};
-static const float full_pixel_motion_search_based_split_logits_bias_8[] = {
+static const float av1_simple_motion_search_based_split_logits_bias_8[] = {
-3.44508f
};
-static const NN_CONFIG full_pixel_motion_search_based_split_nn_config_8 = {
+static const NN_CONFIG av1_simple_motion_search_based_split_nn_config_8 = {
+ NUM_FEATURES_8,
+ NUM_LOGITS_8,
+ NUM_HIDDEN_LAYERS_8,
+ {
+ NUM_LAYER_0_UNITS_8,
+ },
+ {
+ av1_simple_motion_search_based_split_layer_0_kernel_8,
+ av1_simple_motion_search_based_split_logits_kernel_8,
+ },
+ {
+ av1_simple_motion_search_based_split_layer_0_bias_8,
+ av1_simple_motion_search_based_split_logits_bias_8,
+ },
+};
+
+#endif
+
+// Model based on simple_motion_search
+
+// Thresholds for doing a single type of partition
+// TODO(chiyotsai@google.com): Set the thresholds for PARTITION_SPLIT.
+static const float av1_simple_motion_search_prune_part_only_thresh_128[10] = {
+ 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f
+};
+
+static const float av1_simple_motion_search_prune_part_only_thresh_64[10] = {
+ 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f
+};
+
+static const float av1_simple_motion_search_prune_part_only_thresh_32[10] = {
+ 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f
+};
+
+static const float av1_simple_motion_search_prune_part_only_thresh_16[10] = {
+ 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f
+};
+
+static const float av1_simple_motion_search_prune_part_only_thresh_8[10] = {
+ 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f
+};
+
+// Thresholds for pruning a partition type
+static const float av1_simple_motion_search_prune_part_prune_thresh_128[10] = {
+ 0.0f, 0.0288721601835f, 0.0288721601835f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f
+};
+
+static const float av1_simple_motion_search_prune_part_prune_thresh_64[10] = {
+ 0.0f, 0.0281573780991f, 0.0281573780991f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f
+};
+
+static const float av1_simple_motion_search_prune_part_prune_thresh_32[10] = {
+ 0.0f, 0.0225501403434f, 0.0225501403434f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+ 0.0f
+};
+
+static const float av1_simple_motion_search_prune_part_prune_thresh_16[10] = {
+ 0.0f,
+ 0.000961189195907f,
+ 0.000961189195907f,
+ 0.0f,
+ 0.0f,
+ 0.0f,
+ 0.0f,
+ 0.0f,
+ 0.0f,
+ 0.0f
+};
+
+static const float av1_simple_motion_search_prune_part_prune_thresh_8[10] = {
+ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f
+};
+
+// Mean and std
+static const float av1_simple_motion_search_prune_part_mean_128[25] = {
+ 13.292176f, 13.231236f, 11.098058f, 11.049944f, 10.481336f,
+ 10.431587f, 10.789337f, 10.732787f, 10.233817f, 10.173738f,
+ 12.214045f, 12.157505f, 11.863353f, 11.802220f, 12.204053f,
+ 12.152315f, 11.517566f, 11.465651f, 5.383040f, 0.757934f,
+ 4.012611f, 4.052191f, 0.853365f, 3.954503f, 3.944135f,
+};
+
+static const float av1_simple_motion_search_prune_part_std_128[25] = {
+ 2.589217f, 2.559396f, 2.268402f, 2.282274f, 3.341234f, 3.341994f, 3.033007f,
+ 3.041550f, 3.786247f, 3.784053f, 2.523459f, 2.511275f, 3.349364f, 3.340481f,
+ 2.390149f, 2.384226f, 3.599467f, 3.587460f, 2.319911f, 0.428335f, 1.241087f,
+ 1.208679f, 0.353742f, 1.228122f, 1.211777f,
+};
+
+static const float av1_simple_motion_search_prune_part_mean_64[25] = {
+ 11.439831f, 11.382639f, 9.647134f, 9.578121f, 9.146770f,
+ 9.084122f, 8.559063f, 8.499496f, 8.095865f, 8.041795f,
+ 10.547537f, 10.486240f, 9.362147f, 9.308391f, 10.548071f,
+ 10.484358f, 10.002225f, 9.944480f, 4.964504f, 0.897164f,
+ 3.306144f, 3.351039f, 0.928582f, 3.319739f, 3.287726f,
+};
+
+static const float av1_simple_motion_search_prune_part_std_64[25] = {
+ 2.033404f, 2.050657f, 2.064671f, 2.081519f, 2.916312f, 2.914649f, 3.628949f,
+ 3.618760f, 4.011421f, 3.996068f, 2.087127f, 2.103106f, 3.885277f, 3.876166f,
+ 2.035599f, 2.052976f, 3.052501f, 3.050985f, 2.232998f, 0.303745f, 1.111161f,
+ 1.081292f, 0.257521f, 1.112510f, 1.089404f,
+};
+
+static const float av1_simple_motion_search_prune_part_mean_32[25] = {
+ 9.862349f, 9.793658f, 8.043962f, 7.954083f, 8.058867f, 7.966165f, 8.046844f,
+ 7.956817f, 8.061414f, 7.967906f, 8.966450f, 8.890165f, 8.968315f, 8.891513f,
+ 8.953573f, 8.877070f, 8.974275f, 8.895363f, 4.387239f, 0.954143f, 2.701000f,
+ 2.751266f, 0.963302f, 2.716584f, 2.709725f,
+};
+
+static const float av1_simple_motion_search_prune_part_std_32[25] = {
+ 1.971555f, 1.985517f, 1.935986f, 1.944743f, 1.924122f, 1.932169f, 1.943151f,
+ 1.950612f, 1.931156f, 1.938242f, 1.987803f, 1.997670f, 2.000859f, 2.009913f,
+ 1.938270f, 1.949277f, 1.922999f, 1.933145f, 1.991504f, 0.209175f, 0.973824f,
+ 0.952221f, 0.188018f, 0.985295f, 0.946228f,
+};
+
+static const float av1_simple_motion_search_prune_part_mean_16[25] = {
+ 8.391692f, 8.303431f, 6.590342f, 6.459725f, 6.460719f, 6.333274f, 6.592615f,
+ 6.461661f, 6.464787f, 6.337191f, 7.499753f, 7.395166f, 7.503220f, 7.398344f,
+ 7.498312f, 7.395039f, 7.353743f, 7.253139f, 3.874267f, 0.979701f, 2.087404f,
+ 2.131698f, 0.981005f, 2.110868f, 2.106539f,
+};
+
+static const float av1_simple_motion_search_prune_part_std_16[25] = {
+ 1.865867f, 1.870012f, 1.773885f, 1.770447f, 1.972922f, 1.961361f, 1.777224f,
+ 1.772864f, 1.974519f, 1.962281f, 1.831632f, 1.831837f, 1.837595f, 1.837008f,
+ 1.822791f, 1.822053f, 2.074991f, 2.067200f, 1.676261f, 0.141022f, 0.840297f,
+ 0.829935f, 0.136507f, 0.828972f, 0.808563f,
+};
+
+static const float av1_simple_motion_search_prune_part_mean_8[25] = {
+ 6.997798f, 6.867032f, 5.134819f, 4.883330f, 5.134804f, 4.879707f, 5.140518f,
+ 4.886751f, 5.142186f, 4.885262f, 6.069946f, 5.896944f, 6.080442f, 5.906130f,
+ 6.077539f, 5.905929f, 6.083087f, 5.909298f, 3.552709f, 0.990654f, 1.497349f,
+ 1.531762f, 0.989606f, 1.496581f, 1.484139f,
+};
+
+static const float av1_simple_motion_search_prune_part_std_8[25] = {
+ 1.727562f, 1.725050f, 1.633396f, 1.618773f, 1.633586f, 1.620657f, 1.620798f,
+ 1.604892f, 1.621570f, 1.607439f, 1.691024f, 1.684225f, 1.676065f, 1.668442f,
+ 1.680016f, 1.672452f, 1.677775f, 1.671586f, 1.451902f, 0.096223f, 0.751190f,
+ 0.754040f, 0.101419f, 0.738239f, 0.729455f,
+};
+
+#define NUM_HIDDEN_LAYERS_128 1
+#define NUM_FEATURES_128 25
+#define NUM_LAYER_0_UNITS_128 8
+#define NUM_LOGITS_128 4
+
+static const float av1_simple_motion_search_prune_part_logits_kernel_128[] = {
+ -0.129103f, 0.457758f, -0.489986f, 0.65462f, -0.184312f, 3.81202f,
+ -0.444407f, -0.64198f, -0.575008f, 0.0311711f, 0.525243f, -20.892f,
+ 1.08811f, -65.0976f, -12.3973f, -1.38278f, -0.264233f, 0.241636f,
+ -10.6925f, -0.725414f, -18.8987f, -40.2284f, -16.08f, 0.995331f,
+ 1.47614f, -0.964864f, 0.405506f, 0.140449f, 0.459534f, -1.9093f,
+ 0.398452f, 0.696949f
+};
+
+static const float av1_simple_motion_search_prune_part_layer_0_bias_128[] = {
+ 1.22789f, -1.34527f, 0.759048f, 0.315086f,
+ 1.0834f, -1.58019f, -0.465158f, 1.20716f
+};
+
+static const float av1_simple_motion_search_prune_part_layer_0_kernel_128[] = {
+ -0.668677f, 0.58694f, -0.417094f, 0.754735f, -0.7859f,
+ 0.377479f, -0.0415929f, -0.0140585f, -0.730001f, 0.747528f,
+ -0.135247f, 0.406505f, -0.234184f, 0.956362f, -0.637555f,
+ 0.791884f, 0.0303722f, 1.04424f, -0.727859f, -0.274321f,
+ -0.122986f, 0.066312f, -0.00559175f, -0.239643f, -0.0188767f,
+ -0.102787f, -0.262967f, 0.071882f, -0.283398f, 0.111607f,
+ -0.425826f, 0.02699f, 0.108873f, -0.180558f, -0.0794057f,
+ 0.29665f, -0.0252969f, -0.0266213f, -0.277462f, -0.361973f,
+ 0.512552f, 0.395011f, -0.225876f, 0.301924f, 0.136954f,
+ 0.507259f, 1.23425f, 0.0137135f, 0.662572f, 0.591583f,
+ 0.101564f, 0.416805f, -0.645081f, -0.179086f, -0.36747f,
+ -0.332213f, 0.095177f, 0.220739f, -0.153256f, 0.706155f,
+ 0.161701f, 0.696815f, -1.21531f, -0.115059f, 0.486764f,
+ -0.396093f, 0.784883f, 0.535357f, -0.278021f, 0.143496f,
+ -0.44931f, -0.144543f, 0.319326f, 0.0190167f, -0.206295f,
+ 0.373995f, -0.247897f, -0.608095f, -0.41796f, -0.137129f,
+ -0.709562f, 0.678273f, 0.537607f, 0.557474f, 0.453308f,
+ 0.21405f, -0.0466495f, 0.519139f, -0.168832f, 0.902911f,
+ 0.681131f, -0.139876f, -0.2052f, -0.393271f, 0.262222f,
+ -0.246246f, -0.213993f, 0.646619f, 0.0496181f, -0.00354157f,
+ 0.822927f, 0.0939522f, 0.180738f, 0.118355f, 0.120456f,
+ -0.0472214f, -0.144958f, 0.173405f, -0.886644f, -0.0949769f,
+ -0.813518f, -0.3947f, -0.128021f, 0.356196f, 0.469169f,
+ -0.413702f, 1.04242f, 0.428853f, -0.387293f, 0.0850877f,
+ 0.279409f, -0.142276f, 0.0579376f, 0.211112f, 0.0703013f,
+ -1.9274f, -0.729147f, 0.534193f, 0.773586f, 0.922864f,
+ 0.642881f, 1.15127f, 0.621032f, 0.933942f, 1.01837f,
+ -0.660282f, -0.40059f, -1.11279f, -0.77088f, -0.43349f,
+ 0.202361f, -0.0840912f, 0.0935707f, 0.056333f, -0.0779369f,
+ 0.0173447f, -0.0104756f, 0.0115005f, -0.0195593f, 0.03592f,
+ -0.343454f, -0.618048f, 0.258172f, -0.412322f, -0.0463746f,
+ -0.0413654f, -0.0400194f, 0.615981f, -0.452094f, 0.644555f,
+ 0.0822476f, -0.359791f, -0.0904274f, 0.209427f, 0.0116338f,
+ -0.190978f, 0.890233f, 0.737769f, -1.66663f, -0.392605f,
+ 0.0785728f, -0.224553f, -0.128258f, -0.227227f, -0.0777773f,
+ 0.685976f, 0.347042f, -0.555325f, -0.249221f, 0.0919837f,
+ -0.0660016f, -0.272316f, 0.0390632f, -0.619624f, -0.0565801f,
+ 0.585026f, 0.597375f, 0.54114f, 0.593389f, 0.604391f,
+ 0.0820294f, -0.85339f, -1.40741f, -0.391675f, 0.0579205f,
+ -0.197626f, 0.130044f, -0.234488f, -0.0373991f, -0.0717973f
+};
+
+static const float av1_simple_motion_search_prune_part_logits_bias_128[] = {
+ 1.58571f, -4.6314f, -2.00273f, 0.543699f
+};
+
+static const NN_CONFIG av1_simple_motion_search_prune_part_nn_config_128 = {
+ NUM_FEATURES_128,
+ NUM_LOGITS_128,
+ NUM_HIDDEN_LAYERS_128,
+ {
+ NUM_LAYER_0_UNITS_128,
+ },
+ {
+ av1_simple_motion_search_prune_part_layer_0_kernel_128,
+ av1_simple_motion_search_prune_part_logits_kernel_128,
+ },
+ {
+ av1_simple_motion_search_prune_part_layer_0_bias_128,
+ av1_simple_motion_search_prune_part_logits_bias_128,
+ },
+};
+
+#undef NUM_HIDDEN_LAYERS_128
+#undef NUM_FEATURES_128
+#undef NUM_LAYER_0_UNITS_128
+#undef NUM_LOGITS_128
+
+#define NUM_HIDDEN_LAYERS_64 1
+#define NUM_FEATURES_64 25
+#define NUM_LAYER_0_UNITS_64 32
+#define NUM_LOGITS_64 10
+
+static const float av1_simple_motion_search_prune_part_logits_kernel_64[] = {
+ 0.10424f, -0.346025f, 0.534547f, -0.385925f, 2.58341f, -0.256414f,
+ -0.232498f, 0.329823f, -0.0777376f, -0.590939f, 0.062657f, -0.628252f,
+ 0.0934588f, 2.04029f, -0.224448f, 0.371168f, -0.385348f, -0.589883f,
+ -3.73627f, -0.943144f, 0.346409f, -0.211215f, -0.351008f, 0.418807f,
+ 0.943663f, 0.173267f, 1.16585f, -0.0840888f, 0.227464f, 0.374412f,
+ 0.0422597f, -0.338868f, 0.222576f, 0.431713f, 1.12366f, 0.00753411f,
+ 0.248412f, -0.0902425f, 0.542455f, -0.665629f, -0.311245f, -0.205639f,
+ -0.447149f, -0.0502733f, -0.290186f, -0.794384f, 0.0940881f, -0.0686117f,
+ -0.0199961f, -0.587965f, 0.777096f, -0.083381f, -1.21282f, 0.652959f,
+ -1.18238f, 0.539991f, 0.352497f, -0.540076f, -0.26222f, -0.568556f,
+ 0.409102f, -0.131146f, -0.407161f, -0.188287f, -0.478657f, 0.000401932f,
+ -0.689324f, 0.351064f, -1.43704f, -0.315185f, -0.868726f, 0.376341f,
+ -0.0566277f, 0.364831f, 0.611298f, -0.495253f, -0.0193132f, 0.617978f,
+ 0.189586f, -0.236758f, -0.608246f, -0.149017f, -1.78303f, 0.143023f,
+ 0.698386f, -0.994086f, -0.673327f, 0.233868f, 0.360425f, 0.0294123f,
+ -0.248683f, -0.148392f, 0.0861829f, -0.190843f, -0.414906f, 0.607378f,
+ -0.756715f, -0.511713f, -0.321556f, 1.0078f, -1.18141f, 0.519751f,
+ 0.834629f, -0.359343f, 0.612262f, -0.0730553f, 0.262935f, 0.488276f,
+ 0.387071f, -1.44123f, 1.08269f, 0.554402f, -0.069f, 0.14113f,
+ 0.323817f, 0.824314f, -0.431417f, -0.349448f, 0.950728f, -0.587836f,
+ -0.83914f, -0.10844f, 0.26602f, 0.831933f, -0.271315f, 0.231563f,
+ 0.417049f, 0.190627f, -0.0940667f, 0.255363f, -0.0741022f, -0.0987662f,
+ -0.847522f, 0.00287554f, 0.0615741f, -0.0832218f, 0.0847148f, -0.392843f,
+ -0.938068f, -0.10621f, -0.260859f, -0.825175f, -0.401039f, 0.315213f,
+ -0.108269f, 0.288036f, -8.66166f, -0.970752f, -0.66678f, -0.593405f,
+ -0.518294f, -0.138722f, -0.454698f, -0.22969f, -0.553006f, -0.440111f,
+ 0.462661f, -0.536854f, 0.0108295f, -0.522888f, 0.00111157f, 0.229999f,
+ 0.0267768f, 0.176266f, -1.57043f, 0.0318106f, 0.257534f, -0.198583f,
+ 0.175564f, -0.251465f, -0.262441f, -1.65283f, -0.319603f, -0.875282f,
+ -0.301303f, 0.0170948f, -0.227075f, 0.0299545f, -4.98346f, 0.470046f,
+ -1.28051f, -0.213809f, -0.486585f, -0.906463f, -0.169984f, -0.333153f,
+ -0.376733f, 0.108016f, 0.486744f, -0.186936f, -0.429259f, 0.056501f,
+ -0.266545f, 0.265447f, -0.137718f, -0.490687f, -0.935668f, -0.16229f,
+ -0.696932f, 0.173157f, 0.434959f, -0.140595f, 0.345845f, -1.08013f,
+ -0.0205929f, -0.815874f, -0.179812f, 0.02767f, -0.141727f, 0.471936f,
+ -7.29453f, -1.04362f, -0.745482f, -0.28725f, -0.214997f, -0.0850651f,
+ -0.748471f, 0.161325f, -1.04387f, -0.705305f, 0.489427f, -0.765373f,
+ -0.301576f, 0.0742467f, -0.331282f, 0.0372328f, -0.90298f, -0.0608646f,
+ -2.18756f, 0.170384f, -0.258357f, 0.106287f, -0.161684f, -0.103799f,
+ -0.127774f, -0.156313f, 0.0705286f, -0.977908f, -0.281191f, -0.056757f,
+ -0.309474f, 0.050476f, -9.78198f, -2.42795f, -0.289626f, -1.07579f,
+ -0.439256f, -1.09948f, -0.564671f, 0.0913182f, -0.417216f, -1.19909f,
+ 0.287063f, 0.402315f, -0.17646f, 0.540488f, 0.00840239f, 0.397492f,
+ 0.702393f, -0.10566f, 0.655296f, -0.0443876f, 0.154918f, -0.760479f,
+ -0.0523153f, -0.366199f, -1.08212f, -0.398556f, -0.415203f, -1.10488f,
+ 0.208349f, 0.27079f, 0.101546f, -0.205752f, -13.7923f, -0.218637f,
+ -1.10077f, 0.355735f, -0.306196f, 0.627434f, -0.473101f, -0.308027f,
+ -1.12724f, 0.301597f, 0.660785f, 0.0576217f, -0.155925f, -0.56107f,
+ -0.223537f, 0.114299f, -0.53803f, -0.252674f, -2.66103f, -0.185245f,
+ -0.314673f, 0.403337f, 0.679821f, -0.69231f, 0.506264f, -0.999705f,
+ -0.549097f, 0.353745f, 0.188249f, 0.414484f, -0.615853f, 0.525681f,
+ -5.23065f, -3.05174f, 1.02074f, -0.965499f, -0.158947f, 0.0436088f,
+ -0.485824f, 0.0375094f, -1.39985f, -0.481392f, 0.485785f, -0.24874f,
+ -0.359633f, 0.668108f
+};
+
+static const float av1_simple_motion_search_prune_part_layer_0_bias_64[] = {
+ 0.0735592f, -0.045064f, -0.0114103f, 1.39246f, -0.683467f, 0.155765f,
+ -0.667652f, -0.202425f, -0.585433f, -0.146752f, -0.0812931f, 0.580642f,
+ 0.578542f, -0.831916f, 0.610063f, 0.0101856f, -0.235863f, 0.538141f,
+ -2.91334f, -1.71887f, 0.126616f, 0.582497f, -0.438879f, 0.221833f,
+ 0.850773f, -0.280886f, 0.443233f, -0.0964873f, -0.216161f, 0.34413f,
+ 0.656818f, 0.0169274f
+};
+
+static const float av1_simple_motion_search_prune_part_layer_0_kernel_64[] = {
+ -0.310947f, -0.232675f, 0.0171092f, 0.0834474f, 0.373977f,
+ 0.300429f, 0.215072f, -0.454074f, 0.187565f, 0.282742f,
+ 0.562562f, -0.0419322f, 0.000978486f, -0.298267f, 0.216934f,
+ -0.388722f, -0.146866f, -0.275946f, 0.202361f, 0.225847f,
+ 1.42868f, 0.473127f, -0.145747f, -0.104986f, 0.153459f,
+ 0.69382f, 0.162266f, 0.0207715f, -0.45095f, -0.412071f,
+ -0.235109f, -0.130199f, 0.231741f, 0.460193f, 0.0378202f,
+ 0.429516f, 0.387691f, -0.272479f, 0.0723884f, -0.453914f,
+ -0.150618f, -0.10745f, -0.258615f, 0.0838312f, -0.00554958f,
+ 0.105377f, -0.0415479f, 0.13228f, 1.09044f, -0.73053f,
+ -0.422553f, -0.435842f, 0.211416f, 0.420332f, 0.0181353f,
+ -0.030891f, 0.522788f, 0.613526f, 0.374032f, 0.287986f,
+ -0.403118f, -0.287362f, -1.11523f, -0.577713f, -0.020228f,
+ 0.86465f, -0.0590579f, 0.341274f, -0.0115644f, -0.260236f,
+ 0.192123f, -0.0849825f, 0.0501709f, 0.444382f, 0.0762727f,
+ 0.0926596f, -0.101157f, -0.142787f, 0.40861f, 0.555805f,
+ -0.00614654f, -0.122846f, 0.203163f, 0.234266f, 0.409795f,
+ -0.0206245f, -0.224679f, 0.025081f, 0.518044f, -0.287186f,
+ 0.016494f, -0.0886331f, 0.236438f, -1.01032f, 0.118332f,
+ 0.364217f, 0.061438f, 0.0381303f, 0.128418f, 0.0257077f,
+ -0.975751f, -0.694894f, 0.00351914f, 0.278179f, 0.29363f,
+ 0.525576f, 0.0604849f, 0.531734f, 0.406643f, 0.812497f,
+ -0.403196f, -0.16664f, -0.620887f, -0.428194f, 0.275401f,
+ 0.432063f, -0.00378342f, 0.295758f, 0.105615f, -0.00683626f,
+ 0.00396146f, 0.00598654f, -0.0131701f, -0.0115787f, 0.00386643f,
+ -0.69686f, -0.139623f, -0.440817f, 0.0542873f, 0.217962f,
+ 0.527035f, -0.0201046f, 0.0471354f, 0.0271858f, -0.0775197f,
+ -0.309797f, 0.184879f, -0.232854f, -0.407081f, 0.706227f,
+ -0.0877534f, 0.306843f, 0.455075f, -0.333961f, 0.0759148f,
+ 0.0444791f, -0.0693626f, -0.0850289f, -0.513063f, -0.643971f,
+ -0.630279f, -0.153889f, 0.123315f, 0.00548238f, 0.170707f,
+ 0.734339f, -0.176988f, 0.322519f, 0.178365f, 0.183519f,
+ -0.698683f, -0.12043f, -0.349914f, -0.0696762f, -0.53986f,
+ -0.104738f, 1.05264f, 0.983568f, -0.109035f, 0.0113748f,
+ 0.0815189f, -0.0628812f, 0.0769389f, 0.010261f, 0.146573f,
+ -0.433194f, -0.211572f, -0.000397392f, 0.445325f, 0.145091f,
+ -0.0625902f, 0.29394f, 0.302315f, 0.0892226f, -0.209504f,
+ -0.0150374f, 0.242608f, 0.216223f, 0.366857f, 0.209829f,
+ -0.540035f, 0.117599f, -0.329315f, 0.0471133f, -0.0115449f,
+ -0.0638235f, 0.0527461f, 0.348149f, 0.360802f, 1.06624f,
+ -0.615991f, -0.341396f, 0.18972f, 0.0709888f, -0.0414466f,
+ -0.0193809f, 0.0938933f, 0.209058f, 0.575042f, 0.483608f,
+ -0.285875f, -0.115905f, -0.363637f, 0.375425f, 0.336217f,
+ 0.0336358f, -0.00265618f, -0.406854f, -0.792959f, -0.219354f,
+ 0.0331615f, 0.0298859f, -0.211446f, -0.00280773f, -0.194011f,
+ 0.262109f, 0.548076f, 0.120183f, -0.661603f, 0.241855f,
+ -0.501428f, 0.00102718f, -0.347331f, -0.58306f, 0.0977254f,
+ 0.117491f, 0.0840667f, 0.00693675f, 0.000600294f, 0.649569f,
+ -0.0553811f, -0.197198f, 0.397236f, -0.523737f, -0.564192f,
+ -0.374679f, -0.249344f, 0.00861428f, 0.00393439f, -0.0834608f,
+ 0.124389f, -0.0393049f, 0.0425391f, -0.153383f, -0.182346f,
+ 0.420953f, 0.464221f, 0.288984f, 0.570921f, -0.239965f,
+ 0.247239f, -0.083434f, 0.714418f, 0.986323f, -0.460244f,
+ -0.260993f, -0.947743f, -1.0789f, -0.0391231f, 0.612407f,
+ -0.0306767f, 0.281419f, 0.0072426f, -0.37623f, 0.188744f,
+ 0.221666f, -0.424914f, 0.29703f, 0.261715f, 0.277809f,
+ -0.0617616f, -0.000611999f, -0.0547053f, -0.0901018f, -0.347669f,
+ 0.856072f, 0.596675f, -0.467639f, -1.09324f, -0.184224f,
+ -0.56051f, -0.0144704f, 0.102894f, -0.122982f, -0.0020749f,
+ -0.0423487f, 0.0328702f, -0.0154263f, 0.0349021f, -0.00315595f,
+ 0.0254802f, -0.729191f, 0.207296f, -0.0212349f, -0.207078f,
+ 0.20636f, -0.156883f, 0.429765f, -0.42672f, 0.138775f,
+ -0.0267343f, 0.631528f, 0.300646f, -0.4793f, -0.273833f,
+ -0.0135367f, -0.530819f, -0.534881f, 0.830896f, 0.0266992f,
+ 0.473744f, 0.210334f, 0.0234739f, 0.255394f, 0.123531f,
+ -0.489341f, -0.796627f, 0.372617f, 0.190136f, 0.275342f,
+ 0.739505f, 0.402354f, 0.782806f, 0.437374f, 1.04948f,
+ -0.55963f, 0.382704f, -0.698321f, 0.0817868f, -0.440108f,
+ -0.0635004f, -0.277851f, -0.524194f, 0.286157f, -0.01097f,
+ -0.0293145f, -0.0405071f, -0.035662f, -0.012871f, -0.0516409f,
+ -0.406671f, 0.709259f, -0.525177f, 0.521123f, -0.44813f,
+ 0.48412f, -0.0546513f, 0.305253f, -0.468328f, 0.316453f,
+ -0.36307f, 0.497515f, -0.0606276f, 0.315764f, -0.422066f,
+ 0.554025f, -0.679183f, 0.616914f, 0.00283324f, -0.000643824f,
+ 0.0639999f, 0.0488285f, -0.141031f, 0.068003f, -0.0792678f,
+ -0.425307f, -0.152235f, 0.269917f, -0.352327f, 0.44792f,
+ -0.116514f, -0.465868f, 0.154287f, 0.0161028f, -0.16848f,
+ -0.255487f, 0.189832f, 0.254883f, 0.0240822f, 0.432638f,
+ -0.136564f, 0.137036f, 0.0375734f, 0.989246f, -0.126287f,
+ 0.111416f, -0.0271002f, 0.718755f, -0.0412969f, 0.00645681f,
+ 0.253811f, -0.0186998f, 0.691971f, -0.282042f, -0.0783915f,
+ 0.274592f, -0.358449f, 0.34155f, -0.186374f, -0.136907f,
+ -0.192334f, -0.251168f, -0.100874f, -0.166578f, -0.336507f,
+ 0.402373f, 0.173695f, 0.108788f, 0.00885581f, -0.310063f,
+ 1.05545f, 0.0295867f, 0.180785f, -0.173469f, -0.469924f,
+ -0.224155f, 0.665862f, -0.126546f, 0.240691f, -0.0415301f,
+ -0.598534f, 0.0012723f, -0.122297f, -0.558947f, 0.268844f,
+ 0.241193f, 0.0524422f, -0.1683f, 0.575588f, -0.139012f,
+ 0.0636691f, -0.446709f, -0.094532f, 0.883809f, -0.112981f,
+ -0.224047f, 0.0811193f, -0.140571f, -0.09683f, -0.0796143f,
+ -0.102246f, -0.863392f, -0.0755124f, 0.23125f, -0.0301361f,
+ -0.153029f, -0.172238f, -0.0286382f, -0.338495f, -0.317216f,
+ -0.146629f, -0.242264f, -0.702306f, -0.285052f, 0.0623479f,
+ 0.265735f, 0.00674475f, 0.666196f, 0.883586f, 0.278416f,
+ -0.341692f, -0.509931f, -0.156263f, 0.635885f, -0.544143f,
+ -0.572632f, -0.213285f, 0.443396f, -0.268329f, 0.0638439f,
+ -0.185397f, 0.071126f, 0.386503f, -0.402212f, -0.140784f,
+ -0.411661f, 0.049398f, -0.0672907f, -0.267034f, -0.0560875f,
+ 0.0607937f, 0.0445484f, -0.547651f, 0.574718f, 0.417189f,
+ -0.0610166f, 0.0632293f, 0.391619f, -0.00671215f, -0.136883f,
+ -0.339346f, 0.0356183f, 0.511993f, 0.178676f, 0.286998f,
+ 0.136511f, -0.00796929f, 0.203985f, 0.0423532f, -0.175196f,
+ 0.378534f, 0.770417f, 0.593778f, 0.0256067f, -0.82394f,
+ -0.500691f, -0.425725f, -0.623708f, -0.0406241f, -0.00226464f,
+ 0.0207836f, 0.30732f, -0.00784268f, 0.0065445f, -0.0991039f,
+ -0.20871f, -0.206835f, 0.281219f, 0.119361f, 0.259346f,
+ -0.102713f, 0.186488f, -0.034455f, -0.00198392f, -0.279107f,
+ -0.638993f, -0.374404f, -0.48601f, -0.262345f, 0.624532f,
+ 0.620632f, -0.227014f, 0.433579f, -0.0455096f, 1.22123f,
+ -0.429156f, 0.12396f, 0.0815152f, -0.0837355f, 0.0282623f,
+ -0.407475f, 0.787321f, -0.434974f, 0.312904f, -0.230805f,
+ 0.213042f, -0.250929f, 0.302997f, -0.354709f, 0.0504905f,
+ -0.561706f, 0.595558f, 0.374951f, 0.802969f, -0.674902f,
+ 0.33136f, 0.156606f, 0.0218968f, -0.694188f, -0.0221949f,
+ -0.00639123f, 0.0146536f, 0.0104145f, 0.021635f, -0.0499428f,
+ -0.575116f, -0.239035f, -0.0588276f, 0.599722f, 0.541932f,
+ 0.437433f, 0.716268f, 0.193207f, 0.548351f, 0.326951f,
+ -0.197124f, 0.0355353f, -0.0952009f, -0.217265f, -0.389789f,
+ 0.0528124f, -0.21334f, -0.190296f, -1.17367f, 0.108905f,
+ 0.109397f, -0.0192577f, 0.0343813f, 0.085004f, -0.0556737f,
+ -0.0411158f, -0.534989f, 0.0361896f, 0.124415f, 0.291603f,
+ -0.0311974f, -0.326726f, 0.343131f, 0.0276456f, -0.231827f,
+ -0.373894f, -0.208898f, -0.273011f, 0.061323f, -0.0910538f,
+ -0.30746f, -0.108644f, -0.190736f, 1.58048f, -0.0739711f,
+ -0.0623489f, -0.137967f, -0.0601359f, -0.133004f, -0.0857153f,
+ 0.00955987f, -0.365561f, -0.0329051f, 0.463463f, 0.14758f,
+ -0.512256f, -0.227463f, -0.26008f, -0.567777f, 0.0646234f,
+ 1.02161f, 0.66157f, -0.16733f, 0.264921f, -0.242036f,
+ 0.214622f, 0.0712054f, -0.260377f, 0.0849665f, 0.735094f,
+ 0.11001f, 0.297301f, -0.333342f, 0.066978f, -0.123625f,
+ 1.07596f, 0.401263f, 0.0800875f, -0.340862f, -0.115587f,
+ -0.32692f, -0.300842f, 0.0277397f, 0.0630788f, -0.261198f,
+ 0.428695f, -0.0544757f, -0.124511f, 0.036992f, 0.126322f,
+ 0.0317603f, 0.0820762f, 0.117277f, -1.14594f, -0.108076f,
+ -0.0258198f, -0.00337525f, -0.00512531f, 0.1274f, -0.0660535f,
+ -0.640733f, 0.197142f, 0.147278f, 0.489271f, 0.226507f,
+ -0.0668414f, 0.0946318f, 0.0994164f, -0.820516f, 0.512939f,
+ -0.305172f, -0.715187f, -0.195125f, 0.279346f, 0.462144f,
+ 0.913882f, -0.453879f, 0.0582033f, -0.462866f, 0.0538736f,
+ 0.0115737f, 0.00626993f, -0.0185185f, 0.0114601f, -0.0181164f,
+ 0.41588f, -0.0447331f, 0.611756f, 0.43385f, 0.834465f,
+ 0.122019f, -0.352983f, 0.340429f, -0.245425f, -0.365328f,
+ -0.521825f, 0.0371057f, 0.172188f, -0.387949f, 0.221054f,
+ 0.0126359f, 0.422958f, 0.584198f, -0.581498f, -0.019466f,
+ -0.0271737f, -0.0740885f, 0.00540879f, 0.186086f, -0.0324402f,
+ -0.563462f, -0.458759f, -0.425296f, -0.0118862f, -0.641508f,
+ 0.0132084f, 0.0581128f, 0.0231444f, 0.468587f, 0.258838f,
+ 0.0296665f, 0.0562801f, 0.630014f, 0.381816f, -0.269761f,
+ -0.135515f, 0.046186f, 1.07632f, -0.050616f, 0.104987f,
+ 0.29991f, 0.119316f, 0.117248f, 0.0795009f, 0.242573f,
+ 0.0416634f, -0.0577639f, -0.0974078f, 0.106255f, -0.13098f,
+ 0.0141486f, -0.00418257f, 0.144848f, -0.463934f, 0.0452591f,
+ 0.252617f, 0.205222f, -0.189843f, 0.0652245f, -0.135386f,
+ 0.0500646f, -0.200368f, -0.0142312f, -0.0286832f, -0.254355f,
+ -1.02752f, -0.73549f, 0.0364518f, 0.0416227f, -0.13185f,
+ -0.0886515f, -0.502314f, -0.102916f, 0.410911f, -0.355655f,
+ 0.400416f, -0.340217f, 0.208829f, 0.245972f, 0.149739f,
+ -0.49458f, 0.589482f, 0.550827f, 0.912709f, -0.351275f,
+ -0.128076f, -0.285172f, -0.672752f, 0.090583f, -0.245286f,
+ -0.737297f, -0.201515f, -0.025122f, -0.109854f, 0.36738f
+};
+
+static const float av1_simple_motion_search_prune_part_logits_bias_64[] = {
+ 0.346819f, 0.442965f, -0.0216032f, 0.0229235f, -0.402797f,
+ -0.666074f, -0.455388f, -0.00353411f, -0.595511f, -0.845667f
+};
+
+static const NN_CONFIG av1_simple_motion_search_prune_part_nn_config_64 = {
+ NUM_FEATURES_64,
+ NUM_LOGITS_64,
+ NUM_HIDDEN_LAYERS_64,
+ {
+ NUM_LAYER_0_UNITS_64,
+ },
+ {
+ av1_simple_motion_search_prune_part_layer_0_kernel_64,
+ av1_simple_motion_search_prune_part_logits_kernel_64,
+ },
+ {
+ av1_simple_motion_search_prune_part_layer_0_bias_64,
+ av1_simple_motion_search_prune_part_logits_bias_64,
+ },
+};
+
+#undef NUM_HIDDEN_LAYERS_64
+#undef NUM_FEATURES_64
+#undef NUM_LAYER_0_UNITS_64
+#undef NUM_LOGITS_64
+
+#define NUM_HIDDEN_LAYERS_32 1
+#define NUM_FEATURES_32 25
+#define NUM_LAYER_0_UNITS_32 28
+#define NUM_LOGITS_32 10
+
+static const float av1_simple_motion_search_prune_part_logits_kernel_32[] = {
+ 0.486581f, 0.340847f, -0.109226f, 0.467224f, -0.541561f,
+ 0.0943619f, -0.429442f, -0.207442f, 0.959963f, 0.618666f,
+ -0.0636751f, 0.144508f, -0.0278289f, 0.332293f, -0.751493f,
+ 0.245438f, -0.917758f, 0.612128f, -0.32648f, 0.534618f,
+ -0.615239f, 2.71641f, 0.233759f, 0.820558f, -0.249758f,
+ -0.427783f, -0.359361f, 0.0375732f, 0.806973f, 0.352512f,
+ -0.0532192f, 0.0576861f, -0.464178f, -0.334877f, -0.697042f,
+ 0.0538218f, 0.0919659f, -0.00765812f, 0.0603847f, -0.460315f,
+ 0.37979f, -0.0867612f, -0.670683f, -0.188619f, -0.570586f,
+ 0.233418f, 0.153581f, 0.290905f, -0.624885f, -0.557842f,
+ -0.555567f, 0.463773f, -0.123909f, -0.277731f, 0.0374468f,
+ 0.409903f, 0.287638f, -0.593066f, -0.223434f, 0.154263f,
+ -0.250464f, -0.077696f, 0.229652f, -0.304174f, 0.308053f,
+ 0.33155f, -0.502825f, 0.361216f, -0.499294f, 0.00595444f,
+ -0.307201f, 0.5766f, -0.438384f, -0.093701f, -0.118586f,
+ 0.202337f, -0.486623f, 0.261552f, 0.139756f, -0.655642f,
+ -0.0627001f, -0.213053f, -0.243037f, 0.205918f, 0.0718368f,
+ 0.188041f, 0.141529f, -0.132239f, 0.425827f, -0.218353f,
+ 0.153114f, 0.33268f, 0.0226116f, 0.167394f, 0.269854f,
+ -0.457001f, 0.1973f, -0.526087f, 0.467528f, 0.290934f,
+ 1.16267f, 0.0823663f, -0.754389f, -0.83716f, 0.270157f,
+ -1.41229f, 0.148511f, -0.286832f, 0.664796f, 0.492254f,
+ 0.360567f, -0.533993f, 0.0435672f, -0.103001f, 0.220668f,
+ 0.594621f, -0.0213356f, -0.347638f, -0.694457f, 0.0759505f,
+ 0.161358f, -0.389384f, -0.0455192f, -0.61252f, -0.174173f,
+ -0.00788878f, -1.22487f, 0.332233f, -0.0457021f, -0.225918f,
+ -0.197657f, -0.115408f, -0.240589f, -2.05681f, 0.00914629f,
+ -1.92213f, 0.0268578f, -0.49076f, -0.0120123f, 0.291157f,
+ 0.267116f, -0.0775724f, 0.181115f, -0.392441f, -0.488114f,
+ -0.28842f, -0.115465f, 0.128974f, -0.0829899f, -0.14096f,
+ -0.140145f, -0.700281f, 0.0368945f, -0.437598f, 0.243485f,
+ -1.00301f, 0.332324f, 0.125014f, -0.0604481f, -0.0652028f,
+ -0.207295f, -1.0209f, -0.341525f, 0.191326f, -0.147578f,
+ 0.0878327f, 0.129827f, -0.0848319f, 0.187381f, -1.28663f,
+ 0.00537885f, -0.134277f, -0.0411126f, -0.3434f, -0.0456494f,
+ 0.37861f, 0.409095f, 0.237177f, -0.396855f, -0.205418f,
+ -1.31701f, -0.319032f, -0.123404f, -0.240005f, -0.305206f,
+ -0.0258176f, -0.26367f, -0.142396f, 0.191672f, -1.44061f,
+ 0.0554776f, -0.571839f, -0.284789f, -0.425677f, -0.0307376f,
+ 0.20275f, -0.223146f, 0.144612f, 0.0212636f, 0.0238303f,
+ -0.253802f, -0.188922f, -0.0637066f, -0.340836f, 0.124774f,
+ 0.130474f, -0.154099f, -0.0292733f, 0.158148f, -0.246989f,
+ -0.259059f, 0.220224f, 0.228449f, -0.41956f, -0.321848f,
+ -0.2396f, -0.316449f, -1.3363f, 0.0264099f, -1.46865f,
+ 0.113073f, 0.0722885f, -0.166986f, -0.164877f, 0.0360911f,
+ 0.534472f, -0.551152f, -0.328501f, 0.0781121f, -0.378112f,
+ -0.459502f, 0.28015f, -0.212302f, -0.521641f, 0.618993f,
+ -0.347709f, 0.266253f, -0.0280894f, 0.348511f, -0.0155031f,
+ -0.100693f, 0.0447673f, 0.277519f, -0.233998f, -0.0796738f,
+ -1.73644f, -0.160776f, 0.53092f, -0.180406f, 0.056447f,
+ 0.385356f, -0.262337f, -0.241479f, -0.271426f, -0.457354f,
+ -0.266788f, 0.367371f, -0.103065f, 0.47783f, -0.188327f,
+ -0.159636f, 0.00142907f, -0.409756f, 0.454889f, -0.24566f,
+ -0.0760084f, 0.286355f, 0.462102f, 0.0431695f, -0.127395f,
+ -0.200476f, -0.350557f, 0.217275f, -0.23975f, 0.255148f,
+ -0.280626f, 0.42476f, 0.157411f, 0.0358675f, -0.192591f
+};
+
+static const float av1_simple_motion_search_prune_part_layer_0_bias_32[] = {
+ 0.940498f, 0.15602f, -0.234831f, 0.0268585f, 0.144769f, 0.243081f,
+ 0.611406f, 0.366093f, 0.361868f, 0.39668f, 0.401479f, 0.369467f,
+ 0.0909503f, 0.710595f, 0.032786f, 0.525891f, -1.0232f, 0.732557f,
+ -0.064425f, 0.865222f, -0.042917f, -0.237191f, -0.527006f, -0.0172101f,
+ 0.59681f, -0.472405f, 0.0969218f, -0.250624f
+};
+
+static const float av1_simple_motion_search_prune_part_layer_0_kernel_32[] = {
+ 0.355607f, 0.126701f, -0.0825159f, 0.200675f, -0.011308f,
+ -0.280057f, 0.559816f, 0.142689f, 0.0422419f, -0.151692f,
+ -0.0275637f, -0.283101f, -0.20822f, -0.200394f, 0.465427f,
+ 0.344491f, -0.525319f, -0.358813f, -0.39767f, 0.0974486f,
+ 0.00559058f, -0.00546089f, 0.0506486f, 0.114475f, -0.0436463f,
+ -0.574152f, -0.376294f, 0.16563f, -0.0967032f, 0.00579838f,
+ 0.0639909f, -0.037129f, 0.407574f, -0.231428f, 0.489326f,
+ -0.221566f, -0.270382f, -0.784628f, -0.155502f, 0.481698f,
+ -0.0296057f, 0.431855f, 0.840807f, 0.112291f, 0.773874f,
+ -0.0610936f, -0.012892f, 0.365154f, 0.0267687f, -0.0751114f,
+ 0.25043f, 0.516472f, -0.186133f, -0.12762f, -0.168804f,
+ -0.146309f, 0.139314f, -0.367113f, -0.601079f, 0.0559856f,
+ 0.176081f, 0.22397f, 0.434113f, 0.0363256f, 0.313051f,
+ 0.0143976f, 0.190076f, 0.474607f, -0.681134f, -0.0709097f,
+ -0.253289f, -0.216277f, -0.0593789f, -0.107795f, -0.194842f,
+ 0.513945f, 0.239171f, -0.720561f, 0.0136723f, -0.391147f,
+ -0.272043f, -0.164766f, 0.124248f, 0.147178f, -0.35497f,
+ 0.397725f, -0.117603f, 0.262937f, -0.331964f, 0.182418f,
+ 0.315671f, -0.0385649f, 0.488769f, -0.334568f, 0.00596018f,
+ 0.0661557f, -0.0446985f, -0.0928255f, -0.0221032f, -0.019045f,
+ -0.20881f, 0.197907f, -0.381881f, 0.0598071f, -0.0434551f,
+ 0.159283f, -0.110631f, 0.266996f, -0.0265494f, 0.135199f,
+ -0.00833162f, 0.804482f, -0.114698f, -0.15066f, -0.479553f,
+ 0.448407f, -0.344069f, -0.0280952f, -0.208211f, -0.102269f,
+ -0.679066f, -0.37476f, -0.0228875f, 0.0535049f, 0.111015f,
+ -0.18125f, -0.167584f, 0.0110497f, 0.262723f, -0.413839f,
+ -0.0611238f, 0.358499f, 0.0807514f, 0.208254f, 0.214499f,
+ 0.11137f, -0.14262f, -0.0513973f, 0.243718f, -0.373716f,
+ -0.00413366f, 0.216501f, -0.164149f, -0.064935f, -0.0840282f,
+ 0.0566148f, 0.0377686f, 0.289835f, 0.769388f, 0.891198f,
+ -0.592739f, 0.40744f, -0.153095f, 0.657311f, 0.140737f,
+ 0.28209f, 0.158344f, 0.353546f, 0.0868246f, 0.116887f,
+ 0.402004f, 0.437184f, 0.589219f, 0.760594f, -0.575419f,
+ -0.754308f, -0.709219f, -0.297814f, -0.418609f, -0.0262104f,
+ 0.0411959f, 0.0597708f, -0.143728f, -0.136642f, 0.099614f,
+ -0.257601f, -0.2404f, 0.305893f, 0.254009f, -0.0301398f,
+ -0.0653091f, -0.459002f, -0.163404f, 0.123152f, -0.0284252f,
+ -0.457272f, 0.00788622f, -0.828399f, -0.0534199f, 0.586877f,
+ 0.982728f, 0.424581f, 0.0891856f, 0.383182f, -0.122053f,
+ 0.0808408f, -0.00384914f, -0.0560201f, -0.0524772f, -0.263444f,
+ -0.239287f, -0.882777f, 0.0180592f, -0.0948711f, -0.177946f,
+ 0.0296473f, 0.096082f, 0.0455604f, -0.108608f, 0.00777951f,
+ -0.140896f, 0.117187f, -0.342467f, -0.0691604f, 0.0761611f,
+ -0.0892053f, 0.111386f, -0.167456f, 1.40616f, -0.00478793f,
+ 0.00547665f, -0.0441829f, 0.0151323f, -0.0674099f, -0.0380578f,
+ 0.16072f, 0.31882f, 0.245486f, -0.424318f, 0.101845f,
+ -0.203343f, -0.197402f, -0.163025f, -0.0771961f, -0.264435f,
+ 0.319429f, 0.250076f, 0.782726f, 0.386003f, 0.00700673f,
+ -0.375715f, 0.151453f, -0.296265f, -0.560183f, -0.00767249f,
+ -0.109593f, -0.119419f, -0.0161516f, 0.0380283f, -0.156417f,
+ 0.131708f, 0.396268f, -0.221796f, 0.232099f, 0.128852f,
+ 0.0567268f, 0.297297f, 0.173269f, 0.213411f, 0.0384426f,
+ -0.290985f, -0.0426841f, -0.488292f, -0.087101f, -0.311582f,
+ 0.83009f, -0.153163f, 0.903335f, -1.15644f, -0.0378635f,
+ -0.0552129f, -0.126362f, -0.176945f, 0.0653115f, 0.0989368f,
+ -0.333543f, -0.330586f, 0.29775f, -0.103535f, 0.210824f,
+ -0.00300509f, 0.317105f, 0.216852f, 0.479718f, 0.0485808f,
+ -0.15662f, 0.718199f, 0.327513f, 0.115169f, -0.423598f,
+ -0.456633f, -0.575814f, -0.494454f, 0.304411f, 0.0493055f,
+ -0.381171f, 0.467251f, -0.122872f, -0.167441f, 0.017253f,
+ -0.0583646f, -0.1586f, 0.214046f, -0.0284424f, -0.217112f,
+ 0.606567f, -0.107533f, 0.36615f, -0.0709227f, 0.604761f,
+ -0.244657f, -0.296651f, -0.595611f, -0.156629f, -0.693468f,
+ -0.310603f, 0.499272f, 0.282941f, 0.295043f, -0.178704f,
+ 0.281186f, 0.014329f, -0.120819f, 0.154234f, 0.0131325f,
+ -0.472231f, -0.631281f, 0.422955f, 0.711432f, -0.118025f,
+ 0.0864996f, 0.343971f, -0.301477f, -0.246638f, 0.165068f,
+ 0.218044f, 0.224236f, -0.0848522f, 0.00671216f, 0.401141f,
+ -0.218857f, -0.0298495f, -0.135725f, -0.377618f, 0.022473f,
+ 0.106955f, -0.0582005f, 0.0468484f, -0.0217442f, 0.130911f,
+ -0.0926905f, 0.383007f, -0.159353f, -0.222711f, -0.0286419f,
+ 0.372315f, -0.469095f, 0.797571f, -0.301315f, 0.239327f,
+ -0.997507f, -0.363409f, 0.353717f, 0.676686f, -0.0500028f,
+ 0.0638539f, -0.431927f, 0.243852f, 0.000884826f, -0.00166585f,
+ 0.0613292f, -0.029558f, -0.0248432f, -0.0125607f, -0.0309674f,
+ -0.743308f, 0.0409806f, 0.0921015f, 0.167816f, 0.406849f,
+ 0.095677f, 0.0308913f, 0.139956f, -0.400472f, 0.396617f,
+ 0.936517f, 0.355057f, -0.423816f, -0.232472f, -0.220188f,
+ -0.399746f, -0.409623f, -0.158797f, 0.361153f, 0.0327019f,
+ 0.0690844f, -0.032197f, 0.0248558f, 0.00438518f, 0.0222724f,
+ -0.326832f, -0.314295f, 0.156563f, 0.0562703f, 0.332694f,
+ 0.299424f, 0.228206f, 0.322038f, 0.0136098f, 0.0060297f,
+ -0.165851f, -0.306512f, 0.0796508f, -0.37158f, 0.239395f,
+ -0.349442f, 0.198515f, -0.253854f, -1.13694f, 0.0202873f,
+ -0.0504009f, -0.130528f, -0.017126f, -0.0370001f, -0.087458f,
+ -0.119952f, -0.130404f, 0.0333733f, -0.184736f, 0.182162f,
+ 0.227776f, -0.166563f, -0.156162f, 0.118215f, -0.220183f,
+ 0.00474779f, -0.107792f, 0.260493f, 0.11884f, 0.156587f,
+ 0.303936f, -0.131788f, -0.314774f, 0.310606f, 0.0935523f,
+ 0.790767f, 0.26461f, 0.0236426f, 0.0629469f, 0.0344072f,
+ -0.151513f, 0.211498f, 0.0245435f, 0.0629973f, 0.052019f,
+ -0.03308f, 0.123487f, 0.0885027f, 0.159172f, -0.0510615f,
+ 0.0298033f, -0.130515f, -0.121799f, -0.104915f, 0.208822f,
+ -0.310496f, -0.314106f, 0.303307f, -0.0196736f, 0.0420045f,
+ 0.461777f, -0.433699f, 0.00345407f, 0.703139f, -0.655637f,
+ -0.210767f, -0.201278f, 0.163694f, -0.236534f, 0.300877f,
+ 0.0769982f, -0.282453f, 0.149721f, -0.0303466f, -0.191473f,
+ -0.406056f, -0.213472f, 0.1619f, -0.245953f, 0.00544399f,
+ -0.121434f, 0.193012f, -0.307165f, 1.45431f, -0.161468f,
+ -0.12444f, -0.146129f, -0.0528212f, -0.0925165f, -0.134528f,
+ -0.479475f, 0.315525f, 0.133845f, 0.382158f, -0.0799693f,
+ -0.151041f, 0.255772f, 0.409536f, -0.240663f, -0.323741f,
+ -0.205876f, 0.03699f, -0.217541f, 0.108511f, 0.640628f,
+ 0.705993f, -0.423899f, -0.78314f, -0.100733f, -0.00859087f,
+ 0.0251879f, 0.0458335f, 0.00210128f, -0.047576f, -0.0560518f,
+ -1.23869f, -0.829914f, 0.0346551f, 0.350505f, 0.193688f,
+ 0.459154f, 0.137898f, 0.503818f, 0.260867f, 0.649539f,
+ 0.0150802f, 0.0239274f, -0.276069f, -0.0621478f, -0.193106f,
+ -0.0375665f, -0.654529f, 0.189493f, 0.446625f, -0.0208265f,
+ 0.019838f, -0.0201955f, 0.00180428f, -0.0110678f, -0.0172414f,
+ 0.0276489f, -0.252882f, -0.0351807f, -0.0518874f, 0.279098f,
+ -0.245122f, 0.101287f, -0.114202f, -0.0812187f, 0.572429f,
+ -0.0821731f, 0.564183f, 0.0222552f, 0.190111f, -0.0417497f,
+ -0.00385925f, -0.182995f, -0.240482f, -0.291572f, -0.0450444f,
+ 0.0962974f, -0.165973f, -0.0954637f, -0.163841f, -0.833405f,
+ -1.31541f, -0.336473f, -0.0920702f, 0.816105f, 0.393377f,
+ 0.0340241f, -0.0844545f, 0.61729f, -0.17596f, 0.241149f,
+ -0.42825f, -0.59091f, -0.290702f, 0.0796465f, 0.0982819f,
+ 0.466934f, 0.261666f, 0.0373333f, 0.332509f, -0.0266694f,
+ -0.0476951f, -0.00642167f, -0.0132542f, -0.000320841f, 0.00475532f,
+ 0.000502778f, 0.296534f, -0.13297f, -0.113082f, -0.327923f,
+ 0.35901f, -0.302246f, 0.189799f, -0.37994f, 0.16107f,
+ -0.20414f, 0.548575f, -0.460821f, 0.591878f, -0.213113f,
+ -0.169373f, -0.07332f, 0.228841f, 0.682302f, -0.0665316f,
+ -0.142456f, -0.0873117f, 0.00607451f, 0.0376443f, 0.0536673f,
+ -0.0109536f, -0.400279f, 0.550058f, 0.820871f, -0.666373f,
+ -0.471962f, -0.315925f, -0.313142f, 0.952742f, 0.473928f,
+ -0.119006f, 0.153241f, -0.0383078f, 0.631869f, -0.343423f,
+ -0.233473f, -0.218195f, -0.077688f, -0.728291f, 0.0382408f,
+ -0.00662886f, -0.0419666f, 0.0309776f, -0.0281592f, 0.0154229f,
+ -0.198534f, 0.0206324f, 0.0152272f, -0.235067f, 0.0330486f,
+ 0.139198f, -0.0612118f, 0.133154f, -0.258675f, 0.0900275f,
+ -0.127771f, 0.157322f, -0.00767807f, -0.329258f, 0.327458f,
+ 0.0528581f, -0.181125f, 0.409995f, -0.162979f, -0.0193475f,
+ 0.186009f, 0.0519501f, 0.651877f, -0.37821f, -1.10341f,
+ -0.189776f, -0.0922788f, 0.460256f, 0.168011f, 0.440295f,
+ 0.478135f, 0.374573f, 0.384048f, 0.116953f, 0.68886f,
+ -0.427727f, -0.36676f, -0.500013f, -0.228685f, -0.218859f,
+ 0.208396f, -0.0173765f, -0.0680241f, -0.00538013f, -0.0674409f,
+ -0.092764f, 0.0295707f, -0.0462887f, -0.00636006f, 0.0334169f
+};
+
+static const float av1_simple_motion_search_prune_part_logits_bias_32[] = {
+ 0.176459f, 0.154405f, 0.281821f, 0.375264f, -0.882863f,
+ -0.240261f, -1.17075f, -0.280216f, -0.743836f, -0.317511f
+};
+
+static const NN_CONFIG av1_simple_motion_search_prune_part_nn_config_32 = {
+ NUM_FEATURES_32,
+ NUM_LOGITS_32,
+ NUM_HIDDEN_LAYERS_32,
+ {
+ NUM_LAYER_0_UNITS_32,
+ },
+ {
+ av1_simple_motion_search_prune_part_layer_0_kernel_32,
+ av1_simple_motion_search_prune_part_logits_kernel_32,
+ },
+ {
+ av1_simple_motion_search_prune_part_layer_0_bias_32,
+ av1_simple_motion_search_prune_part_logits_bias_32,
+ },
+};
+
+#undef NUM_HIDDEN_LAYERS_32
+#undef NUM_FEATURES_32
+#undef NUM_LAYER_0_UNITS_32
+#undef NUM_LOGITS_32
+
+#define NUM_HIDDEN_LAYERS_16 1
+#define NUM_FEATURES_16 25
+#define NUM_LAYER_0_UNITS_16 32
+#define NUM_LOGITS_16 10
+
+static const float av1_simple_motion_search_prune_part_logits_kernel_16[] = {
+ -0.520913f, 0.395611f, 0.0369091f, -0.318591f, -0.463252f,
+ 0.134992f, -0.43154f, -0.0739112f, -0.118817f, 0.476373f,
+ -0.281406f, 0.3413f, 0.456255f, 0.33307f, 0.2942f,
+ 0.1317f, 0.498113f, 1.95406f, -0.165726f, -0.219306f,
+ -0.302656f, -1.31157f, -0.433662f, 0.151716f, -0.214817f,
+ 0.504523f, -0.710049f, 0.359616f, -0.412695f, -0.103193f,
+ 0.341912f, 0.351378f, -0.181486f, 0.573862f, -0.0396254f,
+ -0.17855f, -0.276163f, 0.0367465f, -0.353905f, -0.204689f,
+ 0.309581f, -0.0439686f, -0.147855f, 0.152745f, 0.290871f,
+ 0.131049f, -0.27808f, -0.142997f, 0.207843f, -1.23074f,
+ -0.267714f, -0.336923f, 0.313781f, -0.61488f, -0.161984f,
+ 0.238059f, -0.0879942f, -0.085543f, -0.260156f, -0.13614f,
+ -0.242196f, 0.201216f, -0.248691f, 0.0936671f, -0.350522f,
+ -0.35002f, -0.156583f, -0.00579001f, 0.300578f, -0.341269f,
+ -0.290712f, 0.354802f, -0.31629f, 0.509107f, -0.236953f,
+ -0.0923519f, 0.544509f, -0.280991f, -0.017437f, -0.202721f,
+ -0.116388f, -0.7191f, 0.324586f, 0.254249f, 0.125505f,
+ 0.00658697f, -0.333322f, -0.126537f, -0.140004f, -0.0241202f,
+ -0.172466f, 0.210035f, -0.270833f, 0.0579044f, 0.0950352f,
+ -0.120382f, 0.063292f, -0.394925f, 0.482165f, 0.147753f,
+ 0.331465f, -0.187444f, 0.1083f, 0.414028f, 0.279238f,
+ -0.486889f, -0.674349f, -0.313656f, -0.131186f, -0.100662f,
+ 0.238191f, -1.19083f, -0.30667f, -2.4324f, 0.235311f,
+ 0.108605f, 1.67197f, 0.476157f, 0.30055f, 0.0839538f,
+ 0.408469f, -0.473517f, 0.560283f, -0.0188136f, 0.273824f,
+ -0.43707f, -0.0346978f, -0.438315f, -0.0196275f, -0.0567921f,
+ -0.220166f, 0.216175f, -0.0180461f, 0.0116429f, -0.0096949f,
+ -0.32613f, 0.176829f, -0.243563f, -0.240972f, -0.621819f,
+ -0.00619648f, -0.145525f, 0.124324f, -0.0306925f, 0.172208f,
+ -2.04631f, -0.200087f, -0.594135f, -0.352303f, -0.309826f,
+ 0.0922786f, -0.698371f, -0.0366823f, 0.0244036f, 0.338775f,
+ -0.115947f, 0.144971f, -0.0607037f, -0.762412f, 0.0125584f,
+ -0.262427f, -0.0830273f, -0.291252f, -0.176059f, -0.203983f,
+ 0.0871455f, -0.0894925f, 0.0426263f, -0.060001f, -0.542355f,
+ -0.407837f, -0.0419273f, 0.226608f, -0.114844f, 0.158733f,
+ -0.187237f, 0.113163f, -1.86337f, -0.367544f, -0.547048f,
+ -0.24192f, -0.226764f, 0.090912f, 0.819604f, 0.433766f,
+ -0.841657f, 0.446987f, -0.622761f, -0.0296385f, -0.130176f,
+ -0.0518136f, -0.640326f, -0.330107f, -0.137832f, -0.0119033f,
+ 0.39401f, 0.111331f, -0.141367f, -0.230289f, 0.171054f,
+ -0.924059f, -0.107317f, -0.347983f, 0.0261109f, 0.423002f,
+ -0.305817f, 0.247696f, 0.0436002f, 0.0305862f, -1.52448f,
+ -0.595587f, -0.155552f, -1.11949f, -0.513937f, 0.138347f,
+ -0.301487f, 0.352144f, -0.615801f, 0.0326701f, -0.215322f,
+ -0.0608176f, -0.416557f, -0.306073f, -0.441512f, -0.0569277f,
+ -0.709768f, -0.602527f, -0.311134f, 0.152471f, -0.255299f,
+ 0.354505f, 0.194464f, 0.0144251f, 0.110732f, -0.4452f,
+ -0.804814f, 0.205325f, -0.0957486f, 0.502684f, 0.09112f,
+ -0.533087f, -1.77979f, 0.556992f, -0.176157f, -0.642633f,
+ 0.11553f, -0.232561f, 0.161277f, -0.0631125f, -0.20759f,
+ 0.489253f, -0.067533f, 0.0231024f, -0.179831f, -0.272985f,
+ -0.390059f, 0.3089f, 0.185733f, -0.257065f, -0.508838f,
+ -0.550028f, 0.0665621f, -0.138288f, -0.413188f, 0.191193f,
+ -1.32969f, -0.431025f, 0.270242f, -0.340062f, 0.0817257f,
+ 0.0376051f, -0.18633f, 0.0828274f, 0.00670051f, -0.431295f,
+ -0.450316f, -0.173042f, -0.322248f, 0.370628f, 0.10019f,
+ 0.317293f, -0.266613f, 0.0752441f, -0.425656f, -0.112223f,
+ 0.557991f, -0.324368f, -0.195261f, -0.0526129f, -0.807472f,
+ -0.387466f, 0.192186f, 0.353213f, -0.120238f, 0.107686f,
+ 0.200678f, -0.75363f, 0.466857f, -0.282345f, -0.0849236f,
+ -0.0490695f, -0.00643182f, 0.123047f, -0.207805f, -0.130456f,
+ -1.09455f, 0.340973f, 0.334784f, 0.0706643f, -1.65681f,
+ -0.319952f, -0.198514f, -0.0787972f, 0.089524f, 0.0531034f,
+ -0.202705f, -0.0852339f, -0.62572f, -0.0734234f, -0.838088f
+};
+
+static const float av1_simple_motion_search_prune_part_layer_0_bias_16[] = {
+ -0.0616197f, 0.939947f, 0.521161f, 0.213886f, 0.130324f, -0.127443f,
+ -0.0538715f, 0.708746f, 0.445031f, 0.418781f, -0.114539f, 0.521941f,
+ 1.13719f, 0.606545f, -0.32193f, -0.150788f, 0.158487f, -0.224005f,
+ 0.654715f, 0.115729f, -0.286506f, -2.06223f, 0.0117697f, 0.503905f,
+ -0.102339f, 0.653256f, -0.813561f, 0.905235f, -0.417269f, -0.206265f,
+ 0.661496f, 0.95533f
+};
+
+static const float av1_simple_motion_search_prune_part_layer_0_kernel_16[] = {
+ -0.203489f, 0.00686229f, -0.161414f, 0.0637276f, 0.27516f,
+ 0.512219f, 0.164205f, 0.00326062f, -0.41914f, -0.400334f,
+ 0.554419f, 0.715772f, -0.295569f, -0.703503f, 0.0137744f,
+ -0.0934259f, 0.174234f, -0.148618f, -0.0360558f, -0.0986598f,
+ -0.138502f, -0.0770713f, 0.122922f, -0.00784415f, 0.0953234f,
+ -0.255754f, -0.310967f, 0.185306f, 0.464554f, 0.147338f,
+ -0.0612304f, 0.164783f, 0.301097f, 0.161364f, -0.12723f,
+ -0.0265984f, -0.471361f, 0.0578776f, -0.362865f, 0.425789f,
+ 0.402758f, -0.190235f, 0.00549738f, -0.570908f, 1.27206f,
+ 0.048868f, -0.0097675f, 0.0708324f, 0.0456103f, 0.0149062f,
+ -0.563032f, -0.420573f, 0.107278f, 0.0938258f, 0.142712f,
+ -0.00251036f, -0.250583f, 0.522272f, 0.0113175f, 0.126751f,
+ -0.433028f, -0.035542f, -0.536686f, -0.0668722f, 0.253094f,
+ 0.254007f, -0.435505f, 0.343001f, 0.0531542f, -0.361914f,
+ -0.102664f, 0.0404874f, 0.132686f, 0.0762298f, 0.0236971f,
+ -0.419454f, 0.230877f, -0.223714f, 0.037813f, 0.0818604f,
+ 0.383705f, -0.235028f, -0.0554801f, 0.429851f, 0.0845829f,
+ 0.166295f, 0.355111f, -0.421197f, 0.298949f, 0.0218224f,
+ 0.445705f, -0.392217f, -0.429578f, -0.076276f, -0.0963531f,
+ -0.631425f, -0.225977f, 8.06349e-06f, 0.0676679f, 0.0779651f,
+ 0.0706891f, 0.101377f, 0.517103f, 0.0945502f, -0.52522f,
+ -0.312022f, 0.0358089f, 0.616509f, -0.0507444f, -0.465814f,
+ -0.0326024f, 0.591298f, 0.188544f, -0.0633316f, -0.199987f,
+ 0.403118f, -0.511281f, -0.696263f, 0.112996f, 0.103875f,
+ 0.0495595f, -0.0107449f, 0.521539f, -0.0123823f, -0.0642751f,
+ 0.08548f, -0.0679207f, 0.526558f, 0.0651114f, -0.342643f,
+ -0.349934f, 0.307437f, 0.368763f, -0.194851f, -0.134117f,
+ 0.102448f, -0.0520666f, 0.0415824f, -0.175085f, 0.272685f,
+ 0.0675856f, 0.120627f, 0.391408f, -0.135249f, -0.357024f,
+ 0.019666f, -0.0622677f, 0.407427f, 0.22655f, -0.129432f,
+ -0.165327f, 0.004893f, 0.5479f, 0.0613981f, -0.479682f,
+ -0.144228f, -0.130106f, 0.206458f, -0.342086f, 0.12691f,
+ -0.113554f, 0.231164f, -0.051419f, 0.0401286f, -0.560429f,
+ -0.070609f, 0.420232f, 0.442465f, -0.237501f, -0.000293732f,
+ -1.017f, -0.210222f, 0.0157063f, 0.0488178f, 0.0734721f,
+ -0.52626f, -0.276441f, -0.521579f, 0.443532f, -0.0819051f,
+ -0.0732633f, -0.17999f, 0.258525f, -0.0374872f, 0.150115f,
+ 0.0510939f, 0.168116f, 0.473372f, 0.824489f, 0.302195f,
+ -0.348613f, 0.238569f, 0.176444f, -0.633945f, -0.0567195f,
+ -0.0305827f, -0.0551851f, 0.85822f, -0.0628099f, 0.0364294f,
+ -0.234823f, 0.179067f, 0.143208f, -0.0511014f, -0.404191f,
+ 0.428035f, 0.0235506f, 0.371991f, -0.312909f, 0.550933f,
+ -0.389265f, -0.271813f, -0.293461f, -0.583752f, 0.179991f,
+ 0.191698f, 0.659094f, 1.07941f, -0.509555f, -0.100638f,
+ 0.079988f, -0.0519107f, -0.112723f, -0.0663326f, 0.0353569f,
+ -0.795055f, -0.465999f, 0.283579f, 0.340913f, 0.152738f,
+ 0.294664f, 0.527839f, 0.187735f, 0.359461f, 0.164629f,
+ 0.107512f, 0.390402f, 0.236702f, 0.114674f, -0.525655f,
+ -0.555476f, -0.6589f, -0.266601f, -0.0946547f, 0.6306f,
+ 0.0248513f, 0.038497f, 0.432706f, -0.0715465f, 0.0410172f,
+ -0.115313f, -0.428684f, 0.136283f, 0.0913185f, 0.11277f,
+ 0.0968689f, -0.00437052f, 0.0888981f, 0.10304f, 0.02442f,
+ -0.211315f, 0.00981596f, -0.0974827f, 0.208611f, 0.140644f,
+ 0.0315567f, 0.350332f, -0.291049f, -0.0715449f, -0.352992f,
+ -0.858004f, 0.828658f, 0.439092f, 0.0151291f, 0.0503828f,
+ 0.0656112f, -0.710749f, -0.0951757f, 0.193908f, 0.00908018f,
+ 0.141486f, -0.0657711f, 0.099791f, 0.153729f, -0.419576f,
+ -0.892636f, -0.0449268f, -0.170786f, -0.156564f, 0.384511f,
+ 0.296565f, 0.0569815f, -0.103938f, 1.27479f, -0.0406475f,
+ 0.154083f, -0.186442f, 0.0282588f, 0.0312102f, -0.188994f,
+ 0.284243f, -0.564693f, 0.425525f, -0.00924596f, 0.810003f,
+ 0.233812f, -0.0180273f, 0.121082f, -0.209096f, 0.151437f,
+ 0.286921f, -0.348095f, 0.174813f, -0.413798f, 0.108994f,
+ -0.34266f, -0.0337981f, -0.459f, -0.409812f, -0.0890104f,
+ 0.0834802f, -0.00259191f, -0.105914f, -0.164207f, 0.0697689f,
+ -0.312098f, -0.00650536f, -0.486758f, -0.248486f, 0.24314f,
+ -0.0857144f, 0.0884781f, -0.65615f, -0.121744f, 0.0709335f,
+ -0.0237193f, 0.10764f, -0.0409452f, -0.0824305f, 0.42329f,
+ 0.138258f, 0.502607f, 0.228545f, 0.0687789f, 0.0361586f,
+ 0.39074f, 0.0722654f, -0.0133148f, 0.283278f, 0.0743384f,
+ 0.310292f, -0.297675f, -0.359935f, 0.521021f, -0.10082f,
+ -0.272333f, 0.0120283f, 0.138118f, -0.123711f, -0.0711386f,
+ 0.0170747f, 0.831039f, 0.0509626f, 0.790608f, -0.0863406f,
+ -0.31962f, 0.0631013f, 0.0873453f, -0.472331f, -0.0826027f,
+ -0.241722f, 0.148835f, -0.131611f, 0.000195347f, -0.0615804f,
+ -0.838663f, -0.586979f, 0.247713f, 0.362254f, 0.492727f,
+ -0.132163f, 0.0516545f, 0.477838f, -0.0395182f, 0.0124993f,
+ -0.771514f, 0.0386912f, -0.118525f, -0.346172f, -0.265905f,
+ -0.175257f, -0.406287f, 0.393837f, 0.409096f, -0.408501f,
+ -0.0207146f, 0.0487809f, 0.0636982f, 0.0276368f, 0.0878249f,
+ 0.0425889f, 0.0868633f, 0.17423f, -0.128217f, -0.477068f,
+ -0.321294f, 0.0393771f, 0.00812823f, -0.350529f, -0.129012f,
+ 0.439953f, 0.396662f, 0.410475f, -0.123129f, -0.565966f,
+ 0.0298635f, -0.614611f, -0.477514f, 0.453651f, 0.0617068f,
+ 0.0530563f, 0.0479074f, 0.213551f, 0.039034f, 0.0449095f,
+ -1.06868f, -1.2654f, -0.175482f, 0.595068f, -0.230095f,
+ 0.719838f, -0.272148f, 0.696564f, 0.0485396f, 0.468584f,
+ 0.0695439f, -0.0842122f, -0.228978f, 0.161397f, -0.000441421f,
+ -0.0297514f, -0.250599f, 0.196656f, 0.608423f, -0.0112096f,
+ 0.0236881f, -0.00167311f, 0.0040709f, 0.015495f, 0.00757698f,
+ -0.165886f, 0.359767f, -0.0214696f, 0.377208f, 0.0303547f,
+ 0.0657094f, 0.140775f, 0.21867f, -0.203922f, 0.263878f,
+ -0.0529099f, 0.202438f, -0.243226f, 0.156659f, -0.627056f,
+ -0.845036f, -0.500873f, 0.172588f, 0.402972f, -0.147734f,
+ 0.151792f, -0.075579f, 0.443519f, 0.0311335f, -0.0328222f,
+ -0.0299781f, 0.435956f, -0.0987376f, 0.288402f, 0.135902f,
+ -0.173584f, -0.186255f, 0.224524f, -0.249645f, 0.123702f,
+ -0.0846244f, 0.491317f, 0.544846f, 0.338677f, -0.258885f,
+ -0.617434f, -0.629003f, -0.347233f, 0.181262f, -0.0606015f,
+ -0.537766f, 0.215089f, -0.334527f, 0.0488534f, 0.0577997f,
+ -1.12431f, -0.932292f, -0.11559f, 0.573715f, 0.151128f,
+ 0.693818f, -0.16956f, 0.802591f, -0.231531f, 1.04318f,
+ -0.476417f, 0.293452f, -0.610136f, 0.27506f, -0.384012f,
+ 0.305366f, -0.0540464f, -0.337583f, -0.174285f, 0.157248f,
+ 0.0477345f, -0.0229535f, 0.0475766f, -0.00603319f, 0.00856119f,
+ -0.702893f, -0.0579673f, 0.183024f, -0.166222f, 0.109763f,
+ -0.148019f, -0.258873f, -0.0820157f, -0.186716f, -0.449265f,
+ -0.0534138f, 0.15732f, 0.46357f, 0.00502591f, -0.0282085f,
+ 0.152277f, -0.855199f, -0.357115f, 0.0366159f, 0.0131101f,
+ -0.0407758f, 0.0462835f, 0.146309f, -0.00276278f, -0.0591814f,
+ -0.109437f, 0.506764f, -0.044421f, 0.465907f, 0.114444f,
+ -0.241053f, -0.362649f, -0.432615f, 0.199989f, -0.00635866f,
+ -0.521886f, 0.0958924f, -0.485725f, 0.0430527f, 0.069746f,
+ 0.681091f, -0.288144f, 0.505671f, 0.0489065f, -0.0373836f,
+ 0.266079f, 0.145173f, -0.011481f, -0.225074f, -0.754501f,
+ -0.122939f, -0.294213f, 0.334738f, 0.281561f, 0.558977f,
+ -0.21551f, -0.346507f, -0.0625635f, 0.0782034f, -0.236999f,
+ -0.803783f, -0.601117f, 0.091192f, 0.636122f, -0.250626f,
+ 0.0354961f, 0.103915f, 0.508571f, 0.329911f, -0.0425999f,
+ -0.0867587f, -0.0385824f, 1.13914f, -0.0261992f, 0.00484478f,
+ 0.124603f, -0.012173f, -0.377358f, -0.243563f, 0.236094f,
+ 0.145663f, -0.132752f, 0.347497f, -0.529315f, 0.271632f,
+ -0.372805f, 0.0261836f, 0.126169f, 0.0941008f, 0.283773f,
+ 0.765701f, -0.226477f, -0.181549f, -0.306896f, 0.110165f,
+ -0.0784234f, -0.0827892f, -0.0374252f, -0.0950872f, -0.451015f,
+ -0.995793f, -0.452663f, 0.293338f, -0.380865f, 0.032683f,
+ 0.0178248f, 0.0699194f, -0.0811722f, -0.0866096f, 0.139289f,
+ 0.296604f, 0.192293f, -0.0589607f, -0.179878f, 0.00360266f,
+ -0.0905794f, 0.136744f, -0.191555f, 1.31877f, -0.0592033f,
+ -0.158766f, 0.0214746f, -0.190113f, -0.116671f, 0.0449292f,
+ -0.109533f, -0.709307f, 0.386424f, 0.40201f, 0.262211f,
+ -0.155244f, 0.233988f, -0.0166317f, 0.462665f, 0.0484462f,
+ 0.210902f, -0.352798f, 0.38698f, -0.228261f, -0.084309f,
+ -0.220751f, -0.170879f, -0.352617f, -1.24277f, 0.266004f,
+ -0.0125749f, -0.0380073f, 0.101838f, -0.0483024f, -0.0629178f,
+ -0.0695577f, -0.103439f, 0.242131f, -0.0796858f, 0.349718f,
+ -0.332045f, 0.0138352f, -0.380235f, -0.28717f, -0.176276f,
+ 0.865903f, 0.36593f, 0.243925f, -0.422289f, -0.117327f,
+ 0.21876f, 0.245393f, -0.426134f, -0.186077f, 0.0352515f,
+ -0.123742f, 0.249376f, 1.3281f, 0.0707771f, 0.071415f,
+ -0.286827f, -0.131691f, -0.270881f, -0.434378f, 0.376064f,
+ 0.35966f, 0.513374f, 0.439378f, -0.222716f, -0.5874f,
+ 0.487997f, -0.293271f, -0.184245f, -0.037256f, 0.17723f,
+ -0.438651f, 0.428184f, 0.112983f, -0.449287f, -0.0451963f,
+ 0.0854929f, 0.0735442f, -0.0148642f, -0.0586782f, -0.176455f,
+ -0.438979f, -0.127109f, 0.211478f, 0.388035f, -0.0372021f,
+ 0.220575f, 0.382144f, 0.302121f, 0.0857121f, 0.193445f,
+ -0.488858f, -0.195288f, -0.316184f, -0.314026f, -0.111956f,
+ 0.0744768f, 0.292709f, 0.30187f, -0.285506f, -0.105006f,
+ 0.0851402f, -0.082318f, 0.277518f, 0.725294f, -0.756304f,
+ 0.0155309f, -0.378542f, 0.293377f, -0.347252f, -0.338458f,
+ 0.221449f, -0.176443f, -0.131972f, 0.0129163f, -0.290649f,
+ 0.198596f, -0.0721333f, 0.620591f, 0.568736f, 0.174001f,
+ -0.205186f, -0.265606f, -0.249155f, 0.299163f, 1.11842f,
+ 0.17423f, 0.196417f, -0.014484f, 0.0735422f, 0.26329f,
+ 0.12284f, -0.750305f, -0.351337f, 0.121994f, -0.00542878f,
+ -0.295707f, -0.094124f, 0.300993f, 0.412408f, -0.170761f,
+ -0.0676329f, -0.106638f, -0.419785f, -0.43878f, 0.22421f,
+ 0.0339903f, 0.619851f, 0.0615381f, 0.514631f, 1.35424f,
+ -0.0679228f, -0.203457f, 0.131948f, -0.0041251f, -0.209054f
+};
+
+static const float av1_simple_motion_search_prune_part_logits_bias_16[] = {
+ 0.304025f, 0.131887f, 0.259279f, -0.561564f, -0.161729f,
+ -0.208036f, 0.102206f, -0.162937f, -1.42311f, -0.708305f
+};
+
+static const NN_CONFIG av1_simple_motion_search_prune_part_nn_config_16 = {
+ NUM_FEATURES_16,
+ NUM_LOGITS_16,
+ NUM_HIDDEN_LAYERS_16,
+ {
+ NUM_LAYER_0_UNITS_16,
+ },
+ {
+ av1_simple_motion_search_prune_part_layer_0_kernel_16,
+ av1_simple_motion_search_prune_part_logits_kernel_16,
+ },
+ {
+ av1_simple_motion_search_prune_part_layer_0_bias_16,
+ av1_simple_motion_search_prune_part_logits_bias_16,
+ },
+};
+
+#undef NUM_HIDDEN_LAYERS_16
+#undef NUM_FEATURES_16
+#undef NUM_LAYER_0_UNITS_16
+#undef NUM_LOGITS_16
+
+#define NUM_HIDDEN_LAYERS_8 1
+#define NUM_FEATURES_8 25
+#define NUM_LAYER_0_UNITS_8 32
+#define NUM_LOGITS_8 4
+
+static const float av1_simple_motion_search_prune_part_logits_kernel_8[] = {
+ -0.266303f, -0.387676f, 0.204501f, -0.120842f, -0.0752326f, 0.0337739f,
+ 0.0243477f, -0.356748f, 0.0143051f, -0.16403f, -0.139013f, 0.175003f,
+ -0.206754f, 0.349059f, 0.181763f, 0.212768f, -0.313783f, 0.182829f,
+ 0.00205376f, -0.939525f, -0.0992424f, 0.306254f, 0.083329f, -0.133137f,
+ -0.179022f, -0.0237902f, 0.0601026f, -0.216698f, -0.551149f, 0.081711f,
+ -0.442191f, 0.0680832f, -0.0353678f, 0.237704f, 0.23155f, -0.36097f,
+ 0.123389f, -0.288927f, 0.178133f, -0.152222f, -0.235648f, -0.0495293f,
+ -0.316522f, 0.034207f, 0.0463139f, -0.817825f, 0.417443f, -0.110984f,
+ -0.402371f, 0.0341694f, -0.37383f, 0.414532f, 0.093993f, 0.0039505f,
+ 0.0803175f, -0.511859f, -0.0154802f, 0.0979595f, 0.0909049f, -0.120938f,
+ -0.577382f, -0.155041f, -0.404295f, 0.122223f, -0.084703f, 0.00415336f,
+ 0.149135f, 0.113219f, 0.124236f, -0.240905f, 0.163909f, -0.154202f,
+ -0.208917f, 0.00200158f, -0.71796f, 0.105984f, -0.131996f, -0.539603f,
+ 0.223768f, -0.0710733f, -0.346679f, -0.0745909f, 0.171032f, 0.215701f,
+ 0.218519f, 0.105981f, -0.096209f, -0.166453f, -0.468894f, -0.401578f,
+ -0.239222f, 0.111382f, 0.38747f, -0.164734f, -0.175955f, 0.336621f,
+ -0.0305501f, -0.0576765f, 0.0672671f, -0.183692f, 0.412082f, -0.262951f,
+ -0.153429f, -0.128589f, -0.530472f, 0.0936412f, -1.08296f, -0.45147f,
+ 0.0714904f, -3.96842f, 0.438125f, -0.313945f, 0.231104f, -0.00183851f,
+ -0.0192768f, -0.637531f, -0.109296f, 0.0531702f, 0.00262162f, -0.615951f,
+ -0.546241f, -0.635305f, -0.0762367f, 0.0122019f, 0.423693f, -0.129142f,
+ -0.112242f, 0.295184f
+};
+
+static const float av1_simple_motion_search_prune_part_layer_0_bias_8[] = {
+ -2.16023f, -3.12831f, -0.213206f, -2.97875f, -1.83791f, -2.84713f,
+ -0.909636f, -2.05893f, 0.00525274f, -1.51672f, -3.95017f, 1.82847f,
+ -0.853224f, -3.29503f, -0.537517f, 0.923106f, -3.18665f, -1.29905f,
+ 1.64506f, -1.99848f, -2.24315f, 0.408613f, 0.503671f, -3.83393f,
+ -2.88388f, -3.52337f, 1.46818f, -1.67169f, -3.83253f, 1.52644f,
+ -0.490783f, -0.415782f
+};
+
+static const float av1_simple_motion_search_prune_part_layer_0_kernel_8[] = {
+ -0.702198f, -0.102148f, 0.0564545f, -0.0555548f, 0.16184f,
+ 0.0950792f, 0.136974f, -0.00824146f, 0.05746f, 0.0447542f,
+ 0.145978f, 0.0855769f, -0.041449f, 0.301347f, -0.0206691f,
+ -0.0662514f, -0.0525079f, -0.0998387f, -0.0891438f, 0.110545f,
+ -0.863098f, -1.83798f, 0.238818f, 0.127797f, 0.116872f,
+ -0.270655f, -0.21057f, 0.197013f, -0.123332f, 0.137104f,
+ -0.174766f, -0.00803025f, 0.0234369f, -0.0894175f, -0.0380927f,
+ 0.00827928f, -0.134148f, 0.110575f, -0.250173f, 0.116273f,
+ 0.0197749f, 0.270391f, 0.108437f, 0.173197f, -0.0650348f,
+ 0.0884626f, 0.262792f, 0.0649228f, 0.5573f, -2.81315f,
+ -0.479801f, -1.15825f, 0.0807932f, -0.19144f, 0.404016f,
+ -0.211521f, 0.233269f, -0.391414f, 0.160381f, -0.277233f,
+ 0.426354f, 0.156839f, 0.494315f, -0.214259f, -0.0132062f,
+ 0.148628f, -0.0899568f, 0.161845f, 0.467689f, 0.229474f,
+ 0.590634f, -0.705793f, -0.0486113f, -0.439088f, 0.994566f,
+ 0.679065f, 0.777869f, -0.225291f, -0.0303006f, -0.638782f,
+ -0.0824632f, -0.128561f, -0.327603f, 0.105624f, 0.567581f,
+ -0.396135f, -0.471028f, 0.181286f, 0.274604f, 0.180169f,
+ 0.0612144f, -0.865004f, 0.0306804f, 0.142985f, -0.0914358f,
+ -0.243284f, 0.358359f, -0.443847f, -0.371978f, 0.606933f,
+ -0.900408f, -0.52076f, 0.472118f, 0.0610973f, 0.152526f,
+ -0.550379f, 0.309331f, -0.141573f, 0.203046f, -0.231485f,
+ 0.505156f, 0.393224f, 0.435487f, -0.218681f, 0.123707f,
+ -0.270383f, -0.033565f, 0.210373f, -2.33967f, 0.367434f,
+ 0.0308118f, -0.205771f, 0.546141f, 0.19837f, 0.035648f,
+ -0.467007f, -1.50995f, -0.0314176f, 0.11762f, -0.15307f,
+ 0.618257f, -0.139502f, 0.303386f, -0.00758681f, 0.228107f,
+ -0.594499f, -0.201984f, -0.239666f, 0.114878f, -0.922174f,
+ -0.530137f, -0.379366f, -0.319582f, 0.0889624f, -0.00544663f,
+ 0.316264f, -0.204262f, -0.0959358f, 0.23552f, 0.141369f,
+ -0.207129f, -1.04067f, -0.0780501f, 0.226768f, -0.246752f,
+ 0.0823105f, 0.114783f, 0.49315f, 0.0197732f, 0.705433f,
+ 0.158076f, -0.250584f, -0.157326f, -0.0439547f, -0.139047f,
+ 0.090531f, -0.38833f, 0.743143f, -1.47418f, -0.155009f,
+ 0.511466f, -0.726716f, -0.181075f, 0.450133f, -0.390204f,
+ 0.292725f, 0.00811462f, -0.347738f, 0.613381f, -0.237124f,
+ 0.750748f, -0.383123f, 0.410309f, -0.204166f, 0.667199f,
+ -0.313197f, 0.436059f, -0.607571f, 0.193681f, 0.409399f,
+ 0.631747f, -0.0454149f, 0.198232f, 0.345591f, -0.0137374f,
+ -0.307014f, -0.535515f, 0.764678f, -0.225686f, -0.451621f,
+ -2.75564f, -1.52877f, 0.0511933f, 0.905979f, 0.145029f,
+ 0.759615f, 0.130166f, 0.83827f, 0.0655081f, 1.07555f,
+ -0.529777f, 0.682967f, -0.412052f, 0.611947f, -0.83676f,
+ 0.940695f, -0.465681f, 0.51505f, -0.883659f, -0.105524f,
+ -0.0344173f, -0.0683618f, -0.00698688f, -0.139349f, 0.135741f,
+ -0.294455f, -0.377834f, -0.602084f, -1.00128f, 0.483291f,
+ 1.25327f, 0.178987f, 0.75068f, -0.520731f, -0.325517f,
+ 0.272032f, 0.144144f, -0.279453f, 0.564907f, 0.144036f,
+ 0.297448f, -0.504243f, -0.250508f, -1.26395f, 0.4816f,
+ 0.392771f, -0.389961f, -0.261585f, -0.127124f, -0.202945f,
+ -0.709716f, -0.174719f, 0.113613f, 0.477753f, -0.226659f,
+ 0.0697828f, -0.177994f, 0.300726f, -0.185504f, 0.339424f,
+ -0.316746f, 0.369693f, -0.339723f, -0.143886f, -0.0326589f,
+ -0.268761f, -0.241094f, 0.284876f, -0.0270867f, -0.207397f,
+ -1.42738f, 0.495612f, -0.0277732f, 0.199675f, 1.48638f,
+ -0.659257f, -1.28199f, 0.498702f, 0.140695f, 0.571152f,
+ 0.416368f, 0.14153f, 0.126876f, 0.521114f, -0.00150571f,
+ 0.375581f, 0.00537624f, 0.1286f, -0.332227f, 0.417663f,
+ -0.539023f, 0.217124f, -0.787111f, -0.0335266f, 1.56751f,
+ 0.0640563f, -0.158791f, 0.118195f, 0.000970493f, -0.0403852f,
+ -0.0572557f, -0.0201181f, -0.10255f, 0.63237f, 0.156662f,
+ 0.418696f, -0.274802f, -0.663923f, -0.375232f, -0.40846f,
+ 0.462092f, 1.2176f, -0.301532f, -0.779704f, -0.112876f,
+ 0.0806591f, -0.0141923f, 0.00960801f, -0.663557f, 0.0979948f,
+ -0.0575999f, -0.012847f, 0.0403853f, -0.133666f, -0.00330217f,
+ -0.931518f, -0.774599f, -0.21391f, 0.377601f, -0.183365f,
+ 0.299094f, 0.0238552f, 0.206716f, -0.18959f, 0.346013f,
+ -0.150991f, -0.192817f, -0.293962f, -0.0537604f, -0.0648171f,
+ -0.275941f, -0.144854f, -0.224092f, 2.43113f, 0.0422494f,
+ -0.047236f, -0.0262028f, 0.0282119f, -0.175553f, 0.0888502f,
+ 0.580682f, 0.951055f, -0.284441f, -0.120133f, -0.268058f,
+ -0.312083f, -0.411556f, 0.21431f, -0.28033f, 0.324851f,
+ -1.02787f, -0.936816f, -0.577628f, 0.544743f, 0.295807f,
+ 0.406157f, 0.447927f, 0.25369f, -0.811421f, -0.0424979f,
+ -0.189867f, 0.00778673f, -0.113587f, -0.116175f, -0.0542222f,
+ -1.80089f, -1.44175f, -0.35332f, 0.191314f, -0.236691f,
+ -0.0261926f, -0.502363f, 0.252278f, -0.485478f, 0.296495f,
+ 0.455612f, -0.0489631f, 0.227255f, 0.170975f, 0.473487f,
+ 0.257812f, 0.178048f, 0.2506f, 2.04637f, -0.173857f,
+ 0.0583379f, 0.00765589f, -0.025772f, -0.162666f, -0.016214f,
+ -0.607486f, -0.0808025f, 0.0551611f, -0.0772291f, 0.126421f,
+ 0.10869f, -0.0877463f, -0.111527f, -0.0775766f, 0.503886f,
+ -0.002757f, -0.0421354f, -0.247857f, 0.140827f, 0.383576f,
+ 0.228232f, -0.157877f, -0.0927911f, 0.344687f, 0.191181f,
+ 0.236533f, 0.00102869f, -0.0184502f, -1.4509f, -1.15945f,
+ -0.521978f, -0.643225f, 0.133139f, 0.0660321f, 0.0851957f,
+ 0.0303648f, 0.0296239f, 0.0455713f, 0.175647f, 0.080532f,
+ 0.0445691f, -0.257356f, -0.125602f, -0.138829f, -0.167057f,
+ -0.0992552f, -0.13944f, 0.507531f, 0.444997f, 0.221452f,
+ -0.308384f, -0.327554f, 0.13235f, 2.1487f, -1.15453f,
+ -0.280239f, -0.363582f, -0.00358745f, 0.012866f, 0.251088f,
+ 0.0676416f, 0.178492f, -0.136631f, 0.197938f, -0.078198f,
+ 0.812439f, 1.1173f, 0.712113f, 1.10124f, -0.836503f,
+ -1.22433f, -1.07894f, -1.29215f, 0.56057f, 2.23928f,
+ -0.419029f, 0.282178f, -0.0719266f, -0.172192f, 0.28034f,
+ -2.99124f, -2.01481f, 0.0688982f, 0.697466f, 0.00635555f,
+ 0.566069f, 0.047534f, 0.507755f, -0.00690707f, 0.712594f,
+ -0.191467f, 0.355733f, -0.480016f, 0.664669f, -0.390619f,
+ 0.351199f, -0.482342f, 0.325005f, 1.9089f, 0.155987f,
+ 0.17032f, 0.132729f, 0.0402649f, 0.146991f, 0.0314905f,
+ -0.775316f, -0.208892f, -0.105993f, 0.0181653f, -0.12735f,
+ 0.0897852f, 0.0470231f, 0.25807f, 0.127406f, -0.0893252f,
+ -0.279776f, 0.190844f, 0.110384f, -0.148833f, 0.025293f,
+ 0.239838f, 0.00932245f, 0.35103f, -0.128268f, -0.0536754f,
+ 0.506899f, -0.16793f, 0.0955582f, -2.01108f, 0.721433f,
+ -2.31413f, -2.08646f, 0.033315f, 0.689828f, -0.271213f,
+ 0.790425f, -0.114234f, 0.755325f, -0.211533f, 0.774544f,
+ -0.263268f, 0.795762f, -0.551455f, 0.953602f, -0.168454f,
+ 0.529055f, -0.768991f, 0.882371f, 0.29763f, -0.155017f,
+ 0.00464101f, 0.121093f, 0.948271f, 0.113138f, -0.110332f,
+ -2.0492f, -1.31322f, -0.129212f, 0.464778f, -0.181465f,
+ 0.618403f, 0.0627984f, 0.465228f, 0.165729f, 0.278277f,
+ -0.563276f, -0.358358f, -0.590638f, 0.0104993f, 0.731206f,
+ 0.752569f, 0.631615f, 0.811822f, 0.129804f, -0.0558327f,
+ 0.570081f, -0.417922f, -0.168275f, 0.0703671f, 0.269127f,
+ 0.240457f, -0.197159f, -0.00179261f, 0.220065f, 0.463511f,
+ 0.0714626f, -0.716477f, -0.441865f, -0.717028f, -0.149176f,
+ 0.452182f, 0.662699f, -0.906534f, -0.817133f, 0.237747f,
+ 0.26024f, -7.7441e-05f, 0.0934616f, 0.824641f, -0.0404494f,
+ -0.088297f, -0.157899f, 0.037408f, 0.132435f, -0.316155f,
+ -0.276785f, 0.0117868f, 0.185008f, 0.32369f, -0.465855f,
+ -0.302127f, 0.303289f, 0.338597f, -0.665408f, -0.507594f,
+ 0.526979f, 0.532091f, 0.234395f, 0.754063f, 0.116769f,
+ 0.0800309f, -0.939344f, -1.51269f, 1.4583f, 0.178444f,
+ 0.0106756f, -0.213468f, -0.00369439f, 0.071015f, -0.192798f,
+ -0.0933147f, -0.129901f, -0.368279f, -0.246564f, 0.126966f,
+ 0.478565f, -0.476246f, -0.762863f, 0.168883f, 0.536136f,
+ -0.272969f, 0.2573f, -0.161577f, 0.311428f, -0.777994f,
+ -1.29752f, 0.216046f, 0.329016f, 1.57265f, 0.168075f,
+ -0.192518f, 0.0829308f, -0.073533f, -0.0202034f, 0.114716f,
+ -0.34888f, -0.519215f, 0.190809f, 0.0138507f, 0.133635f,
+ 0.14194f, 0.410618f, -0.165106f, 0.214438f, 0.0438265f,
+ -0.8481f, -1.19182f, -1.07878f, -0.882217f, 0.45616f,
+ 0.977385f, 0.74929f, 0.918466f, 0.904704f, 0.041938f,
+ 0.0362776f, 0.0757255f, 1.14007f, 0.0516825f, -0.160068f,
+ 0.219535f, 0.638634f, -0.0284544f, -0.222849f, -0.0344915f,
+ -0.0350256f, -0.0504452f, -0.0458416f, 0.146099f, 0.0783083f,
+ 0.206579f, 0.241264f, 0.28401f, 0.0425312f, -0.802049f,
+ -0.746271f, -0.578969f, -0.078218f, 0.436176f, -0.281465f,
+ -2.5539f, 0.237868f, -0.121796f, 0.0715619f, 0.106992f,
+ -0.621862f, -0.167142f, 0.153716f, 0.0570912f, -0.06525f,
+ -0.923773f, 0.130759f, 0.0517066f, 0.0729862f, -0.873064f,
+ 0.0403328f, -0.186499f, -0.0831918f, -0.223723f, 0.144697f,
+ 0.212845f, 0.416876f, 0.361598f, 0.138229f, 0.0728777f,
+ -1.95419f, -0.00382816f, -0.0440387f, 0.433627f, 0.44781f,
+ -1.05229f, -1.54506f, 0.564827f, -0.263456f, 0.296105f,
+ -0.158055f, 0.388274f, -0.366639f, 0.212006f, -0.245619f,
+ 0.593064f, 0.088727f, 0.410632f, -0.263462f, 0.507075f,
+ -0.0974155f, 0.275268f, -0.1293f, 0.136679f, 1.98276f,
+ 0.411766f, 0.391987f, 0.34283f, -0.114077f, 0.258462f,
+ -0.302443f, 0.301138f, -0.00726621f, 0.276441f, -0.291582f,
+ 0.66498f, -0.321451f, -0.332805f, 0.0943272f, 0.572253f,
+ -0.45818f, -0.0219593f, -0.151679f, 0.402033f, -1.15502f,
+ -0.882955f, 0.772904f, 0.88126f, -0.149555f, 0.709525f,
+ 0.350116f, -0.21531f, 0.797893f, 0.0230234f, 0.0203034f,
+ 0.2744f, 1.08273f, 0.039349f, 0.503909f, -0.45892f,
+ -0.579516f, -0.344058f, 0.390628f, -0.386941f, -0.430317f,
+ -0.0807066f, 0.435906f, 0.522996f, 0.724476f, -0.74371f,
+ -0.05376f, -0.340898f, -0.962646f, -0.0278005f, 0.0981149f,
+ -0.0811161f, 0.00237994f, 0.850042f, 0.0665473f, 0.134413f
+};
+
+static const float av1_simple_motion_search_prune_part_logits_bias_8[] = {
+ 1.63404f, -0.715866f, -1.0132f, -2.08745f
+};
+
+static const NN_CONFIG av1_simple_motion_search_prune_part_nn_config_8 = {
NUM_FEATURES_8,
NUM_LOGITS_8,
NUM_HIDDEN_LAYERS_8,
@@ -2883,22 +3967,839 @@ static const NN_CONFIG full_pixel_motion_search_based_split_nn_config_8 = {
NUM_LAYER_0_UNITS_8,
},
{
- full_pixel_motion_search_based_split_layer_0_kernel_8,
- full_pixel_motion_search_based_split_logits_kernel_8,
+ av1_simple_motion_search_prune_part_layer_0_kernel_8,
+ av1_simple_motion_search_prune_part_logits_kernel_8,
},
{
- full_pixel_motion_search_based_split_layer_0_bias_8,
- full_pixel_motion_search_based_split_logits_bias_8,
+ av1_simple_motion_search_prune_part_layer_0_bias_8,
+ av1_simple_motion_search_prune_part_logits_bias_8,
},
};
-static const float full_pixel_motion_search_based_split_thresh_8 = 2.0f;
+#undef NUM_HIDDEN_LAYERS_8
+#undef NUM_FEATURES_8
+#undef NUM_LAYER_0_UNITS_8
+#undef NUM_LOGITS_8
+
+#define FEATURE_SIZE 19
+static const float av1_2pass_split_partition_weights_128[FEATURE_SIZE + 1] = {
+ 2.683936f, -0.193620f, -4.106470f, -0.141320f, -0.282289f,
+ 0.125296f, -1.134961f, 0.862757f, -0.418799f, -0.637666f,
+ 0.016232f, 0.345013f, 0.018823f, -0.393394f, -1.130700f,
+ 0.695357f, 0.112569f, -0.341975f, -0.513882f, 5.7488966f,
+};
+
+static const float av1_2pass_split_partition_weights_64[FEATURE_SIZE + 1] = {
+ 2.990993f, 0.423273f, -0.926544f, 0.454646f, -0.292698f,
+ -1.311632f, -0.284432f, 0.717141f, -0.419257f, -0.574760f,
+ -0.674444f, 0.669047f, -0.374255f, 0.380624f, -0.804036f,
+ 0.264021f, 0.004163f, 1.896802f, 0.924287f, 0.13490619f,
+};
+
+static const float av1_2pass_split_partition_weights_32[FEATURE_SIZE + 1] = {
+ 2.795181f, -0.136943f, -0.924842f, 0.405330f, -0.463505f,
+ -0.584076f, -0.831472f, 0.382985f, -0.597544f, -0.138915f,
+ -1.354350f, 0.466035f, -0.553961f, 0.213202f, -1.166429f,
+ 0.010776f, -0.096236f, 2.335084f, 1.699857f, -0.58178353f,
+};
+
+static const float av1_2pass_split_partition_weights_16[FEATURE_SIZE + 1] = {
+ 1.987888f, -0.431100f, -1.687703f, 0.262602f, -0.425298f,
+ -0.463870f, -1.493457f, 0.470917f, -0.528457f, -0.087700f,
+ -1.815092f, 0.152883f, -0.337908f, 0.093679f, -1.548267f,
+ -0.042387f, -0.000861f, 2.556746f, 1.619192f, 0.03643292f,
+};
+
+static const float av1_2pass_split_partition_weights_8[FEATURE_SIZE + 1] = {
+ 2.188344f, -0.817528f, -2.119219f, 0.000000f, -0.348167f,
+ -0.658074f, -1.960362f, 0.000000f, -0.403080f, 0.282699f,
+ -2.061088f, 0.000000f, -0.431919f, -0.127960f, -1.099550f,
+ 0.000000f, 0.121622f, 2.017455f, 2.058228f, -0.15475988f,
+};
+
+static const float av1_2pass_none_partition_weights_128[FEATURE_SIZE + 1] = {
+ -1.006689f, 0.777908f, 4.461072f, -0.395782f, -0.014610f,
+ -0.853863f, 0.729997f, -0.420477f, 0.282429f, -1.194595f,
+ 3.181220f, -0.511416f, 0.117084f, -1.149348f, 1.507990f,
+ -0.477212f, 0.202963f, -1.469581f, 0.624461f, -0.89081228f,
+};
+
+static const float av1_2pass_none_partition_weights_64[FEATURE_SIZE + 1] = {
+ -1.241117f, 0.844878f, 5.638803f, -0.489780f, -0.108796f,
+ -4.576821f, 1.540624f, -0.477519f, 0.227791f, -1.443968f,
+ 1.586911f, -0.505125f, 0.140764f, -0.464194f, 1.466658f,
+ -0.641166f, 0.195412f, 1.427905f, 2.080007f, -1.98272777f,
+};
+
+static const float av1_2pass_none_partition_weights_32[FEATURE_SIZE + 1] = {
+ -2.130825f, 0.476023f, 5.907343f, -0.516002f, -0.097471f,
+ -2.662754f, 0.614858f, -0.576728f, 0.085261f, -0.031901f,
+ 0.727842f, -0.600034f, 0.079326f, 0.324328f, 0.504502f,
+ -0.547105f, -0.037670f, 0.304995f, 0.369018f, -2.66299987f,
+};
+
+static const float av1_2pass_none_partition_weights_16[FEATURE_SIZE + 1] = {
+ -1.626410f, 0.872047f, 5.414965f, -0.554781f, -0.084514f,
+ -3.020550f, 0.467632f, -0.382280f, 0.199568f, 0.426220f,
+ 0.829426f, -0.467100f, 0.153098f, 0.662994f, 0.327545f,
+ -0.560106f, -0.141610f, 0.403372f, 0.523991f, -3.02891231f,
+};
+
+static const float av1_2pass_none_partition_weights_8[FEATURE_SIZE + 1] = {
+ -1.463349f, 0.375376f, 4.751430f, 0.000000f, -0.184451f,
+ -1.655447f, 0.443214f, 0.000000f, 0.127961f, 0.152435f,
+ 0.083288f, 0.000000f, 0.143105f, 0.438012f, 0.073238f,
+ 0.000000f, -0.278137f, 0.186134f, 0.073737f, -1.6494962f,
+};
+#undef FEATURE_SIZE
+
+// nn model for predicting max square partition level of a superblock
+#define NUM_HIDDEN_LAYERS 1
+#define NUM_FEATURES 13
+#define NUM_LAYER_0_UNITS 48
+#define NUM_LOGITS 4
+
+static const float av1_max_part_pred_logits_kernel[] = {
+ -0.304561f, 0.0885596f, -0.988539f, 1.08147f, 0.215213f,
+ 0.202965f, -0.828457f, -0.233945f, -0.0866977f, -0.115521f,
+ 0.02079f, 0.196491f, -0.0285075f, 0.05067f, -0.00872862f,
+ 0.00281844f, -0.238954f, 0.0253801f, 0.0257775f, 0.339269f,
+ 0.176174f, -0.152545f, -0.0588704f, -1.62275f, -0.189329f,
+ 0.0808033f, 0.233844f, -4.53798f, 0.674968f, -0.0361688f,
+ -0.0754075f, 1.16129f, -0.0188879f, 0.113255f, -3.04378f,
+ 0.814728f, -0.568517f, -0.00179383f, -3.61223f, -1.67535f,
+ -2.20417f, -0.197196f, 0.0507745f, -0.0909394f, -0.0507879f,
+ -1.27999f, -0.055623f, 0.0318497f, 0.192867f, 0.138726f,
+ 0.0443392f, -0.595075f, -0.166774f, 0.0882958f, -0.348161f,
+ 0.0214428f, -0.0599275f, -0.0995385f, -0.82358f, 0.141205f,
+ -0.053232f, 0.00508296f, -1.90872f, 1.15004f, -0.194219f,
+ 0.0229019f, -0.00354318f, 0.22016f, 0.154101f, -0.159231f,
+ -0.0446647f, -0.197503f, 0.0408453f, 0.197659f, 0.797858f,
+ -0.189722f, 0.343653f, 0.124666f, -1.03083f, 0.603059f,
+ 0.101565f, 0.0932993f, 0.462484f, 0.295984f, 1.11198f,
+ 0.143709f, -0.846232f, -0.464392f, -1.06058f, -0.124889f,
+ 0.0727475f, 1.18446f, -0.100302f, 0.0641918f, -0.101622f,
+ 0.10219f, 0.130189f, 0.0915623f, -0.166904f, -1.10606f,
+ -0.16726f, -0.146152f, 0.145443f, -0.177091f, -0.0215214f,
+ 0.0158506f, -0.553294f, 0.0784749f, -0.0416628f, -0.027785f,
+ 0.280027f, 0.484898f, -0.164225f, 0.0238317f, -0.0345254f,
+ 0.0410244f, 0.131529f, 0.0239622f, -0.0749436f, -0.0224914f,
+ 0.128926f, 0.224539f, 0.413297f, 0.0638572f, 0.103308f,
+ 0.0913242f, -0.119274f, 0.0163103f, 0.113828f, 0.119809f,
+ 0.297057f, -0.124889f, -0.533108f, -0.181408f, -0.129896f,
+ 0.0221064f, -0.0773281f, -0.0386467f, 0.0342961f, 0.126575f,
+ -0.24114f, 0.0735576f, 0.0524791f, 0.246896f, -0.130674f,
+ -0.03979f, 0.173639f, 1.95193f, -0.113029f, -0.0305852f,
+ -0.00671737f, 0.157159f, -0.00102858f, -0.543688f, 0.566772f,
+ 0.124124f, -0.0294064f, -0.0699021f, -0.0704103f, -0.766097f,
+ -0.0625802f, -0.0906173f, -0.0520414f, -0.0272724f, 0.283064f,
+ 0.236213f, -0.127319f, 0.019392f, 0.170042f, -0.0214542f,
+ 0.0740938f, 0.356578f, -0.236257f, 0.269021f, 0.114759f,
+ -0.641166f, 0.136308f, -0.0386959f, -0.112024f, -0.361209f,
+ 0.686095f, 0.183906f, 0.288656f, 0.182007f, 0.337458f,
+ 0.058974f, -0.305512f, -0.841708f, -0.243779f, -0.0614058f,
+ 0.208747f, 0.448697f
+};
+
+static const float av1_max_part_pred_layer_0_bias[] = {
+ -0.776544f, -2.0022f, -0.330294f, 2.47665f, 1.90206f, -1.61571f,
+ 0.536246f, 1.00455f, 5.24561f, 1.55111f, -0.816399f, -4.88703f,
+ -1.06417f, -1.15359f, -0.145289f, 1.91831f, 0.630915f, -1.94256f,
+ -3.35239f, -1.05007f, -1.05186f, 1.36824f, -5.2878f, 1.10482f,
+ -5.00077f, -0.0445198f, 3.41427f, 2.3439f, -0.413306f, -1.88152f,
+ -2.28638f, 8.24783f, -1.91961f, -1.49324f, 1.96599f, -6.32309f,
+ -0.332426f, -0.425506f, 4.06511f, 5.84386f, 4.15747f, 1.22402f,
+ 2.8512f, 2.53027f, 0.0170272f, -1.43966f, -0.997785f, 5.43064f
+};
+
+static const float av1_max_part_pred_logits_bias[] = { -4.25432f, 0.144758f,
+ 1.96217f, 0.728905f };
+
+static const float av1_max_part_pred_layer_0_kernel[] = {
+ 0.992471f, 0.533006f, 0.143743f, -2.51788f, -0.468337f,
+ -0.201376f, -0.151834f, 0.479883f, 1.16061f, -0.278878f,
+ -0.814954f, -0.152405f, -0.0521608f, 0.797104f, -2.08912f,
+ 0.385839f, -2.22889f, -0.106858f, -0.239766f, -0.951128f,
+ -0.698753f, 0.0831051f, 1.1702f, 0.342834f, -0.0352795f,
+ -0.0847639f, -0.802086f, 0.258982f, 1.14174f, 0.645885f,
+ -1.19226f, -0.592888f, -0.343659f, 1.1912f, 1.45411f,
+ -1.22927f, 0.152858f, 0.00373585f, -1.60637f, 0.592611f,
+ 0.0857475f, -0.346147f, -0.150784f, -0.0817408f, -0.189918f,
+ -0.804952f, -1.33036f, -1.03307f, 0.0248769f, 0.16607f,
+ -2.896f, -2.1293f, 0.12293f, -0.173179f, -0.212128f,
+ -6.76221f, 0.033188f, 0.0231787f, 0.905957f, 0.0551327f,
+ -0.356276f, 0.0181795f, 0.0977523f, -0.0352873f, -0.0396386f,
+ 2.3241f, 0.0632874f, -0.11804f, -6.32521f, 0.0224659f,
+ -0.00188896f, 0.267992f, 0.272337f, 0.00936963f, 0.659969f,
+ -2.25707f, -0.0278229f, -0.0185089f, -1.14466f, 0.104827f,
+ 0.0435885f, 0.558586f, -0.00697004f, 0.0312611f, 0.540574f,
+ -0.568625f, 0.218608f, 0.378911f, -0.0289192f, -0.0734742f,
+ -1.08782f, -2.42069f, -0.0127239f, 0.0493651f, -1.15837f,
+ 0.261831f, 0.401824f, -1.04545f, 0.284173f, 0.784972f,
+ -0.511243f, -0.982599f, -0.106134f, -0.325964f, -1.44107f,
+ -1.42434f, -1.02402f, -1.52034f, 0.0737116f, 0.0462242f,
+ 0.628722f, -1.0405f, -0.113718f, 2.20573f, -4.33951f,
+ -0.0192695f, -0.0229314f, -1.89156f, 0.645942f, 0.375708f,
+ -1.97447f, -0.267014f, 0.0989443f, -0.450534f, -1.01737f,
+ -0.642416f, -0.0897288f, -2.08724f, -0.190965f, -0.279135f,
+ -0.830178f, 0.808754f, -0.139091f, 1.11004f, -0.454439f,
+ -0.479238f, -1.44001f, 0.0888059f, 0.885689f, -0.642505f,
+ -0.00773651f, -0.0265721f, -0.906346f, 1.68504f, 0.084257f,
+ -0.951101f, -8.06495f, 0.19231f, 0.16389f, -0.193678f,
+ 0.729837f, -1.98392f, -5.98513f, 3.32638f, -0.0658378f,
+ -0.0910426f, -0.666567f, -0.315339f, 0.123124f, -2.66375f,
+ -0.714852f, -0.136176f, -0.460166f, -0.567551f, -1.06193f,
+ -1.21389f, -0.83865f, 0.00280695f, -0.199519f, -0.534704f,
+ 0.419311f, -0.149008f, -3.68707f, 0.00285113f, -0.0718198f,
+ -1.41026f, -1.34155f, -0.538687f, -0.623666f, -2.56462f,
+ -0.0183333f, -0.323532f, -1.27141f, -0.0212039f, 0.198633f,
+ 0.459554f, -4.65103f, -1.01293f, -1.39512f, -0.289026f,
+ 0.208724f, -0.665226f, 1.13369f, -1.96734f, -1.45442f,
+ -3.46172f, 0.810681f, -0.603973f, 0.842764f, -3.90371f,
+ -0.394561f, -3.61363f, -2.88085f, 0.031645f, -0.23125f,
+ -2.63898f, -1.35314f, -0.46726f, 1.33145f, 1.20269f,
+ 1.38682f, -0.331637f, 0.069021f, 0.149523f, -1.24957f,
+ -0.878857f, -0.200368f, 0.465744f, 1.01365f, -0.0122221f,
+ -0.550586f, -1.12581f, -0.422132f, -0.0744868f, -2.4804f,
+ -1.07072f, -0.479006f, 0.101817f, -0.118947f, 0.341576f,
+ -1.0538f, -0.812346f, -1.13727f, -0.00939806f, 10.1571f,
+ -0.0441302f, 0.00280407f, -21.5044f, 0.0181152f, -0.0143246f,
+ 3.23462f, -1.38624f, -1.80416f, 4.89763f, -2.67364f,
+ 2.31771e-05f, 0.000393989f, 0.352204f, -0.193455f, 0.531455f,
+ 0.488757f, -0.442555f, -0.518528f, 0.431482f, -2.67727f,
+ -2.00626f, -0.39729f, -0.221494f, -0.0188888f, -0.0377649f,
+ -1.80169f, 0.0810332f, -0.0408335f, -1.28675f, -0.0353824f,
+ -0.666723f, -1.07281f, 0.252912f, -1.24547f, -1.7831f,
+ -1.14354f, -0.137662f, 0.00230182f, 0.736862f, 0.175872f,
+ -0.187556f, 0.43963f, -0.796524f, 0.056219f, -0.387874f,
+ 0.0710224f, -0.16548f, -0.100993f, 0.931481f, -3.20738f,
+ -0.0197576f, 0.266148f, -0.173909f, -0.337795f, -0.0682381f,
+ 0.176844f, 0.140286f, 1.12033f, 0.429064f, -2.24192f,
+ -1.54682f, 2.23646f, -0.0371138f, -0.0475339f, -3.21766f,
+ 0.0412858f, 0.387811f, 6.6711f, 0.140649f, 0.0559547f,
+ -0.802839f, 0.599977f, 0.64552f, -2.08103f, -0.503401f,
+ -0.0407036f, -0.0299199f, 0.0849445f, -0.111657f, -1.63462f,
+ 3.33762f, 0.0441394f, 0.0466889f, -0.951806f, 0.0723954f,
+ 0.00348661f, -1.36903f, 2.24625f, -0.0348915f, -0.0508893f,
+ -0.240891f, -0.120143f, -0.17991f, -2.09137f, 0.0150871f,
+ 0.0480333f, 1.72012f, 0.0309551f, -0.0370507f, -0.377075f,
+ 0.103916f, -0.0169255f, -0.0145395f, -4.02144f, 0.83193f,
+ -0.316502f, 6.3832f, -1.70038f, -1.97215f, -1.94501f,
+ 1.45479f, 0.711725f, -0.348496f, -0.279056f, -1.13396f,
+ -1.51744f, -0.853307f, 1.53131f, -0.0032358f, 1.41808f,
+ -1.32989f, -0.245221f, -0.161614f, -0.500845f, -0.449252f,
+ 0.0724151f, -0.116333f, -0.0946182f, -2.0945f, 0.0564572f,
+ 0.393261f, -1.06861f, -0.111458f, -0.839943f, -0.0880348f,
+ 0.0365742f, 0.415339f, -1.57494f, -0.713697f, 1.02349f,
+ -0.221371f, -0.0446281f, 1.89223f, -0.0811754f, -0.402773f,
+ -0.930987f, 0.0243194f, 0.0678332f, -0.0233014f, 0.165372f,
+ -0.44083f, -1.2404f, 0.35675f, -0.040916f, -0.0512548f,
+ -2.9071f, 0.861174f, -0.778133f, 2.14436f, -0.688427f,
+ -0.480371f, -1.69032f, 0.706687f, -0.281982f, -2.30451f,
+ 1.61541f, -0.0213638f, -0.740509f, -0.266677f, 0.0268434f,
+ -0.0116908f, -3.17595f, 0.0114825f, 0.0196997f, -0.144005f,
+ 0.0550181f, -0.851459f, -0.000285073f, -0.538441f, -0.0254868f,
+ -0.0104454f, -0.0661998f, -0.196469f, -0.346372f, -5.52892f,
+ -0.643683f, -0.622224f, -0.31463f, -0.555956f, -0.520132f,
+ -0.843166f, -2.59479f, -0.750195f, 0.00635995f, -0.338615f,
+ -0.216676f, -0.391544f, -1.62185f, -0.718471f, -0.475406f,
+ -0.782041f, -0.608824f, -1.09633f, -1.27308f, -0.560719f,
+ -0.207539f, -0.0196445f, -1.05519f, -0.575249f, -1.0642f,
+ 1.01615f, -0.873633f, -0.417953f, -0.428051f, 0.350259f,
+ -2.53833f, -2.72203f, 0.672846f, -0.503094f, -1.1374f,
+ 0.214291f, 0.013305f, 0.0112064f, 1.10532f, 0.030455f,
+ 0.0239614f, 0.628072f, 0.0539135f, -0.472441f, -0.688439f,
+ -0.32044f, -0.0234867f, -0.0158436f, -0.949314f, -0.0453161f,
+ -1.18306f, 0.626845f, -0.426925f, -0.688371f, 0.415062f,
+ 0.0640985f, -0.638387f, -2.01399f, -0.209744f, -0.762892f,
+ -0.0753296f, -0.879315f, -0.520433f, -0.111375f, 0.389742f,
+ -0.398862f, -0.643227f, -0.246396f, 0.0317051f, 1.06973f,
+ 0.413617f, 0.180506f, -0.0507897f, -0.00650435f, 0.620892f,
+ 0.046312f, 0.475032f, 0.906993f, -0.0388061f, -0.256271f,
+ -1.03323f, 0.0125266f, -0.31116f, -0.377611f, -0.0386407f,
+ -0.0232745f, -0.353644f, -2.27289f, 0.0571779f, -0.00865006f,
+ 1.65101f, 0.0175711f, 0.0184585f, 0.558458f, 0.2213f,
+ -0.285089f, 0.433445f, -0.427177f, -0.0103682f, -0.0101273f,
+ 0.214085f, -0.0459885f, 0.00761981f, 0.836381f, 0.0175293f,
+ 0.02508f, -1.51778f, 0.0143956f, -0.162589f, 0.595418f,
+ 0.21445f, -0.0335848f, -0.0136684f, -0.16686f, -0.14612f,
+ 0.0816238f, 0.499636f, 0.12458f, -2.41673f, -0.261721f,
+ -0.676805f, -1.88366f, 0.730462f, 0.69196f, -0.0288489f,
+ -2.38272f, 0.329876f, 0.014517f, -0.115145f, -3.48151f,
+ -0.00209072f, -0.0732377f, 0.820443f, -0.0118701f, 0.112145f,
+ 0.272315f, 0.137531f, -0.0200997f, -0.0397883f, -2.19458f,
+ 0.183554f, -0.639716f, 0.481605f, -0.621639f, -0.0980299f,
+ -0.710534f, -0.143105f, -6.77626f, -1.65139f, -2.37718f,
+ -0.533127f, -1.12574f, 3.34182f, -0.0758663f, 0.0334238f,
+ -9.48647f, 0.0674974f, 0.0507665f, 0.523007f, -0.0668f,
+ 0.5736f, -0.589761f, -1.1692f, -0.0236497f, -0.00828928f,
+ -0.265823f, 1.15284f, 0.307927f, -0.695308f, 0.13725f,
+ -0.20394f, -0.363965f, -0.331159f, -1.50927f, -1.20051f,
+ -0.0205825f, -0.0381859f, -0.0579876f, -1.6913f, -1.94626f,
+ 3.4214f, 3.3922f, -2.13798f, -0.679848f, -0.890735f,
+ 0.235017f, -0.253202f, -1.0571f, 1.40354f, 0.00719052f,
+ -1.54365f, -0.7289f, -1.05492f, 0.0238169f, -0.00543592f,
+ -0.0510353f, -0.175386f, -0.724207f, -0.788936f, 0.039976f,
+ 1.36966f, 0.869475f, -0.0302774f, -0.0537556f
+};
+
+static const NN_CONFIG av1_max_part_pred_nn_config = {
+ NUM_FEATURES,
+ NUM_LOGITS,
+ NUM_HIDDEN_LAYERS,
+ {
+ NUM_LAYER_0_UNITS,
+ },
+ {
+ av1_max_part_pred_layer_0_kernel,
+ av1_max_part_pred_logits_kernel,
+ },
+ {
+ av1_max_part_pred_layer_0_bias,
+ av1_max_part_pred_logits_bias,
+ },
+};
+
+#undef NUM_HIDDEN_LAYERS
+#undef NUM_FEATURES
+#undef NUM_LAYER_0_UNITS
+#undef NUM_LOGITS
+
+// Early termination in second pass
+static const float av1_simple_motion_search_term_none_mean_128[28] = {
+ 12.661922f, 12.638062f, 10.896497f, 10.865719f, 10.978963f, 10.940105f,
+ 11.012235f, 10.972760f, 11.069924f, 11.018533f, 11.773865f, 11.747426f,
+ 11.891315f, 11.858107f, 11.793916f, 11.766356f, 11.874997f, 11.840164f,
+ 5.940535f, 0.770746f, 4.292692f, 4.309581f, 0.848423f, 4.292334f,
+ 4.298179f, 8.514713f, 14.911736f, 19.825352f,
+};
+
+static const float av1_simple_motion_search_term_none_std_128[28] = {
+ 1.796731f, 1.797056f, 1.898383f, 1.900753f, 1.846624f, 1.846953f, 1.906632f,
+ 1.908089f, 1.836533f, 1.835967f, 1.840262f, 1.840671f, 1.816836f, 1.817103f,
+ 1.879846f, 1.881333f, 1.803102f, 1.802654f, 2.263402f, 0.420354f, 1.117165f,
+ 1.083779f, 0.358611f, 1.101183f, 1.084938f, 2.462638f, 1.577009f, 1.574711f,
+};
+
+static const float av1_simple_motion_search_term_none_mean_64[28] = {
+ 10.904455f, 10.853546f, 9.247903f, 9.184479f, 9.251985f, 9.186686f,
+ 9.253490f, 9.190190f, 9.270079f, 9.204357f, 10.086511f, 10.031060f,
+ 10.100875f, 10.045429f, 10.069688f, 10.013173f, 10.082980f, 10.024640f,
+ 4.888378f, 0.878113f, 3.598450f, 3.628491f, 0.925833f, 3.560971f,
+ 3.573322f, 8.807137f, 13.348477f, 18.269117f,
+};
+
+static const float av1_simple_motion_search_term_none_std_64[28] = {
+ 1.789300f, 1.787061f, 1.823519f, 1.820226f, 1.794643f, 1.788620f, 1.797194f,
+ 1.795135f, 1.777795f, 1.773634f, 1.794000f, 1.790377f, 1.772197f, 1.769692f,
+ 1.819050f, 1.817139f, 1.793577f, 1.789333f, 1.998251f, 0.327156f, 0.885748f,
+ 0.853767f, 0.262043f, 0.902435f, 0.860033f, 1.224865f, 1.603411f, 1.589296f,
+};
+
+static const float av1_simple_motion_search_term_none_mean_32[28] = {
+ 9.818970f, 9.751199f, 8.015079f, 7.927318f, 8.029113f, 7.938330f, 8.012570f,
+ 7.923719f, 8.033508f, 7.941911f, 8.933057f, 8.857422f, 8.935639f, 8.859187f,
+ 8.905495f, 8.829741f, 8.929428f, 8.851351f, 4.114069f, 0.954752f, 2.645082f,
+ 2.709703f, 0.964678f, 2.652077f, 2.673393f, 9.430499f, 11.922798f, 16.942251f,
+};
+
+static const float av1_simple_motion_search_term_none_std_32[28] = {
+ 1.737107f, 1.734327f, 1.727923f, 1.720244f, 1.721570f, 1.712775f, 1.718028f,
+ 1.710370f, 1.711612f, 1.702596f, 1.754856f, 1.748855f, 1.741871f, 1.736304f,
+ 1.722428f, 1.717380f, 1.713563f, 1.707582f, 1.761170f, 0.207847f, 0.900058f,
+ 0.862356f, 0.184593f, 0.903822f, 0.856120f, 1.529199f, 1.412085f, 1.453153f,
+};
+
+static const float av1_simple_motion_search_term_none_mean_16[28] = {
+ 8.998877f, 8.912468f, 7.085255f, 6.953476f, 7.086386f, 6.954091f, 7.088727f,
+ 6.955747f, 7.093955f, 6.960635f, 8.065050f, 7.961432f, 8.071631f, 7.967233f,
+ 8.041699f, 7.937715f, 8.046791f, 7.942183f, 3.833521f, 0.978421f, 1.901347f,
+ 1.950124f, 0.979418f, 1.928000f, 1.936727f, 9.773951f, 10.735227f, 15.949769f,
+};
+
+static const float av1_simple_motion_search_term_none_std_16[28] = {
+ 1.641193f, 1.640172f, 1.614794f, 1.608906f, 1.609571f, 1.603580f, 1.606928f,
+ 1.601246f, 1.599230f, 1.593529f, 1.633747f, 1.630219f, 1.625695f, 1.622547f,
+ 1.633827f, 1.630182f, 1.626607f, 1.622777f, 1.548838f, 0.145303f, 0.744550f,
+ 0.736552f, 0.141980f, 0.742979f, 0.736977f, 1.366255f, 1.258794f, 1.294309f,
+};
+
+static const float av1_simple_motion_search_term_none_model_128[] = {
+ -0.6106842357f, -1.0402954455f, 0.6054417656f, -0.2116623578f,
+ 0.2447714930f, 0.3782256209f, 0.5095592479f, -0.3275620904f,
+ 0.3886188013f, 0.2629499420f, -0.1979599415f, -0.5389565605f,
+ 0.1209207902f, -0.4913347466f, 0.3798542731f, -0.2812861709f,
+ -0.1049824167f, -0.1088672020f, 0.4059596517f, -0.1347896613f,
+ 0.2276868621f, 0.0506386970f, 0.0071088411f, 0.0467952100f,
+ 0.2091247458f, -0.7371964736f, 0.1368935545f, 0.3175247786f,
+ -0.5493146094f,
+};
+
+static const float av1_simple_motion_search_term_none_model_64[] = {
+ -0.4150046575f, -0.3954358561f, 0.1997997444f, 0.3395826831f,
+ 0.2827215753f, 0.3395683652f, 0.2483140395f, 0.2722216476f,
+ 0.2610308009f, 0.3724974359f, -0.0551479654f, -0.1721616359f,
+ -0.3459358629f, -0.0952524186f, -0.1428993840f, -0.0415654914f,
+ -0.3169539902f, -0.0269429900f, 0.9891530919f, -0.0125084982f,
+ 0.0972182377f, 0.0008889801f, 0.0205418050f, 0.0057237854f,
+ 0.1005222691f, -0.2851321920f, -1.5150336445f, 0.1893942436f,
+ -0.4337360901f,
+};
+
+static const float av1_simple_motion_search_term_none_model_32[] = {
+ -0.4667392852f, -0.3893302767f, 0.1603498635f, 0.2304974726f,
+ 0.1404975592f, 0.2505516225f, 0.1423053884f, 0.2189318406f,
+ 0.1379765409f, 0.2638241296f, -0.1342865463f, -0.0549054345f,
+ -0.1925223436f, -0.1142702769f, 0.0127811659f, 0.0868639997f,
+ -0.0643197251f, 0.0279496470f, 0.9904395769f, -0.0095178685f,
+ 0.1179410649f, -0.0013411972f, 0.0095060660f, 0.0195730400f,
+ 0.0779717771f, -0.2498860763f, -0.8168817125f, -0.4798397348f,
+ -0.6609679881f,
+};
+
+static const float av1_simple_motion_search_term_none_model_16[] = {
+ -0.3021081992f, -0.4620153673f, 0.0448577479f, 0.1738455035f,
+ 0.0663209177f, 0.1629614573f, 0.0555168744f, 0.1631870212f,
+ 0.0425805150f, 0.1688564954f, 0.0434083772f, -0.0046603915f,
+ -0.0271580056f, -0.0183879127f, 0.1073730471f, 0.0314201476f,
+ 0.0576891756f, 0.0119723753f, 0.9084332022f, -0.0188429077f,
+ 0.0755089811f, -0.0172550234f, 0.0037663075f, 0.0022094472f,
+ 0.0500247894f, -0.2944572004f, -0.8908521199f, -0.2555515792f,
+ -0.5396254205f,
+};
+
+// Early termination in firstpass
+static const float av1_fp_simple_motion_search_term_none_mean_32[20] = {
+ 10.216787f, 10.167575f, 8.405353f, 8.340786f, 8.436503f,
+ 8.373259f, 8.444113f, 8.379074f, 8.448215f, 8.384669f,
+ 4.107491f, 0.923902f, 2.702687f, 2.712742f, 0.953166f,
+ 2.703244f, 2.707070f, 9.549801f, 12.013671f, 17.059454f,
+};
+
+static const float av1_fp_simple_motion_search_term_none_std_32[20] = {
+ 1.886182f, 1.886638f, 1.884324f, 1.883410f, 1.851800f, 1.851652f, 1.847129f,
+ 1.848014f, 1.832187f, 1.832360f, 1.758185f, 0.265155f, 0.939592f, 0.932395f,
+ 0.211284f, 0.950024f, 0.945295f, 1.846744f, 1.453674f, 1.505994f,
+};
+
+static const float av1_fp_simple_motion_search_term_none_mean_16[20] = {
+ 9.131485f, 9.065489f, 7.254479f, 7.158092f, 7.274240f, 7.178158f, 7.278780f,
+ 7.182110f, 7.278793f, 7.182714f, 3.981902f, 0.964040f, 2.080875f, 2.087185f,
+ 0.973397f, 2.088189f, 2.090166f, 9.386505f, 10.826546f, 15.985614f,
+};
+
+static const float av1_fp_simple_motion_search_term_none_std_16[20] = {
+ 1.681172f, 1.688587f, 1.710854f, 1.717533f, 1.684010f, 1.691476f, 1.683537f,
+ 1.691523f, 1.674699f, 1.682130f, 1.639731f, 0.186191f, 0.796448f, 0.795075f,
+ 0.160921f, 0.791005f, 0.790048f, 1.430960f, 1.337976f, 1.370498f,
+};
+
+static const float av1_fp_simple_motion_search_term_none_mean_8[20] = {
+ 7.821461f, 7.714526f, 5.799360f, 5.606948f, 5.805885f, 5.614357f, 5.794252f,
+ 5.599669f, 5.798780f, 5.605399f, 4.069016f, 0.977720f, 1.577513f, 1.581266f,
+ 0.983371f, 1.524603f, 1.524952f, 9.221803f, 9.508886f, 14.972815f,
+};
+
+static const float av1_fp_simple_motion_search_term_none_std_8[20] = {
+ 1.618036f, 1.634415f, 1.652861f, 1.672006f, 1.646337f, 1.664935f, 1.650876f,
+ 1.670476f, 1.645141f, 1.664301f, 1.502258f, 0.147592f, 0.760353f, 0.762547f,
+ 0.127879f, 0.741096f, 0.742186f, 1.042003f, 1.292524f, 1.250398f,
+};
+
+#define NUM_HIDDEN_LAYERS_32 1
+#define NUM_FEATURES_32 20
+#define NUM_LAYER_0_UNITS_32 20
+#define NUM_LOGITS_32 1
+
+static const float
+ av1_fp_simple_motion_search_term_none_hiddenlayer_0_kernel_32[] = {
+ -0.293987f, 0.796773f, -0.0888487f, -0.00796495f, -0.343768f,
+ 0.0783252f, 0.0596814f, -0.235432f, -0.0780005f, -0.409017f,
+ -0.256821f, -0.281654f, 1.00889f, 0.701893f, -0.0181661f,
+ 0.119718f, 0.0956582f, 0.76792f, 0.235693f, 0.351628f,
+ -1.28111f, -1.45847f, 0.387732f, 0.476054f, 0.384561f,
+ 0.427465f, 0.11875f, -0.0176598f, -0.0528453f, 0.395589f,
+ -0.331994f, 0.0442108f, 0.195171f, -0.0377402f, -0.0736457f,
+ -0.0490903f, 0.116165f, -0.549512f, 0.12968f, 0.641055f,
+ -1.03066f, -0.601979f, 0.351981f, -0.122019f, 0.00869275f,
+ 0.399222f, -0.343995f, -0.444257f, -0.160805f, -0.537537f,
+ 0.261478f, -0.163785f, 0.218916f, 0.106506f, -0.103819f,
+ 0.0121841f, 0.284757f, -0.362989f, 1.10793f, 0.477236f,
+ -0.424117f, -0.884156f, -0.468291f, -0.510531f, 0.791441f,
+ 0.75243f, 0.839871f, 0.604127f, -0.182956f, -0.246703f,
+ -1.25861f, 0.0546303f, 0.0811323f, 0.00655988f, 0.0286305f,
+ -0.00938366f, -0.0291418f, -0.231632f, -0.331077f, 1.12479f,
+ -0.635514f, -0.146066f, 0.853122f, 0.923699f, 0.180011f,
+ -0.252973f, 0.1474f, -0.454344f, 0.354736f, 0.576872f,
+ -1.43275f, 0.0327868f, 0.140849f, -0.102523f, 0.0524867f,
+ 0.007091f, -0.00232578f, -0.536116f, -0.700144f, 0.166646f,
+ 0.0636548f, 0.44645f, -0.346062f, -0.685779f, -1.0792f,
+ -0.999219f, 0.442744f, 0.371198f, 0.777914f, 0.719409f,
+ -0.417984f, 0.0602868f, 0.0225539f, 0.0457407f, 0.0249501f,
+ 0.0126021f, 0.00450792f, 0.0485095f, 0.203485f, 0.584116f,
+ -0.599426f, -0.244633f, 0.168231f, -0.00134934f, -0.106987f,
+ -0.0490239f, -0.22029f, 0.138017f, 0.373674f, 0.00638684f,
+ -2.08003f, 0.106453f, 0.124456f, -0.0286108f, 0.0422698f,
+ 0.013734f, 0.0780971f, -0.40173f, 0.473453f, 1.16836f,
+ -0.251035f, 0.0119074f, 0.319241f, 0.0422023f, -0.730454f,
+ -0.745948f, 0.796709f, 0.277634f, 0.09711f, -0.212224f,
+ 0.825348f, 0.0208521f, -0.0238098f, 0.00929265f, 0.0516351f,
+ -0.02329f, 0.0983163f, -0.180721f, 0.0122096f, -0.246159f,
+ 0.61468f, 0.923765f, 0.240435f, -0.294845f, -0.495317f,
+ -0.0563837f, -0.417936f, 0.154874f, -0.604407f, -0.0681337f,
+ -0.65738f, -0.0270073f, 0.0920023f, -0.0742724f, 0.820862f,
+ -0.602758f, -1.20617f, -0.201707f, 0.869499f, -0.0539076f,
+ 0.403097f, 0.429168f, -0.938227f, -0.830894f, -0.362462f,
+ -0.0658648f, 0.471469f, -0.264827f, 0.610275f, 0.367995f,
+ 0.735662f, -0.0473157f, -0.0380545f, -0.0848067f, -0.146108f,
+ -0.125875f, -0.0576117f, -0.296198f, -0.100443f, -0.212971f,
+ 0.593524f, 1.23111f, -0.810009f, -0.604572f, 0.203021f,
+ 0.256285f, -1.17049f, -1.19156f, 0.24365f, 0.727876f,
+ -0.466826f, 0.0298762f, -0.0331735f, -0.0109056f, 0.0114862f,
+ 0.00396703f, 0.0385985f, -0.0587946f, 0.821079f, 0.0582033f,
+ 0.349156f, 1.03529f, -0.407036f, 0.200308f, -0.265649f,
+ -0.104567f, 0.161149f, -0.0717528f, -0.0112724f, 0.0681578f,
+ 0.103809f, -0.0807997f, 0.0316814f, -0.332323f, 0.112254f,
+ -0.163981f, 0.118988f, -0.777055f, -1.34047f, -0.910482f,
+ 0.74599f, -0.59633f, 0.165649f, -0.594998f, 0.0845802f,
+ 0.00440975f, 0.122606f, -0.463991f, 0.418502f, -0.339126f,
+ 1.41847f, -0.109594f, -0.411879f, -0.444865f, -0.0404821f,
+ -0.0607352f, -0.663753f, -0.724327f, -0.138642f, 0.834144f,
+ -0.811695f, -0.930264f, 0.150993f, -0.325565f, 0.0615853f,
+ -0.473993f, 0.0966587f, 0.315197f, 1.0345f, 0.35441f,
+ 0.703234f, -0.335715f, 0.783153f, 0.467976f, -0.0234736f,
+ 0.549724f, 0.539107f, -0.510182f, -0.154442f, 0.0126656f,
+ 1.66711f, 0.884555f, 0.118675f, -0.341705f, 0.195316f,
+ -0.0366564f, -0.619244f, -0.634092f, -0.559951f, 0.0564255f,
+ 0.765917f, 0.0510238f, 0.0667615f, 0.0699302f, -0.0351751f,
+ -0.0484402f, -0.000792665f, -0.10775f, -0.337121f, -0.983947f,
+ 0.517793f, 1.34977f, -0.567602f, 0.129921f, -0.443722f,
+ -0.276277f, -0.501404f, -0.183234f, -0.553055f, -0.447434f,
+ -0.35529f, -0.0444689f, 0.0192031f, 0.0372702f, -0.195202f,
+ -0.020753f, -0.0247035f, 0.420298f, 1.39373f, 0.203699f,
+ -0.218818f, 0.250734f, -0.0282348f, 0.411986f, -0.262946f,
+ 0.526339f, 0.242769f, -0.159857f, -0.546788f, -0.0410147f,
+ 0.954238f, -0.0252765f, 0.639488f, -0.491367f, -0.0572638f,
+ 0.285763f, -0.45764f, 0.121657f, -1.24374f, -0.372479f,
+ -0.111521f, 0.194134f, -0.271364f, 0.179678f, 0.121237f,
+ -0.14305f, -0.205662f, 0.216891f, 0.344568f, -0.523745f,
+ -1.00908f, 0.180965f, 0.0263031f, -0.0556144f, 0.0831083f,
+ -0.0623274f, 0.112748f, 0.597137f, -0.502616f, -1.10624f,
+ -0.0487462f, -1.10744f, -0.125653f, 0.277049f, -0.141329f,
+ -0.00457003f, -0.161038f, 0.588462f, 0.323317f, 0.49762f,
+ 0.477561f, 0.901705f, -0.264511f, 0.256557f, 0.076023f,
+ -0.0460696f, 0.0830666f, -0.0651269f, -0.881245f, -0.285999f,
+ 0.53127f, 0.914533f, 0.0505795f, -0.3054f, -0.0988696f,
+ -0.0658403f, 0.15979f, -0.453316f, -0.824834f, -0.280222f,
+ -0.686952f, -0.0768344f, -1.12235f, -0.815408f, 0.0202134f,
+ -0.111892f, 0.0847659f, -0.18763f, 0.597782f, 0.364016f
+ };
+
+static const float
+ av1_fp_simple_motion_search_term_none_hiddenlayer_0_bias_32[] = {
+ -1.541f, -0.00935641f, -1.50754f, -0.638648f, -0.679403f,
+ -0.0387804f, -0.714791f, -1.69522f, 0.435677f, -1.5846f,
+ 0.108788f, 0.614982f, 0.111048f, -0.465826f, -0.611358f,
+ 0.637197f, 0.929621f, -1.20889f, 0.954558f, 0.716529f
+ };
+
+static const float av1_fp_simple_motion_search_term_none_logits_kernel_32[] = {
+ 0.396195f, -0.791364f, -0.881893f, 1.0542069f, 0.772562f,
+ 0.60815647f, 1.117405f, -1.272638f, 0.483183f, -0.917147f,
+ 0.690799f, -0.601466f, -0.545536f, -0.416353f, -0.927874f,
+ 0.972198f, -0.3770457f, 0.542694f, -0.591889f, 0.464565f
+};
+
+static const float av1_fp_simple_motion_search_term_none_logits_bias_32[] = {
+ -0.590318f
+};
+
+static const NN_CONFIG av1_fp_simple_motion_search_term_none_nn_config_32 = {
+ NUM_FEATURES_32,
+ NUM_LOGITS_32,
+ NUM_HIDDEN_LAYERS_32,
+ {
+ NUM_LAYER_0_UNITS_32,
+ },
+ {
+ av1_fp_simple_motion_search_term_none_hiddenlayer_0_kernel_32,
+ av1_fp_simple_motion_search_term_none_logits_kernel_32,
+ },
+ {
+ av1_fp_simple_motion_search_term_none_hiddenlayer_0_bias_32,
+ av1_fp_simple_motion_search_term_none_logits_bias_32,
+ },
+};
+
+#undef NUM_HIDDEN_LAYERS_32
+#undef NUM_FEATURES_32
+#undef NUM_LAYER_0_UNITS_32
+#undef NUM_LOGITS_32
+
+#define NUM_HIDDEN_LAYERS_16 1
+#define NUM_FEATURES_16 20
+#define NUM_LAYER_0_UNITS_16 24
+#define NUM_LOGITS_16 1
+
+static const float
+ av1_fp_simple_motion_search_term_none_hiddenlayer_0_kernel_16[] = {
+ -0.315922f, 0.74455f, -0.0196939f, 0.238336f, 0.288554f,
+ 0.0845902f, -0.0121831f, 0.455303f, 0.0235902f, 0.218997f,
+ -0.0445164f, 0.0752211f, 0.0539915f, -0.0439682f, -0.397139f,
+ -0.0030004f, -0.106365f, 0.845384f, 0.684638f, -0.965702f,
+ 0.307643f, -0.0433377f, -0.0644826f, -0.214946f, -0.44467f,
+ 0.142967f, 0.0109982f, -0.344458f, -0.42947f, 0.269175f,
+ -0.88534f, -0.28077f, -1.36018f, -0.33725f, -0.0885953f,
+ -0.123887f, 0.218107f, -0.0759977f, 0.739124f, 0.684048f,
+ 0.577964f, -0.328481f, -0.247837f, 0.00546713f, 0.191895f,
+ -0.145274f, 0.320121f, -0.482379f, 0.534585f, -0.1582f,
+ 0.944784f, 0.944665f, 0.0494451f, -0.0399724f, -0.170375f,
+ -0.0869746f, 0.106216f, -0.120556f, -1.57849f, -0.752895f,
+ 0.424454f, -0.0269515f, 0.00398589f, 0.214165f, -0.142986f,
+ 0.199223f, 0.049624f, -0.116783f, -0.648119f, -0.311599f,
+ 0.122629f, -0.0338422f, 0.345092f, -0.408254f, 0.601037f,
+ -0.00146985f, 0.00133926f, 0.0392668f, -0.931156f, 0.31429f,
+ -0.150243f, 0.0755763f, -0.32177f, 0.258521f, -0.104078f,
+ -0.144506f, 0.0199566f, -0.454723f, -0.292959f, -0.0953681f,
+ -1.24843f, 0.446814f, -0.311363f, 0.0590878f, -0.0568717f,
+ -0.421585f, 0.179852f, 0.668763f, 0.48914f, 0.290584f,
+ -1.14053f, -1.37576f, 0.420112f, -0.158582f, 0.268231f,
+ 0.252999f, 0.276423f, 0.529033f, 0.141127f, 0.702762f,
+ 0.181407f, -0.0279289f, -0.0194757f, 0.0752152f, -0.136963f,
+ 0.00902489f, 0.125334f, 0.0680212f, -0.370449f, 0.438003f,
+ -0.600869f, 0.154209f, -0.36306f, -0.484209f, 0.140093f,
+ 0.0743079f, -0.143317f, 0.0442872f, 0.272089f, 0.601531f,
+ 1.20687f, -0.280695f, 0.222235f, -0.0106747f, -0.017026f,
+ 0.204008f, -0.0316111f, -0.64679f, -0.866749f, -0.774231f,
+ 0.306231f, -0.0940114f, -0.56555f, -0.34399f, 0.425142f,
+ 0.424064f, -0.50189f, -0.146558f, 0.544899f, 0.141728f,
+ 1.14592f, -0.0124826f, 0.111613f, -0.0862228f, 0.0211737f,
+ 0.0614017f, 0.0245077f, -0.454523f, -0.0766391f, -0.436808f,
+ 0.251409f, -0.13354f, -0.242447f, -0.311807f, -0.844505f,
+ -0.671486f, 0.0946297f, 0.241702f, 0.856521f, 0.529763f,
+ -0.869772f, -0.0016341f, 0.14511f, 0.0136254f, -0.0359721f,
+ -0.0454713f, 0.00664495f, 0.0373555f, 0.653991f, -0.075867f,
+ -0.102728f, -0.947685f, -0.119479f, -0.145413f, 0.148364f,
+ 0.310885f, -0.266837f, 0.354087f, 0.299469f, 0.603911f,
+ 0.257161f, 0.0190527f, 0.152862f, -0.0987196f, -0.293369f,
+ 0.139026f, -0.128421f, 0.0505933f, -0.703803f, 1.08628f,
+ -0.562294f, -0.818943f, 0.102178f, 0.727399f, -0.228433f,
+ 0.484057f, 0.0595919f, -0.0559087f, -0.549447f, 0.176168f,
+ 1.41744f, -0.126284f, 0.0987251f, -0.00123073f, 0.00510827f,
+ 0.105209f, 0.0671775f, -0.438525f, 0.211028f, -0.782459f,
+ 0.286411f, -0.459887f, 0.0633669f, 0.329958f, -0.0736945f,
+ 0.45188f, -0.2447f, 0.676601f, 0.600321f, -0.0336198f,
+ 0.108531f, 0.0452834f, -0.0848577f, 0.0731281f, 1.32381f,
+ -0.118349f, 0.129497f, -0.840938f, -1.45444f, -0.559047f,
+ -0.248109f, -0.491559f, -0.139812f, 0.175964f, 0.168687f,
+ 0.123031f, 0.201625f, 0.422849f, 0.34436f, 0.0426694f,
+ 0.558045f, -0.246772f, 0.679483f, -0.0959578f, -0.102879f,
+ 0.391029f, 0.280906f, 0.0867408f, -1.10932f, 0.402526f,
+ -0.227285f, 0.336087f, -0.237765f, 0.185619f, -0.309732f,
+ 0.0781132f, -0.0234955f, 0.0828806f, 0.19966f, -0.241288f,
+ -0.224634f, 0.0638918f, -0.143521f, -0.0206692f, -0.27131f,
+ 0.973051f, 1.12031f, 0.262846f, 0.471585f, 0.105231f,
+ -0.386434f, -0.355846f, 0.7359f, 0.567308f, 0.130768f,
+ 0.242369f, -0.0272523f, -0.118436f, 0.374145f, 0.24802f,
+ -1.00186f, -0.0241195f, 0.0140446f, 0.0202831f, 0.163197f,
+ 0.0399298f, -0.00912791f, -0.280572f, -0.309893f, -0.644495f,
+ 0.243838f, 0.731391f, 0.0725078f, 0.350308f, -0.136691f,
+ 0.208814f, 0.0218567f, -0.0805393f, -0.18681f, -0.214638f,
+ 0.273354f, -0.355047f, 0.242748f, 0.472951f, -0.202705f,
+ 0.405247f, 0.161622f, -0.284883f, -1.31181f, -0.661056f,
+ -0.248219f, -0.827307f, 0.289221f, 0.660529f, 0.48563f,
+ 0.407366f, 0.0327303f, -0.0610309f, -0.647064f, 0.0899991f,
+ 0.376267f, 1.27555f, 0.0264175f, 0.153931f, 1.07345f,
+ 0.0715052f, 0.174473f, 0.01322f, -0.715723f, 0.113909f,
+ 0.100968f, -0.457287f, -0.672022f, -0.20532f, 0.895176f,
+ 0.357034f, 0.5413f, 0.918393f, -0.455f, -0.499617f,
+ -1.21799f, 0.0634338f, 0.144944f, -0.106715f, 0.0227713f,
+ -0.0203213f, 0.030851f, -0.0726756f, 0.589192f, -0.060841f,
+ -0.198521f, 0.497179f, -0.0591156f, -0.135466f, -0.132638f,
+ -0.181333f, -0.332358f, 0.0349959f, 0.212885f, -0.536206f,
+ -0.425009f, -0.035525f, 0.0384449f, 0.0360549f, -0.0383953f,
+ -0.0263281f, -0.0228435f, 1.11771f, 0.928061f, -0.163923f,
+ -0.327868f, -0.894518f, 0.00448907f, 0.0805977f, 0.329559f,
+ 0.157429f, 0.292729f, 0.497688f, 0.188659f, 0.203724f,
+ -1.26001f, -0.0392533f, -0.0566088f, 0.000859925f, 0.125254f,
+ 0.054261f, 0.0357295f, -0.393813f, -0.275944f, 0.299657f,
+ -0.211421f, 0.038172f, -0.439829f, -0.913949f, 0.35642f,
+ 0.865473f, -0.472033f, -0.752376f, 0.995255f, 0.417965f,
+ -0.680645f, 0.0622027f, 0.128878f, -0.0357859f, 0.0793577f,
+ 0.203629f, -0.0600867f, 0.0512268f, 0.528584f, 0.23889f,
+ 0.38255f, -0.216407f, -0.0338828f, 0.0328103f, -0.885678f,
+ -0.716634f, 0.438663f, 0.320841f, -0.119656f, 0.626092f,
+ 0.8526f, -0.0325005f, -0.0275416f, -0.171131f, 0.0260563f,
+ -0.0162027f, 0.0879367f, -0.340473f, 0.0220265f, -0.1731f,
+ 0.512539f, 0.587822f, -0.175619f, 0.177215f, -0.35458f,
+ -0.159059f, -0.423754f, 0.0198413f, -0.336208f, -0.359052f,
+ -1.50819f, 0.0628184f, 0.054506f, 0.0048834f, 0.361657f,
+ 0.00986886f, -0.0721521f, -0.256765f, 1.41173f, 0.376196f,
+ -0.0783331f, 0.174803f, -0.00240091f, -0.306571f, -0.304654f,
+ -0.0348377f, 0.115569f, -0.20359f, -0.162341f, -0.0443526f,
+ -0.848317f, -0.228167f, 0.699534f, 0.482092f, -0.0921484f,
+ -0.172425f, -0.0610094f, -0.188327f, 0.836209f, 0.541725f
+ };
+
+static const float
+ av1_fp_simple_motion_search_term_none_hiddenlayer_0_bias_16[] = {
+ -0.388147f, -0.0868767f, 0.702129f, 0.376659f, -0.709988f, 0.496603f,
+ -0.238442f, -1.35761f, -0.391887f, 0.235468f, -0.327982f, 0.731842f,
+ 1.0949f, -0.789218f, -0.881452f, 0.514341f, 0.727894f, -0.494498f,
+ -1.32304f, -1.22643f, -0.294287f, -1.3974f, -0.128148f, -0.0956137f
+ };
+
+static const float av1_fp_simple_motion_search_term_none_logits_kernel_16[] = {
+ 0.456147f, 0.248707f, -0.5205241f, -0.1506567f, 0.388359f, -0.6074409f,
+ -0.4719775f, -0.733864f, 0.5588447f, -0.4021345f, -1.140733f, -0.73399f,
+ -0.4299591f, 0.450688f, 0.817564f, -0.265486f, -0.3525806f, 0.55188314f,
+ 1.365457f, 1.180764f, 0.587772f, -0.870683f, 0.818839f, 0.318488f
+};
+
+static const float av1_fp_simple_motion_search_term_none_logits_bias_16[] = {
+ -0.1046478f
+};
+
+static const NN_CONFIG av1_fp_simple_motion_search_term_none_nn_config_16 = {
+ NUM_FEATURES_16,
+ NUM_LOGITS_16,
+ NUM_HIDDEN_LAYERS_16,
+ {
+ NUM_LAYER_0_UNITS_16,
+ },
+ {
+ av1_fp_simple_motion_search_term_none_hiddenlayer_0_kernel_16,
+ av1_fp_simple_motion_search_term_none_logits_kernel_16,
+ },
+ {
+ av1_fp_simple_motion_search_term_none_hiddenlayer_0_bias_16,
+ av1_fp_simple_motion_search_term_none_logits_bias_16,
+ },
+};
+
+#undef NUM_HIDDEN_LAYERS_16
+#undef NUM_FEATURES_16
+#undef NUM_LAYER_0_UNITS_16
+#undef NUM_LOGITS_16
+
+#define NUM_HIDDEN_LAYERS_8 1
+#define NUM_FEATURES_8 20
+#define NUM_LAYER_0_UNITS_8 16
+#define NUM_LOGITS_8 1
+
+static const float
+ av1_fp_simple_motion_search_term_none_hiddenlayer_0_kernel_8[] = {
+ -1.11024f, -0.530449f, -0.164768f, 0.675431f, 0.456155f,
+ 0.711099f, -0.248095f, 0.112132f, -0.131481f, 0.234457f,
+ 0.128073f, 0.306214f, 0.175471f, 0.220189f, -0.270533f,
+ 0.293534f, -0.0795547f, 0.234901f, -0.191754f, 0.101171f,
+ -0.108621f, 0.395477f, -0.529459f, -0.354854f, -0.941334f,
+ -0.237689f, 0.39357f, 0.527129f, 0.174333f, -0.00520422f,
+ 1.22219f, -0.21815f, 0.0866816f, -0.29591f, -0.212968f,
+ 0.00431436f, -0.295382f, -0.582317f, -0.284654f, 0.486427f,
+ -0.202448f, -0.0421883f, -0.116346f, -0.345832f, -0.0471637f,
+ -0.149954f, -0.0969526f, -0.59491f, 0.594364f, 0.298285f,
+ -1.33301f, 0.149562f, 0.097433f, 0.157641f, -0.231132f,
+ -0.0191656f, 0.149396f, 0.811553f, 1.07336f, 0.140674f,
+ 1.02134f, 0.455909f, -0.0548795f, 0.0459996f, -0.0589837f,
+ -0.116328f, -0.607502f, -0.232595f, -0.517977f, -0.325901f,
+ 1.35047f, -0.148698f, 0.0313182f, 0.181634f, 0.06539f,
+ 0.00820322f, 0.0522113f, -1.06071f, -0.817999f, -0.527422f,
+ -1.39175f, -0.110088f, 0.0858626f, -0.247541f, 0.29043f,
+ 1.13767f, 0.185834f, 0.390613f, -0.501175f, -0.214176f,
+ -0.256376f, 0.496687f, 0.240471f, 0.218852f, 0.513543f,
+ 0.400559f, -0.249168f, -0.752987f, 0.430491f, -0.72299f,
+ 0.339754f, 0.396623f, -0.0638322f, 0.353122f, 0.355662f,
+ -0.0704821f, 0.195448f, 0.179396f, 0.486533f, 0.0815535f,
+ -0.503726f, -0.000321223f, 0.501591f, -0.117849f, 0.217667f,
+ -0.123391f, -0.4026f, 0.149756f, -0.0359276f, -0.0990213f,
+ -0.215278f, -0.293649f, 0.301629f, -0.11081f, -0.206725f,
+ -0.00147108f, 0.363644f, -0.430092f, 0.169524f, 0.116091f,
+ -0.583605f, -0.0974948f, 0.253256f, 0.22648f, 0.136902f,
+ -0.882541f, -0.75078f, -0.0629343f, 0.411035f, 0.265742f,
+ -0.360904f, -0.899324f, 0.605871f, 0.0318372f, 0.0735312f,
+ -0.00960722f, 0.691249f, 0.127449f, -0.133021f, -0.0793589f,
+ 0.665591f, -0.0682262f, -0.0437626f, 0.0783621f, 2.25727f,
+ 0.126529f, -0.0320763f, -0.261759f, -1.19987f, 0.216295f,
+ -0.253886f, -0.642908f, 0.1865f, 0.00299179f, 0.0246782f,
+ -0.00750628f, 0.566367f, 0.99916f, -0.0209625f, 0.273254f,
+ 1.09724f, 0.30026f, 0.21585f, -0.0276715f, 0.338996f,
+ 0.129884f, -0.00628438f, 0.0461783f, -1.36378f, -0.394756f,
+ -0.395261f, 0.215928f, 0.252803f, -0.207108f, -0.0506214f,
+ -0.0138889f, 0.124197f, -0.0522996f, 0.533803f, -0.25729f,
+ -0.463514f, 0.128322f, -1.04751f, -0.605498f, -0.107235f,
+ -0.00813289f, 0.539742f, -0.0524178f, 0.272101f, 0.151935f,
+ 0.607511f, -0.0608427f, 0.36342f, 0.0999134f, 0.69712f,
+ -0.152471f, 0.364244f, 0.410644f, 0.312606f, 0.405679f,
+ -0.371656f, -0.0492209f, -0.148911f, 0.214996f, -0.274749f,
+ -0.0372888f, 0.079023f, -0.429136f, -1.30393f, -0.833824f,
+ -1.31373f, -0.445343f, 0.526917f, 1.30569f, -0.0626746f,
+ 0.282353f, -0.28552f, 0.28084f, -0.234934f, 0.227076f,
+ 1.09919f, 0.33248f, -0.114933f, 0.40629f, 0.331031f,
+ 0.245334f, -0.0318782f, 0.00735305f, -1.58715f, 0.126443f,
+ -0.09472f, -0.182152f, 0.311673f, -0.186136f, 0.817743f,
+ 0.928961f, 0.117334f, -0.373644f, -0.0797864f, 0.205565f,
+ 0.0789797f, 0.0757131f, -0.152409f, 0.30301f, -0.0170824f,
+ -0.194496f, 0.485547f, 0.370124f, -0.802044f, -0.789671f,
+ 0.669258f, 0.55082f, -0.438853f, 0.0597597f, -0.0148101f,
+ -0.41603f, 0.0486339f, -0.464523f, -0.413725f, 0.00907629f,
+ 0.70351f, -0.136422f, -0.145957f, -0.0626726f, -0.115773f,
+ -0.333937f, 0.135474f, -0.379598f, -0.134422f, 0.227595f,
+ 0.908927f, 0.759504f, -0.0088258f, -0.349333f, 0.122667f,
+ -0.682175f, 0.2201f, -0.332003f, -0.44433f, -0.620308f,
+ -1.36716f, -0.0167907f, -0.538969f, 0.256824f, -0.0706724f,
+ -0.0392471f, -0.156312f, 0.153699f, 1.41967f, 0.0434739f,
+ 0.428178f, -0.0714879f, 0.0912104f, 0.00687985f, 0.341789f,
+ 0.217381f, 0.128288f, 0.0286751f, 0.527344f, -0.428139f,
+ 0.60908f, 1.02074f, -0.0977894f, 0.158067f, 0.28958f,
+ -0.065152f, 0.120616f, -0.882976f, -1.10413f, -1.37497f
+ };
+
+static const float
+ av1_fp_simple_motion_search_term_none_hiddenlayer_0_bias_8[] = {
+ 1.37086f, -1.61858f, -1.32395f, 0.276031f, -0.124696f, -1.71489f,
+ -1.68429f, 1.79103f, -0.335306f, -1.81523f, 0.841083f, -0.542628f,
+ -1.82168f, 0.459829f, 0.0949306f, 0.918486f
+ };
+
+static const float av1_fp_simple_motion_search_term_none_logits_kernel_8[] = {
+ -0.283418f, -0.444453f, 0.4977782f, -0.4138758f, 0.41890771f, 0.22149438f,
+ 0.545079f, -0.729164f, 0.619389f, 0.5169534f, -0.4236282f, 0.7304213f,
+ 0.531938f, -0.14828f, 0.75119f, -0.464074f
+};
+
+static const float av1_fp_simple_motion_search_term_none_logits_bias_8[] = {
+ -2.22338f
+};
+
+static const NN_CONFIG av1_fp_simple_motion_search_term_none_nn_config_8 = {
+ NUM_FEATURES_8,
+ NUM_LOGITS_8,
+ NUM_HIDDEN_LAYERS_8,
+ {
+ NUM_LAYER_0_UNITS_8,
+ },
+ {
+ av1_fp_simple_motion_search_term_none_hiddenlayer_0_kernel_8,
+ av1_fp_simple_motion_search_term_none_logits_kernel_8,
+ },
+ {
+ av1_fp_simple_motion_search_term_none_hiddenlayer_0_bias_8,
+ av1_fp_simple_motion_search_term_none_logits_bias_8,
+ },
+};
#undef NUM_HIDDEN_LAYERS_8
#undef NUM_FEATURES_8
#undef NUM_LAYER_0_UNITS_8
#undef NUM_LOGITS_8
-#endif
+
+static const float av1_fp_simple_motion_search_term_none_thresh_32 =
+ -2.2884985045792563f;
+static const float av1_fp_simple_motion_search_term_none_thresh_16 =
+ -1.6656874577527165f;
+static const float av1_fp_simple_motion_search_term_none_thresh_8 =
+ -3.608804354309157f;
#ifdef __cplusplus
} // extern "C"
diff --git a/libaom/av1/encoder/partition_strategy.c b/libaom/av1/encoder/partition_strategy.c
new file mode 100644
index 0000000..e8270b3
--- /dev/null
+++ b/libaom/av1/encoder/partition_strategy.c
@@ -0,0 +1,727 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <float.h>
+
+#include "aom_ports/system_state.h"
+
+#include "av1/common/enums.h"
+#include "av1/common/reconinter.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/partition_model_weights.h"
+#include "av1/encoder/partition_strategy.h"
+#include "av1/encoder/rdopt.h"
+
+// Performs a simple_motion_search with a single reference frame and extract
+// the variance of residues. Here features is assumed to be a length 6 array.
+// After this function is called, we will store the following in to features:
+// features[0] = log(1 + dc_q**2/256)
+// features[1] = log(1 + variance_of_residue)
+// for i in [2, 3, 4, 5]:
+// features[i] = log(1 + variance_of_residue_in_block[i]/variance_of_residue)
+static void get_res_var_features(AV1_COMP *const cpi, MACROBLOCK *x, int mi_row,
+ int mi_col, BLOCK_SIZE bsize,
+ float *features) {
+ // TODO(chiyotsai@google.com): The data this model trained on did not also use
+ // SIMPLE_TRANSLATION to build the inter_predictor. Retraining and tuning the
+ // model with the correct data should give better performance.
+ assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+
+ MACROBLOCKD *xd = &x->e_mbd;
+
+ // Perform a single motion search in Y_PLANE to make a prediction
+ const int use_subpixel = 0;
+
+ // Start getting the features
+ int f_idx = 0;
+
+ // Q_INDEX
+ const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8);
+ aom_clear_system_state();
+ features[f_idx++] = logf(1.0f + (float)(dc_q * dc_q) / 256.0f);
+
+ // VARIANCE
+ unsigned int sse = 0;
+ unsigned int var = 0;
+ const MV ref_mv_full = { .row = 0, .col = 0 };
+ av1_simple_motion_sse_var(cpi, x, mi_row, mi_col, bsize, ref_mv_full,
+ use_subpixel, &sse, &var);
+ aom_clear_system_state();
+ features[f_idx++] = logf(1.0f + (float)var);
+
+ // Regional
+ const uint8_t *src = x->plane[0].src.buf;
+ const int src_stride = x->plane[0].src.stride;
+ const uint8_t *dst = xd->plane[0].dst.buf;
+ const int dst_stride = xd->plane[0].dst.stride;
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+ int r_idx = 0;
+ for (r_idx = 0; r_idx < 4; r_idx++) {
+ const int x_idx = (r_idx & 1) * bw / 2;
+ const int y_idx = (r_idx >> 1) * bh / 2;
+ const int src_offset = y_idx * src_stride + x_idx;
+ const int dst_offset = y_idx * dst_stride + x_idx;
+ const unsigned int sub_var = cpi->fn_ptr[subsize].vf(
+ src + src_offset, src_stride, dst + dst_offset, dst_stride, &sse);
+ aom_clear_system_state();
+ const float var_ratio = (1.0f + (float)sub_var) / (4.0f + (float)var);
+ features[f_idx++] = var_ratio;
+ }
+}
+
+void av1_simple_motion_search_based_split(
+ AV1_COMP *const cpi, MACROBLOCK *x, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, int *partition_none_allowed, int *partition_horz_allowed,
+ int *partition_vert_allowed, int *do_rectangular_split,
+ int *do_square_split) {
+ const NN_CONFIG *nn_config = NULL;
+ float split_only_thresh = 0.0f;
+ if (bsize == BLOCK_128X128) {
+ nn_config = &av1_simple_motion_search_based_split_nn_config_128;
+ split_only_thresh = av1_simple_motion_search_based_split_thresh_128;
+ } else if (bsize == BLOCK_64X64) {
+ nn_config = &av1_simple_motion_search_based_split_nn_config_64;
+ split_only_thresh = av1_simple_motion_search_based_split_thresh_64;
+ } else if (bsize == BLOCK_32X32) {
+ nn_config = &av1_simple_motion_search_based_split_nn_config_32;
+ split_only_thresh = av1_simple_motion_search_based_split_thresh_32;
+ } else if (bsize == BLOCK_16X16) {
+ nn_config = &av1_simple_motion_search_based_split_nn_config_16;
+ split_only_thresh = av1_simple_motion_search_based_split_thresh_16;
+ } else if (bsize == BLOCK_8X8) {
+ // Disable BLOCK_8X8 for now
+#if !CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8
+ nn_config = &av1_simple_motion_search_based_split_nn_config_8;
+ split_only_thresh = av1_simple_motion_search_based_split_thresh_8;
+#endif
+ } else {
+ assert(0 && "Unexpected block size in simple_motion_based_split");
+ }
+ if (nn_config) {
+ float features[6] = { 0 };
+ float score = 0;
+ get_res_var_features(cpi, x, mi_row, mi_col, bsize, features);
+ av1_nn_predict(features, nn_config, &score);
+
+ if (score > split_only_thresh) {
+ *partition_none_allowed = 0;
+ *partition_horz_allowed = 0;
+ *partition_vert_allowed = 0;
+ *do_rectangular_split = 0;
+ }
+ if (cpi->sf.simple_motion_search_split_only >= 2) {
+ if (score < -split_only_thresh) *do_square_split = 0;
+ // For larger scores (>split_only_thresh), none and rectangular partitions
+ // are skipped. As score reduces, possibility of split decreases. Hence
+ // for near larger scores (.875 * split_only_thresh to split_only_thresh)
+ // none partition is disabled, but rectangular partitions are evaluated
+ // additionally.
+ if (score > (split_only_thresh * 0.875)) *partition_none_allowed = 0;
+ }
+ }
+}
+
+// Given a list of ref frames in refs, performs simple_motion_search on each of
+// the refs and returns the ref with the smallest sse. Returns -1 if none of the
+// ref in the list is available. Also stores the best sse and var in best_sse,
+// best_var, respectively. If save_mv_code is -1, don't update mv_ref_fulls in
+// pc_tree. If save_mv_code is between 0 and 3, update mv_ref_fulls under
+// pc_tree->split[i]. If save_mv_code is 4, update mv_ref_fulls under pc_tree.
+static int simple_motion_search_get_best_ref(
+ AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row,
+ int mi_col, BLOCK_SIZE bsize, const int *const refs, int num_refs,
+ int use_subpixel, int save_mv_code, unsigned int *best_sse,
+ unsigned int *best_var) {
+ // TODO(chiyotsai@google.com): The calculation of variance currently uses
+ // bsize, so we might take area outside of the image into account. We need to
+ // modify the SIMD functions to fix this later.
+ const AV1_COMMON *const cm = &cpi->common;
+ int best_ref = -1;
+
+ if (mi_col >= cm->mi_cols || mi_row >= cm->mi_rows) {
+ // If the whole block is outside of the image, set the var and sse to 0.
+ *best_var = 0;
+ *best_sse = 0;
+
+ return best_ref;
+ }
+
+ // Otherwise do loop through the reference frames and find the one with the
+ // minimum SSE
+ const MACROBLOCKD *xd = &x->e_mbd;
+ const MV *mv_ref_fulls = pc_tree->mv_ref_fulls;
+
+ const int num_planes = 1;
+
+ *best_sse = INT_MAX;
+
+ for (int ref_idx = 0; ref_idx < num_refs; ref_idx++) {
+ const int ref = refs[ref_idx];
+
+ if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref]) {
+ unsigned int curr_sse = 0, curr_var = 0;
+ av1_simple_motion_search(cpi, x, mi_row, mi_col, bsize, ref,
+ mv_ref_fulls[ref], num_planes, use_subpixel);
+ curr_var = cpi->fn_ptr[bsize].vf(
+ x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].dst.buf,
+ xd->plane[0].dst.stride, &curr_sse);
+ if (curr_sse < *best_sse) {
+ *best_sse = curr_sse;
+ *best_var = curr_var;
+ best_ref = ref;
+ }
+
+ const int new_mv_row = x->best_mv.as_mv.row / 8;
+ const int new_mv_col = x->best_mv.as_mv.col / 8;
+ if (save_mv_code == 4) {
+ pc_tree->mv_ref_fulls[ref].row = new_mv_row;
+ pc_tree->mv_ref_fulls[ref].col = new_mv_col;
+ } else if (save_mv_code >= 0 && save_mv_code < 4) {
+ // Propagate the new motion vectors to a lower level
+ pc_tree->split[save_mv_code]->mv_ref_fulls[ref].row = new_mv_row;
+ pc_tree->split[save_mv_code]->mv_ref_fulls[ref].col = new_mv_col;
+ } else {
+ assert(save_mv_code == -1 &&
+ "Unknown code in simple_motion_search_get_best_ref.");
+ }
+ }
+ }
+
+ return best_ref;
+}
+
+// Performs fullpixel simple_motion_search with LAST_FRAME and ALTREF_FRAME on
+// each subblock and extract the variance and sse of residues. Then store the
+// var and sse from each partition subblock to features. The DC qindex is also
+// stored in features.
+// Here features is assumed to be a length 19 array.
+// After this function is called, we will store the following to features:
+// features[0:17] = var and sse from subblocks
+// features[18] = DC q_index
+static void simple_motion_search_prune_part_features(
+ AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row,
+ int mi_col, BLOCK_SIZE bsize, float *features) {
+ // TODO(chiyotsai@google.com): Cache the result of the motion search from the
+ // larger bsize.
+ const int w_mi = mi_size_wide[bsize];
+ const int h_mi = mi_size_high[bsize];
+ int f_idx = 0;
+ assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+ assert(cpi->ref_frame_flags & av1_ref_frame_flag_list[LAST_FRAME] ||
+ cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME]);
+
+ // Setting up motion search
+ const int ref_list[] = { LAST_FRAME, ALTREF_FRAME };
+ const int num_refs = 2;
+ const int use_subpixel = 1;
+
+ unsigned int int_features[FEATURE_SIZE_SMS_PRUNE_PART - 1];
+
+ // Doing whole block first to update the mv
+ simple_motion_search_get_best_ref(
+ cpi, x, pc_tree, mi_row, mi_col, bsize, ref_list, num_refs, use_subpixel,
+ 4, &int_features[f_idx], &int_features[f_idx + 1]);
+ f_idx += 2;
+
+ // Split subblocks
+ BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+ int r_idx = 0;
+ for (r_idx = 0; r_idx < 4; r_idx++) {
+ const int sub_mi_col = mi_col + (r_idx & 1) * w_mi / 2;
+ const int sub_mi_row = mi_row + (r_idx >> 1) * h_mi / 2;
+
+ simple_motion_search_get_best_ref(
+ cpi, x, pc_tree, sub_mi_row, sub_mi_col, subsize, ref_list, num_refs,
+ use_subpixel, r_idx, &int_features[f_idx], &int_features[f_idx + 1]);
+ f_idx += 2;
+ }
+
+ // Horz subblocks
+ subsize = get_partition_subsize(bsize, PARTITION_HORZ);
+ for (r_idx = 0; r_idx < 2; r_idx++) {
+ const int sub_mi_col = mi_col + 0;
+ const int sub_mi_row = mi_row + r_idx * h_mi / 2;
+
+ simple_motion_search_get_best_ref(
+ cpi, x, pc_tree, sub_mi_row, sub_mi_col, subsize, ref_list, num_refs,
+ use_subpixel, -1, &int_features[f_idx], &int_features[f_idx + 1]);
+
+ f_idx += 2;
+ }
+
+ // Vert subblock
+ subsize = get_partition_subsize(bsize, PARTITION_VERT);
+ for (r_idx = 0; r_idx < 2; r_idx++) {
+ const int sub_mi_col = mi_col + r_idx * w_mi / 2;
+ const int sub_mi_row = mi_row + 0;
+
+ simple_motion_search_get_best_ref(
+ cpi, x, pc_tree, sub_mi_row, sub_mi_col, subsize, ref_list, num_refs,
+ use_subpixel, -1, &int_features[f_idx], &int_features[f_idx + 1]);
+
+ f_idx += 2;
+ }
+
+ aom_clear_system_state();
+ for (int idx = 0; idx < f_idx; idx++) {
+ features[idx] = logf(1.0f + (float)int_features[idx]);
+ }
+
+ const MACROBLOCKD *xd = &x->e_mbd;
+ set_offsets_for_motion_search(cpi, x, mi_row, mi_col, bsize);
+
+ // Q_INDEX
+ const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8);
+ features[f_idx++] = logf(1.0f + (float)(dc_q * dc_q) / 256.0f);
+
+ // Neighbor stuff
+ const int has_above = !!xd->above_mbmi;
+ const int has_left = !!xd->left_mbmi;
+ const BLOCK_SIZE above_bsize = has_above ? xd->above_mbmi->sb_type : bsize;
+ const BLOCK_SIZE left_bsize = has_left ? xd->left_mbmi->sb_type : bsize;
+ features[f_idx++] = (float)has_above;
+ features[f_idx++] = (float)mi_size_wide_log2[above_bsize];
+ features[f_idx++] = (float)mi_size_high_log2[above_bsize];
+ features[f_idx++] = (float)has_left;
+ features[f_idx++] = (float)mi_size_wide_log2[left_bsize];
+ features[f_idx++] = (float)mi_size_high_log2[left_bsize];
+
+ assert(f_idx == FEATURE_SIZE_SMS_PRUNE_PART);
+}
+
+void av1_simple_motion_search_prune_part(
+ AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row,
+ int mi_col, BLOCK_SIZE bsize, int *partition_none_allowed,
+ int *partition_horz_allowed, int *partition_vert_allowed,
+ int *do_square_split, int *do_rectangular_split, int *prune_horz,
+ int *prune_vert, float *features, int *valid) {
+ const AV1_COMMON *const cm = &cpi->common;
+ // Get model parameters
+ const NN_CONFIG *nn_config = NULL;
+ const float *prune_thresh = NULL, *only_thresh = NULL;
+ const float *ml_mean = NULL, *ml_std = NULL;
+ float normalized_features[FEATURE_SIZE_SMS_PRUNE_PART] = { 0.0f };
+
+ if (bsize == BLOCK_128X128) {
+ nn_config = &av1_simple_motion_search_prune_part_nn_config_128;
+ ml_mean = av1_simple_motion_search_prune_part_mean_128;
+ ml_std = av1_simple_motion_search_prune_part_std_128;
+ prune_thresh = av1_simple_motion_search_prune_part_prune_thresh_128;
+ only_thresh = av1_simple_motion_search_prune_part_only_thresh_128;
+ } else if (bsize == BLOCK_64X64) {
+ nn_config = &av1_simple_motion_search_prune_part_nn_config_64;
+ ml_mean = av1_simple_motion_search_prune_part_mean_64;
+ ml_std = av1_simple_motion_search_prune_part_std_64;
+ prune_thresh = av1_simple_motion_search_prune_part_prune_thresh_64;
+ only_thresh = av1_simple_motion_search_prune_part_only_thresh_64;
+ } else if (bsize == BLOCK_32X32) {
+ nn_config = &av1_simple_motion_search_prune_part_nn_config_32;
+ ml_mean = av1_simple_motion_search_prune_part_mean_32;
+ ml_std = av1_simple_motion_search_prune_part_std_32;
+ prune_thresh = av1_simple_motion_search_prune_part_prune_thresh_32;
+ only_thresh = av1_simple_motion_search_prune_part_only_thresh_32;
+ } else if (bsize == BLOCK_16X16) {
+ nn_config = &av1_simple_motion_search_prune_part_nn_config_16;
+ ml_mean = av1_simple_motion_search_prune_part_mean_16;
+ ml_std = av1_simple_motion_search_prune_part_std_16;
+ prune_thresh = av1_simple_motion_search_prune_part_prune_thresh_16;
+ only_thresh = av1_simple_motion_search_prune_part_only_thresh_16;
+ } else if (bsize == BLOCK_8X8) {
+ nn_config = &av1_simple_motion_search_prune_part_nn_config_8;
+ ml_mean = av1_simple_motion_search_prune_part_mean_8;
+ ml_std = av1_simple_motion_search_prune_part_std_8;
+ prune_thresh = av1_simple_motion_search_prune_part_prune_thresh_8;
+ only_thresh = av1_simple_motion_search_prune_part_only_thresh_8;
+ } else {
+ assert(0 && "Unexpected block size in simple_motion_prune_part");
+ }
+
+ // If there is no valid threshold, return immediately.
+ if (!nn_config || (prune_thresh[PARTITION_HORZ] == 0.0f &&
+ prune_thresh[PARTITION_VERT] == 0.0f)) {
+ return;
+ }
+ if (bsize < BLOCK_8X8) {
+ return;
+ }
+
+ // Get features
+ simple_motion_search_prune_part_features(cpi, x, pc_tree, mi_row, mi_col,
+ bsize, features);
+ *valid = 1;
+ for (int f_idx = 0; f_idx < FEATURE_SIZE_SMS_PRUNE_PART; f_idx++) {
+ normalized_features[f_idx] =
+ (features[f_idx] - ml_mean[f_idx]) / ml_std[f_idx];
+ }
+
+ // Get probabilities
+ float scores[EXT_PARTITION_TYPES] = { 0.0f },
+ probs[EXT_PARTITION_TYPES] = { 0.0f };
+ const int num_classes = (bsize == BLOCK_128X128 || bsize == BLOCK_8X8)
+ ? PARTITION_TYPES
+ : EXT_PARTITION_TYPES;
+
+ av1_nn_predict(normalized_features, nn_config, scores);
+ aom_clear_system_state();
+
+ av1_nn_softmax(scores, probs, num_classes);
+
+ // Determine if we should prune rectangular partitions.
+ if (cpi->sf.simple_motion_search_prune_rect && !frame_is_intra_only(cm) &&
+ (*partition_horz_allowed || *partition_vert_allowed) &&
+ bsize >= BLOCK_8X8 && !av1_superres_scaled(cm)) {
+ *prune_horz = probs[PARTITION_HORZ] <= prune_thresh[PARTITION_HORZ];
+ *prune_vert = probs[PARTITION_VERT] <= prune_thresh[PARTITION_VERT];
+ }
+
+ // Silence compiler warnings
+ (void)only_thresh;
+ (void)partition_none_allowed;
+ (void)do_square_split;
+ (void)do_rectangular_split;
+}
+
+// Early terminates PARTITION_NONE using simple_motion_search features and the
+// rate, distortion, and rdcost of PARTITION_NONE. This is only called when:
+// - The frame is a show frame
+// - The frame is not intra only
+// - The current bsize is > BLOCK_8X8
+// - blk_row + blk_height/2 < total_rows and blk_col + blk_width/2 < total_cols
+void av1_simple_motion_search_early_term_none(
+ AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row,
+ int mi_col, BLOCK_SIZE bsize, const RD_STATS *none_rdc,
+ int *early_terminate, float *simple_motion_features,
+ int *simple_motion_features_are_valid) {
+ // TODO(chiyotsai@google.com): There are other features we can extract from
+ // PARTITION_NONE. Play with this later.
+ int f_idx = 0;
+ if (!*simple_motion_features_are_valid) {
+ simple_motion_search_prune_part_features(cpi, x, pc_tree, mi_row, mi_col,
+ bsize, simple_motion_features);
+ *simple_motion_features_are_valid = 1;
+ }
+ f_idx = 25;
+
+ simple_motion_features[f_idx++] = logf(1.0f + (float)none_rdc->rate);
+ simple_motion_features[f_idx++] = logf(1.0f + (float)none_rdc->dist);
+ simple_motion_features[f_idx++] = logf(1.0f + (float)none_rdc->rdcost);
+
+ assert(f_idx == FEATURE_SIZE_SMS_TERM_NONE);
+
+ const float *ml_mean = NULL;
+ const float *ml_std = NULL;
+ const float *ml_model = NULL;
+
+ if (bsize == BLOCK_128X128) {
+ ml_mean = av1_simple_motion_search_term_none_mean_128;
+ ml_std = av1_simple_motion_search_term_none_std_128;
+ ml_model = av1_simple_motion_search_term_none_model_128;
+ } else if (bsize == BLOCK_64X64) {
+ ml_mean = av1_simple_motion_search_term_none_mean_64;
+ ml_std = av1_simple_motion_search_term_none_std_64;
+ ml_model = av1_simple_motion_search_term_none_model_64;
+ } else if (bsize == BLOCK_32X32) {
+ ml_mean = av1_simple_motion_search_term_none_mean_32;
+ ml_std = av1_simple_motion_search_term_none_std_32;
+ ml_model = av1_simple_motion_search_term_none_model_32;
+ } else if (bsize == BLOCK_16X16) {
+ ml_mean = av1_simple_motion_search_term_none_mean_16;
+ ml_std = av1_simple_motion_search_term_none_std_16;
+ ml_model = av1_simple_motion_search_term_none_model_16;
+ } else {
+ assert(0 && "Unexpected block size in simple_motion_term_none");
+ }
+
+ if (ml_model) {
+ float score = 0.0f;
+ for (f_idx = 0; f_idx < FEATURE_SIZE_SMS_TERM_NONE; f_idx++) {
+ score += ml_model[f_idx] *
+ (simple_motion_features[f_idx] - ml_mean[f_idx]) / ml_std[f_idx];
+ }
+ score += ml_model[FEATURE_SIZE_SMS_TERM_NONE];
+
+ if (score >= 0.0f) {
+ *early_terminate = 1;
+ }
+ }
+}
+
+static void firstpass_simple_motion_search_features(
+ AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row,
+ int mi_col, BLOCK_SIZE bsize, float *features) {
+ assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+ assert(cpi->ref_frame_flags & av1_ref_frame_flag_list[LAST_FRAME] ||
+ cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME]);
+
+ // Setting up motion search
+ const int ref_list[] = { LAST_FRAME, ALTREF_FRAME };
+ const int num_refs = 2;
+ const int use_subpixel = 0;
+
+ unsigned int int_features[10] = { 0 };
+
+ int f_idx = 0;
+ // Doing whole block first to update the mv
+ simple_motion_search_get_best_ref(
+ cpi, x, pc_tree, mi_row, mi_col, bsize, ref_list, num_refs, use_subpixel,
+ 4, &int_features[f_idx], &int_features[f_idx + 1]);
+ f_idx += 2;
+
+ // Split subblocks
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+ const int w_mi = mi_size_wide[bsize];
+ const int h_mi = mi_size_high[bsize];
+ for (int r_idx = 0; r_idx < 4; r_idx++) {
+ const int sub_mi_col = mi_col + (r_idx & 1) * w_mi / 2;
+ const int sub_mi_row = mi_row + (r_idx >> 1) * h_mi / 2;
+
+ simple_motion_search_get_best_ref(
+ cpi, x, pc_tree, sub_mi_row, sub_mi_col, subsize, ref_list, num_refs,
+ use_subpixel, r_idx, &int_features[f_idx], &int_features[f_idx + 1]);
+ f_idx += 2;
+ }
+
+ aom_clear_system_state();
+ for (int idx = 0; idx < f_idx; idx++) {
+ features[idx] = logf(1.0f + (float)int_features[idx]);
+ }
+
+ const MACROBLOCKD *xd = &x->e_mbd;
+ set_offsets_for_motion_search(cpi, x, mi_row, mi_col, bsize);
+
+ // Q_INDEX
+ const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8);
+ features[f_idx++] = logf(1.0f + (float)(dc_q * dc_q) / 256.0f);
+
+ // Neighbor stuff
+ const int has_above = !!xd->above_mbmi;
+ const int has_left = !!xd->left_mbmi;
+ const BLOCK_SIZE above_bsize = has_above ? xd->above_mbmi->sb_type : bsize;
+ const BLOCK_SIZE left_bsize = has_left ? xd->left_mbmi->sb_type : bsize;
+ features[f_idx++] = (float)has_above;
+ features[f_idx++] = (float)mi_size_wide_log2[above_bsize];
+ features[f_idx++] = (float)mi_size_high_log2[above_bsize];
+ features[f_idx++] = (float)has_left;
+ features[f_idx++] = (float)mi_size_wide_log2[left_bsize];
+ features[f_idx++] = (float)mi_size_high_log2[left_bsize];
+}
+
+void av1_firstpass_simple_motion_search_early_term(AV1_COMP *const cpi,
+ MACROBLOCK *x,
+ PC_TREE *pc_tree, int mi_row,
+ int mi_col, BLOCK_SIZE bsize,
+ const RD_STATS *none_rdc,
+ int *do_square_split) {
+ const NN_CONFIG *nn_config = NULL;
+ float thresh = 0.0f;
+ const float *ml_mean = NULL, *ml_std = NULL;
+ if (bsize == BLOCK_32X32) {
+ nn_config = &av1_fp_simple_motion_search_term_none_nn_config_32;
+ ml_mean = av1_fp_simple_motion_search_term_none_mean_32;
+ ml_std = av1_fp_simple_motion_search_term_none_std_32;
+ thresh = av1_fp_simple_motion_search_term_none_thresh_32;
+ } else if (bsize == BLOCK_16X16) {
+ nn_config = &av1_fp_simple_motion_search_term_none_nn_config_16;
+ ml_mean = av1_fp_simple_motion_search_term_none_mean_16;
+ ml_std = av1_fp_simple_motion_search_term_none_std_16;
+ thresh = av1_fp_simple_motion_search_term_none_thresh_16;
+ } else if (bsize == BLOCK_8X8) {
+ nn_config = &av1_fp_simple_motion_search_term_none_nn_config_8;
+ ml_mean = av1_fp_simple_motion_search_term_none_mean_8;
+ ml_std = av1_fp_simple_motion_search_term_none_std_8;
+ thresh = av1_fp_simple_motion_search_term_none_thresh_8;
+ } else {
+ assert(0 &&
+ "Unexpected bsize in firstpass_simple_motion_search_early_term");
+ return;
+ }
+
+ float ml_features[FEATURE_SIZE_FP_SMS_TERM_NONE] = { 0.0f };
+
+ firstpass_simple_motion_search_features(cpi, x, pc_tree, mi_row, mi_col,
+ bsize, ml_features);
+ int f_idx = 17;
+
+ ml_features[f_idx++] = logf(1.0f + (float)none_rdc->rate);
+ ml_features[f_idx++] = logf(1.0f + (float)none_rdc->dist);
+ ml_features[f_idx++] = logf(1.0f + (float)none_rdc->rdcost);
+
+ for (f_idx = 0; f_idx < 20; f_idx++) {
+ ml_features[f_idx] = (ml_features[f_idx] - ml_mean[f_idx]) / ml_std[f_idx];
+ }
+
+ // Get probabilities
+ float score = 0.0f;
+
+ av1_nn_predict(ml_features, nn_config, &score);
+ aom_clear_system_state();
+
+ // Determine if we should prune square partitions.
+ if (score < thresh) {
+ *do_square_split = 0;
+ }
+}
+
+void av1_get_max_min_partition_features(AV1_COMP *const cpi, MACROBLOCK *x,
+ int mi_row, int mi_col,
+ float *features) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ const BLOCK_SIZE sb_size = cm->seq_params.sb_size;
+
+ assert(sb_size == BLOCK_128X128);
+
+ int f_idx = 0;
+
+ const int dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd) >> (xd->bd - 8);
+ aom_clear_system_state();
+ const float log_q_sq = logf(1.0f + (float)(dc_q * dc_q) / 256.0f);
+
+ // Perform full-pixel single motion search in Y plane of 16x16 mbs in the sb
+ float sum_mv_row_sq = 0;
+ float sum_mv_row = 0;
+ float min_abs_mv_row = FLT_MAX;
+ float max_abs_mv_row = 0;
+
+ float sum_mv_col_sq = 0;
+ float sum_mv_col = 0;
+ float min_abs_mv_col = FLT_MAX;
+ float max_abs_mv_col = 0;
+
+ float sum_log_sse_sq = 0;
+ float sum_log_sse = 0;
+ float min_log_sse = FLT_MAX;
+ float max_log_sse = 0;
+
+ const BLOCK_SIZE mb_size = BLOCK_16X16;
+ const int mb_rows = block_size_high[sb_size] / block_size_high[mb_size];
+ const int mb_cols = block_size_wide[sb_size] / block_size_wide[mb_size];
+ const int mb_in_mi_size_high_log2 = mi_size_high_log2[mb_size];
+ const int mb_in_mi_size_wide_log2 = mi_size_wide_log2[mb_size];
+
+ for (int mb_row = 0; mb_row < mb_rows; mb_row++)
+ for (int mb_col = 0; mb_col < mb_cols; mb_col++) {
+ const int this_mi_row = mi_row + (mb_row << mb_in_mi_size_high_log2);
+ const int this_mi_col = mi_col + (mb_col << mb_in_mi_size_wide_log2);
+ unsigned int sse = 0;
+ unsigned int var = 0;
+ const MV ref_mv_full = { .row = 0, .col = 0 };
+
+ av1_simple_motion_sse_var(cpi, x, this_mi_row, this_mi_col, mb_size,
+ ref_mv_full, 0, &sse, &var);
+
+ aom_clear_system_state();
+ const float mv_row = (float)(x->best_mv.as_mv.row / 8);
+ const float mv_col = (float)(x->best_mv.as_mv.col / 8);
+ const float log_sse = logf(1.0f + (float)sse);
+ const float abs_mv_row = fabsf(mv_row);
+ const float abs_mv_col = fabsf(mv_col);
+
+ sum_mv_row_sq += mv_row * mv_row;
+ sum_mv_row += mv_row;
+ sum_mv_col_sq += mv_col * mv_col;
+ sum_mv_col += mv_col;
+
+ if (abs_mv_row < min_abs_mv_row) min_abs_mv_row = abs_mv_row;
+ if (abs_mv_row > max_abs_mv_row) max_abs_mv_row = abs_mv_row;
+ if (abs_mv_col < min_abs_mv_col) min_abs_mv_col = abs_mv_col;
+ if (abs_mv_col > max_abs_mv_col) max_abs_mv_col = abs_mv_col;
+
+ sum_log_sse_sq += log_sse * log_sse;
+ sum_log_sse += log_sse;
+ if (log_sse < min_log_sse) min_log_sse = log_sse;
+ if (log_sse > max_log_sse) max_log_sse = log_sse;
+ }
+ aom_clear_system_state();
+ const float avg_mv_row = sum_mv_row / 64.0f;
+ const float var_mv_row = sum_mv_row_sq / 64.0f - avg_mv_row * avg_mv_row;
+
+ const float avg_mv_col = sum_mv_col / 64.0f;
+ const float var_mv_col = sum_mv_col_sq / 64.0f - avg_mv_col * avg_mv_col;
+
+ const float avg_log_sse = sum_log_sse / 64.0f;
+ const float var_log_sse = sum_log_sse_sq / 64.0f - avg_log_sse * avg_log_sse;
+
+ features[f_idx++] = avg_log_sse;
+ features[f_idx++] = avg_mv_col;
+ features[f_idx++] = avg_mv_row;
+ features[f_idx++] = log_q_sq;
+ features[f_idx++] = max_abs_mv_col;
+ features[f_idx++] = max_abs_mv_row;
+ features[f_idx++] = max_log_sse;
+ features[f_idx++] = min_abs_mv_col;
+ features[f_idx++] = min_abs_mv_row;
+ features[f_idx++] = min_log_sse;
+ features[f_idx++] = var_log_sse;
+ features[f_idx++] = var_mv_col;
+ features[f_idx++] = var_mv_row;
+
+ assert(f_idx == FEATURE_SIZE_MAX_MIN_PART_PRED);
+}
+
+BLOCK_SIZE av1_predict_max_partition(AV1_COMP *const cpi, MACROBLOCK *const x,
+ const float *features) {
+ float scores[MAX_NUM_CLASSES_MAX_MIN_PART_PRED] = { 0.0f },
+ probs[MAX_NUM_CLASSES_MAX_MIN_PART_PRED] = { 0.0f };
+ const NN_CONFIG *nn_config = &av1_max_part_pred_nn_config;
+
+ assert(cpi->sf.auto_max_partition_based_on_simple_motion != NOT_IN_USE);
+
+ aom_clear_system_state();
+ av1_nn_predict(features, nn_config, scores);
+ av1_nn_softmax(scores, probs, MAX_NUM_CLASSES_MAX_MIN_PART_PRED);
+
+ int result = MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1;
+ if (cpi->sf.auto_max_partition_based_on_simple_motion == DIRECT_PRED) {
+ result = 0;
+ float max_prob = probs[0];
+ for (int i = 1; i < MAX_NUM_CLASSES_MAX_MIN_PART_PRED; ++i) {
+ if (probs[i] > max_prob) {
+ max_prob = probs[i];
+ result = i;
+ }
+ }
+ } else if (cpi->sf.auto_max_partition_based_on_simple_motion ==
+ RELAXED_PRED) {
+ for (result = MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1; result >= 0;
+ --result) {
+ if (result < MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1) {
+ probs[result] += probs[result + 1];
+ }
+ if (probs[result] > 0.2) break;
+ }
+ } else if (cpi->sf.auto_max_partition_based_on_simple_motion == ADAPT_PRED) {
+ const BLOCK_SIZE sb_size = cpi->common.seq_params.sb_size;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ // TODO(debargha): x->source_variance is unavailable at this point,
+ // so compute. The redundant recomputation later can be removed.
+ const unsigned int source_variance =
+ is_cur_buf_hbd(xd)
+ ? av1_high_get_sby_perpixel_variance(cpi, &x->plane[0].src, sb_size,
+ xd->bd)
+ : av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, sb_size);
+ if (source_variance > 16) {
+ const double thresh = source_variance < 128 ? 0.05 : 0.1;
+ for (result = MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1; result >= 0;
+ --result) {
+ if (result < MAX_NUM_CLASSES_MAX_MIN_PART_PRED - 1) {
+ probs[result] += probs[result + 1];
+ }
+ if (probs[result] > thresh) break;
+ }
+ }
+ }
+
+ return (BLOCK_SIZE)((result + 2) * 3);
+}
diff --git a/libaom/av1/encoder/partition_strategy.h b/libaom/av1/encoder/partition_strategy.h
new file mode 100644
index 0000000..36b1e95
--- /dev/null
+++ b/libaom/av1/encoder/partition_strategy.h
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_PARTITION_STRATEGY_H_
+#define AOM_AV1_ENCODER_PARTITION_STRATEGY_H_
+
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/encoder.h"
+
+#define FEATURE_SIZE_SMS_PRUNE_PART 25
+#define FEATURE_SIZE_SMS_TERM_NONE 28
+#define FEATURE_SIZE_FP_SMS_TERM_NONE 20
+#define FEATURE_SIZE_MAX_MIN_PART_PRED 13
+#define MAX_NUM_CLASSES_MAX_MIN_PART_PRED 4
+
+// Performs a simple_motion_search with a single reference frame and extract
+// the variance of residues. Then use the features to determine whether we want
+// to go straight to splitting without trying PARTITION_NONE
+void av1_simple_motion_search_based_split(
+ AV1_COMP *const cpi, MACROBLOCK *x, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, int *partition_none_allowed, int *partition_horz_allowed,
+ int *partition_vert_allowed, int *do_rectangular_split,
+ int *do_square_split);
+
+// Performs a simple_motion_search with two reference frames and extract
+// the variance of residues. Then use the features to determine whether we want
+// to prune some partitions.
+void av1_simple_motion_search_prune_part(
+ AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row,
+ int mi_col, BLOCK_SIZE bsize, int *partition_none_allowed,
+ int *partition_horz_allowed, int *partition_vert_allowed,
+ int *do_square_split, int *do_rectangular_split, int *prune_horz,
+ int *prune_vert, float *features, int *valid);
+
+// Early terminates PARTITION_NONE using simple_motion_search features and the
+// rate, distortion, and rdcost of PARTITION_NONE. This is only called when:
+// - The frame is a show frame
+// - The frame is not intra only
+// - The current bsize is > BLOCK_8X8
+// - blk_row + blk_height/2 < total_rows and blk_col + blk_width/2 < total_cols
+void av1_simple_motion_search_early_term_none(
+ AV1_COMP *const cpi, MACROBLOCK *x, PC_TREE *pc_tree, int mi_row,
+ int mi_col, BLOCK_SIZE bsize, const RD_STATS *none_rdc,
+ int *early_terminate, float *simple_motion_features,
+ int *simple_motion_features_are_valid);
+
+// Early terminates after PARTITION_NONE in firstpass of two pass partition
+// search.
+void av1_firstpass_simple_motion_search_early_term(AV1_COMP *const cpi,
+ MACROBLOCK *x,
+ PC_TREE *pc_tree, int mi_row,
+ int mi_col, BLOCK_SIZE bsize,
+ const RD_STATS *none_rdc,
+ int *do_square_split);
+
+// Get the features for selecting the max and min partition size. Currently this
+// performs simple_motion_search on 16X16 subblocks of the currnet superblock,
+// and then extract the statistics of sse and motion vectors as features.
+void av1_get_max_min_partition_features(AV1_COMP *const cpi, MACROBLOCK *x,
+ int mi_row, int mi_col,
+ float *features);
+
+// Predict the maximum BLOCK_SIZE to be used to encoder the current superblock.
+BLOCK_SIZE av1_predict_max_partition(AV1_COMP *const cpi, MACROBLOCK *const x,
+ const float *features);
+
+// A simplified version of set_offsets meant to be used for
+// simple_motion_search.
+static INLINE void set_offsets_for_motion_search(const AV1_COMP *const cpi,
+ MACROBLOCK *const x,
+ int mi_row, int mi_col,
+ BLOCK_SIZE bsize) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int mi_width = mi_size_wide[bsize];
+ const int mi_height = mi_size_high[bsize];
+
+ set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
+
+ // Set up destination pointers.
+ av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col, 0,
+ num_planes);
+
+ // Set up limit values for MV components.
+ // Mv beyond the range do not produce new/different prediction block.
+ x->mv_limits.row_min =
+ -(((mi_row + mi_height) * MI_SIZE) + AOM_INTERP_EXTEND);
+ x->mv_limits.col_min = -(((mi_col + mi_width) * MI_SIZE) + AOM_INTERP_EXTEND);
+ x->mv_limits.row_max = (cm->mi_rows - mi_row) * MI_SIZE + AOM_INTERP_EXTEND;
+ x->mv_limits.col_max = (cm->mi_cols - mi_col) * MI_SIZE + AOM_INTERP_EXTEND;
+
+ set_plane_n4(xd, mi_width, mi_height, num_planes);
+
+ // Set up distance of MB to edge of frame in 1/8th pel units.
+ assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1)));
+ xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
+ xd->mb_to_bottom_edge = ((cm->mi_rows - mi_height - mi_row) * MI_SIZE) * 8;
+ xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
+ xd->mb_to_right_edge = ((cm->mi_cols - mi_width - mi_col) * MI_SIZE) * 8;
+
+ // Set up source buffers.
+ av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes, bsize);
+
+ // R/D setup.
+ x->rdmult = cpi->rd.RDMULT;
+}
+
+static INLINE void init_simple_motion_search_mvs(PC_TREE *pc_tree) {
+ for (int idx = 0; idx < REF_FRAMES; idx++) {
+ pc_tree->mv_ref_fulls[idx].row = 0;
+ pc_tree->mv_ref_fulls[idx].col = 0;
+ }
+ if (pc_tree->block_size >= BLOCK_8X8) {
+ init_simple_motion_search_mvs(pc_tree->split[0]);
+ init_simple_motion_search_mvs(pc_tree->split[1]);
+ init_simple_motion_search_mvs(pc_tree->split[2]);
+ init_simple_motion_search_mvs(pc_tree->split[3]);
+ }
+}
+
+static INLINE int is_full_sb(AV1_COMMON *const cm, int mi_row, int mi_col,
+ BLOCK_SIZE sb_size) {
+ const int sb_mi_wide = mi_size_wide[sb_size];
+ const int sb_mi_high = mi_size_high[sb_size];
+
+ return (mi_row + sb_mi_high) <= cm->mi_rows &&
+ (mi_col + sb_mi_wide) <= cm->mi_cols;
+}
+
+static INLINE int use_auto_max_partition(AV1_COMP *const cpi,
+ BLOCK_SIZE sb_size, int mi_row,
+ int mi_col) {
+ AV1_COMMON *const cm = &cpi->common;
+
+ return !frame_is_intra_only(cm) &&
+ cpi->sf.auto_max_partition_based_on_simple_motion != NOT_IN_USE &&
+ sb_size == BLOCK_128X128 && is_full_sb(cm, mi_row, mi_col, sb_size) &&
+ cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index] !=
+ OVERLAY_UPDATE &&
+ cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index] !=
+ INTNL_OVERLAY_UPDATE;
+}
+
+#endif // AOM_AV1_ENCODER_PARTITION_STRATEGY_H_
diff --git a/libaom/av1/encoder/pass2_strategy.c b/libaom/av1/encoder/pass2_strategy.c
new file mode 100644
index 0000000..ac22b68
--- /dev/null
+++ b/libaom/av1/encoder/pass2_strategy.c
@@ -0,0 +1,1787 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdint.h>
+
+#include "config/aom_config.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_codec.h"
+#include "aom/aom_encoder.h"
+
+#include "aom_ports/system_state.h"
+
+#include "av1/common/onyxc_int.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/gop_structure.h"
+
+// Calculate an active area of the image that discounts formatting
+// bars and partially discounts other 0 energy areas.
+#define MIN_ACTIVE_AREA 0.5
+#define MAX_ACTIVE_AREA 1.0
+double calculate_active_area(const AV1_COMP *cpi,
+ const FIRSTPASS_STATS *this_frame) {
+ double active_pct;
+
+ active_pct =
+ 1.0 -
+ ((this_frame->intra_skip_pct / 2) +
+ ((this_frame->inactive_zone_rows * 2) / (double)cpi->common.mb_rows));
+ return fclamp(active_pct, MIN_ACTIVE_AREA, MAX_ACTIVE_AREA);
+}
+
+// Calculate a modified Error used in distributing bits between easier and
+// harder frames.
+#define ACT_AREA_CORRECTION 0.5
+double calculate_modified_err(const AV1_COMP *cpi, const TWO_PASS *twopass,
+ const AV1EncoderConfig *oxcf,
+ const FIRSTPASS_STATS *this_frame) {
+ const FIRSTPASS_STATS *const stats = &twopass->total_stats;
+ const double av_weight = stats->weight / stats->count;
+ const double av_err = (stats->coded_error * av_weight) / stats->count;
+ double modified_error =
+ av_err * pow(this_frame->coded_error * this_frame->weight /
+ DOUBLE_DIVIDE_CHECK(av_err),
+ oxcf->two_pass_vbrbias / 100.0);
+
+ // Correction for active area. Frames with a reduced active area
+ // (eg due to formatting bars) have a higher error per mb for the
+ // remaining active MBs. The correction here assumes that coding
+ // 0.5N blocks of complexity 2X is a little easier than coding N
+ // blocks of complexity X.
+ modified_error *=
+ pow(calculate_active_area(cpi, this_frame), ACT_AREA_CORRECTION);
+
+ return fclamp(modified_error, twopass->modified_error_min,
+ twopass->modified_error_max);
+}
+
+// Resets the first pass file to the given position using a relative seek from
+// the current position.
+static void reset_fpf_position(TWO_PASS *p, const FIRSTPASS_STATS *position) {
+ p->stats_in = position;
+}
+
+static int input_stats(TWO_PASS *p, FIRSTPASS_STATS *fps) {
+ if (p->stats_in >= p->stats_in_end) return EOF;
+
+ *fps = *p->stats_in;
+ ++p->stats_in;
+ return 1;
+}
+
+// Read frame stats at an offset from the current position.
+static const FIRSTPASS_STATS *read_frame_stats(const TWO_PASS *p, int offset) {
+ if ((offset >= 0 && p->stats_in + offset >= p->stats_in_end) ||
+ (offset < 0 && p->stats_in + offset < p->stats_in_start)) {
+ return NULL;
+ }
+
+ return &p->stats_in[offset];
+}
+
+static void subtract_stats(FIRSTPASS_STATS *section,
+ const FIRSTPASS_STATS *frame) {
+ section->frame -= frame->frame;
+ section->weight -= frame->weight;
+ section->intra_error -= frame->intra_error;
+ section->frame_avg_wavelet_energy -= frame->frame_avg_wavelet_energy;
+ section->coded_error -= frame->coded_error;
+ section->sr_coded_error -= frame->sr_coded_error;
+ section->pcnt_inter -= frame->pcnt_inter;
+ section->pcnt_motion -= frame->pcnt_motion;
+ section->pcnt_second_ref -= frame->pcnt_second_ref;
+ section->pcnt_neutral -= frame->pcnt_neutral;
+ section->intra_skip_pct -= frame->intra_skip_pct;
+ section->inactive_zone_rows -= frame->inactive_zone_rows;
+ section->inactive_zone_cols -= frame->inactive_zone_cols;
+ section->MVr -= frame->MVr;
+ section->mvr_abs -= frame->mvr_abs;
+ section->MVc -= frame->MVc;
+ section->mvc_abs -= frame->mvc_abs;
+ section->MVrv -= frame->MVrv;
+ section->MVcv -= frame->MVcv;
+ section->mv_in_out_count -= frame->mv_in_out_count;
+ section->new_mv_count -= frame->new_mv_count;
+ section->count -= frame->count;
+ section->duration -= frame->duration;
+}
+
+// Calculate the linear size relative to a baseline of 1080P
+#define BASE_SIZE 2073600.0 // 1920x1080
+static double get_linear_size_factor(const AV1_COMP *cpi) {
+ const double this_area = cpi->initial_width * cpi->initial_height;
+ return pow(this_area / BASE_SIZE, 0.5);
+}
+
+// This function returns the maximum target rate per frame.
+static int frame_max_bits(const RATE_CONTROL *rc,
+ const AV1EncoderConfig *oxcf) {
+ int64_t max_bits = ((int64_t)rc->avg_frame_bandwidth *
+ (int64_t)oxcf->two_pass_vbrmax_section) /
+ 100;
+ if (max_bits < 0)
+ max_bits = 0;
+ else if (max_bits > rc->max_frame_bandwidth)
+ max_bits = rc->max_frame_bandwidth;
+
+ return (int)max_bits;
+}
+
+static double calc_correction_factor(double err_per_mb, double err_divisor,
+ double pt_low, double pt_high, int q,
+ aom_bit_depth_t bit_depth) {
+ const double error_term = err_per_mb / err_divisor;
+
+ // Adjustment based on actual quantizer to power term.
+ const double power_term =
+ AOMMIN(av1_convert_qindex_to_q(q, bit_depth) * 0.01 + pt_low, pt_high);
+
+ // Calculate correction factor.
+ if (power_term < 1.0) assert(error_term >= 0.0);
+
+ return fclamp(pow(error_term, power_term), 0.05, 5.0);
+}
+
+#define ERR_DIVISOR 100.0
+#define FACTOR_PT_LOW 0.70
+#define FACTOR_PT_HIGH 0.90
+
+// Similar to find_qindex_by_rate() function in ratectrl.c, but includes
+// calculation of a correction_factor.
+static int find_qindex_by_rate_with_correction(
+ int desired_bits_per_mb, aom_bit_depth_t bit_depth, FRAME_TYPE frame_type,
+ double error_per_mb, double ediv_size_correction,
+ double group_weight_factor, int best_qindex, int worst_qindex) {
+ assert(best_qindex <= worst_qindex);
+ int low = best_qindex;
+ int high = worst_qindex;
+ while (low < high) {
+ const int mid = (low + high) >> 1;
+ const double mid_factor =
+ calc_correction_factor(error_per_mb, ERR_DIVISOR - ediv_size_correction,
+ FACTOR_PT_LOW, FACTOR_PT_HIGH, mid, bit_depth);
+ const int mid_bits_per_mb = av1_rc_bits_per_mb(
+ frame_type, mid, mid_factor * group_weight_factor, bit_depth);
+ if (mid_bits_per_mb > desired_bits_per_mb) {
+ low = mid + 1;
+ } else {
+ high = mid;
+ }
+ }
+#if CONFIG_DEBUG
+ assert(low == high);
+ const double low_factor =
+ calc_correction_factor(error_per_mb, ERR_DIVISOR - ediv_size_correction,
+ FACTOR_PT_LOW, FACTOR_PT_HIGH, low, bit_depth);
+ const int low_bits_per_mb = av1_rc_bits_per_mb(
+ frame_type, low, low_factor * group_weight_factor, bit_depth);
+ assert(low_bits_per_mb <= desired_bits_per_mb || low == worst_qindex);
+#endif // CONFIG_DEBUG
+ return low;
+}
+
+static int get_twopass_worst_quality(const AV1_COMP *cpi,
+ const double section_err,
+ double inactive_zone,
+ int section_target_bandwidth,
+ double group_weight_factor) {
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+
+ inactive_zone = fclamp(inactive_zone, 0.0, 1.0);
+
+ if (section_target_bandwidth <= 0) {
+ return rc->worst_quality; // Highest value allowed
+ } else {
+ const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+ ? cpi->initial_mbs
+ : cpi->common.MBs;
+ const int active_mbs = AOMMAX(1, num_mbs - (int)(num_mbs * inactive_zone));
+ const double av_err_per_mb = section_err / active_mbs;
+ const int target_norm_bits_per_mb =
+ (int)((uint64_t)section_target_bandwidth << BPER_MB_NORMBITS) /
+ active_mbs;
+
+ // Larger image formats are expected to be a little harder to code
+ // relatively given the same prediction error score. This in part at
+ // least relates to the increased size and hence coding overheads of
+ // motion vectors. Some account of this is made through adjustment of
+ // the error divisor.
+ double ediv_size_correction =
+ AOMMAX(0.2, AOMMIN(5.0, get_linear_size_factor(cpi)));
+ if (ediv_size_correction < 1.0)
+ ediv_size_correction = -(1.0 / ediv_size_correction);
+ ediv_size_correction *= 4.0;
+
+ // Try and pick a max Q that will be high enough to encode the
+ // content at the given rate.
+ int q = find_qindex_by_rate_with_correction(
+ target_norm_bits_per_mb, cpi->common.seq_params.bit_depth, INTER_FRAME,
+ av_err_per_mb, ediv_size_correction, group_weight_factor,
+ rc->best_quality, rc->worst_quality);
+
+ // Restriction on active max q for constrained quality mode.
+ if (cpi->oxcf.rc_mode == AOM_CQ) q = AOMMAX(q, oxcf->cq_level);
+ return q;
+ }
+}
+
+#define SR_DIFF_PART 0.0015
+#define MOTION_AMP_PART 0.003
+#define INTRA_PART 0.005
+#define DEFAULT_DECAY_LIMIT 0.75
+#define LOW_SR_DIFF_TRHESH 0.1
+#define SR_DIFF_MAX 128.0
+#define NCOUNT_FRAME_II_THRESH 5.0
+
+static double get_sr_decay_rate(const AV1_COMP *cpi,
+ const FIRSTPASS_STATS *frame) {
+ const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
+ : cpi->common.MBs;
+ double sr_diff = (frame->sr_coded_error - frame->coded_error) / num_mbs;
+ double sr_decay = 1.0;
+ double modified_pct_inter;
+ double modified_pcnt_intra;
+ const double motion_amplitude_factor =
+ frame->pcnt_motion * ((frame->mvc_abs + frame->mvr_abs) / 2);
+
+ modified_pct_inter = frame->pcnt_inter;
+ if ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) <
+ (double)NCOUNT_FRAME_II_THRESH) {
+ modified_pct_inter = frame->pcnt_inter - frame->pcnt_neutral;
+ }
+ modified_pcnt_intra = 100 * (1.0 - modified_pct_inter);
+
+ if ((sr_diff > LOW_SR_DIFF_TRHESH)) {
+ sr_diff = AOMMIN(sr_diff, SR_DIFF_MAX);
+ sr_decay = 1.0 - (SR_DIFF_PART * sr_diff) -
+ (MOTION_AMP_PART * motion_amplitude_factor) -
+ (INTRA_PART * modified_pcnt_intra);
+ }
+ return AOMMAX(sr_decay, AOMMIN(DEFAULT_DECAY_LIMIT, modified_pct_inter));
+}
+
+// This function gives an estimate of how badly we believe the prediction
+// quality is decaying from frame to frame.
+static double get_zero_motion_factor(const AV1_COMP *cpi,
+ const FIRSTPASS_STATS *frame) {
+ const double zero_motion_pct = frame->pcnt_inter - frame->pcnt_motion;
+ double sr_decay = get_sr_decay_rate(cpi, frame);
+ return AOMMIN(sr_decay, zero_motion_pct);
+}
+
+#define ZM_POWER_FACTOR 0.75
+
+static double get_prediction_decay_rate(const AV1_COMP *cpi,
+ const FIRSTPASS_STATS *next_frame) {
+ const double sr_decay_rate = get_sr_decay_rate(cpi, next_frame);
+ const double zero_motion_factor =
+ (0.95 * pow((next_frame->pcnt_inter - next_frame->pcnt_motion),
+ ZM_POWER_FACTOR));
+
+ return AOMMAX(zero_motion_factor,
+ (sr_decay_rate + ((1.0 - sr_decay_rate) * zero_motion_factor)));
+}
+
+// Function to test for a condition where a complex transition is followed
+// by a static section. For example in slide shows where there is a fade
+// between slides. This is to help with more optimal kf and gf positioning.
+static int detect_transition_to_still(AV1_COMP *cpi, int frame_interval,
+ int still_interval,
+ double loop_decay_rate,
+ double last_decay_rate) {
+ TWO_PASS *const twopass = &cpi->twopass;
+ RATE_CONTROL *const rc = &cpi->rc;
+
+ // Break clause to detect very still sections after motion
+ // For example a static image after a fade or other transition
+ // instead of a clean scene cut.
+ if (frame_interval > rc->min_gf_interval && loop_decay_rate >= 0.999 &&
+ last_decay_rate < 0.9) {
+ int j;
+
+ // Look ahead a few frames to see if static condition persists...
+ for (j = 0; j < still_interval; ++j) {
+ const FIRSTPASS_STATS *stats = &twopass->stats_in[j];
+ if (stats >= twopass->stats_in_end) break;
+
+ if (stats->pcnt_inter - stats->pcnt_motion < 0.999) break;
+ }
+
+ // Only if it does do we signal a transition to still.
+ return j == still_interval;
+ }
+
+ return 0;
+}
+
+// This function detects a flash through the high relative pcnt_second_ref
+// score in the frame following a flash frame. The offset passed in should
+// reflect this.
+static int detect_flash(const TWO_PASS *twopass, int offset) {
+ const FIRSTPASS_STATS *const next_frame = read_frame_stats(twopass, offset);
+
+ // What we are looking for here is a situation where there is a
+ // brief break in prediction (such as a flash) but subsequent frames
+ // are reasonably well predicted by an earlier (pre flash) frame.
+ // The recovery after a flash is indicated by a high pcnt_second_ref
+ // compared to pcnt_inter.
+ return next_frame != NULL &&
+ next_frame->pcnt_second_ref > next_frame->pcnt_inter &&
+ next_frame->pcnt_second_ref >= 0.5;
+}
+
+// Update the motion related elements to the GF arf boost calculation.
+static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats,
+ double *mv_in_out,
+ double *mv_in_out_accumulator,
+ double *abs_mv_in_out_accumulator,
+ double *mv_ratio_accumulator) {
+ const double pct = stats->pcnt_motion;
+
+ // Accumulate Motion In/Out of frame stats.
+ *mv_in_out = stats->mv_in_out_count * pct;
+ *mv_in_out_accumulator += *mv_in_out;
+ *abs_mv_in_out_accumulator += fabs(*mv_in_out);
+
+ // Accumulate a measure of how uniform (or conversely how random) the motion
+ // field is (a ratio of abs(mv) / mv).
+ if (pct > 0.05) {
+ const double mvr_ratio =
+ fabs(stats->mvr_abs) / DOUBLE_DIVIDE_CHECK(fabs(stats->MVr));
+ const double mvc_ratio =
+ fabs(stats->mvc_abs) / DOUBLE_DIVIDE_CHECK(fabs(stats->MVc));
+
+ *mv_ratio_accumulator +=
+ pct * (mvr_ratio < stats->mvr_abs ? mvr_ratio : stats->mvr_abs);
+ *mv_ratio_accumulator +=
+ pct * (mvc_ratio < stats->mvc_abs ? mvc_ratio : stats->mvc_abs);
+ }
+}
+
+#define BASELINE_ERR_PER_MB 1000.0
+#define BOOST_FACTOR 12.5
+
+static double calc_frame_boost(AV1_COMP *cpi, const FIRSTPASS_STATS *this_frame,
+ double this_frame_mv_in_out, double max_boost) {
+ double frame_boost;
+ const double lq = av1_convert_qindex_to_q(
+ cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.seq_params.bit_depth);
+ const double boost_q_correction = AOMMIN((0.5 + (lq * 0.015)), 1.5);
+ int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
+ : cpi->common.MBs;
+
+ // Correct for any inactive region in the image
+ num_mbs = (int)AOMMAX(1, num_mbs * calculate_active_area(cpi, this_frame));
+
+ // Underlying boost factor is based on inter error ratio.
+ frame_boost = (BASELINE_ERR_PER_MB * num_mbs) /
+ DOUBLE_DIVIDE_CHECK(this_frame->coded_error);
+ frame_boost = frame_boost * BOOST_FACTOR * boost_q_correction;
+
+ // Increase boost for frames where new data coming into frame (e.g. zoom out).
+ // Slightly reduce boost if there is a net balance of motion out of the frame
+ // (zoom in). The range for this_frame_mv_in_out is -1.0 to +1.0.
+ if (this_frame_mv_in_out > 0.0)
+ frame_boost += frame_boost * (this_frame_mv_in_out * 2.0);
+ // In the extreme case the boost is halved.
+ else
+ frame_boost += frame_boost * (this_frame_mv_in_out / 2.0);
+
+ return AOMMIN(frame_boost, max_boost * boost_q_correction);
+}
+
+#define GF_MAX_BOOST 90.0
+#define MIN_ARF_GF_BOOST 240
+#define MIN_DECAY_FACTOR 0.01
+
+static int calc_arf_boost(AV1_COMP *cpi, int offset, int f_frames, int b_frames,
+ int *f_boost, int *b_boost) {
+ TWO_PASS *const twopass = &cpi->twopass;
+ int i;
+ double boost_score = 0.0;
+ double mv_ratio_accumulator = 0.0;
+ double decay_accumulator = 1.0;
+ double this_frame_mv_in_out = 0.0;
+ double mv_in_out_accumulator = 0.0;
+ double abs_mv_in_out_accumulator = 0.0;
+ int arf_boost;
+ int flash_detected = 0;
+
+ // Search forward from the proposed arf/next gf position.
+ for (i = 0; i < f_frames; ++i) {
+ const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset);
+ if (this_frame == NULL) break;
+
+ // Update the motion related elements to the boost calculation.
+ accumulate_frame_motion_stats(
+ this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator,
+ &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
+
+ // We want to discount the flash frame itself and the recovery
+ // frame that follows as both will have poor scores.
+ flash_detected = detect_flash(twopass, i + offset) ||
+ detect_flash(twopass, i + offset + 1);
+
+ // Accumulate the effect of prediction quality decay.
+ if (!flash_detected) {
+ decay_accumulator *= get_prediction_decay_rate(cpi, this_frame);
+ decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
+ ? MIN_DECAY_FACTOR
+ : decay_accumulator;
+ }
+
+ boost_score +=
+ decay_accumulator *
+ calc_frame_boost(cpi, this_frame, this_frame_mv_in_out, GF_MAX_BOOST);
+ }
+
+ *f_boost = (int)boost_score;
+
+ // Reset for backward looking loop.
+ boost_score = 0.0;
+ mv_ratio_accumulator = 0.0;
+ decay_accumulator = 1.0;
+ this_frame_mv_in_out = 0.0;
+ mv_in_out_accumulator = 0.0;
+ abs_mv_in_out_accumulator = 0.0;
+
+ // Search backward towards last gf position.
+ for (i = -1; i >= -b_frames; --i) {
+ const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset);
+ if (this_frame == NULL) break;
+
+ // Update the motion related elements to the boost calculation.
+ accumulate_frame_motion_stats(
+ this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator,
+ &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
+
+ // We want to discount the the flash frame itself and the recovery
+ // frame that follows as both will have poor scores.
+ flash_detected = detect_flash(twopass, i + offset) ||
+ detect_flash(twopass, i + offset + 1);
+
+ // Cumulative effect of prediction quality decay.
+ if (!flash_detected) {
+ decay_accumulator *= get_prediction_decay_rate(cpi, this_frame);
+ decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
+ ? MIN_DECAY_FACTOR
+ : decay_accumulator;
+ }
+
+ boost_score +=
+ decay_accumulator *
+ calc_frame_boost(cpi, this_frame, this_frame_mv_in_out, GF_MAX_BOOST);
+ }
+ *b_boost = (int)boost_score;
+
+ arf_boost = (*f_boost + *b_boost);
+ if (arf_boost < ((b_frames + f_frames) * 20))
+ arf_boost = ((b_frames + f_frames) * 20);
+ arf_boost = AOMMAX(arf_boost, MIN_ARF_GF_BOOST);
+
+ return arf_boost;
+}
+
+// Calculate a section intra ratio used in setting max loop filter.
+static int calculate_section_intra_ratio(const FIRSTPASS_STATS *begin,
+ const FIRSTPASS_STATS *end,
+ int section_length) {
+ const FIRSTPASS_STATS *s = begin;
+ double intra_error = 0.0;
+ double coded_error = 0.0;
+ int i = 0;
+
+ while (s < end && i < section_length) {
+ intra_error += s->intra_error;
+ coded_error += s->coded_error;
+ ++s;
+ ++i;
+ }
+
+ return (int)(intra_error / DOUBLE_DIVIDE_CHECK(coded_error));
+}
+
+// Calculate the total bits to allocate in this GF/ARF group.
+static int64_t calculate_total_gf_group_bits(AV1_COMP *cpi,
+ double gf_group_err) {
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const TWO_PASS *const twopass = &cpi->twopass;
+ const int max_bits = frame_max_bits(rc, &cpi->oxcf);
+ int64_t total_group_bits;
+
+ // Calculate the bits to be allocated to the group as a whole.
+ if ((twopass->kf_group_bits > 0) && (twopass->kf_group_error_left > 0)) {
+ total_group_bits = (int64_t)(twopass->kf_group_bits *
+ (gf_group_err / twopass->kf_group_error_left));
+ } else {
+ total_group_bits = 0;
+ }
+
+ // Clamp odd edge cases.
+ total_group_bits = (total_group_bits < 0)
+ ? 0
+ : (total_group_bits > twopass->kf_group_bits)
+ ? twopass->kf_group_bits
+ : total_group_bits;
+
+ // Clip based on user supplied data rate variability limit.
+ if (total_group_bits > (int64_t)max_bits * rc->baseline_gf_interval)
+ total_group_bits = (int64_t)max_bits * rc->baseline_gf_interval;
+
+ return total_group_bits;
+}
+
+// Calculate the number bits extra to assign to boosted frames in a group.
+static int calculate_boost_bits(int frame_count, int boost,
+ int64_t total_group_bits) {
+ int allocation_chunks;
+
+ // return 0 for invalid inputs (could arise e.g. through rounding errors)
+ if (!boost || (total_group_bits <= 0) || (frame_count <= 0)) return 0;
+
+ allocation_chunks = (frame_count * 100) + boost;
+
+ // Prevent overflow.
+ if (boost > 1023) {
+ int divisor = boost >> 10;
+ boost /= divisor;
+ allocation_chunks /= divisor;
+ }
+
+ // Calculate the number of extra bits for use in the boosted frame or frames.
+ return AOMMAX((int)(((int64_t)boost * total_group_bits) / allocation_chunks),
+ 0);
+}
+
+#define LEAF_REDUCTION_FACTOR 0.75
+static double lvl_budget_factor[MAX_PYRAMID_LVL - 1][MAX_PYRAMID_LVL - 1] = {
+ { 1.0, 0.0, 0.0 }, { 0.6, 0.4, 0 }, { 0.45, 0.35, 0.20 }
+};
+static void allocate_gf_group_bits(
+ AV1_COMP *cpi, int64_t gf_group_bits, double group_error, int gf_arf_bits,
+ const EncodeFrameParams *const frame_params) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ TWO_PASS *const twopass = &cpi->twopass;
+ GF_GROUP *const gf_group = &twopass->gf_group;
+ const int key_frame = (frame_params->frame_type == KEY_FRAME);
+ const int max_bits = frame_max_bits(&cpi->rc, &cpi->oxcf);
+ int64_t total_group_bits = gf_group_bits;
+
+ // Check if GF group has any internal arfs.
+ int has_internal_arfs = 0;
+ for (int i = 0; i < gf_group->size; ++i) {
+ if (gf_group->update_type[i] == INTNL_ARF_UPDATE) {
+ has_internal_arfs = 1;
+ break;
+ }
+ }
+
+ // For key frames the frame target rate is already set and it
+ // is also the golden frame.
+ // === [frame_index == 0] ===
+ int frame_index = 0;
+ if (!key_frame) {
+ if (rc->source_alt_ref_active)
+ gf_group->bit_allocation[frame_index] = 0;
+ else
+ gf_group->bit_allocation[frame_index] = gf_arf_bits;
+
+ // Step over the golden frame / overlay frame
+ FIRSTPASS_STATS frame_stats;
+ if (EOF == input_stats(twopass, &frame_stats)) return;
+ }
+
+ // Deduct the boost bits for arf (or gf if it is not a key frame)
+ // from the group total.
+ if (rc->source_alt_ref_pending || !key_frame) total_group_bits -= gf_arf_bits;
+
+ frame_index++;
+
+ // Store the bits to spend on the ARF if there is one.
+ // === [frame_index == 1] ===
+ if (rc->source_alt_ref_pending) {
+ gf_group->bit_allocation[frame_index] = gf_arf_bits;
+
+ ++frame_index;
+
+ // Skip all the internal ARFs right after ARF at the starting segment of
+ // the current GF group.
+ if (has_internal_arfs) {
+ while (gf_group->update_type[frame_index] == INTNL_ARF_UPDATE) {
+ ++frame_index;
+ }
+ }
+ }
+
+ // Save.
+ const int tmp_frame_index = frame_index;
+ int budget_reduced_from_leaf_level = 0;
+
+ // Allocate bits to frames other than first frame, which is either a keyframe,
+ // overlay frame or golden frame.
+ const int normal_frames = rc->baseline_gf_interval - 1;
+
+ for (int i = 0; i < normal_frames; ++i) {
+ FIRSTPASS_STATS frame_stats;
+ if (EOF == input_stats(twopass, &frame_stats)) break;
+
+ const double modified_err =
+ calculate_modified_err(cpi, twopass, oxcf, &frame_stats);
+ const double err_fraction =
+ (group_error > 0) ? modified_err / DOUBLE_DIVIDE_CHECK(group_error)
+ : 0.0;
+ const int target_frame_size =
+ clamp((int)((double)total_group_bits * err_fraction), 0,
+ AOMMIN(max_bits, (int)total_group_bits));
+
+ if (gf_group->update_type[frame_index] == INTNL_OVERLAY_UPDATE) {
+ assert(gf_group->pyramid_height <= MAX_PYRAMID_LVL &&
+ "non-valid height for a pyramid structure");
+
+ const int arf_pos = gf_group->arf_pos_in_gf[frame_index];
+ gf_group->bit_allocation[frame_index] = 0;
+
+ gf_group->bit_allocation[arf_pos] = target_frame_size;
+ // Note: Boost, if needed, is added in the next loop.
+ } else {
+ assert(gf_group->update_type[frame_index] == LF_UPDATE);
+ gf_group->bit_allocation[frame_index] = target_frame_size;
+ if (has_internal_arfs) {
+ const int this_budget_reduction =
+ (int)(target_frame_size * LEAF_REDUCTION_FACTOR);
+ gf_group->bit_allocation[frame_index] -= this_budget_reduction;
+ budget_reduced_from_leaf_level += this_budget_reduction;
+ }
+ }
+
+ ++frame_index;
+
+ // Skip all the internal ARFs.
+ if (has_internal_arfs) {
+ while (gf_group->update_type[frame_index] == INTNL_ARF_UPDATE)
+ ++frame_index;
+ }
+ }
+
+ if (budget_reduced_from_leaf_level > 0) {
+ assert(has_internal_arfs);
+ // Restore.
+ frame_index = tmp_frame_index;
+
+ // Re-distribute this extra budget to overlay frames in the group.
+ for (int i = 0; i < normal_frames; ++i) {
+ if (gf_group->update_type[frame_index] == INTNL_OVERLAY_UPDATE) {
+ assert(gf_group->pyramid_height <= MAX_PYRAMID_LVL &&
+ "non-valid height for a pyramid structure");
+ const int arf_pos = gf_group->arf_pos_in_gf[frame_index];
+ const int this_lvl = gf_group->pyramid_level[arf_pos];
+ const int dist2top = gf_group->pyramid_height - 1 - this_lvl;
+ const double lvl_boost_factor =
+ lvl_budget_factor[gf_group->pyramid_height - 2][dist2top];
+ const int extra_size =
+ (int)(budget_reduced_from_leaf_level * lvl_boost_factor /
+ gf_group->pyramid_lvl_nodes[this_lvl]);
+ gf_group->bit_allocation[arf_pos] += extra_size;
+ }
+ ++frame_index;
+
+ // Skip all the internal ARFs.
+ if (has_internal_arfs) {
+ while (gf_group->update_type[frame_index] == INTNL_ARF_UPDATE) {
+ ++frame_index;
+ }
+ }
+ }
+ }
+}
+
+// Given the maximum allowed height of the pyramid structure, return the fixed
+// GF length to be used.
+static INLINE int get_fixed_gf_length(int max_pyr_height) {
+ (void)max_pyr_height;
+ return MAX_GF_INTERVAL;
+}
+
+// Returns true if KF group and GF group both are almost completely static.
+static INLINE int is_almost_static(double gf_zero_motion, int kf_zero_motion) {
+ return (gf_zero_motion >= 0.995) &&
+ (kf_zero_motion >= STATIC_KF_GROUP_THRESH);
+}
+
+#define ARF_ABS_ZOOM_THRESH 4.4
+#define GROUP_ADAPTIVE_MAXQ 1
+#if GROUP_ADAPTIVE_MAXQ
+#define RC_FACTOR_MIN 0.75
+#define RC_FACTOR_MAX 1.75
+#endif // GROUP_ADAPTIVE_MAXQ
+#define MIN_FWD_KF_INTERVAL 8
+
+// Analyse and define a gf/arf group.
+static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame,
+ const EncodeFrameParams *const frame_params) {
+ AV1_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+ AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ TWO_PASS *const twopass = &cpi->twopass;
+ FIRSTPASS_STATS next_frame;
+ const FIRSTPASS_STATS *const start_pos = twopass->stats_in;
+ int i;
+
+ double boost_score = 0.0;
+ double gf_group_err = 0.0;
+#if GROUP_ADAPTIVE_MAXQ
+ double gf_group_raw_error = 0.0;
+#endif
+ double gf_group_skip_pct = 0.0;
+ double gf_group_inactive_zone_rows = 0.0;
+ double gf_first_frame_err = 0.0;
+ double mod_frame_err = 0.0;
+
+ double mv_ratio_accumulator = 0.0;
+ double decay_accumulator = 1.0;
+ double zero_motion_accumulator = 1.0;
+
+ double loop_decay_rate = 1.00;
+ double last_loop_decay_rate = 1.00;
+
+ double this_frame_mv_in_out = 0.0;
+ double mv_in_out_accumulator = 0.0;
+ double abs_mv_in_out_accumulator = 0.0;
+
+ unsigned int allow_alt_ref = is_altref_enabled(cpi);
+
+ int f_boost = 0;
+ int b_boost = 0;
+ int flash_detected;
+ int64_t gf_group_bits;
+ double gf_group_error_left;
+ int gf_arf_bits;
+ const int is_intra_only = frame_params->frame_type == KEY_FRAME ||
+ frame_params->frame_type == INTRA_ONLY_FRAME;
+ const int arf_active_or_kf = is_intra_only || rc->source_alt_ref_active;
+
+ cpi->internal_altref_allowed = (oxcf->gf_max_pyr_height > 1);
+
+ // Reset the GF group data structures unless this is a key
+ // frame in which case it will already have been done.
+ if (!is_intra_only) {
+ av1_zero(twopass->gf_group);
+ }
+
+ aom_clear_system_state();
+ av1_zero(next_frame);
+
+ // Load stats for the current frame.
+ mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
+
+ // Note the error of the frame at the start of the group. This will be
+ // the GF frame error if we code a normal gf.
+ gf_first_frame_err = mod_frame_err;
+
+ // If this is a key frame or the overlay from a previous arf then
+ // the error score / cost of this frame has already been accounted for.
+ if (arf_active_or_kf) {
+ gf_group_err -= gf_first_frame_err;
+#if GROUP_ADAPTIVE_MAXQ
+ gf_group_raw_error -= this_frame->coded_error;
+#endif
+ gf_group_skip_pct -= this_frame->intra_skip_pct;
+ gf_group_inactive_zone_rows -= this_frame->inactive_zone_rows;
+ }
+ // Motion breakout threshold for loop below depends on image size.
+ const double mv_ratio_accumulator_thresh =
+ (cpi->initial_height + cpi->initial_width) / 4.0;
+
+ // TODO(urvang): Try logic to vary min and max interval based on q.
+ const int active_min_gf_interval = rc->min_gf_interval;
+ const int active_max_gf_interval =
+ AOMMIN(rc->max_gf_interval, get_fixed_gf_length(oxcf->gf_max_pyr_height));
+
+ double avg_sr_coded_error = 0;
+ double avg_raw_err_stdev = 0;
+ int non_zero_stdev_count = 0;
+
+ i = 0;
+ while (i < rc->static_scene_max_gf_interval && i < rc->frames_to_key) {
+ ++i;
+
+ // Accumulate error score of frames in this gf group.
+ mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
+ gf_group_err += mod_frame_err;
+#if GROUP_ADAPTIVE_MAXQ
+ gf_group_raw_error += this_frame->coded_error;
+#endif
+ gf_group_skip_pct += this_frame->intra_skip_pct;
+ gf_group_inactive_zone_rows += this_frame->inactive_zone_rows;
+
+ if (EOF == input_stats(twopass, &next_frame)) break;
+
+ // Test for the case where there is a brief flash but the prediction
+ // quality back to an earlier frame is then restored.
+ flash_detected = detect_flash(twopass, 0);
+
+ // Update the motion related elements to the boost calculation.
+ accumulate_frame_motion_stats(
+ &next_frame, &this_frame_mv_in_out, &mv_in_out_accumulator,
+ &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
+ // sum up the metric values of current gf group
+ avg_sr_coded_error += next_frame.sr_coded_error;
+ if (fabs(next_frame.raw_error_stdev) > 0.000001) {
+ non_zero_stdev_count++;
+ avg_raw_err_stdev += next_frame.raw_error_stdev;
+ }
+
+ // Accumulate the effect of prediction quality decay.
+ if (!flash_detected) {
+ last_loop_decay_rate = loop_decay_rate;
+ loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
+
+ decay_accumulator = decay_accumulator * loop_decay_rate;
+
+ // Monitor for static sections.
+ if ((rc->frames_since_key + i - 1) > 1) {
+ zero_motion_accumulator = AOMMIN(
+ zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
+ }
+
+ // Break clause to detect very still sections after motion. For example,
+ // a static image after a fade or other transition.
+ if (detect_transition_to_still(cpi, i, 5, loop_decay_rate,
+ last_loop_decay_rate)) {
+ allow_alt_ref = 0;
+ break;
+ }
+ }
+
+ // Calculate a boost number for this frame.
+ boost_score +=
+ decay_accumulator *
+ calc_frame_boost(cpi, &next_frame, this_frame_mv_in_out, GF_MAX_BOOST);
+ // If almost totally static, we will not use the the max GF length later,
+ // so we can continue for more frames.
+ if ((i >= active_max_gf_interval + 1) &&
+ !is_almost_static(zero_motion_accumulator,
+ twopass->kf_zeromotion_pct)) {
+ break;
+ }
+
+ // Some conditions to breakout after min interval.
+ if (i >= active_min_gf_interval &&
+ // If possible don't break very close to a kf
+ (rc->frames_to_key - i >= rc->min_gf_interval) && (i & 0x01) &&
+ !flash_detected &&
+ (mv_ratio_accumulator > mv_ratio_accumulator_thresh ||
+ abs_mv_in_out_accumulator > ARF_ABS_ZOOM_THRESH)) {
+ break;
+ }
+ *this_frame = next_frame;
+ }
+
+ // Was the group length constrained by the requirement for a new KF?
+ rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0;
+
+ const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
+ : cpi->common.MBs;
+ assert(num_mbs > 0);
+ if (i) avg_sr_coded_error /= i;
+
+ if (non_zero_stdev_count) avg_raw_err_stdev /= non_zero_stdev_count;
+
+ // Disable internal ARFs for "still" gf groups.
+ // zero_motion_accumulator: minimum percentage of (0,0) motion;
+ // avg_sr_coded_error: average of the SSE per pixel of each frame;
+ // avg_raw_err_stdev: average of the standard deviation of (0,0)
+ // motion error per block of each frame.
+ if (zero_motion_accumulator > MIN_ZERO_MOTION &&
+ avg_sr_coded_error / num_mbs < MAX_SR_CODED_ERROR &&
+ avg_raw_err_stdev < MAX_RAW_ERR_VAR) {
+ cpi->internal_altref_allowed = 0;
+ }
+
+ const int use_alt_ref =
+ !is_almost_static(zero_motion_accumulator, twopass->kf_zeromotion_pct) &&
+ allow_alt_ref && (i < cpi->oxcf.lag_in_frames) &&
+ (i >= rc->min_gf_interval) &&
+ (cpi->oxcf.gf_max_pyr_height > MIN_PYRAMID_LVL);
+
+#define REDUCE_GF_LENGTH_THRESH 4
+#define REDUCE_GF_LENGTH_TO_KEY_THRESH 9
+#define REDUCE_GF_LENGTH_BY 1
+ int alt_offset = 0;
+ // The length reduction strategy is tweaked for certain cases, and doesn't
+ // work well for certain other cases.
+ const int allow_gf_length_reduction =
+ ((cpi->oxcf.rc_mode == AOM_Q && cpi->oxcf.cq_level <= 128) ||
+ !cpi->internal_altref_allowed) &&
+ !is_lossless_requested(&cpi->oxcf);
+
+ if (allow_gf_length_reduction && use_alt_ref) {
+ // adjust length of this gf group if one of the following condition met
+ // 1: only one overlay frame left and this gf is too long
+ // 2: next gf group is too short to have arf compared to the current gf
+
+ // maximum length of next gf group
+ const int next_gf_len = rc->frames_to_key - i;
+ const int single_overlay_left =
+ next_gf_len == 0 && i > REDUCE_GF_LENGTH_THRESH;
+ // the next gf is probably going to have a ARF but it will be shorter than
+ // this gf
+ const int unbalanced_gf =
+ i > REDUCE_GF_LENGTH_TO_KEY_THRESH &&
+ next_gf_len + 1 < REDUCE_GF_LENGTH_TO_KEY_THRESH &&
+ next_gf_len + 1 >= rc->min_gf_interval;
+
+ if (single_overlay_left || unbalanced_gf) {
+ const int roll_back = REDUCE_GF_LENGTH_BY;
+ // Reduce length only if active_min_gf_interval will be respected later.
+ if (i - roll_back >= active_min_gf_interval + 1) {
+ alt_offset = -roll_back;
+ i -= roll_back;
+ }
+ }
+ }
+
+ // Should we use the alternate reference frame.
+ if (use_alt_ref) {
+ // Calculate the boost for alt ref.
+ rc->gfu_boost =
+ calc_arf_boost(cpi, alt_offset, (i - 1), (i - 1), &f_boost, &b_boost);
+ rc->source_alt_ref_pending = 1;
+
+ // do not replace ARFs with overlay frames, and keep it as GOLDEN_REF
+ cpi->preserve_arf_as_gld = 1;
+ } else {
+ rc->gfu_boost = AOMMAX((int)boost_score, MIN_ARF_GF_BOOST);
+ rc->source_alt_ref_pending = 0;
+ cpi->preserve_arf_as_gld = 0;
+ }
+
+ // Set the interval until the next gf.
+ // If forward keyframes are enabled, ensure the final gf group obeys the
+ // MIN_FWD_KF_INTERVAL.
+ if (cpi->oxcf.fwd_kf_enabled &&
+ ((twopass->stats_in - i + rc->frames_to_key) < twopass->stats_in_end)) {
+ if (i == rc->frames_to_key) {
+ rc->baseline_gf_interval = i;
+ // if the last gf group will be smaller than MIN_FWD_KF_INTERVAL
+ } else if ((rc->frames_to_key - i <
+ AOMMAX(MIN_FWD_KF_INTERVAL, rc->min_gf_interval)) &&
+ (rc->frames_to_key != i)) {
+ // if possible, merge the last two gf groups
+ if (rc->frames_to_key <= active_max_gf_interval) {
+ rc->baseline_gf_interval = rc->frames_to_key;
+ // if merging the last two gf groups creates a group that is too long,
+ // split them and force the last gf group to be the MIN_FWD_KF_INTERVAL
+ } else {
+ rc->baseline_gf_interval = rc->frames_to_key - MIN_FWD_KF_INTERVAL;
+ }
+ } else {
+ rc->baseline_gf_interval = i - rc->source_alt_ref_pending;
+ }
+ } else {
+ rc->baseline_gf_interval = i - rc->source_alt_ref_pending;
+ }
+
+#define LAST_ALR_BOOST_FACTOR 0.2f
+ rc->arf_boost_factor = 1.0;
+ if (rc->source_alt_ref_pending && !is_lossless_requested(&cpi->oxcf)) {
+ // Reduce the boost of altref in the last gf group
+ if (rc->frames_to_key - i == REDUCE_GF_LENGTH_BY ||
+ rc->frames_to_key - i == 0) {
+ rc->arf_boost_factor = LAST_ALR_BOOST_FACTOR;
+ }
+ }
+
+ rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+
+ // Reset the file position.
+ reset_fpf_position(twopass, start_pos);
+
+ // Calculate the bits to be allocated to the gf/arf group as a whole
+ gf_group_bits = calculate_total_gf_group_bits(cpi, gf_group_err);
+
+#if GROUP_ADAPTIVE_MAXQ
+ // Calculate an estimate of the maxq needed for the group.
+ // We are more agressive about correcting for sections
+ // where there could be significant overshoot than for easier
+ // sections where we do not wish to risk creating an overshoot
+ // of the allocated bit budget.
+ if ((cpi->oxcf.rc_mode != AOM_Q) && (rc->baseline_gf_interval > 1)) {
+ const int vbr_group_bits_per_frame =
+ (int)(gf_group_bits / rc->baseline_gf_interval);
+ const double group_av_err = gf_group_raw_error / rc->baseline_gf_interval;
+ const double group_av_skip_pct =
+ gf_group_skip_pct / rc->baseline_gf_interval;
+ const double group_av_inactive_zone =
+ ((gf_group_inactive_zone_rows * 2) /
+ (rc->baseline_gf_interval * (double)cm->mb_rows));
+
+ int tmp_q;
+ // rc factor is a weight factor that corrects for local rate control drift.
+ double rc_factor = 1.0;
+ if (rc->rate_error_estimate > 0) {
+ rc_factor = AOMMAX(RC_FACTOR_MIN,
+ (double)(100 - rc->rate_error_estimate) / 100.0);
+ } else {
+ rc_factor = AOMMIN(RC_FACTOR_MAX,
+ (double)(100 - rc->rate_error_estimate) / 100.0);
+ }
+ tmp_q = get_twopass_worst_quality(
+ cpi, group_av_err, (group_av_skip_pct + group_av_inactive_zone),
+ vbr_group_bits_per_frame, twopass->kfgroup_inter_fraction * rc_factor);
+ twopass->active_worst_quality =
+ AOMMAX(tmp_q, twopass->active_worst_quality >> 1);
+ }
+#endif
+
+ // Calculate the extra bits to be used for boosted frame(s)
+ gf_arf_bits = calculate_boost_bits(rc->baseline_gf_interval, rc->gfu_boost,
+ gf_group_bits);
+
+ // Adjust KF group bits and error remaining.
+ twopass->kf_group_error_left -= (int64_t)gf_group_err;
+
+ // If this is an arf update we want to remove the score for the overlay
+ // frame at the end which will usually be very cheap to code.
+ // The overlay frame has already, in effect, been coded so we want to spread
+ // the remaining bits among the other frames.
+ // For normal GFs remove the score for the GF itself unless this is
+ // also a key frame in which case it has already been accounted for.
+ if (rc->source_alt_ref_pending) {
+ gf_group_error_left = gf_group_err - mod_frame_err;
+ } else if (!is_intra_only) {
+ gf_group_error_left = gf_group_err - gf_first_frame_err;
+ } else {
+ gf_group_error_left = gf_group_err;
+ }
+
+ // Set up the structure of this Group-Of-Pictures (same as GF_GROUP)
+ av1_gop_setup_structure(cpi, frame_params);
+
+ // Allocate bits to each of the frames in the GF group.
+ allocate_gf_group_bits(cpi, gf_group_bits, gf_group_error_left, gf_arf_bits,
+ frame_params);
+
+ // Reset the file position.
+ reset_fpf_position(twopass, start_pos);
+
+ // Calculate a section intra ratio used in setting max loop filter.
+ if (frame_params->frame_type != KEY_FRAME) {
+ twopass->section_intra_rating = calculate_section_intra_ratio(
+ start_pos, twopass->stats_in_end, rc->baseline_gf_interval);
+ }
+}
+
+// Minimum % intra coding observed in first pass (1.0 = 100%)
+#define MIN_INTRA_LEVEL 0.25
+// Minimum ratio between the % of intra coding and inter coding in the first
+// pass after discounting neutral blocks (discounting neutral blocks in this
+// way helps catch scene cuts in clips with very flat areas or letter box
+// format clips with image padding.
+#define INTRA_VS_INTER_THRESH 2.0
+// Hard threshold where the first pass chooses intra for almost all blocks.
+// In such a case even if the frame is not a scene cut coding a key frame
+// may be a good option.
+#define VERY_LOW_INTER_THRESH 0.05
+// Maximum threshold for the relative ratio of intra error score vs best
+// inter error score.
+#define KF_II_ERR_THRESHOLD 2.5
+// In real scene cuts there is almost always a sharp change in the intra
+// or inter error score.
+#define ERR_CHANGE_THRESHOLD 0.4
+// For real scene cuts we expect an improvment in the intra inter error
+// ratio in the next frame.
+#define II_IMPROVEMENT_THRESHOLD 3.5
+#define KF_II_MAX 128.0
+
+// Threshold for use of the lagging second reference frame. High second ref
+// usage may point to a transient event like a flash or occlusion rather than
+// a real scene cut.
+// We adapt the threshold based on number of frames in this key-frame group so
+// far.
+static double get_second_ref_usage_thresh(int frame_count_so_far) {
+ const int adapt_upto = 32;
+ const double min_second_ref_usage_thresh = 0.085;
+ const double second_ref_usage_thresh_max_delta = 0.035;
+ if (frame_count_so_far >= adapt_upto) {
+ return min_second_ref_usage_thresh + second_ref_usage_thresh_max_delta;
+ }
+ return min_second_ref_usage_thresh +
+ ((double)frame_count_so_far / (adapt_upto - 1)) *
+ second_ref_usage_thresh_max_delta;
+}
+
+static int test_candidate_kf(TWO_PASS *twopass,
+ const FIRSTPASS_STATS *last_frame,
+ const FIRSTPASS_STATS *this_frame,
+ const FIRSTPASS_STATS *next_frame,
+ int frame_count_so_far) {
+ int is_viable_kf = 0;
+ double pcnt_intra = 1.0 - this_frame->pcnt_inter;
+ double modified_pcnt_inter =
+ this_frame->pcnt_inter - this_frame->pcnt_neutral;
+ const double second_ref_usage_thresh =
+ get_second_ref_usage_thresh(frame_count_so_far);
+
+ // Does the frame satisfy the primary criteria of a key frame?
+ // See above for an explanation of the test criteria.
+ // If so, then examine how well it predicts subsequent frames.
+ if ((this_frame->pcnt_second_ref < second_ref_usage_thresh) &&
+ (next_frame->pcnt_second_ref < second_ref_usage_thresh) &&
+ ((this_frame->pcnt_inter < VERY_LOW_INTER_THRESH) ||
+ ((pcnt_intra > MIN_INTRA_LEVEL) &&
+ (pcnt_intra > (INTRA_VS_INTER_THRESH * modified_pcnt_inter)) &&
+ ((this_frame->intra_error /
+ DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) <
+ KF_II_ERR_THRESHOLD) &&
+ ((fabs(last_frame->coded_error - this_frame->coded_error) /
+ DOUBLE_DIVIDE_CHECK(this_frame->coded_error) >
+ ERR_CHANGE_THRESHOLD) ||
+ (fabs(last_frame->intra_error - this_frame->intra_error) /
+ DOUBLE_DIVIDE_CHECK(this_frame->intra_error) >
+ ERR_CHANGE_THRESHOLD) ||
+ ((next_frame->intra_error /
+ DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) >
+ II_IMPROVEMENT_THRESHOLD))))) {
+ int i;
+ const FIRSTPASS_STATS *start_pos = twopass->stats_in;
+ FIRSTPASS_STATS local_next_frame = *next_frame;
+ double boost_score = 0.0;
+ double old_boost_score = 0.0;
+ double decay_accumulator = 1.0;
+
+ // Examine how well the key frame predicts subsequent frames.
+ for (i = 0; i < 16; ++i) {
+ double next_iiratio = (BOOST_FACTOR * local_next_frame.intra_error /
+ DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error));
+
+ if (next_iiratio > KF_II_MAX) next_iiratio = KF_II_MAX;
+
+ // Cumulative effect of decay in prediction quality.
+ if (local_next_frame.pcnt_inter > 0.85)
+ decay_accumulator *= local_next_frame.pcnt_inter;
+ else
+ decay_accumulator *= (0.85 + local_next_frame.pcnt_inter) / 2.0;
+
+ // Keep a running total.
+ boost_score += (decay_accumulator * next_iiratio);
+
+ // Test various breakout clauses.
+ if ((local_next_frame.pcnt_inter < 0.05) || (next_iiratio < 1.5) ||
+ (((local_next_frame.pcnt_inter - local_next_frame.pcnt_neutral) <
+ 0.20) &&
+ (next_iiratio < 3.0)) ||
+ ((boost_score - old_boost_score) < 3.0) ||
+ (local_next_frame.intra_error < 200)) {
+ break;
+ }
+
+ old_boost_score = boost_score;
+
+ // Get the next frame details
+ if (EOF == input_stats(twopass, &local_next_frame)) break;
+ }
+
+ // If there is tolerable prediction for at least the next 3 frames then
+ // break out else discard this potential key frame and move on
+ if (boost_score > 30.0 && (i > 3)) {
+ is_viable_kf = 1;
+ } else {
+ // Reset the file position
+ reset_fpf_position(twopass, start_pos);
+
+ is_viable_kf = 0;
+ }
+ }
+
+ return is_viable_kf;
+}
+
+#define FRAMES_TO_CHECK_DECAY 8
+#define KF_MIN_FRAME_BOOST 80.0
+#define KF_MAX_FRAME_BOOST 128.0
+#define MIN_KF_BOOST 300 // Minimum boost for non-static KF interval
+#define MIN_STATIC_KF_BOOST 5400 // Minimum boost for static KF interval
+
+static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
+ int i, j;
+ RATE_CONTROL *const rc = &cpi->rc;
+ TWO_PASS *const twopass = &cpi->twopass;
+ GF_GROUP *const gf_group = &twopass->gf_group;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ const FIRSTPASS_STATS first_frame = *this_frame;
+ const FIRSTPASS_STATS *const start_position = twopass->stats_in;
+ FIRSTPASS_STATS next_frame;
+ FIRSTPASS_STATS last_frame;
+ int kf_bits = 0;
+ int loop_decay_counter = 0;
+ double decay_accumulator = 1.0;
+ double av_decay_accumulator = 0.0;
+ double zero_motion_accumulator = 1.0;
+ double boost_score = 0.0;
+ double kf_mod_err = 0.0;
+ double kf_group_err = 0.0;
+ double recent_loop_decay[FRAMES_TO_CHECK_DECAY];
+
+ av1_zero(next_frame);
+
+ rc->frames_since_key = 0;
+
+ // Reset the GF group data structures.
+ av1_zero(*gf_group);
+
+ // Is this a forced key frame by interval.
+ rc->this_key_frame_forced = rc->next_key_frame_forced;
+
+ // Clear the alt ref active flag and last group multi arf flags as they
+ // can never be set for a key frame.
+ rc->source_alt_ref_active = 0;
+
+ // KF is always a GF so clear frames till next gf counter.
+ rc->frames_till_gf_update_due = 0;
+
+ rc->frames_to_key = 1;
+
+ twopass->kf_group_bits = 0; // Total bits available to kf group
+ twopass->kf_group_error_left = 0; // Group modified error score.
+
+ kf_mod_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
+
+ // Initialize the decay rates for the recent frames to check
+ for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j) recent_loop_decay[j] = 1.0;
+
+ // Find the next keyframe.
+ i = 0;
+ while (twopass->stats_in < twopass->stats_in_end &&
+ rc->frames_to_key < cpi->oxcf.key_freq) {
+ // Accumulate kf group error.
+ kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
+
+ // Load the next frame's stats.
+ last_frame = *this_frame;
+ input_stats(twopass, this_frame);
+
+ // Provided that we are not at the end of the file...
+ if (cpi->oxcf.auto_key && twopass->stats_in < twopass->stats_in_end) {
+ double loop_decay_rate;
+
+ // Check for a scene cut.
+ if (test_candidate_kf(twopass, &last_frame, this_frame, twopass->stats_in,
+ rc->frames_to_key))
+ break;
+
+ // How fast is the prediction quality decaying?
+ loop_decay_rate = get_prediction_decay_rate(cpi, twopass->stats_in);
+
+ // We want to know something about the recent past... rather than
+ // as used elsewhere where we are concerned with decay in prediction
+ // quality since the last GF or KF.
+ recent_loop_decay[i % FRAMES_TO_CHECK_DECAY] = loop_decay_rate;
+ decay_accumulator = 1.0;
+ for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j)
+ decay_accumulator *= recent_loop_decay[j];
+
+ // Special check for transition or high motion followed by a
+ // static scene.
+ if (detect_transition_to_still(cpi, i, cpi->oxcf.key_freq - i,
+ loop_decay_rate, decay_accumulator))
+ break;
+
+ // Step on to the next frame.
+ ++rc->frames_to_key;
+
+ // If we don't have a real key frame within the next two
+ // key_freq intervals then break out of the loop.
+ if (rc->frames_to_key >= 2 * cpi->oxcf.key_freq) break;
+ } else {
+ ++rc->frames_to_key;
+ }
+ ++i;
+ }
+
+ // If there is a max kf interval set by the user we must obey it.
+ // We already breakout of the loop above at 2x max.
+ // This code centers the extra kf if the actual natural interval
+ // is between 1x and 2x.
+ if (cpi->oxcf.auto_key && rc->frames_to_key > cpi->oxcf.key_freq) {
+ FIRSTPASS_STATS tmp_frame = first_frame;
+
+ rc->frames_to_key /= 2;
+
+ // Reset to the start of the group.
+ reset_fpf_position(twopass, start_position);
+
+ kf_group_err = 0.0;
+
+ // Rescan to get the correct error data for the forced kf group.
+ for (i = 0; i < rc->frames_to_key; ++i) {
+ kf_group_err += calculate_modified_err(cpi, twopass, oxcf, &tmp_frame);
+ input_stats(twopass, &tmp_frame);
+ }
+ rc->next_key_frame_forced = 1;
+ } else if (twopass->stats_in == twopass->stats_in_end ||
+ rc->frames_to_key >= cpi->oxcf.key_freq) {
+ rc->next_key_frame_forced = 1;
+ } else {
+ rc->next_key_frame_forced = 0;
+ }
+
+ // Special case for the last key frame of the file.
+ if (twopass->stats_in >= twopass->stats_in_end) {
+ // Accumulate kf group error.
+ kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
+ }
+
+ // Calculate the number of bits that should be assigned to the kf group.
+ if (twopass->bits_left > 0 && twopass->modified_error_left > 0.0) {
+ // Maximum number of bits for a single normal frame (not key frame).
+ const int max_bits = frame_max_bits(rc, &cpi->oxcf);
+
+ // Maximum number of bits allocated to the key frame group.
+ int64_t max_grp_bits;
+
+ // Default allocation based on bits left and relative
+ // complexity of the section.
+ twopass->kf_group_bits = (int64_t)(
+ twopass->bits_left * (kf_group_err / twopass->modified_error_left));
+
+ // Clip based on maximum per frame rate defined by the user.
+ max_grp_bits = (int64_t)max_bits * (int64_t)rc->frames_to_key;
+ if (twopass->kf_group_bits > max_grp_bits)
+ twopass->kf_group_bits = max_grp_bits;
+ } else {
+ twopass->kf_group_bits = 0;
+ }
+ twopass->kf_group_bits = AOMMAX(0, twopass->kf_group_bits);
+
+ // Reset the first pass file position.
+ reset_fpf_position(twopass, start_position);
+
+ // Scan through the kf group collating various stats used to determine
+ // how many bits to spend on it.
+ decay_accumulator = 1.0;
+ boost_score = 0.0;
+ const double kf_max_boost =
+ cpi->oxcf.rc_mode == AOM_Q
+ ? AOMMIN(AOMMAX(rc->frames_to_key * 2.0, KF_MIN_FRAME_BOOST),
+ KF_MAX_FRAME_BOOST)
+ : KF_MAX_FRAME_BOOST;
+ for (i = 0; i < (rc->frames_to_key - 1); ++i) {
+ if (EOF == input_stats(twopass, &next_frame)) break;
+
+ // Monitor for static sections.
+ // For the first frame in kf group, the second ref indicator is invalid.
+ if (i > 0) {
+ zero_motion_accumulator = AOMMIN(
+ zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
+ } else {
+ zero_motion_accumulator = next_frame.pcnt_inter - next_frame.pcnt_motion;
+ }
+
+ // Not all frames in the group are necessarily used in calculating boost.
+ if ((i <= rc->max_gf_interval) ||
+ ((i <= (rc->max_gf_interval * 4)) && (decay_accumulator > 0.5))) {
+ const double frame_boost =
+ calc_frame_boost(cpi, this_frame, 0, kf_max_boost);
+
+ // How fast is prediction quality decaying.
+ if (!detect_flash(twopass, 0)) {
+ const double loop_decay_rate =
+ get_prediction_decay_rate(cpi, &next_frame);
+ decay_accumulator *= loop_decay_rate;
+ decay_accumulator = AOMMAX(decay_accumulator, MIN_DECAY_FACTOR);
+ av_decay_accumulator += decay_accumulator;
+ ++loop_decay_counter;
+ }
+ boost_score += (decay_accumulator * frame_boost);
+ }
+ }
+ if (loop_decay_counter > 0)
+ av_decay_accumulator /= (double)loop_decay_counter;
+
+ reset_fpf_position(twopass, start_position);
+
+ // Store the zero motion percentage
+ twopass->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0);
+
+ // Calculate a section intra ratio used in setting max loop filter.
+ twopass->section_intra_rating = calculate_section_intra_ratio(
+ start_position, twopass->stats_in_end, rc->frames_to_key);
+
+ rc->kf_boost = (int)(av_decay_accumulator * boost_score);
+
+ // Special case for static / slide show content but don't apply
+ // if the kf group is very short.
+ if ((zero_motion_accumulator > STATIC_KF_GROUP_FLOAT_THRESH) &&
+ (rc->frames_to_key > 8)) {
+ rc->kf_boost = AOMMAX(rc->kf_boost, MIN_STATIC_KF_BOOST);
+ } else {
+ // Apply various clamps for min and max boost
+ rc->kf_boost = AOMMAX(rc->kf_boost, (rc->frames_to_key * 3));
+ rc->kf_boost = AOMMAX(rc->kf_boost, MIN_KF_BOOST);
+ }
+
+ // Work out how many bits to allocate for the key frame itself.
+ kf_bits = calculate_boost_bits((rc->frames_to_key - 1), rc->kf_boost,
+ twopass->kf_group_bits);
+ // printf("kf boost = %d kf_bits = %d kf_zeromotion_pct = %d\n", rc->kf_boost,
+ // kf_bits, twopass->kf_zeromotion_pct);
+
+ // Work out the fraction of the kf group bits reserved for the inter frames
+ // within the group after discounting the bits for the kf itself.
+ if (twopass->kf_group_bits) {
+ twopass->kfgroup_inter_fraction =
+ (double)(twopass->kf_group_bits - kf_bits) /
+ (double)twopass->kf_group_bits;
+ } else {
+ twopass->kfgroup_inter_fraction = 1.0;
+ }
+
+ twopass->kf_group_bits -= kf_bits;
+
+ // Save the bits to spend on the key frame.
+ gf_group->bit_allocation[0] = kf_bits;
+ gf_group->update_type[0] = KF_UPDATE;
+
+ // Note the total error score of the kf group minus the key frame itself.
+ twopass->kf_group_error_left = (int)(kf_group_err - kf_mod_err);
+
+ // Adjust the count of total modified error left.
+ // The count of bits left is adjusted elsewhere based on real coded frame
+ // sizes.
+ twopass->modified_error_left -= kf_group_err;
+}
+
+static int is_skippable_frame(const AV1_COMP *cpi) {
+ // If the current frame does not have non-zero motion vector detected in the
+ // first pass, and so do its previous and forward frames, then this frame
+ // can be skipped for partition check, and the partition size is assigned
+ // according to the variance
+ const TWO_PASS *const twopass = &cpi->twopass;
+
+ return (!frame_is_intra_only(&cpi->common) &&
+ twopass->stats_in - 2 > twopass->stats_in_start &&
+ twopass->stats_in < twopass->stats_in_end &&
+ (twopass->stats_in - 1)->pcnt_inter -
+ (twopass->stats_in - 1)->pcnt_motion ==
+ 1 &&
+ (twopass->stats_in - 2)->pcnt_inter -
+ (twopass->stats_in - 2)->pcnt_motion ==
+ 1 &&
+ twopass->stats_in->pcnt_inter - twopass->stats_in->pcnt_motion == 1);
+}
+
+#define ARF_STATS_OUTPUT 0
+#if ARF_STATS_OUTPUT
+unsigned int arf_count = 0;
+#endif
+#define DEFAULT_GRP_WEIGHT 1.0
+
+void av1_get_second_pass_params(AV1_COMP *cpi,
+ EncodeFrameParams *const frame_params,
+ unsigned int frame_flags) {
+ AV1_COMMON *const cm = &cpi->common;
+ CurrentFrame *const current_frame = &cm->current_frame;
+ RATE_CONTROL *const rc = &cpi->rc;
+ TWO_PASS *const twopass = &cpi->twopass;
+ GF_GROUP *const gf_group = &twopass->gf_group;
+ int frames_left;
+ FIRSTPASS_STATS this_frame;
+
+ int target_rate;
+
+ frames_left = (int)(twopass->total_stats.count - current_frame->frame_number);
+
+ if (!twopass->stats_in) return;
+
+ // If this is an arf frame then we dont want to read the stats file or
+ // advance the input pointer as we already have what we need.
+ if (gf_group->update_type[gf_group->index] == ARF_UPDATE ||
+ gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE) {
+ target_rate = gf_group->bit_allocation[gf_group->index];
+ target_rate = av1_rc_clamp_pframe_target_size(
+ cpi, target_rate, gf_group->update_type[gf_group->index]);
+ rc->base_frame_target = target_rate;
+
+ if (cpi->no_show_kf) {
+ assert(gf_group->update_type[gf_group->index] == ARF_UPDATE);
+ frame_params->frame_type = KEY_FRAME;
+ } else {
+ frame_params->frame_type = INTER_FRAME;
+ }
+
+ // Do the firstpass stats indicate that this frame is skippable for the
+ // partition search?
+ if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2) {
+ cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
+ }
+
+ return;
+ }
+
+ aom_clear_system_state();
+
+ if (cpi->oxcf.rc_mode == AOM_Q) {
+ twopass->active_worst_quality = cpi->oxcf.cq_level;
+ } else if (current_frame->frame_number == 0) {
+ // Special case code for first frame.
+ const int section_target_bandwidth =
+ (int)(twopass->bits_left / frames_left);
+ const double section_length = twopass->total_left_stats.count;
+ const double section_error =
+ twopass->total_left_stats.coded_error / section_length;
+ const double section_intra_skip =
+ twopass->total_left_stats.intra_skip_pct / section_length;
+ const double section_inactive_zone =
+ (twopass->total_left_stats.inactive_zone_rows * 2) /
+ ((double)cm->mb_rows * section_length);
+ const int tmp_q = get_twopass_worst_quality(
+ cpi, section_error, section_intra_skip + section_inactive_zone,
+ section_target_bandwidth, DEFAULT_GRP_WEIGHT);
+
+ twopass->active_worst_quality = tmp_q;
+ twopass->baseline_active_worst_quality = tmp_q;
+ rc->ni_av_qi = tmp_q;
+ rc->last_q[INTER_FRAME] = tmp_q;
+ rc->avg_q = av1_convert_qindex_to_q(tmp_q, cm->seq_params.bit_depth);
+ rc->avg_frame_qindex[INTER_FRAME] = tmp_q;
+ rc->last_q[KEY_FRAME] = (tmp_q + cpi->oxcf.best_allowed_q) / 2;
+ rc->avg_frame_qindex[KEY_FRAME] = rc->last_q[KEY_FRAME];
+ }
+
+ av1_zero(this_frame);
+ if (EOF == input_stats(twopass, &this_frame)) return;
+
+ // Set the frame content type flag.
+ if (this_frame.intra_skip_pct >= FC_ANIMATION_THRESH)
+ twopass->fr_content_type = FC_GRAPHICS_ANIMATION;
+ else
+ twopass->fr_content_type = FC_NORMAL;
+
+ // Keyframe and section processing.
+ if (rc->frames_to_key == 0 || (frame_flags & FRAMEFLAGS_KEY)) {
+ FIRSTPASS_STATS this_frame_copy;
+ this_frame_copy = this_frame;
+ frame_params->frame_type = KEY_FRAME;
+ // Define next KF group and assign bits to it.
+ find_next_key_frame(cpi, &this_frame);
+ this_frame = this_frame_copy;
+ } else {
+ frame_params->frame_type = INTER_FRAME;
+ }
+
+ // Define a new GF/ARF group. (Should always enter here for key frames).
+ if (rc->frames_till_gf_update_due == 0) {
+ define_gf_group(cpi, &this_frame, frame_params);
+
+ rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+
+#if ARF_STATS_OUTPUT
+ {
+ FILE *fpfile;
+ fpfile = fopen("arf.stt", "a");
+ ++arf_count;
+ fprintf(fpfile, "%10d %10d %10d %10d %10d\n", current_frame->frame_number,
+ rc->frames_till_gf_update_due, rc->kf_boost, arf_count,
+ rc->gfu_boost);
+
+ fclose(fpfile);
+ }
+#endif
+ }
+
+ // Do the firstpass stats indicate that this frame is skippable for the
+ // partition search?
+ if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2) {
+ cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
+ }
+
+ target_rate = gf_group->bit_allocation[gf_group->index];
+
+ if (frame_params->frame_type == KEY_FRAME) {
+ target_rate = av1_rc_clamp_iframe_target_size(cpi, target_rate);
+ } else {
+ target_rate = av1_rc_clamp_pframe_target_size(
+ cpi, target_rate, gf_group->update_type[gf_group->index]);
+ }
+
+ rc->base_frame_target = target_rate;
+
+ {
+ const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+ ? cpi->initial_mbs
+ : cpi->common.MBs;
+ // The multiplication by 256 reverses a scaling factor of (>> 8)
+ // applied when combining MB error values for the frame.
+ twopass->mb_av_energy = log((this_frame.intra_error / num_mbs) + 1.0);
+ twopass->frame_avg_haar_energy =
+ log((this_frame.frame_avg_wavelet_energy / num_mbs) + 1.0);
+ }
+
+ // Update the total stats remaining structure.
+ subtract_stats(&twopass->total_left_stats, &this_frame);
+}
+
+void av1_init_second_pass(AV1_COMP *cpi) {
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ TWO_PASS *const twopass = &cpi->twopass;
+ double frame_rate;
+ FIRSTPASS_STATS *stats;
+
+ av1_twopass_zero_stats(&twopass->total_stats);
+ av1_twopass_zero_stats(&twopass->total_left_stats);
+
+ if (!twopass->stats_in_end) return;
+
+ stats = &twopass->total_stats;
+
+ *stats = *twopass->stats_in_end;
+ twopass->total_left_stats = *stats;
+
+ frame_rate = 10000000.0 * stats->count / stats->duration;
+ // Each frame can have a different duration, as the frame rate in the source
+ // isn't guaranteed to be constant. The frame rate prior to the first frame
+ // encoded in the second pass is a guess. However, the sum duration is not.
+ // It is calculated based on the actual durations of all frames from the
+ // first pass.
+ av1_new_framerate(cpi, frame_rate);
+ twopass->bits_left =
+ (int64_t)(stats->duration * oxcf->target_bandwidth / 10000000.0);
+
+ // This variable monitors how far behind the second ref update is lagging.
+ twopass->sr_update_lag = 1;
+
+ // Scan the first pass file and calculate a modified total error based upon
+ // the bias/power function used to allocate bits.
+ {
+ const double avg_error =
+ stats->coded_error / DOUBLE_DIVIDE_CHECK(stats->count);
+ const FIRSTPASS_STATS *s = twopass->stats_in;
+ double modified_error_total = 0.0;
+ twopass->modified_error_min =
+ (avg_error * oxcf->two_pass_vbrmin_section) / 100;
+ twopass->modified_error_max =
+ (avg_error * oxcf->two_pass_vbrmax_section) / 100;
+ while (s < twopass->stats_in_end) {
+ modified_error_total += calculate_modified_err(cpi, twopass, oxcf, s);
+ ++s;
+ }
+ twopass->modified_error_left = modified_error_total;
+ }
+
+ // Reset the vbr bits off target counters
+ cpi->rc.vbr_bits_off_target = 0;
+ cpi->rc.vbr_bits_off_target_fast = 0;
+
+ cpi->rc.rate_error_estimate = 0;
+
+ // Static sequence monitor variables.
+ twopass->kf_zeromotion_pct = 100;
+ twopass->last_kfgroup_zeromotion_pct = 100;
+}
+
+#define MINQ_ADJ_LIMIT 48
+#define MINQ_ADJ_LIMIT_CQ 20
+#define HIGH_UNDERSHOOT_RATIO 2
+void av1_twopass_postencode_update(AV1_COMP *cpi) {
+ TWO_PASS *const twopass = &cpi->twopass;
+ RATE_CONTROL *const rc = &cpi->rc;
+ const int bits_used = rc->base_frame_target;
+
+ // VBR correction is done through rc->vbr_bits_off_target. Based on the
+ // sign of this value, a limited % adjustment is made to the target rate
+ // of subsequent frames, to try and push it back towards 0. This method
+ // is designed to prevent extreme behaviour at the end of a clip
+ // or group of frames.
+ rc->vbr_bits_off_target += rc->base_frame_target - rc->projected_frame_size;
+ twopass->bits_left = AOMMAX(twopass->bits_left - bits_used, 0);
+
+ // Calculate the pct rc error.
+ if (rc->total_actual_bits) {
+ rc->rate_error_estimate =
+ (int)((rc->vbr_bits_off_target * 100) / rc->total_actual_bits);
+ rc->rate_error_estimate = clamp(rc->rate_error_estimate, -100, 100);
+ } else {
+ rc->rate_error_estimate = 0;
+ }
+
+ if (cpi->common.current_frame.frame_type != KEY_FRAME) {
+ twopass->kf_group_bits -= bits_used;
+ twopass->last_kfgroup_zeromotion_pct = twopass->kf_zeromotion_pct;
+ }
+ twopass->kf_group_bits = AOMMAX(twopass->kf_group_bits, 0);
+
+ // If the rate control is drifting consider adjustment to min or maxq.
+ if ((cpi->oxcf.rc_mode != AOM_Q) && !cpi->rc.is_src_frame_alt_ref) {
+ const int maxq_adj_limit =
+ rc->worst_quality - twopass->active_worst_quality;
+ const int minq_adj_limit =
+ (cpi->oxcf.rc_mode == AOM_CQ ? MINQ_ADJ_LIMIT_CQ : MINQ_ADJ_LIMIT);
+
+ // Undershoot.
+ if (rc->rate_error_estimate > cpi->oxcf.under_shoot_pct) {
+ --twopass->extend_maxq;
+ if (rc->rolling_target_bits >= rc->rolling_actual_bits)
+ ++twopass->extend_minq;
+ // Overshoot.
+ } else if (rc->rate_error_estimate < -cpi->oxcf.over_shoot_pct) {
+ --twopass->extend_minq;
+ if (rc->rolling_target_bits < rc->rolling_actual_bits)
+ ++twopass->extend_maxq;
+ } else {
+ // Adjustment for extreme local overshoot.
+ if (rc->projected_frame_size > (2 * rc->base_frame_target) &&
+ rc->projected_frame_size > (2 * rc->avg_frame_bandwidth))
+ ++twopass->extend_maxq;
+
+ // Unwind undershoot or overshoot adjustment.
+ if (rc->rolling_target_bits < rc->rolling_actual_bits)
+ --twopass->extend_minq;
+ else if (rc->rolling_target_bits > rc->rolling_actual_bits)
+ --twopass->extend_maxq;
+ }
+
+ twopass->extend_minq = clamp(twopass->extend_minq, 0, minq_adj_limit);
+ twopass->extend_maxq = clamp(twopass->extend_maxq, 0, maxq_adj_limit);
+
+ // If there is a big and undexpected undershoot then feed the extra
+ // bits back in quickly. One situation where this may happen is if a
+ // frame is unexpectedly almost perfectly predicted by the ARF or GF
+ // but not very well predcited by the previous frame.
+ if (!frame_is_kf_gf_arf(cpi) && !cpi->rc.is_src_frame_alt_ref) {
+ int fast_extra_thresh = rc->base_frame_target / HIGH_UNDERSHOOT_RATIO;
+ if (rc->projected_frame_size < fast_extra_thresh) {
+ rc->vbr_bits_off_target_fast +=
+ fast_extra_thresh - rc->projected_frame_size;
+ rc->vbr_bits_off_target_fast =
+ AOMMIN(rc->vbr_bits_off_target_fast, (4 * rc->avg_frame_bandwidth));
+
+ // Fast adaptation of minQ if necessary to use up the extra bits.
+ if (rc->avg_frame_bandwidth) {
+ twopass->extend_minq_fast =
+ (int)(rc->vbr_bits_off_target_fast * 8 / rc->avg_frame_bandwidth);
+ }
+ twopass->extend_minq_fast = AOMMIN(
+ twopass->extend_minq_fast, minq_adj_limit - twopass->extend_minq);
+ } else if (rc->vbr_bits_off_target_fast) {
+ twopass->extend_minq_fast = AOMMIN(
+ twopass->extend_minq_fast, minq_adj_limit - twopass->extend_minq);
+ } else {
+ twopass->extend_minq_fast = 0;
+ }
+ }
+ }
+}
diff --git a/libaom/av1/encoder/pass2_strategy.h b/libaom/av1/encoder/pass2_strategy.h
new file mode 100644
index 0000000..bf37746
--- /dev/null
+++ b/libaom/av1/encoder/pass2_strategy.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_PASS2_STRATEGY_H_
+#define AOM_AV1_ENCODER_PASS2_STRATEGY_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1_COMP;
+struct EncodeFrameParams;
+
+void av1_init_second_pass(struct AV1_COMP *cpi);
+
+void av1_get_second_pass_params(struct AV1_COMP *cpi,
+ struct EncodeFrameParams *const frame_params,
+ unsigned int frame_flags);
+
+void av1_twopass_postencode_update(struct AV1_COMP *cpi);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_PASS2_STRATEGY_H_
diff --git a/libaom/av1/encoder/picklpf.c b/libaom/av1/encoder/picklpf.c
index b6b84c8..aca089c 100644
--- a/libaom/av1/encoder/picklpf.c
+++ b/libaom/av1/encoder/picklpf.c
@@ -70,24 +70,24 @@ static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd,
// TODO(any): please enable multi-thread and remove the flag when loop
// filter mask is compatible with multi-thread.
if (cpi->num_workers > 1)
- av1_loop_filter_frame_mt(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, plane,
+ av1_loop_filter_frame_mt(&cm->cur_frame->buf, cm, &cpi->td.mb.e_mbd, plane,
plane + 1, partial_frame,
#if LOOP_FILTER_BITMASK
0,
#endif
cpi->workers, cpi->num_workers, &cpi->lf_row_sync);
else
- av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd,
+ av1_loop_filter_frame(&cm->cur_frame->buf, cm, &cpi->td.mb.e_mbd,
#if LOOP_FILTER_BITMASK
0,
#endif
plane, plane + 1, partial_frame);
- filt_err = aom_get_sse_plane(sd, cm->frame_to_show, plane,
+ filt_err = aom_get_sse_plane(sd, &cm->cur_frame->buf, plane,
cm->seq_params.use_highbitdepth);
// Re-instate the unfiltered frame
- yv12_copy_plane(&cpi->last_frame_uf, cm->frame_to_show, plane);
+ yv12_copy_plane(&cpi->last_frame_uf, &cm->cur_frame->buf, plane);
return filt_err;
}
@@ -108,7 +108,17 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
// range.
int lvl;
switch (plane) {
- case 0: lvl = last_frame_filter_level[dir]; break;
+ case 0:
+ switch (dir) {
+ case 2:
+ lvl = (last_frame_filter_level[0] + last_frame_filter_level[1] + 1) >>
+ 1;
+ break;
+ case 0:
+ case 1: lvl = last_frame_filter_level[dir]; break;
+ default: assert(dir >= 0 && dir <= 2); return 0;
+ }
+ break;
case 1: lvl = last_frame_filter_level[2]; break;
case 2: lvl = last_frame_filter_level[3]; break;
default: assert(plane >= 0 && plane <= 2); return 0;
@@ -120,7 +130,7 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
// Set each entry to -1
memset(ss_err, 0xFF, sizeof(ss_err));
- yv12_copy_plane(cm->frame_to_show, &cpi->last_frame_uf, plane);
+ yv12_copy_plane(&cm->cur_frame->buf, &cpi->last_frame_uf, plane);
best_err = try_filter_frame(sd, cpi, filt_mid, partial_frame, plane, dir);
filt_best = filt_mid;
ss_err[filt_mid] = best_err;
@@ -203,19 +213,25 @@ void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
const int min_filter_level = 0;
const int max_filter_level = av1_get_max_filter_level(cpi);
const int q = av1_ac_quant_Q3(cm->base_qindex, 0, cm->seq_params.bit_depth);
+ // based on tests result for rtc test set
+ // 0.04590 boosted or 0.02295 non-booseted in 18-bit fixed point
+ const int strength_boost_q_treshold = 700;
+ const int inter_frame_multiplier =
+ q > strength_boost_q_treshold ? 12034 : 6017;
// These values were determined by linear fitting the result of the
// searched level for 8 bit depth:
// Keyframes: filt_guess = q * 0.06699 - 1.60817
- // Other frames: filt_guess = q * 0.02295 + 2.48225
+ // Other frames: filt_guess = q * inter_frame_multiplier + 2.48225
//
// And high bit depth separately:
// filt_guess = q * 0.316206 + 3.87252
int filt_guess;
switch (cm->seq_params.bit_depth) {
case AOM_BITS_8:
- filt_guess = (cm->current_frame.frame_type == KEY_FRAME)
- ? ROUND_POWER_OF_TWO(q * 17563 - 421574, 18)
- : ROUND_POWER_OF_TWO(q * 6017 + 650707, 18);
+ filt_guess =
+ (cm->current_frame.frame_type == KEY_FRAME)
+ ? ROUND_POWER_OF_TWO(q * 17563 - 421574, 18)
+ : ROUND_POWER_OF_TWO(q * inter_frame_multiplier + 650707, 18);
break;
case AOM_BITS_10:
filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 4060632, 20);
diff --git a/libaom/av1/encoder/pickrst.c b/libaom/av1/encoder/pickrst.c
index a7fab16..1b4f26c 100644
--- a/libaom/av1/encoder/pickrst.c
+++ b/libaom/av1/encoder/pickrst.c
@@ -140,7 +140,7 @@ static void init_rsc(const YV12_BUFFER_CONFIG *src, const AV1_COMMON *cm,
rsc->rusi = rusi;
rsc->sf = sf;
- const YV12_BUFFER_CONFIG *dgd = cm->frame_to_show;
+ const YV12_BUFFER_CONFIG *dgd = &cm->cur_frame->buf;
const int is_uv = plane != AOM_PLANE_Y;
rsc->plane_width = src->crop_widths[is_uv];
rsc->plane_height = src->crop_heights[is_uv];
@@ -165,7 +165,7 @@ static int64_t try_restoration_unit(const RestSearchCtxt *rsc,
const int bit_depth = cm->seq_params.bit_depth;
const int highbd = cm->seq_params.use_highbitdepth;
- const YV12_BUFFER_CONFIG *fts = cm->frame_to_show;
+ const YV12_BUFFER_CONFIG *fts = &cm->cur_frame->buf;
// TODO(yunqing): For now, only use optimized LR filter in decoder. Can be
// also used in encoder.
const int optimized_lr = 0;
@@ -200,7 +200,7 @@ int64_t av1_lowbd_pixel_proj_error_c(const uint8_t *src8, int width, int height,
v += xq[0] * (flt0[j] - u) + xq[1] * (flt1[j] - u);
const int32_t e =
ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j];
- err += e * e;
+ err += ((int64_t)e * e);
}
dat += dat_stride;
src += src_stride;
@@ -216,7 +216,7 @@ int64_t av1_lowbd_pixel_proj_error_c(const uint8_t *src8, int width, int height,
v += xq[0] * (flt0[j] - u);
const int32_t e =
ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j];
- err += e * e;
+ err += ((int64_t)e * e);
}
dat += dat_stride;
src += src_stride;
@@ -231,7 +231,7 @@ int64_t av1_lowbd_pixel_proj_error_c(const uint8_t *src8, int width, int height,
v += xq[1] * (flt1[j] - u);
const int32_t e =
ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j];
- err += e * e;
+ err += ((int64_t)e * e);
}
dat += dat_stride;
src += src_stride;
@@ -241,7 +241,7 @@ int64_t av1_lowbd_pixel_proj_error_c(const uint8_t *src8, int width, int height,
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
const int32_t e = (int32_t)(dat[j]) - src[j];
- err += e * e;
+ err += ((int64_t)e * e);
}
dat += dat_stride;
src += src_stride;
@@ -276,7 +276,7 @@ int64_t av1_highbd_pixel_proj_error_c(const uint8_t *src8, int width,
v += xq0 * v0;
v += xq1 * v1;
const int32_t e = (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + d - s;
- err += e * e;
+ err += ((int64_t)e * e);
}
dat += dat_stride;
flt0 += flt0_stride;
@@ -304,7 +304,7 @@ int64_t av1_highbd_pixel_proj_error_c(const uint8_t *src8, int width,
int32_t v = half;
v += exq * (flt[j] - u);
const int32_t e = (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + d - s;
- err += e * e;
+ err += ((int64_t)e * e);
}
dat += dat_stride;
flt += flt_stride;
@@ -316,7 +316,7 @@ int64_t av1_highbd_pixel_proj_error_c(const uint8_t *src8, int width,
const int32_t d = dat[j];
const int32_t s = src[j];
const int32_t e = d - s;
- err += e * e;
+ err += ((int64_t)e * e);
}
dat += dat_stride;
src += src_stride;
@@ -1281,7 +1281,7 @@ static void search_norestore(const RestorationTileLimits *limits,
const int highbd = rsc->cm->seq_params.use_highbitdepth;
rusi->sse[RESTORE_NONE] = sse_restoration_unit(
- limits, rsc->src, rsc->cm->frame_to_show, rsc->plane, highbd);
+ limits, rsc->src, &rsc->cm->cur_frame->buf, rsc->plane, highbd);
rsc->sse += rusi->sse[RESTORE_NONE];
}
@@ -1413,20 +1413,22 @@ void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi) {
RestorationType best_rtype = RESTORE_NONE;
const int highbd = rsc.cm->seq_params.use_highbitdepth;
- extend_frame(rsc.dgd_buffer, rsc.plane_width, rsc.plane_height,
- rsc.dgd_stride, RESTORATION_BORDER, RESTORATION_BORDER,
- highbd);
+ if (!cpi->sf.disable_loop_restoration_chroma || !plane) {
+ extend_frame(rsc.dgd_buffer, rsc.plane_width, rsc.plane_height,
+ rsc.dgd_stride, RESTORATION_BORDER, RESTORATION_BORDER,
+ highbd);
- for (RestorationType r = 0; r < num_rtypes; ++r) {
- if ((force_restore_type != RESTORE_TYPES) && (r != RESTORE_NONE) &&
- (r != force_restore_type))
- continue;
+ for (RestorationType r = 0; r < num_rtypes; ++r) {
+ if ((force_restore_type != RESTORE_TYPES) && (r != RESTORE_NONE) &&
+ (r != force_restore_type))
+ continue;
- double cost = search_rest_type(&rsc, r);
+ double cost = search_rest_type(&rsc, r);
- if (r == 0 || cost < best_cost) {
- best_cost = cost;
- best_rtype = r;
+ if (r == 0 || cost < best_cost) {
+ best_cost = cost;
+ best_rtype = r;
+ }
}
}
diff --git a/libaom/av1/encoder/ratectrl.c b/libaom/av1/encoder/ratectrl.c
index 21632c0..861c737 100644
--- a/libaom/av1/encoder/ratectrl.c
+++ b/libaom/av1/encoder/ratectrl.c
@@ -29,6 +29,8 @@
#include "av1/common/seg_common.h"
#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encode_strategy.h"
+#include "av1/encoder/gop_structure.h"
#include "av1/encoder/random.h"
#include "av1/encoder/ratectrl.h"
@@ -96,18 +98,13 @@ static double resize_rate_factor(const AV1_COMP *cpi, int width, int height) {
// fit to the original data (after plotting real maxq vs minq (not q index))
static int get_minq_index(double maxq, double x3, double x2, double x1,
aom_bit_depth_t bit_depth) {
- int i;
const double minqtarget = AOMMIN(((x3 * maxq + x2) * maxq + x1) * maxq, maxq);
// Special case handling to deal with the step from q2.0
// down to lossless mode represented by q 1.0.
if (minqtarget <= 2.0) return 0;
- for (i = 0; i < QINDEX_RANGE; i++) {
- if (minqtarget <= av1_convert_qindex_to_q(i, bit_depth)) return i;
- }
-
- return QINDEX_RANGE - 1;
+ return av1_find_qindex(minqtarget, bit_depth, 0, QINDEX_RANGE - 1);
}
static void init_minq_luts(int *kf_low_m, int *kf_high_m, int *arfgf_low,
@@ -174,13 +171,15 @@ int av1_estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs,
(int)((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS);
}
-int av1_rc_clamp_pframe_target_size(const AV1_COMP *const cpi, int target) {
+int av1_rc_clamp_pframe_target_size(const AV1_COMP *const cpi, int target,
+ FRAME_UPDATE_TYPE frame_update_type) {
const RATE_CONTROL *rc = &cpi->rc;
const AV1EncoderConfig *oxcf = &cpi->oxcf;
const int min_frame_target =
AOMMAX(rc->min_frame_bandwidth, rc->avg_frame_bandwidth >> 5);
// Clip the frame target to the minimum setup value.
- if (cpi->rc.is_src_frame_alt_ref) {
+ if (frame_update_type == OVERLAY_UPDATE ||
+ frame_update_type == INTNL_OVERLAY_UPDATE) {
// If there is an active ARF at this location use the minimum
// bits on this frame even if it is a constructed arf.
// The active maximum quantizer insures that an appropriate
@@ -219,9 +218,7 @@ static void update_buffer_level(AV1_COMP *cpi, int encoded_frame_size) {
RATE_CONTROL *const rc = &cpi->rc;
// Non-viewable frames are a special case and are treated as pure overhead.
- // TODO(zoeliu): To further explore whether we should treat BWDREF_FRAME
- // differently, since it is a no-show frame.
- if (!cm->show_frame && !rc->is_bwd_ref_frame)
+ if (!cm->show_frame)
rc->bits_off_target -= encoded_frame_size;
else
rc->bits_off_target += rc->avg_frame_bandwidth - encoded_frame_size;
@@ -253,9 +250,7 @@ int av1_rc_get_default_min_gf_interval(int width, int height,
int av1_rc_get_default_max_gf_interval(double framerate, int min_gf_interval) {
int interval = AOMMIN(MAX_GF_INTERVAL, (int)(framerate * 0.75));
interval += (interval & 0x01); // Round to even value
-#if CONFIG_FIX_GF_LENGTH
- interval = AOMMAX(FIXED_GF_LENGTH, interval);
-#endif
+ interval = AOMMAX(MAX_GF_INTERVAL, interval);
return AOMMAX(interval, min_gf_interval);
}
@@ -352,6 +347,22 @@ int av1_rc_drop_frame(AV1_COMP *cpi) {
}
}
+static const RATE_FACTOR_LEVEL rate_factor_levels[FRAME_UPDATE_TYPES] = {
+ KF_STD, // KF_UPDATE
+ INTER_NORMAL, // LF_UPDATE
+ GF_ARF_STD, // GF_UPDATE
+ GF_ARF_STD, // ARF_UPDATE
+ INTER_NORMAL, // OVERLAY_UPDATE
+ INTER_NORMAL, // INTNL_OVERLAY_UPDATE
+ GF_ARF_LOW, // INTNL_ARF_UPDATE
+};
+
+static RATE_FACTOR_LEVEL get_rate_factor_level(const GF_GROUP *const gf_group) {
+ const FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index];
+ assert(update_type < FRAME_UPDATE_TYPES);
+ return rate_factor_levels[update_type];
+}
+
static double get_rate_correction_factor(const AV1_COMP *cpi, int width,
int height) {
const RATE_CONTROL *const rc = &cpi->rc;
@@ -360,8 +371,8 @@ static double get_rate_correction_factor(const AV1_COMP *cpi, int width,
if (cpi->common.current_frame.frame_type == KEY_FRAME) {
rcf = rc->rate_correction_factors[KF_STD];
} else if (cpi->oxcf.pass == 2) {
- RATE_FACTOR_LEVEL rf_lvl =
- cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index];
+ const RATE_FACTOR_LEVEL rf_lvl =
+ get_rate_factor_level(&cpi->twopass.gf_group);
rcf = rc->rate_correction_factors[rf_lvl];
} else {
if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
@@ -387,8 +398,8 @@ static void set_rate_correction_factor(AV1_COMP *cpi, double factor, int width,
if (cpi->common.current_frame.frame_type == KEY_FRAME) {
rc->rate_correction_factors[KF_STD] = factor;
} else if (cpi->oxcf.pass == 2) {
- RATE_FACTOR_LEVEL rf_lvl =
- cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index];
+ const RATE_FACTOR_LEVEL rf_lvl =
+ get_rate_factor_level(&cpi->twopass.gf_group);
rc->rate_correction_factors[rf_lvl] = factor;
} else {
if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
@@ -474,45 +485,82 @@ void av1_rc_update_rate_correction_factors(AV1_COMP *cpi, int width,
set_rate_correction_factor(cpi, rate_correction_factor, width, height);
}
+// Calculate rate for the given 'q'.
+static int get_bits_per_mb(const AV1_COMP *cpi, int use_cyclic_refresh,
+ double correction_factor, int q) {
+ const AV1_COMMON *const cm = &cpi->common;
+ return use_cyclic_refresh
+ ? av1_cyclic_refresh_rc_bits_per_mb(cpi, q, correction_factor)
+ : av1_rc_bits_per_mb(cm->current_frame.frame_type, q,
+ correction_factor, cm->seq_params.bit_depth);
+}
+
+// Similar to find_qindex_by_rate() function in ratectrl.c, but returns the q
+// index with rate just above or below the desired rate, depending on which of
+// the two rates is closer to the desired rate.
+// Also, respects the selected aq_mode when computing the rate.
+static int find_closest_qindex_by_rate(int desired_bits_per_mb,
+ const AV1_COMP *cpi,
+ double correction_factor,
+ int best_qindex, int worst_qindex) {
+ const int use_cyclic_refresh =
+ cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->common.seg.enabled;
+
+ // Find 'qindex' based on 'desired_bits_per_mb'.
+ assert(best_qindex <= worst_qindex);
+ int low = best_qindex;
+ int high = worst_qindex;
+ while (low < high) {
+ const int mid = (low + high) >> 1;
+ const int mid_bits_per_mb =
+ get_bits_per_mb(cpi, use_cyclic_refresh, correction_factor, mid);
+ if (mid_bits_per_mb > desired_bits_per_mb) {
+ low = mid + 1;
+ } else {
+ high = mid;
+ }
+ }
+ assert(low == high);
+
+ // Calculate rate difference of this q index from the desired rate.
+ const int curr_q = low;
+ const int curr_bits_per_mb =
+ get_bits_per_mb(cpi, use_cyclic_refresh, correction_factor, curr_q);
+ const int curr_bit_diff = (curr_bits_per_mb <= desired_bits_per_mb)
+ ? desired_bits_per_mb - curr_bits_per_mb
+ : INT_MAX;
+ assert((curr_bit_diff != INT_MAX && curr_bit_diff >= 0) ||
+ curr_q == worst_qindex);
+
+ // Calculate rate difference for previous q index too.
+ const int prev_q = curr_q - 1;
+ int prev_bit_diff;
+ if (curr_bit_diff == INT_MAX || curr_q == best_qindex) {
+ prev_bit_diff = INT_MAX;
+ } else {
+ const int prev_bits_per_mb =
+ get_bits_per_mb(cpi, use_cyclic_refresh, correction_factor, prev_q);
+ assert(prev_bits_per_mb > desired_bits_per_mb);
+ prev_bit_diff = prev_bits_per_mb - desired_bits_per_mb;
+ }
+
+ // Pick one of the two q indices, depending on which one has rate closer to
+ // the desired rate.
+ return (curr_bit_diff <= prev_bit_diff) ? curr_q : prev_q;
+}
+
int av1_rc_regulate_q(const AV1_COMP *cpi, int target_bits_per_frame,
int active_best_quality, int active_worst_quality,
int width, int height) {
- const AV1_COMMON *const cm = &cpi->common;
- int q = active_worst_quality;
- int last_error = INT_MAX;
- int i, target_bits_per_mb, bits_per_mb_at_this_q;
const int MBs = av1_get_MBs(width, height);
const double correction_factor =
get_rate_correction_factor(cpi, width, height);
-
- // Calculate required scaling factor based on target frame size and size of
- // frame produced using previous Q.
- target_bits_per_mb =
+ const int target_bits_per_mb =
(int)((uint64_t)(target_bits_per_frame) << BPER_MB_NORMBITS) / MBs;
- i = active_best_quality;
-
- do {
- if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
- bits_per_mb_at_this_q =
- (int)av1_cyclic_refresh_rc_bits_per_mb(cpi, i, correction_factor);
- } else {
- bits_per_mb_at_this_q =
- (int)av1_rc_bits_per_mb(cm->current_frame.frame_type, i,
- correction_factor, cm->seq_params.bit_depth);
- }
-
- if (bits_per_mb_at_this_q <= target_bits_per_mb) {
- if ((target_bits_per_mb - bits_per_mb_at_this_q) <= last_error)
- q = i;
- else
- q = i - 1;
-
- break;
- } else {
- last_error = bits_per_mb_at_this_q - target_bits_per_mb;
- }
- } while (++i <= active_worst_quality);
+ int q =
+ find_closest_qindex_by_rate(target_bits_per_mb, cpi, correction_factor,
+ active_best_quality, active_worst_quality);
// In CBR mode, this makes sure q is between oscillating Qs to prevent
// resonance.
@@ -560,13 +608,11 @@ static int get_gf_active_quality(const RATE_CONTROL *const rc, int q,
arfgf_low_motion_minq, arfgf_high_motion_minq);
}
-#if REDUCE_LAST_ALT_BOOST
static int get_gf_high_motion_quality(int q, aom_bit_depth_t bit_depth) {
int *arfgf_high_motion_minq;
ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq);
return arfgf_high_motion_minq[q];
}
-#endif
static int calc_active_worst_quality_one_pass_vbr(const AV1_COMP *cpi) {
const RATE_CONTROL *const rc = &cpi->rc;
@@ -758,10 +804,28 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const AV1_COMP *cpi, int width,
return q;
}
+static int gf_group_pyramid_level(const AV1_COMP *cpi) {
+ const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+ int this_height = gf_group->pyramid_level[gf_group->index];
+ return this_height;
+}
+
static int get_active_cq_level(const RATE_CONTROL *rc,
- const AV1EncoderConfig *const oxcf) {
+ const AV1EncoderConfig *const oxcf,
+ int intra_only, int superres_denom) {
static const double cq_adjust_threshold = 0.1;
int active_cq_level = oxcf->cq_level;
+ (void)intra_only;
+ if (oxcf->rc_mode == AOM_CQ || oxcf->rc_mode == AOM_Q) {
+ // printf("Superres %d %d %d = %d\n", superres_denom, intra_only,
+ // rc->frames_to_key, !(intra_only && rc->frames_to_key <= 1));
+ if (oxcf->superres_mode == SUPERRES_QTHRESH &&
+ superres_denom != SCALE_NUMERATOR &&
+ !(intra_only && rc->frames_to_key <= 1)) {
+ active_cq_level =
+ AOMMAX(active_cq_level - ((superres_denom - SCALE_NUMERATOR) * 4), 0);
+ }
+ }
if (oxcf->rc_mode == AOM_CQ && rc->total_target_bits > 0) {
const double x = (double)rc->total_actual_bits / rc->total_target_bits;
if (x < cq_adjust_threshold) {
@@ -778,7 +842,8 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const AV1_COMP *cpi, int width,
const RATE_CONTROL *const rc = &cpi->rc;
const CurrentFrame *const current_frame = &cm->current_frame;
const AV1EncoderConfig *const oxcf = &cpi->oxcf;
- const int cq_level = get_active_cq_level(rc, oxcf);
+ const int cq_level = get_active_cq_level(rc, oxcf, frame_is_intra_only(cm),
+ cm->superres_scale_denominator);
int active_best_quality;
int active_worst_quality = calc_active_worst_quality_one_pass_vbr(cpi);
int q;
@@ -920,15 +985,20 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const AV1_COMP *cpi, int width,
return q;
}
-int av1_frame_type_qdelta(const AV1_COMP *cpi, int rf_level, int q) {
- static const FRAME_TYPE frame_type[RATE_FACTOR_LEVELS] = {
- INTER_FRAME, INTER_FRAME, INTER_FRAME, INTER_FRAME, INTER_FRAME, KEY_FRAME
- };
- const AV1_COMMON *const cm = &cpi->common;
- int qdelta = av1_compute_qdelta_by_rate(&cpi->rc, frame_type[rf_level], q,
- rate_factor_deltas[rf_level],
- cm->seq_params.bit_depth);
- return qdelta;
+static const double rate_factor_deltas[RATE_FACTOR_LEVELS] = {
+ 1.00, // INTER_NORMAL
+ 1.25, // GF_ARF_LOW
+ 2.00, // GF_ARF_STD
+ 2.00, // KF_STD
+};
+
+int av1_frame_type_qdelta(const AV1_COMP *cpi, int q) {
+ const RATE_FACTOR_LEVEL rf_lvl =
+ get_rate_factor_level(&cpi->twopass.gf_group);
+ const FRAME_TYPE frame_type = (rf_lvl == KF_STD) ? KEY_FRAME : INTER_FRAME;
+ return av1_compute_qdelta_by_rate(&cpi->rc, frame_type, q,
+ rate_factor_deltas[rf_lvl],
+ cpi->common.seq_params.bit_depth);
}
#define STATIC_MOTION_THRESH 95
@@ -939,7 +1009,8 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
const RATE_CONTROL *const rc = &cpi->rc;
const AV1EncoderConfig *const oxcf = &cpi->oxcf;
const GF_GROUP *gf_group = &cpi->twopass.gf_group;
- const int cq_level = get_active_cq_level(rc, oxcf);
+ const int cq_level = get_active_cq_level(rc, oxcf, frame_is_intra_only(cm),
+ cm->superres_scale_denominator);
int active_best_quality;
int active_worst_quality = cpi->twopass.active_worst_quality;
int q;
@@ -947,12 +1018,8 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
const int bit_depth = cm->seq_params.bit_depth;
ASSIGN_MINQ_TABLE(bit_depth, inter_minq);
-#if CUSTOMIZED_GF
const int is_intrl_arf_boost =
gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE;
-#else
- const int is_intrl_arf_boost = cpi->refresh_alt2_ref_frame;
-#endif // CUSTOMIZED_GF
if (frame_is_intra_only(cm)) {
if (rc->frames_to_key == 1 && oxcf->rc_mode == AOM_Q) {
@@ -961,6 +1028,18 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
// as q.
active_best_quality = cq_level;
active_worst_quality = cq_level;
+ } else if (cm->current_frame.frame_type == KEY_FRAME &&
+ cm->show_frame == 0) {
+ // Handle the special case for forward reference key frames.
+ // Increase the boost because this keyframe is used as a forward and
+ // backward reference.
+ const int qindex = rc->last_boosted_qindex;
+ const double last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
+ const int delta_qindex = av1_compute_qdelta(
+ rc, last_boosted_q, last_boosted_q * 0.25, bit_depth);
+ active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+ // Update the arf_q since the forward keyframe is replacing the ALTREF
+ *arf_q = active_best_quality;
} else if (rc->this_key_frame_forced) {
// Handle the special case for key frames forced when we have reached
// the maximum key frame interval. Here force the Q to a range
@@ -978,13 +1057,10 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
active_worst_quality =
AOMMIN(qindex + delta_qindex, active_worst_quality);
} else {
- // Increase the boost if the forced keyframe is a forward reference.
- // These numbers were derived empirically.
- const double boost_factor = cpi->oxcf.fwd_kf_enabled ? 0.25 : 0.50;
qindex = rc->last_boosted_qindex;
last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
- delta_qindex = av1_compute_qdelta(
- rc, last_boosted_q, last_boosted_q * boost_factor, bit_depth);
+ delta_qindex = av1_compute_qdelta(rc, last_boosted_q,
+ last_boosted_q * 0.50, bit_depth);
active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
}
} else {
@@ -1035,80 +1111,57 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
// Constrained quality use slightly lower active best.
active_best_quality = active_best_quality * 15 / 16;
-#if USE_SYMM_MULTI_LAYER && MULTI_LVL_BOOST_VBR_CQ
- if (gf_group->update_type[gf_group->index] == ARF_UPDATE ||
- (is_intrl_arf_boost && !cpi->new_bwdref_update_rule)) {
-#if REDUCE_LAST_ALT_BOOST
- if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
- const int min_boost = get_gf_high_motion_quality(q, bit_depth);
- const int boost = min_boost - active_best_quality;
+ if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
+ const int min_boost = get_gf_high_motion_quality(q, bit_depth);
+ const int boost = min_boost - active_best_quality;
- active_best_quality = min_boost - (int)(boost * rc->arf_boost_factor);
- }
-#endif // REDUCE_LAST_ALT_BOOST
+ active_best_quality = min_boost - (int)(boost * rc->arf_boost_factor);
*arf_q = active_best_quality;
- } else if (cpi->new_bwdref_update_rule && is_intrl_arf_boost) {
+ } else if (is_intrl_arf_boost) {
assert(rc->arf_q >= 0); // Ensure it is set to a valid value.
active_best_quality = rc->arf_q;
- int this_height = gf_group->pyramid_level[gf_group->index];
+ int this_height = gf_group_pyramid_level(cpi);
while (this_height < gf_group->pyramid_height) {
active_best_quality = (active_best_quality + cq_level + 1) / 2;
++this_height;
}
}
-#endif // USE_SYMM_MULTI_LAYER && MULTI_LVL_BOOST_VBR_CQ
} else if (oxcf->rc_mode == AOM_Q) {
if (!cpi->refresh_alt_ref_frame && !is_intrl_arf_boost) {
active_best_quality = cq_level;
} else {
if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
active_best_quality = get_gf_active_quality(rc, q, bit_depth);
- *arf_q = active_best_quality;
-#if REDUCE_LAST_ALT_BOOST
const int min_boost = get_gf_high_motion_quality(q, bit_depth);
const int boost = min_boost - active_best_quality;
active_best_quality = min_boost - (int)(boost * rc->arf_boost_factor);
-#endif
+ *arf_q = active_best_quality;
} else {
assert(rc->arf_q >= 0); // Ensure it is set to a valid value.
+ assert(is_intrl_arf_boost);
active_best_quality = rc->arf_q;
- }
-#if USE_SYMM_MULTI_LAYER
- if (cpi->new_bwdref_update_rule && is_intrl_arf_boost) {
- int this_height = gf_group->pyramid_level[gf_group->index];
+ int this_height = gf_group_pyramid_level(cpi);
while (this_height < gf_group->pyramid_height) {
active_best_quality = (active_best_quality + cq_level + 1) / 2;
++this_height;
}
- } else {
-#endif
- // Modify best quality for second level arfs. For mode AOM_Q this
- // becomes the baseline frame q.
- if (gf_group->rf_level[gf_group->index] == GF_ARF_LOW)
- active_best_quality = (active_best_quality + cq_level + 1) / 2;
-#if USE_SYMM_MULTI_LAYER
}
-#endif
}
} else {
active_best_quality = get_gf_active_quality(rc, q, bit_depth);
-#if REDUCE_LAST_ALT_BOOST
const int min_boost = get_gf_high_motion_quality(q, bit_depth);
const int boost = min_boost - active_best_quality;
active_best_quality = min_boost - (int)(boost * rc->arf_boost_factor);
-#endif
-#if USE_SYMM_MULTI_LAYER
- if (cpi->new_bwdref_update_rule && is_intrl_arf_boost) {
- int this_height = gf_group->pyramid_level[gf_group->index];
+ if (is_intrl_arf_boost) {
+ int this_height = gf_group_pyramid_level(cpi);
while (this_height < gf_group->pyramid_height) {
active_best_quality =
(active_best_quality + active_worst_quality + 1) / 2;
++this_height;
}
}
-#endif
}
} else {
if (oxcf->rc_mode == AOM_Q) {
@@ -1126,8 +1179,7 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
// Extension to max or min Q if undershoot or overshoot is outside
// the permitted range.
- if ((cpi->oxcf.rc_mode != AOM_Q) &&
- (cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD)) {
+ if (cpi->oxcf.rc_mode != AOM_Q) {
if (frame_is_intra_only(cm) ||
(!rc->is_src_frame_alt_ref &&
(cpi->refresh_golden_frame || is_intrl_arf_boost ||
@@ -1146,8 +1198,7 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
// Static forced key frames Q restrictions dealt with elsewhere.
if (!(frame_is_intra_only(cm)) || !rc->this_key_frame_forced ||
(cpi->twopass.last_kfgroup_zeromotion_pct < STATIC_MOTION_THRESH)) {
- int qdelta = av1_frame_type_qdelta(cpi, gf_group->rf_level[gf_group->index],
- active_worst_quality);
+ const int qdelta = av1_frame_type_qdelta(cpi, active_worst_quality);
active_worst_quality =
AOMMAX(active_worst_quality + qdelta, active_best_quality);
}
@@ -1167,7 +1218,8 @@ static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
if (oxcf->rc_mode == AOM_Q ||
(frame_is_intra_only(cm) && !rc->this_key_frame_forced &&
- cpi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH)) {
+ cpi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH &&
+ rc->frames_to_key > 1)) {
q = active_best_quality;
// Special case code to try and match quality with forced key frames.
} else if (frame_is_intra_only(cm) && rc->this_key_frame_forced) {
@@ -1275,16 +1327,12 @@ static void update_alt_ref_frame_stats(AV1_COMP *cpi) {
static void update_golden_frame_stats(AV1_COMP *cpi) {
RATE_CONTROL *const rc = &cpi->rc;
-#if CUSTOMIZED_GF
const TWO_PASS *const twopass = &cpi->twopass;
const GF_GROUP *const gf_group = &twopass->gf_group;
const int is_intrnl_arf =
cpi->oxcf.pass == 2
? gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE
: cpi->refresh_alt2_ref_frame;
-#else
- const int is_intnl_arf = cpi->refresh_alt2_ref_frame;
-#endif
// Update the Golden frame usage counts.
// NOTE(weitinglin): If we use show_existing_frame for an OVERLAY frame,
@@ -1292,9 +1340,10 @@ static void update_golden_frame_stats(AV1_COMP *cpi) {
// updated and cpi->refresh_golden_frame will still be zero.
if (cpi->refresh_golden_frame || rc->is_src_frame_alt_ref) {
// We will not use internal overlay frames to replace the golden frame
- if (!rc->is_src_frame_ext_arf)
+ if (!rc->is_src_frame_internal_arf) {
// this frame refreshes means next frames don't unless specified by user
rc->frames_since_golden = 0;
+ }
// If we are not using alt ref in the up and coming group clear the arf
// active flag. In multi arf group case, if the index is not 0 then
@@ -1310,165 +1359,16 @@ static void update_golden_frame_stats(AV1_COMP *cpi) {
}
}
-// Define the reference buffers that will be updated post encode.
-void av1_configure_buffer_updates(AV1_COMP *cpi) {
- TWO_PASS *const twopass = &cpi->twopass;
-
- // NOTE(weitinglin): Should we define another function to take care of
- // cpi->rc.is_$Source_Type to make this function as it is in the comment?
-
- cpi->rc.is_src_frame_alt_ref = 0;
- cpi->rc.is_bwd_ref_frame = 0;
- cpi->rc.is_last_bipred_frame = 0;
- cpi->rc.is_bipred_frame = 0;
- cpi->rc.is_src_frame_ext_arf = 0;
-
- switch (twopass->gf_group.update_type[twopass->gf_group.index]) {
- case KF_UPDATE:
- cpi->refresh_last_frame = 1;
- cpi->refresh_golden_frame = 1;
- cpi->refresh_bwd_ref_frame = 1;
- cpi->refresh_alt2_ref_frame = 1;
- cpi->refresh_alt_ref_frame = 1;
- break;
-
- case LF_UPDATE:
- cpi->refresh_last_frame = 1;
- cpi->refresh_golden_frame = 0;
- cpi->refresh_bwd_ref_frame = 0;
- cpi->refresh_alt2_ref_frame = 0;
- cpi->refresh_alt_ref_frame = 0;
- break;
-
- case GF_UPDATE:
- // TODO(zoeliu): To further investigate whether 'refresh_last_frame' is
- // needed.
- cpi->refresh_last_frame = 1;
- cpi->refresh_golden_frame = 1;
- cpi->refresh_bwd_ref_frame = 0;
- cpi->refresh_alt2_ref_frame = 0;
- cpi->refresh_alt_ref_frame = 0;
- break;
-
- case OVERLAY_UPDATE:
- cpi->refresh_last_frame = 0;
- cpi->refresh_golden_frame = 1;
- cpi->refresh_bwd_ref_frame = 0;
- cpi->refresh_alt2_ref_frame = 0;
- cpi->refresh_alt_ref_frame = 0;
-
- cpi->rc.is_src_frame_alt_ref = 1;
- break;
-
- case ARF_UPDATE:
- cpi->refresh_last_frame = 0;
- cpi->refresh_golden_frame = 0;
- // NOTE: BWDREF does not get updated along with ALTREF_FRAME.
- cpi->refresh_bwd_ref_frame = 0;
- cpi->refresh_alt2_ref_frame = 0;
- cpi->refresh_alt_ref_frame = 1;
- break;
-
- case BRF_UPDATE:
- cpi->refresh_last_frame = 0;
- cpi->refresh_golden_frame = 0;
- cpi->refresh_bwd_ref_frame = 1;
- cpi->refresh_alt2_ref_frame = 0;
- cpi->refresh_alt_ref_frame = 0;
-
- cpi->rc.is_bwd_ref_frame = 1;
- break;
-
- case LAST_BIPRED_UPDATE:
- cpi->refresh_last_frame = 1;
- cpi->refresh_golden_frame = 0;
- cpi->refresh_bwd_ref_frame = 0;
- cpi->refresh_alt2_ref_frame = 0;
- cpi->refresh_alt_ref_frame = 0;
-
- cpi->rc.is_last_bipred_frame = 1;
- break;
-
- case BIPRED_UPDATE:
- cpi->refresh_last_frame = 1;
- cpi->refresh_golden_frame = 0;
- cpi->refresh_bwd_ref_frame = 0;
- cpi->refresh_alt2_ref_frame = 0;
- cpi->refresh_alt_ref_frame = 0;
-
- cpi->rc.is_bipred_frame = 1;
- break;
-
- case INTNL_OVERLAY_UPDATE:
- cpi->refresh_last_frame = 1;
- cpi->refresh_golden_frame = 0;
- cpi->refresh_bwd_ref_frame = 0;
- cpi->refresh_alt2_ref_frame = 0;
- cpi->refresh_alt_ref_frame = 0;
-
- cpi->rc.is_src_frame_alt_ref = 1;
- cpi->rc.is_src_frame_ext_arf = 1;
- break;
-
- case INTNL_ARF_UPDATE:
- cpi->refresh_last_frame = 0;
- cpi->refresh_golden_frame = 0;
-#if USE_SYMM_MULTI_LAYER
- if (cpi->new_bwdref_update_rule == 1) {
- cpi->refresh_bwd_ref_frame = 1;
- cpi->refresh_alt2_ref_frame = 0;
- } else {
-#endif
- cpi->refresh_bwd_ref_frame = 0;
- cpi->refresh_alt2_ref_frame = 1;
-#if USE_SYMM_MULTI_LAYER
- }
-#endif
- cpi->refresh_alt_ref_frame = 0;
- break;
-
- default: assert(0); break;
- }
-}
-
-void av1_estimate_qp_gop(AV1_COMP *cpi) {
- AV1_COMMON *const cm = &cpi->common;
- int gop_length = cpi->rc.baseline_gf_interval;
- int bottom_index, top_index;
- int idx;
- const int gf_index = cpi->twopass.gf_group.index;
-
- for (idx = 1; idx <= gop_length + 1 && idx < MAX_LAG_BUFFERS; ++idx) {
- TplDepFrame *tpl_frame = &cpi->tpl_stats[idx];
- int target_rate = cpi->twopass.gf_group.bit_allocation[idx];
- int arf_q = 0;
-
- cpi->twopass.gf_group.index = idx;
- rc_set_frame_target(cpi, target_rate, cm->width, cm->height);
- av1_configure_buffer_updates(cpi);
- tpl_frame->base_qindex = rc_pick_q_and_bounds_two_pass(
- cpi, cm->width, cm->height, &bottom_index, &top_index, &arf_q);
- tpl_frame->base_qindex = AOMMAX(tpl_frame->base_qindex, 1);
- }
- // Reset the actual index and frame update
- cpi->twopass.gf_group.index = gf_index;
- av1_configure_buffer_updates(cpi);
-}
-
void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
const AV1_COMMON *const cm = &cpi->common;
const CurrentFrame *const current_frame = &cm->current_frame;
RATE_CONTROL *const rc = &cpi->rc;
-#if CUSTOMIZED_GF
const TWO_PASS *const twopass = &cpi->twopass;
const GF_GROUP *const gf_group = &twopass->gf_group;
const int is_intrnl_arf =
cpi->oxcf.pass == 2
? gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE
: cpi->refresh_alt2_ref_frame;
-#else
- const int is_intrnl_arf = cpi->refresh_alt2_ref_frame;
-#endif
const int qindex = cm->base_qindex;
@@ -1539,10 +1439,7 @@ void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
// Actual bits spent
rc->total_actual_bits += rc->projected_frame_size;
- // TODO(zoeliu): To investigate whether we should treat BWDREF_FRAME
- // differently here for rc->avg_frame_bandwidth.
- rc->total_target_bits +=
- (cm->show_frame || rc->is_bwd_ref_frame) ? rc->avg_frame_bandwidth : 0;
+ rc->total_target_bits += cm->show_frame ? rc->avg_frame_bandwidth : 0;
rc->total_target_vs_actual = rc->total_actual_bits - rc->total_target_bits;
@@ -1575,22 +1472,24 @@ void av1_rc_postencode_update_drop_frame(AV1_COMP *cpi) {
// Use this macro to turn on/off use of alt-refs in one-pass mode.
#define USE_ALTREF_FOR_ONE_PASS 1
-static int calc_pframe_target_size_one_pass_vbr(const AV1_COMP *const cpi) {
+static int calc_pframe_target_size_one_pass_vbr(
+ const AV1_COMP *const cpi, FRAME_UPDATE_TYPE frame_update_type) {
static const int af_ratio = 10;
const RATE_CONTROL *const rc = &cpi->rc;
int target;
#if USE_ALTREF_FOR_ONE_PASS
- target =
- (!rc->is_src_frame_alt_ref &&
- (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))
- ? (rc->avg_frame_bandwidth * rc->baseline_gf_interval * af_ratio) /
- (rc->baseline_gf_interval + af_ratio - 1)
- : (rc->avg_frame_bandwidth * rc->baseline_gf_interval) /
- (rc->baseline_gf_interval + af_ratio - 1);
+ if (frame_update_type == KF_UPDATE || frame_update_type == GF_UPDATE ||
+ frame_update_type == ARF_UPDATE) {
+ target = (rc->avg_frame_bandwidth * rc->baseline_gf_interval * af_ratio) /
+ (rc->baseline_gf_interval + af_ratio - 1);
+ } else {
+ target = (rc->avg_frame_bandwidth * rc->baseline_gf_interval) /
+ (rc->baseline_gf_interval + af_ratio - 1);
+ }
#else
target = rc->avg_frame_bandwidth;
#endif
- return av1_rc_clamp_pframe_target_size(cpi, target);
+ return av1_rc_clamp_pframe_target_size(cpi, target, frame_update_type);
}
static int calc_iframe_target_size_one_pass_vbr(const AV1_COMP *const cpi) {
@@ -1600,7 +1499,10 @@ static int calc_iframe_target_size_one_pass_vbr(const AV1_COMP *const cpi) {
return av1_rc_clamp_iframe_target_size(cpi, target);
}
-void av1_rc_get_one_pass_vbr_params(AV1_COMP *cpi) {
+void av1_rc_get_one_pass_vbr_params(AV1_COMP *cpi,
+ FRAME_UPDATE_TYPE *const frame_update_type,
+ EncodeFrameParams *const frame_params,
+ unsigned int frame_flags) {
AV1_COMMON *const cm = &cpi->common;
RATE_CONTROL *const rc = &cpi->rc;
CurrentFrame *const current_frame = &cm->current_frame;
@@ -1610,48 +1512,45 @@ void av1_rc_get_one_pass_vbr_params(AV1_COMP *cpi) {
int sframe_mode = cpi->oxcf.sframe_mode;
int sframe_enabled = cpi->oxcf.sframe_enabled;
// TODO(yaowu): replace the "auto_key && 0" below with proper decision logic.
- if (!cpi->refresh_alt_ref_frame &&
- (current_frame->frame_number == 0 ||
- (cpi->frame_flags & FRAMEFLAGS_KEY) || rc->frames_to_key == 0 ||
- (cpi->oxcf.auto_key && 0))) {
- current_frame->frame_type = KEY_FRAME;
+ if (*frame_update_type != ARF_UPDATE &&
+ (current_frame->frame_number == 0 || (frame_flags & FRAMEFLAGS_KEY) ||
+ rc->frames_to_key == 0 || (cpi->oxcf.auto_key && 0))) {
+ frame_params->frame_type = KEY_FRAME;
rc->this_key_frame_forced =
current_frame->frame_number != 0 && rc->frames_to_key == 0;
rc->frames_to_key = cpi->oxcf.key_freq;
rc->kf_boost = DEFAULT_KF_BOOST;
rc->source_alt_ref_active = 0;
} else {
- current_frame->frame_type = INTER_FRAME;
+ frame_params->frame_type = INTER_FRAME;
if (sframe_enabled) {
if (altref_enabled) {
if (sframe_mode == 1) {
// sframe_mode == 1: insert sframe if it matches altref frame.
if (current_frame->frame_number % sframe_dist == 0 &&
- current_frame->frame_type != KEY_FRAME &&
- current_frame->frame_number != 0 && cpi->refresh_alt_ref_frame) {
- current_frame->frame_type = S_FRAME;
+ current_frame->frame_number != 0 &&
+ *frame_update_type == ARF_UPDATE) {
+ frame_params->frame_type = S_FRAME;
}
} else {
// sframe_mode != 1: if sframe will be inserted at the next available
// altref frame
if (current_frame->frame_number % sframe_dist == 0 &&
- current_frame->frame_type != KEY_FRAME &&
current_frame->frame_number != 0) {
rc->sframe_due = 1;
}
- if (rc->sframe_due && cpi->refresh_alt_ref_frame) {
- current_frame->frame_type = S_FRAME;
+ if (rc->sframe_due && *frame_update_type == ARF_UPDATE) {
+ frame_params->frame_type = S_FRAME;
rc->sframe_due = 0;
}
}
} else {
if (current_frame->frame_number % sframe_dist == 0 &&
- current_frame->frame_type != KEY_FRAME &&
current_frame->frame_number != 0) {
- current_frame->frame_type = S_FRAME;
+ frame_params->frame_type = S_FRAME;
}
}
}
@@ -1666,7 +1565,7 @@ void av1_rc_get_one_pass_vbr_params(AV1_COMP *cpi) {
} else {
rc->constrained_gf_group = 0;
}
- cpi->refresh_golden_frame = 1;
+ if (*frame_update_type == LF_UPDATE) *frame_update_type = GF_UPDATE;
rc->source_alt_ref_pending = USE_ALTREF_FOR_ONE_PASS;
rc->gfu_boost = DEFAULT_GF_BOOST;
}
@@ -1674,14 +1573,15 @@ void av1_rc_get_one_pass_vbr_params(AV1_COMP *cpi) {
if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
av1_cyclic_refresh_update_parameters(cpi);
- if (current_frame->frame_type == KEY_FRAME)
+ if (frame_params->frame_type == KEY_FRAME)
target = calc_iframe_target_size_one_pass_vbr(cpi);
else
- target = calc_pframe_target_size_one_pass_vbr(cpi);
+ target = calc_pframe_target_size_one_pass_vbr(cpi, *frame_update_type);
rc_set_frame_target(cpi, target, cm->width, cm->height);
}
-static int calc_pframe_target_size_one_pass_cbr(const AV1_COMP *cpi) {
+static int calc_pframe_target_size_one_pass_cbr(
+ const AV1_COMP *cpi, FRAME_UPDATE_TYPE frame_update_type) {
const AV1EncoderConfig *oxcf = &cpi->oxcf;
const RATE_CONTROL *rc = &cpi->rc;
const int64_t diff = rc->optimal_buffer_level - rc->buffer_level;
@@ -1692,12 +1592,14 @@ static int calc_pframe_target_size_one_pass_cbr(const AV1_COMP *cpi) {
if (oxcf->gf_cbr_boost_pct) {
const int af_ratio_pct = oxcf->gf_cbr_boost_pct + 100;
- target = cpi->refresh_golden_frame
- ? (rc->avg_frame_bandwidth * rc->baseline_gf_interval *
- af_ratio_pct) /
- (rc->baseline_gf_interval * 100 + af_ratio_pct - 100)
- : (rc->avg_frame_bandwidth * rc->baseline_gf_interval * 100) /
- (rc->baseline_gf_interval * 100 + af_ratio_pct - 100);
+ if (frame_update_type == GF_UPDATE || frame_update_type == OVERLAY_UPDATE) {
+ target =
+ (rc->avg_frame_bandwidth * rc->baseline_gf_interval * af_ratio_pct) /
+ (rc->baseline_gf_interval * 100 + af_ratio_pct - 100);
+ } else {
+ target = (rc->avg_frame_bandwidth * rc->baseline_gf_interval * 100) /
+ (rc->baseline_gf_interval * 100 + af_ratio_pct - 100);
+ }
} else {
target = rc->avg_frame_bandwidth;
}
@@ -1740,23 +1642,25 @@ static int calc_iframe_target_size_one_pass_cbr(const AV1_COMP *cpi) {
return av1_rc_clamp_iframe_target_size(cpi, target);
}
-void av1_rc_get_one_pass_cbr_params(AV1_COMP *cpi) {
+void av1_rc_get_one_pass_cbr_params(AV1_COMP *cpi,
+ FRAME_UPDATE_TYPE *const frame_update_type,
+ EncodeFrameParams *const frame_params,
+ unsigned int frame_flags) {
AV1_COMMON *const cm = &cpi->common;
RATE_CONTROL *const rc = &cpi->rc;
CurrentFrame *const current_frame = &cm->current_frame;
int target;
// TODO(yaowu): replace the "auto_key && 0" below with proper decision logic.
- if ((current_frame->frame_number == 0 ||
- (cpi->frame_flags & FRAMEFLAGS_KEY) || rc->frames_to_key == 0 ||
- (cpi->oxcf.auto_key && 0))) {
- current_frame->frame_type = KEY_FRAME;
+ if ((current_frame->frame_number == 0 || (frame_flags & FRAMEFLAGS_KEY) ||
+ rc->frames_to_key == 0 || (cpi->oxcf.auto_key && 0))) {
+ frame_params->frame_type = KEY_FRAME;
rc->this_key_frame_forced =
current_frame->frame_number != 0 && rc->frames_to_key == 0;
rc->frames_to_key = cpi->oxcf.key_freq;
rc->kf_boost = DEFAULT_KF_BOOST;
rc->source_alt_ref_active = 0;
} else {
- current_frame->frame_type = INTER_FRAME;
+ frame_params->frame_type = INTER_FRAME;
}
if (rc->frames_till_gf_update_due == 0) {
if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
@@ -1768,7 +1672,7 @@ void av1_rc_get_one_pass_cbr_params(AV1_COMP *cpi) {
// NOTE: frames_till_gf_update_due must be <= frames_to_key.
if (rc->frames_till_gf_update_due > rc->frames_to_key)
rc->frames_till_gf_update_due = rc->frames_to_key;
- cpi->refresh_golden_frame = 1;
+ if (*frame_update_type == LF_UPDATE) *frame_update_type = GF_UPDATE;
rc->gfu_boost = DEFAULT_GF_BOOST;
}
@@ -1777,42 +1681,75 @@ void av1_rc_get_one_pass_cbr_params(AV1_COMP *cpi) {
if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
av1_cyclic_refresh_update_parameters(cpi);
- if (current_frame->frame_type == KEY_FRAME)
+ if (frame_params->frame_type == KEY_FRAME)
target = calc_iframe_target_size_one_pass_cbr(cpi);
else
- target = calc_pframe_target_size_one_pass_cbr(cpi);
+ target = calc_pframe_target_size_one_pass_cbr(cpi, *frame_update_type);
rc_set_frame_target(cpi, target, cm->width, cm->height);
// TODO(afergs): Decide whether to scale up, down, or not at all
}
+int av1_find_qindex(double desired_q, aom_bit_depth_t bit_depth,
+ int best_qindex, int worst_qindex) {
+ assert(best_qindex <= worst_qindex);
+ int low = best_qindex;
+ int high = worst_qindex;
+ while (low < high) {
+ const int mid = (low + high) >> 1;
+ const double mid_q = av1_convert_qindex_to_q(mid, bit_depth);
+ if (mid_q < desired_q) {
+ low = mid + 1;
+ } else {
+ high = mid;
+ }
+ }
+ assert(low == high);
+ assert(av1_convert_qindex_to_q(low, bit_depth) >= desired_q ||
+ low == worst_qindex);
+ return low;
+}
+
int av1_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget,
aom_bit_depth_t bit_depth) {
- int start_index = rc->worst_quality;
- int target_index = rc->worst_quality;
- int i;
-
- // Convert the average q value to an index.
- for (i = rc->best_quality; i < rc->worst_quality; ++i) {
- start_index = i;
- if (av1_convert_qindex_to_q(i, bit_depth) >= qstart) break;
- }
+ const int start_index =
+ av1_find_qindex(qstart, bit_depth, rc->best_quality, rc->worst_quality);
+ const int target_index =
+ av1_find_qindex(qtarget, bit_depth, rc->best_quality, rc->worst_quality);
+ return target_index - start_index;
+}
- // Convert the q target to an index
- for (i = rc->best_quality; i < rc->worst_quality; ++i) {
- target_index = i;
- if (av1_convert_qindex_to_q(i, bit_depth) >= qtarget) break;
+// Find q_index for the desired_bits_per_mb, within [best_qindex, worst_qindex],
+// assuming 'correction_factor' is 1.0.
+// To be precise, 'q_index' is the smallest integer, for which the corresponding
+// bits per mb <= desired_bits_per_mb.
+// If no such q index is found, returns 'worst_qindex'.
+static int find_qindex_by_rate(int desired_bits_per_mb,
+ aom_bit_depth_t bit_depth, FRAME_TYPE frame_type,
+ int best_qindex, int worst_qindex) {
+ assert(best_qindex <= worst_qindex);
+ int low = best_qindex;
+ int high = worst_qindex;
+ while (low < high) {
+ const int mid = (low + high) >> 1;
+ const int mid_bits_per_mb =
+ av1_rc_bits_per_mb(frame_type, mid, 1.0, bit_depth);
+ if (mid_bits_per_mb > desired_bits_per_mb) {
+ low = mid + 1;
+ } else {
+ high = mid;
+ }
}
-
- return target_index - start_index;
+ assert(low == high);
+ assert(av1_rc_bits_per_mb(frame_type, low, 1.0, bit_depth) <=
+ desired_bits_per_mb ||
+ low == worst_qindex);
+ return low;
}
int av1_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type,
int qindex, double rate_target_ratio,
aom_bit_depth_t bit_depth) {
- int target_index = rc->worst_quality;
- int i;
-
// Look up the current projected bits per block for the base index
const int base_bits_per_mb =
av1_rc_bits_per_mb(frame_type, qindex, 1.0, bit_depth);
@@ -1820,14 +1757,9 @@ int av1_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type,
// Find the target bits per mb based on the base value and given ratio.
const int target_bits_per_mb = (int)(rate_target_ratio * base_bits_per_mb);
- // Convert the q target to an index
- for (i = rc->best_quality; i < rc->worst_quality; ++i) {
- if (av1_rc_bits_per_mb(frame_type, i, 1.0, bit_depth) <=
- target_bits_per_mb) {
- target_index = i;
- break;
- }
- }
+ const int target_index =
+ find_qindex_by_rate(target_bits_per_mb, bit_depth, frame_type,
+ rc->best_quality, rc->worst_quality);
return target_index - qindex;
}
diff --git a/libaom/av1/encoder/ratectrl.h b/libaom/av1/encoder/ratectrl.h
index ea8975d..1cd5994 100644
--- a/libaom/av1/encoder/ratectrl.h
+++ b/libaom/av1/encoder/ratectrl.h
@@ -15,6 +15,8 @@
#include "aom/aom_codec.h"
#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
#include "av1/common/blockd.h"
#include "av1/common/onyxc_int.h"
@@ -34,54 +36,29 @@ extern "C" {
// The maximum duration of a GF group that is static (e.g. a slide show).
#define MAX_STATIC_GF_GROUP_LENGTH 250
-#define CUSTOMIZED_GF 1
-
-#if CONFIG_FIX_GF_LENGTH
-#define FIXED_GF_LENGTH 16
+// Minimum and maximum height for the new pyramid structure.
+// (Old structure supports height = 1, but does NOT support height = 4).
+#define MIN_PYRAMID_LVL 0
#define MAX_PYRAMID_LVL 4
-// We allow a frame to have at most two left/right descendants before changing
-// them into to a subtree, i.e., we allow the following structure:
-/* OUT_OF_ORDER_FRAME
- / / \ \
-(two left children) F F F F (two right children) */
-// Therefore the max gf size supported by 4 layer structure is
-// 1 (KEY/OVERLAY) + 1 + 2 + 4 + 16 (two children on both side of their parent)
-#define MAX_PYRAMID_SIZE 24
-#define USE_SYMM_MULTI_LAYER 1
-#define REDUCE_LAST_ALT_BOOST 1
-#define REDUCE_LAST_GF_LENGTH 1
-#define MULTI_LVL_BOOST_VBR_CQ 1
-#else
-#define MAX_PYRAMID_SIZE 16
-#define USE_SYMM_MULTI_LAYER 0
-#define REDUCE_LAST_ALT_BOOST 0
-#define REDUCE_LAST_GF_LENGTH 0
-#define MULTI_LVL_BOOST_VBR_CQ 0
-#endif
-
-#if USE_SYMM_MULTI_LAYER
-#define USE_MANUAL_GF4_STRUCT 0
-#endif
#define MIN_GF_INTERVAL 4
#define MAX_GF_INTERVAL 16
#define FIXED_GF_INTERVAL 8 // Used in some testing modes only
-static const double rate_factor_deltas[RATE_FACTOR_LEVELS] = {
- 1.00, // INTER_NORMAL
- 0.80, // INTER_LOW
- 1.50, // INTER_HIGH
- 1.25, // GF_ARF_LOW
- 2.00, // GF_ARF_STD
- 2.00, // KF_STD
-};
-
typedef struct {
int resize_width;
int resize_height;
uint8_t superres_denom;
} size_params_type;
+enum {
+ INTER_NORMAL,
+ GF_ARF_LOW,
+ GF_ARF_STD,
+ KF_STD,
+ RATE_FACTOR_LEVELS
+} UENUM1BYTE(RATE_FACTOR_LEVEL);
+
typedef struct {
// Rate targetting variables
int base_frame_target; // A baseline frame target before adjustment
@@ -94,7 +71,6 @@ typedef struct {
int last_kf_qindex; // Q index of the last key frame coded.
int gfu_boost;
- int last_boost;
int kf_boost;
double rate_correction_factors[RATE_FACTOR_LEVELS];
@@ -113,18 +89,9 @@ typedef struct {
int source_alt_ref_pending;
int source_alt_ref_active;
int is_src_frame_alt_ref;
+ int is_src_frame_internal_arf;
int sframe_due;
- // Length of the bi-predictive frame group interval
- int bipred_group_interval;
-
- // NOTE: Different types of frames may have different bits allocated
- // accordingly, aiming to achieve the overall optimal RD performance.
- int is_bwd_ref_frame;
- int is_last_bipred_frame;
- int is_bipred_frame;
- int is_src_frame_ext_arf;
-
int avg_frame_bandwidth; // Average frame size target for clip
int min_frame_bandwidth; // Minimum allocation used for any frame
int max_frame_bandwidth; // Maximum burst rate allowed for a frame.
@@ -172,8 +139,6 @@ typedef struct {
int q_1_frame;
int q_2_frame;
- // Auto frame-scaling variables.
- int rf_level_maxq[RATE_FACTOR_LEVELS];
float_t arf_boost_factor;
// Q index used for ALT frame
int arf_q;
@@ -196,7 +161,7 @@ int av1_rc_get_default_min_gf_interval(int width, int height, double framerate);
// Note av1_rc_get_default_max_gf_interval() requires the min_gf_interval to
// be passed in to ensure that the max_gf_interval returned is at least as bis
// as that.
-int av1_rc_get_default_max_gf_interval(double framerate, int min_frame_rate);
+int av1_rc_get_default_max_gf_interval(double framerate, int min_gf_interval);
// Generally at the high level, the following flow is expected
// to be enforced for rate control:
@@ -221,8 +186,13 @@ int av1_rc_get_default_max_gf_interval(double framerate, int min_frame_rate);
// Functions to set parameters for encoding before the actual
// encode_frame_to_data_rate() function.
-void av1_rc_get_one_pass_vbr_params(struct AV1_COMP *cpi);
-void av1_rc_get_one_pass_cbr_params(struct AV1_COMP *cpi);
+struct EncodeFrameParams;
+void av1_rc_get_one_pass_vbr_params(
+ struct AV1_COMP *cpi, uint8_t *const frame_update_type,
+ struct EncodeFrameParams *const frame_params, unsigned int frame_flags);
+void av1_rc_get_one_pass_cbr_params(
+ struct AV1_COMP *cpi, uint8_t *const frame_update_type,
+ struct EncodeFrameParams *const frame_params, unsigned int frame_flags);
// Post encode update of the rate control parameters based
// on bytes used
@@ -262,7 +232,14 @@ int av1_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
int av1_rc_clamp_iframe_target_size(const struct AV1_COMP *const cpi,
int target);
int av1_rc_clamp_pframe_target_size(const struct AV1_COMP *const cpi,
- int target);
+ int target, uint8_t frame_update_type);
+
+// Find q_index corresponding to desired_q, within [best_qindex, worst_qindex].
+// To be precise, 'q_index' is the smallest integer, for which the corresponding
+// q >= desired_q.
+// If no such q index is found, returns 'worst_qindex'.
+int av1_find_qindex(double desired_q, aom_bit_depth_t bit_depth,
+ int best_qindex, int worst_qindex);
// Computes a q delta (in "q index" terms) to get from a starting q value
// to a target q value
@@ -275,7 +252,7 @@ int av1_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type,
int qindex, double rate_target_ratio,
aom_bit_depth_t bit_depth);
-int av1_frame_type_qdelta(const struct AV1_COMP *cpi, int rf_level, int q);
+int av1_frame_type_qdelta(const struct AV1_COMP *cpi, int q);
void av1_rc_update_framerate(struct AV1_COMP *cpi, int width, int height);
@@ -286,10 +263,6 @@ void av1_set_target_rate(struct AV1_COMP *cpi, int width, int height);
int av1_resize_one_pass_cbr(struct AV1_COMP *cpi);
-void av1_configure_buffer_updates(struct AV1_COMP *cpi);
-
-void av1_estimate_qp_gop(struct AV1_COMP *cpi);
-
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/libaom/av1/encoder/rd.c b/libaom/av1/encoder/rd.c
index 510bb3b..d78e269 100644
--- a/libaom/av1/encoder/rd.c
+++ b/libaom/av1/encoder/rd.c
@@ -344,13 +344,7 @@ void av1_init_me_luts(void) {
static const int rd_boost_factor[16] = { 64, 32, 32, 32, 24, 16, 12, 12,
8, 8, 4, 4, 2, 2, 1, 0 };
static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = {
- 128, 144, 128, 128, 144,
- // TODO(zoeliu): To adjust further following factor values.
- 128, 128, 128,
- // TODO(weitinglin): We should investigate if the values should be the same
- // as the value used by OVERLAY frame
- 144, // INTNL_OVERLAY_UPDATE
- 128 // INTNL_ARF_UPDATE
+ 128, 144, 128, 128, 144, 144, 128
};
int av1_compute_rd_mult_based_on_qindex(const AV1_COMP *cpi, int qindex) {
@@ -508,6 +502,17 @@ void av1_fill_coeff_costs(MACROBLOCK *x, FRAME_CONTEXT *fc,
av1_cost_tokens_from_cdf(pcost->base_cost[ctx],
fc->coeff_base_cdf[tx_size][plane][ctx], NULL);
+ for (int ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx) {
+ pcost->base_cost[ctx][4] = 0;
+ pcost->base_cost[ctx][5] = pcost->base_cost[ctx][1] +
+ av1_cost_literal(1) -
+ pcost->base_cost[ctx][0];
+ pcost->base_cost[ctx][6] =
+ pcost->base_cost[ctx][2] - pcost->base_cost[ctx][1];
+ pcost->base_cost[ctx][7] =
+ pcost->base_cost[ctx][3] - pcost->base_cost[ctx][2];
+ }
+
for (int ctx = 0; ctx < EOB_COEF_CONTEXTS; ++ctx)
av1_cost_tokens_from_cdf(pcost->eob_extra_cost[ctx],
fc->eob_extra_cdf[tx_size][plane][ctx], NULL);
@@ -538,6 +543,14 @@ void av1_fill_coeff_costs(MACROBLOCK *x, FRAME_CONTEXT *fc,
// printf("%5d ", pcost->lps_cost[ctx][i]);
// printf("\n");
}
+ for (int ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx) {
+ pcost->lps_cost[ctx][0 + COEFF_BASE_RANGE + 1] =
+ pcost->lps_cost[ctx][0];
+ for (int i = 1; i <= COEFF_BASE_RANGE; ++i) {
+ pcost->lps_cost[ctx][i + COEFF_BASE_RANGE + 1] =
+ pcost->lps_cost[ctx][i] - pcost->lps_cost[ctx][i - 1];
+ }
+ }
}
}
}
@@ -684,6 +697,7 @@ static double interp_cubic(const double *p, double x) {
x * (3.0 * (p[1] - p[2]) + p[3] - p[0])));
}
+/*
static double interp_bicubic(const double *p, int p_stride, double x,
double y) {
double q[4];
@@ -693,441 +707,224 @@ static double interp_bicubic(const double *p, int p_stride, double x,
q[3] = interp_cubic(p + 3 * p_stride, x);
return interp_cubic(q, y);
}
+*/
-static const double interp_rgrid_surf[65 * 18] = {
- 0.104019, 0.245714, 0.293686, 0.358635, 0.382167, 0.412446,
- 0.419955, 0.421388, 0.426672, 0.427990, 0.428531, 0.456868,
- 0.569880, 0.638822, 1.016319, 2.143453, 3.565229, 4.720880,
- 0.124618, 0.294211, 0.352023, 0.429991, 0.458206, 0.494510,
- 0.503513, 0.505232, 0.511566, 0.513234, 0.519365, 0.570225,
- 0.697373, 0.840624, 1.462198, 3.289054, 6.256517, 6.852788,
- 0.118630, 0.269669, 0.346620, 0.430999, 0.459385, 0.495783,
- 0.504808, 0.506532, 0.512884, 0.514988, 0.543437, 0.662772,
- 0.795876, 1.313596, 2.403841, 4.163098, 7.440589, 8.616275,
- 0.093329, 0.168205, 0.321320, 0.430607, 0.459385, 0.495783,
- 0.504813, 0.506548, 0.512975, 0.520662, 0.571659, 0.701841,
- 1.010727, 2.138851, 3.460626, 6.317955, 10.098127, 14.418553,
- 0.087021, 0.142905, 0.315011, 0.430509, 0.459385, 0.495787,
- 0.505075, 0.507599, 0.513584, 0.543182, 0.669941, 0.825620,
- 1.362800, 2.572187, 4.205047, 7.498399, 12.303118, 16.641735,
- 0.086923, 0.142513, 0.314913, 0.430508, 0.459385, 0.495803,
- 0.506126, 0.511816, 0.514810, 0.549705, 0.725350, 1.127334,
- 2.168597, 3.463686, 6.318605, 10.162284, 18.556041, 19.847042,
- 0.086923, 0.142513, 0.314913, 0.430506, 0.459376, 0.495805,
- 0.506388, 0.512954, 0.520772, 0.580215, 0.810474, 1.391548,
- 2.579442, 4.205160, 7.498399, 12.381597, 21.703618, 24.015457,
- 0.086923, 0.142513, 0.314911, 0.430353, 0.458765, 0.495652,
- 0.506391, 0.513406, 0.544098, 0.702950, 1.121860, 2.168961,
- 3.463798, 6.318607, 10.162284, 18.685361, 28.188192, 37.638872,
- 0.086923, 0.142513, 0.314901, 0.429742, 0.456313, 0.495045,
- 0.506484, 0.519195, 0.580104, 0.810126, 1.391462, 2.579441,
- 4.205160, 7.498399, 12.381597, 21.848607, 33.367199, 42.623190,
- 0.086923, 0.142513, 0.314899, 0.429589, 0.455706, 0.495155,
- 0.507882, 0.542426, 0.702360, 1.119921, 2.168478, 3.463791,
- 6.318607, 10.162284, 18.685361, 28.345760, 47.802028, 49.163533,
- 0.086924, 0.142548, 0.315086, 0.429842, 0.455870, 0.496336,
- 0.512412, 0.556953, 0.773373, 1.266396, 2.548277, 4.204676,
- 7.498399, 12.381597, 21.848607, 33.548250, 54.301011, 56.262859,
- 0.087067, 0.144957, 0.327436, 0.446616, 0.466362, 0.505706,
- 0.522077, 0.610747, 0.972543, 1.666916, 3.338812, 6.316669,
- 10.162284, 18.685361, 28.345760, 48.065311, 66.145302, 78.396020,
- 0.094295, 0.164235, 0.393722, 0.534219, 0.530922, 0.579308,
- 0.603889, 0.760870, 1.229961, 2.423214, 4.173513, 7.497916,
- 12.381597, 21.848607, 33.548250, 54.589585, 74.875848, 86.468182,
- 0.124096, 0.213005, 0.497188, 0.665176, 0.685973, 0.800200,
- 0.911394, 1.077971, 1.677290, 3.332129, 6.314960, 10.162257,
- 18.685361, 28.345760, 48.065311, 66.453506, 98.275189, 96.862588,
- 0.140999, 0.270140, 0.658212, 0.867661, 0.970183, 1.149516,
- 1.480599, 1.664833, 2.421893, 3.857981, 7.418830, 12.380371,
- 21.848607, 33.548250, 54.589585, 75.188867, 106.657971, 99.762997,
- 0.178353, 0.398001, 0.988462, 1.241473, 1.340967, 1.713568,
- 2.335030, 2.701432, 3.348532, 5.077158, 9.829903, 18.676528,
- 28.345700, 48.065311, 66.453506, 98.588283, 117.057193, 101.130722,
- 0.281079, 0.548300, 1.395825, 1.780770, 2.000508, 2.702964,
- 3.638454, 4.573843, 5.051641, 7.079129, 11.293332, 21.594861,
- 33.544335, 54.589585, 75.188867, 106.971065, 119.957601, 101.466632,
- 0.476762, 0.842189, 2.019678, 2.723895, 3.188467, 4.011610,
- 5.545111, 7.508984, 8.176339, 9.774504, 14.720782, 27.334416,
- 48.049609, 66.453506, 98.588283, 117.370357, 121.329855, 101.509242,
- 0.993999, 1.520111, 3.013605, 4.203530, 4.982992, 6.074944,
- 8.583581, 11.818375, 14.192544, 14.937517, 21.258160, 33.305953,
- 54.585735, 75.188867, 106.971135, 120.279824, 121.976055, 102.690130,
- 1.776487, 2.613655, 4.356487, 6.161726, 7.622196, 9.464193,
- 13.077233, 18.051656, 23.221051, 24.080068, 30.085038, 48.345269,
- 66.457698, 98.588353, 117.379415, 121.976128, 124.356210, 107.713202,
- 3.191085, 4.495201, 5.686033, 8.365566, 11.275339, 14.706437,
- 20.300969, 28.152237, 35.688355, 39.341382, 41.030743, 55.752262,
- 75.211764, 106.980285, 120.608403, 124.680746, 130.222528, 112.260098,
- 6.136611, 7.305215, 7.272532, 10.646713, 15.630815, 22.383168,
- 31.349131, 42.419822, 52.301680, 58.983454, 58.915405, 69.161305,
- 98.992460, 117.713855, 124.344836, 130.623638, 138.442401, 127.846670,
- 11.707980, 13.490761, 11.640845, 14.176132, 22.131124, 33.776462,
- 47.365711, 61.603834, 75.281056, 83.463985, 85.510533, 86.026513,
- 108.787480, 123.031136, 130.607284, 138.954406, 160.867784, 158.958882,
- 27.062874, 32.195139, 24.147297, 22.114632, 35.580506, 52.551674,
- 71.652956, 88.606776, 102.107193, 110.703186, 114.398733, 111.118539,
- 121.503578, 132.455924, 139.490806, 161.412674, 193.563210, 172.203945,
- 35.625692, 47.953028, 42.639820, 42.276254, 58.815664, 84.977282,
- 110.656412, 126.168446, 134.658126, 140.604482, 144.006012, 141.702382,
- 140.125323, 153.122630, 164.748041, 194.156197, 206.854650, 174.013079,
- 49.516447, 65.335381, 71.738306, 81.872819, 98.400740, 136.840488,
- 163.775802, 169.440078, 172.747876, 171.222919, 171.679604, 172.173550,
- 168.200129, 187.617133, 199.683394, 207.768200, 210.062520, 175.478356,
- 60.341673, 92.487135, 119.907299, 136.068010, 144.778950, 189.443534,
- 220.120077, 219.641635, 214.616503, 205.894657, 198.453924, 200.013069,
- 195.938103, 206.118661, 210.447375, 212.061379, 216.078218, 181.162805,
- 78.422159, 112.242899, 158.416312, 181.404320, 193.188690, 229.296967,
- 270.461799, 275.168977, 256.511701, 244.706786, 231.344608, 226.065087,
- 222.248618, 218.662324, 217.966722, 218.248574, 218.818588, 182.740573,
- 88.713664, 123.594164, 172.928179, 213.781414, 245.800351, 252.063414,
- 313.283141, 331.703831, 305.866639, 285.177142, 269.759635, 251.988739,
- 245.998388, 232.688076, 230.588702, 230.882657, 230.319053, 192.120741,
- 102.540561, 152.905927, 189.137131, 241.806756, 273.868497, 284.258017,
- 339.689853, 373.561104, 362.657463, 326.291984, 311.922687, 290.460189,
- 276.774381, 273.012072, 277.751792, 279.123748, 278.820447, 233.813798,
- 132.983118, 176.307242, 197.415684, 243.307787, 280.893995, 332.922370,
- 340.329043, 404.530166, 419.475405, 375.775209, 351.300889, 340.042759,
- 315.683832, 306.123530, 306.359319, 306.733063, 307.609556, 261.647847,
- 149.579109, 185.925581, 207.937033, 245.159084, 301.890957, 350.040480,
- 352.250771, 418.742329, 458.112686, 430.125208, 386.460441, 380.346839,
- 354.679150, 337.305620, 334.504124, 335.889932, 341.060725, 286.898578,
- 153.576812, 202.105624, 219.366967, 248.524506, 314.255692, 350.607526,
- 390.567688, 408.629209, 488.000213, 480.563823, 432.461799, 410.412624,
- 398.607371, 400.188740, 402.780916, 408.853470, 430.449735, 363.777088,
- 161.353129, 214.848904, 231.549852, 258.536466, 313.163177, 368.140577,
- 412.136393, 413.409032, 499.838438, 519.571063, 485.833867, 444.562715,
- 435.738129, 442.358549, 450.166531, 453.208524, 458.424358, 385.823139,
- 175.109034, 227.608058, 250.069563, 286.101747, 312.256740, 378.421485,
- 413.344147, 435.058646, 476.960941, 542.448886, 530.189154, 495.408402,
- 475.326752, 465.017144, 464.694045, 465.144689, 466.905382, 398.669138,
- 184.750180, 240.766694, 283.240772, 305.480150, 322.409001, 374.526162,
- 427.141326, 452.840323, 472.604139, 545.366105, 567.676694, 541.666203,
- 509.591873, 492.044219, 492.778569, 493.765684, 493.235693, 413.684325,
- 194.728357, 254.928927, 289.991157, 300.193195, 324.194589, 371.563147,
- 439.226438, 468.295088, 495.654854, 533.506353, 587.476353, 578.298989,
- 548.041942, 527.393885, 538.965146, 545.070442, 544.295454, 454.012211,
- 205.195287, 283.135677, 297.921431, 319.295927, 355.621830, 392.466463,
- 446.696167, 485.053519, 516.426615, 532.264584, 588.481600, 615.906737,
- 589.319634, 555.754316, 558.389367, 569.094521, 569.779764, 475.384946,
- 218.552054, 298.511016, 319.188338, 351.781666, 372.789510, 412.827434,
- 464.569387, 506.270203, 533.049810, 553.347364, 580.644599, 632.759854,
- 622.235843, 569.960552, 580.799340, 586.553714, 579.488366, 491.826482,
- 244.803348, 299.790203, 324.187975, 363.280782, 403.710443, 441.724083,
- 492.732682, 534.722691, 552.193622, 575.112647, 586.097705, 635.224970,
- 644.642944, 606.017786, 640.321218, 642.316989, 616.397020, 548.300111,
- 256.957358, 318.638991, 355.063346, 389.889307, 433.607315, 468.209001,
- 515.178157, 573.556591, 578.113115, 587.246475, 601.762801, 638.454644,
- 656.574853, 641.184609, 676.908189, 684.198162, 678.387412, 574.805864,
- 251.211502, 323.448532, 364.227424, 411.792704, 462.226488, 503.572288,
- 549.299249, 599.124071, 601.227977, 597.118176, 613.247552, 633.278532,
- 658.074755, 664.930719, 685.731531, 693.632845, 693.076350, 578.326477,
- 267.695377, 354.273736, 389.976833, 438.518178, 493.332686, 544.343027,
- 588.895829, 620.206193, 628.327410, 606.067827, 620.998532, 657.985256,
- 683.936059, 691.345257, 693.894723, 695.175306, 693.618786, 578.517148,
- 274.290725, 363.465288, 411.808596, 463.369805, 515.310226, 581.009306,
- 613.070738, 636.638714, 647.333929, 629.867603, 644.646319, 687.796202,
- 702.859596, 713.495479, 704.068069, 704.991807, 704.188594, 587.283658,
- 302.538449, 389.174737, 438.518422, 493.398902, 547.662399, 601.981814,
- 624.773046, 641.629484, 644.699451, 645.848784, 668.033340, 703.643523,
- 707.422408, 717.329600, 726.298973, 744.127507, 745.365167, 617.954068,
- 310.328188, 410.984766, 463.369805, 515.315010, 581.309832, 613.787792,
- 634.988538, 654.145284, 662.632978, 668.413496, 706.494057, 750.545471,
- 730.724808, 730.002100, 743.625262, 750.801609, 745.308457, 606.505800,
- 329.948756, 437.600191, 493.398902, 547.661910, 601.917884, 622.557745,
- 633.244395, 644.055898, 648.224221, 665.062911, 763.555733, 812.391078,
- 769.063582, 744.865168, 727.579796, 724.950408, 722.179707, 598.564510,
- 350.848328, 462.437458, 515.315010, 581.309823, 613.779123, 634.465309,
- 652.056257, 662.179143, 671.466297, 726.881256, 819.824030, 880.232789,
- 810.371672, 754.246481, 725.053473, 724.253390, 723.503395, 603.394909,
- 373.704088, 492.408266, 547.661910, 601.917884, 622.557620, 633.236320,
- 644.023513, 648.232514, 666.381639, 785.498283, 929.441612, 999.772800,
- 890.339033, 775.852504, 731.840181, 726.905100, 725.251844, 604.899901,
- 394.473422, 514.261306, 581.309823, 613.779123, 634.465309, 652.056257,
- 662.179143, 671.466557, 727.134512, 835.764144, 981.747089, 1018.462934,
- 939.686967, 811.276731, 739.398459, 727.365647, 725.285425, 604.923525,
- 419.976505, 546.538939, 601.917884, 622.557620, 633.236320, 644.023513,
- 648.232514, 666.381639, 785.545191, 932.841398, 1036.609617, 1026.945092,
- 963.822765, 840.827315, 755.532423, 730.241865, 725.366847, 604.924155,
- 437.281359, 580.116337, 613.779123, 634.465309, 652.056257, 662.179143,
- 671.466557, 727.134512, 835.764859, 981.996194, 1031.896881, 1002.544732,
- 881.157178, 828.151494, 799.340975, 751.314325, 728.316587, 605.005504,
- 464.713920, 600.649281, 622.557620, 633.236320, 644.023513, 648.232514,
- 666.381639, 785.545191, 932.841398, 1036.735329, 1035.037004, 995.478339,
- 858.093733, 823.471976, 819.881754, 798.749289, 749.440463, 607.955244,
- 495.880237, 612.473139, 634.465309, 652.056257, 662.179143, 671.466557,
- 727.134512, 835.764859, 981.996194, 1032.339788, 1031.105117, 995.303259,
- 857.733663, 823.435877, 822.822791, 819.873050, 796.882480, 629.038445,
- 510.391280, 621.158273, 633.236320, 644.023513, 648.232514, 666.381639,
- 785.545191, 932.841398, 1036.735329, 1035.566013, 1029.599350, 994.926093,
- 857.645648, 823.435143, 822.904139, 822.822791, 817.965681, 673.856962,
- 514.588176, 632.947715, 652.056257, 662.179143, 671.466557, 727.134512,
- 835.764859, 981.996194, 1032.339788, 1031.547475, 1023.835377, 972.158629,
- 851.968626, 823.347128, 822.904770, 822.904139, 820.752301, 684.418900,
- 520.013294, 631.668183, 644.023513, 648.232514, 666.381639, 785.545191,
- 932.841398, 1036.735329, 1035.567378, 1029.776746, 1001.044108, 880.853721,
- 829.201546, 822.994150, 822.904770, 822.904770, 820.792975, 684.582020,
- 531.253628, 650.479606, 662.179143, 671.466557, 727.134512, 835.764859,
- 981.996194, 1032.339788, 1031.636855, 1029.601779, 995.366703, 858.086641,
- 823.524524, 822.906135, 822.904770, 822.904770, 820.792975, 684.582020,
- 528.531744, 642.424501, 648.232514, 666.381639, 785.545191, 932.841398,
- 1036.735329, 1035.567378, 1030.219103, 1029.576226, 995.278687, 857.733663,
- 823.436508, 822.904770, 822.904770, 822.904770, 820.792975, 684.582020,
- 545.401164, 660.550678, 671.508859, 727.304161, 835.807162, 981.996850,
- 1032.339788, 1031.636855, 1030.130788, 1029.487827, 994.925709, 857.645648,
- 823.435143, 822.904770, 822.904770, 822.904770, 820.792975, 684.582020,
- 537.684760, 646.650947, 669.110131, 796.487512, 935.569890, 1036.777631,
- 1035.567378, 1030.219103, 1030.018584, 1023.810805, 972.158629, 851.968626,
- 823.347128, 822.904770, 822.904770, 822.904770, 820.792975, 684.582020,
- 552.408370, 670.001885, 738.246482, 879.690154, 992.939171, 1032.509436,
- 1031.636855, 1030.132153, 1029.665223, 1001.043724, 880.853721, 829.201546,
- 822.994150, 822.904770, 822.904770, 822.904770, 820.792975, 684.582020,
- 539.835902, 667.496388, 799.216004, 946.512211, 1039.506123, 1035.609680,
- 1030.219103, 1030.107964, 1029.577207, 995.366703, 858.086641, 823.524524,
- 822.906135, 822.904770, 822.904770, 822.904770, 820.792975, 684.582020,
- 558.362529, 734.277451, 877.197218, 990.478243, 1029.908393, 1028.993978,
- 1027.488620, 1027.464048, 1026.933674, 992.724534, 855.532488, 821.323349,
- 820.792975, 820.792975, 820.792975, 820.792975, 818.686600, 682.825198,
- 453.127195, 649.075095, 780.278390, 867.165890, 862.469711, 857.067460,
- 856.956321, 856.955937, 856.513579, 827.981461, 713.556496, 685.024378,
- 684.582020, 684.582020, 684.582020, 684.582020, 682.825198, 569.510056,
+static const uint8_t bsize_curvfit_model_cat_lookup[BLOCK_SIZES_ALL] = {
+ 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 0, 0, 1, 1, 2, 2
};
-static const double interp_dgrid_surf[65 * 18] = {
- 10.650434, 12.204694, 12.040917, 11.843008, 11.845578, 12.051535, 12.103583,
- 12.136780, 12.266709, 12.299107, 12.299673, 12.303120, 12.316337, 12.293431,
- 12.092165, 11.602421, 11.141559, 8.864495, 12.770003, 14.634889, 14.437149,
- 14.199413, 14.202487, 14.449423, 14.511827, 14.551629, 14.707410, 14.746265,
- 14.747610, 14.753705, 14.762194, 14.699395, 14.390525, 13.690970, 12.874168,
- 10.367121, 12.832328, 14.790730, 14.503765, 14.236403, 14.239028, 14.486600,
- 14.549164, 14.589069, 14.745250, 14.784258, 14.788320, 14.801930, 14.762798,
- 14.499088, 14.021544, 13.469684, 12.661560, 10.108384, 12.950520, 15.264726,
- 14.621957, 14.238236, 14.239028, 14.486601, 14.549264, 14.589469, 14.745361,
- 14.784949, 14.791572, 14.798652, 14.660251, 14.119394, 13.651131, 12.935657,
- 12.176082, 9.228999, 12.979992, 15.382918, 14.651428, 14.238693, 14.239028,
- 14.486701, 14.555710, 14.615321, 14.751849, 14.787700, 14.797104, 14.743189,
- 14.475057, 13.944406, 13.450468, 12.687876, 11.824993, 8.906683, 12.980449,
- 15.384750, 14.651885, 14.238700, 14.239028, 14.487102, 14.581562, 14.718998,
- 14.777721, 14.788445, 14.778661, 14.582790, 14.099785, 13.649637, 12.935359,
- 12.201859, 10.891931, 8.482221, 12.980449, 15.384750, 14.651886, 14.238801,
- 14.239434, 14.487303, 14.588010, 14.744860, 14.784773, 14.786094, 14.735647,
- 14.455704, 13.939591, 13.450393, 12.687876, 11.849334, 10.476658, 8.043672,
- 12.980449, 15.384750, 14.651987, 14.245320, 14.265579, 14.493824, 14.588211,
- 14.745312, 14.787263, 14.775934, 14.582036, 14.099475, 13.649563, 12.935358,
- 12.201859, 10.911285, 9.730570, 6.696921, 12.980449, 15.384750, 14.652393,
- 14.271466, 14.370434, 14.520069, 14.589027, 14.746028, 14.785482, 14.735605,
- 14.455693, 13.939590, 13.450393, 12.687876, 11.849334, 10.494514, 9.195398,
- 6.215460, 12.980449, 15.384750, 14.652494, 14.277985, 14.396679, 14.533035,
- 14.615021, 14.754825, 14.775610, 14.582796, 14.099664, 13.649565, 12.935358,
- 12.201859, 10.911285, 9.747361, 7.779960, 5.617541, 12.980448, 15.384731,
- 14.652415, 14.278078, 14.397578, 14.559053, 14.718657, 14.776398, 14.747044,
- 14.504690, 13.951810, 13.450583, 12.687876, 11.849334, 10.494514, 9.210817,
- 7.210003, 5.164575, 12.980446, 15.383448, 14.647073, 14.277541, 14.403813,
- 14.569546, 14.744956, 14.765103, 14.629073, 14.296161, 13.698573, 12.936118,
- 12.201859, 10.911285, 9.747361, 7.790897, 6.322998, 3.931551, 12.981550,
- 15.376916, 14.615597, 14.274820, 14.437479, 14.575942, 14.707492, 14.734111,
- 14.515975, 14.000806, 13.462803, 12.688066, 11.849334, 10.494514, 9.210817,
- 7.219566, 5.781392, 3.486081, 12.991899, 15.376201, 14.579444, 14.296898,
- 14.473361, 14.522910, 14.491600, 14.543267, 14.288580, 13.700311, 12.936579,
- 12.201867, 10.911285, 9.747361, 7.790897, 6.331506, 4.480348, 2.923138,
- 13.019848, 15.383477, 14.582260, 14.385262, 14.452673, 14.436019, 14.238174,
- 14.255993, 13.977481, 13.532342, 12.705591, 11.849605, 10.494514, 9.210817,
- 7.219566, 5.789642, 4.018194, 2.766222, 13.028558, 15.315782, 14.439141,
- 14.326286, 14.452429, 14.311731, 14.033235, 13.922587, 13.665868, 13.207897,
- 12.274375, 10.912967, 9.747371, 7.790897, 6.331506, 4.488594, 3.454993,
- 2.692682, 12.992752, 15.321471, 14.409573, 14.236340, 14.322969, 14.049072,
- 13.764823, 13.479242, 13.250105, 12.759133, 12.019174, 10.532951, 9.211409,
- 7.219566, 5.789642, 4.026440, 3.298077, 2.674624, 12.945493, 15.276596,
- 14.315745, 14.026198, 14.085774, 13.844563, 13.447576, 12.964935, 12.735525,
- 12.288592, 11.511693, 9.900227, 7.793270, 6.331506, 4.488594, 3.463236,
- 3.224318, 2.672433, 12.757570, 15.056661, 14.095011, 13.722362, 13.812624,
- 13.608480, 13.021206, 12.367627, 11.937931, 11.581049, 10.599552, 9.247860,
- 7.220151, 5.789642, 4.026437, 3.305882, 3.191260, 2.615317, 12.581293,
- 14.824658, 13.909074, 13.496158, 13.491402, 13.221550, 12.514140, 11.677229,
- 10.936895, 10.619912, 9.634779, 7.763570, 6.331082, 4.488590, 3.462798,
- 3.216460, 3.076315, 2.373499, 12.283499, 14.455760, 13.890593, 13.427587,
- 13.183783, 12.763833, 11.861006, 10.740618, 9.820756, 9.354945, 8.669862,
- 7.123268, 5.787860, 4.025994, 3.290000, 3.084410, 2.810905, 2.222916,
- 12.010893, 14.300919, 13.986624, 13.484026, 13.025385, 12.224281, 11.064265,
- 9.631040, 8.594396, 8.003736, 7.561587, 6.274418, 4.466637, 3.446574,
- 3.102467, 2.816989, 2.598688, 1.951541, 11.581477, 13.831132, 13.632027,
- 13.380414, 12.807880, 11.665651, 10.218236, 8.562237, 7.222614, 6.611808,
- 6.261676, 5.402793, 3.938544, 3.174375, 2.818166, 2.602758, 2.213911,
- 1.434763, 11.050735, 12.893449, 12.363152, 12.712829, 12.012961, 10.887854,
- 9.109699, 7.421701, 5.965603, 5.272129, 4.991435, 4.423000, 3.369988,
- 2.800371, 2.593901, 2.217431, 1.670917, 1.215265, 10.641194, 11.766277,
- 10.777082, 10.972917, 10.689298, 9.701545, 7.719947, 6.145654, 4.872442,
- 4.099600, 3.880934, 3.514159, 2.786474, 2.368963, 2.162376, 1.673670,
- 1.450770, 1.185424, 10.071964, 11.107701, 9.172361, 8.551313, 8.412080,
- 7.641397, 6.174246, 4.853916, 3.904549, 3.246810, 2.959903, 2.785066,
- 2.240001, 1.793166, 1.585520, 1.449824, 1.405368, 1.168856, 9.213182,
- 9.173278, 7.219231, 6.242951, 5.626013, 5.768007, 4.908666, 3.809589,
- 3.115109, 2.617899, 2.274793, 2.172960, 1.838597, 1.505915, 1.414333,
- 1.392666, 1.338173, 1.105611, 7.365015, 7.471370, 5.622346, 4.520127,
- 3.936272, 4.208822, 3.623024, 2.977794, 2.450003, 2.097261, 1.824090,
- 1.643270, 1.473525, 1.351388, 1.327504, 1.323865, 1.307894, 1.088234,
- 6.198210, 6.580712, 4.682511, 3.416952, 2.941929, 2.766637, 2.650686,
- 2.315439, 1.925838, 1.659784, 1.464419, 1.252806, 1.162722, 1.197518,
- 1.199875, 1.197365, 1.194040, 0.995797, 5.402507, 5.055466, 3.728724,
- 2.624359, 2.165810, 1.943189, 1.918190, 1.738078, 1.516328, 1.290520,
- 1.155793, 1.015962, 0.881900, 0.807203, 0.754242, 0.743378, 0.740288,
- 0.614158, 3.937867, 3.862507, 2.884664, 2.088147, 1.648496, 1.473584,
- 1.340123, 1.291769, 1.165381, 1.000224, 0.893316, 0.821333, 0.691363,
- 0.610501, 0.586766, 0.583762, 0.577840, 0.468733, 3.104660, 3.181078,
- 2.420208, 1.747442, 1.297956, 1.109835, 0.970385, 0.943229, 0.876923,
- 0.777584, 0.678183, 0.628623, 0.553745, 0.523430, 0.519490, 0.514394,
- 0.492259, 0.403172, 2.593833, 2.533720, 2.010452, 1.480944, 1.060302,
- 0.846383, 0.738703, 0.673144, 0.658010, 0.592449, 0.518236, 0.470335,
- 0.425088, 0.393168, 0.378116, 0.355846, 0.275469, 0.213128, 2.176988,
- 2.089575, 1.671284, 1.225008, 0.895382, 0.672008, 0.566241, 0.496746,
- 0.488005, 0.449874, 0.400899, 0.354002, 0.318150, 0.281533, 0.238545,
- 0.224159, 0.202399, 0.160681, 1.874679, 1.769165, 1.430124, 1.068727,
- 0.780272, 0.557801, 0.441643, 0.377256, 0.352957, 0.338452, 0.304965,
- 0.273172, 0.240052, 0.208724, 0.193431, 0.190845, 0.185025, 0.138166,
- 1.590226, 1.502830, 1.193127, 0.917885, 0.670432, 0.474546, 0.355420,
- 0.292305, 0.259035, 0.249937, 0.232079, 0.208943, 0.181936, 0.160038,
- 0.152257, 0.151235, 0.149583, 0.120747, 1.331730, 1.255907, 1.012871,
- 0.778422, 0.578977, 0.412432, 0.293155, 0.231824, 0.197187, 0.183921,
- 0.174876, 0.157252, 0.140263, 0.127050, 0.110244, 0.105041, 0.104323,
- 0.086944, 1.153994, 1.118771, 0.822355, 0.612321, 0.478249, 0.348222,
- 0.247408, 0.186141, 0.152714, 0.135445, 0.129810, 0.119994, 0.115619,
- 0.131626, 0.095612, 0.079343, 0.077502, 0.064550, 0.946317, 0.925894,
- 0.677969, 0.499906, 0.397101, 0.297931, 0.214467, 0.152333, 0.120731,
- 0.102686, 0.095062, 0.090361, 0.122319, 0.240194, 0.112687, 0.070690,
- 0.070461, 0.054194, 0.824155, 0.787241, 0.581856, 0.419228, 0.313167,
- 0.245582, 0.183500, 0.128101, 0.096577, 0.080267, 0.071022, 0.066851,
- 0.085754, 0.154163, 0.075884, 0.052401, 0.054270, 0.026656, 0.716310,
- 0.671378, 0.489580, 0.349569, 0.256155, 0.206343, 0.157853, 0.111950,
- 0.079271, 0.062518, 0.053441, 0.049660, 0.051400, 0.063778, 0.039993,
- 0.029133, 0.023382, 0.013725, 0.614125, 0.579096, 0.417126, 0.299465,
- 0.217849, 0.165515, 0.129040, 0.093127, 0.065612, 0.049543, 0.041429,
- 0.036850, 0.034416, 0.033989, 0.024216, 0.017377, 0.014833, 0.011987,
- 0.520407, 0.487239, 0.349473, 0.251741, 0.184897, 0.135813, 0.107098,
- 0.073607, 0.053938, 0.040531, 0.032931, 0.028876, 0.025759, 0.022168,
- 0.016739, 0.014638, 0.014333, 0.011947, 0.449954, 0.415124, 0.299452,
- 0.216942, 0.158874, 0.115334, 0.088821, 0.060105, 0.042610, 0.032566,
- 0.026903, 0.023123, 0.019913, 0.016835, 0.014306, 0.013625, 0.013535,
- 0.011284, 0.377618, 0.347773, 0.251741, 0.184839, 0.132857, 0.095439,
- 0.070462, 0.052244, 0.036078, 0.026025, 0.021518, 0.018487, 0.015361,
- 0.012905, 0.011470, 0.010569, 0.010283, 0.008297, 0.319953, 0.297976,
- 0.216942, 0.158842, 0.113280, 0.080426, 0.057367, 0.041987, 0.030135,
- 0.022295, 0.017901, 0.015121, 0.012224, 0.010035, 0.009353, 0.009108,
- 0.008695, 0.006139, 0.267864, 0.250502, 0.184839, 0.132851, 0.095039,
- 0.068220, 0.049135, 0.035315, 0.025144, 0.018237, 0.013857, 0.012094,
- 0.009715, 0.007743, 0.006937, 0.006446, 0.006243, 0.004929, 0.230449,
- 0.215895, 0.158842, 0.113280, 0.080417, 0.057174, 0.041304, 0.029959,
- 0.021866, 0.015673, 0.012133, 0.010083, 0.007801, 0.006053, 0.005401,
- 0.003834, 0.003429, 0.002851, 0.193984, 0.183963, 0.132851, 0.095039,
- 0.068220, 0.049133, 0.035305, 0.025140, 0.018150, 0.013175, 0.010422,
- 0.008491, 0.006397, 0.004567, 0.003494, 0.002933, 0.002825, 0.002355,
- 0.167298, 0.158088, 0.113280, 0.080417, 0.057174, 0.041304, 0.029959,
- 0.021866, 0.015669, 0.011955, 0.009257, 0.007051, 0.005543, 0.003905,
- 0.002984, 0.002825, 0.002814, 0.002347, 0.143228, 0.132220, 0.095039,
- 0.068220, 0.049133, 0.035305, 0.025140, 0.018150, 0.013174, 0.010394,
- 0.008403, 0.006661, 0.005378, 0.003545, 0.002876, 0.002818, 0.002814,
- 0.002347, 0.122934, 0.112735, 0.080417, 0.057174, 0.041304, 0.029959,
- 0.021866, 0.015669, 0.011955, 0.009258, 0.007182, 0.006012, 0.003762,
- 0.002866, 0.002739, 0.002788, 0.002810, 0.002347, 0.101934, 0.094569,
- 0.068220, 0.049133, 0.035305, 0.025140, 0.018150, 0.013174, 0.010394,
- 0.008405, 0.006797, 0.005845, 0.003333, 0.002703, 0.002695, 0.002723,
- 0.002781, 0.002343, 0.086702, 0.080014, 0.057174, 0.041304, 0.029959,
- 0.021866, 0.015669, 0.011955, 0.009258, 0.007190, 0.006533, 0.005839,
- 0.003326, 0.002700, 0.002690, 0.002694, 0.002716, 0.002314, 0.073040,
- 0.067886, 0.049133, 0.035305, 0.025140, 0.018150, 0.013174, 0.010394,
- 0.008405, 0.006807, 0.006468, 0.005831, 0.003325, 0.002700, 0.002690,
- 0.002690, 0.002687, 0.002253, 0.061685, 0.056890, 0.041304, 0.029959,
- 0.021866, 0.015669, 0.011955, 0.009258, 0.007190, 0.006542, 0.006360,
- 0.005416, 0.003221, 0.002698, 0.002690, 0.002690, 0.002683, 0.002238,
- 0.052465, 0.048894, 0.035305, 0.025140, 0.018150, 0.013174, 0.010394,
- 0.008405, 0.006807, 0.006472, 0.005943, 0.003748, 0.002805, 0.002692,
- 0.002690, 0.002690, 0.002683, 0.002238, 0.043838, 0.041101, 0.029959,
- 0.021866, 0.015669, 0.011955, 0.009258, 0.007190, 0.006543, 0.006465,
- 0.005839, 0.003333, 0.002702, 0.002690, 0.002690, 0.002690, 0.002683,
- 0.002238, 0.037824, 0.035133, 0.025140, 0.018150, 0.013174, 0.010394,
- 0.008405, 0.006807, 0.006480, 0.006464, 0.005838, 0.003326, 0.002700,
- 0.002690, 0.002690, 0.002690, 0.002683, 0.002238, 0.031865, 0.029815,
- 0.021866, 0.015668, 0.011955, 0.009258, 0.007190, 0.006543, 0.006475,
- 0.006462, 0.005831, 0.003325, 0.002700, 0.002690, 0.002690, 0.002690,
- 0.002683, 0.002238, 0.027150, 0.025016, 0.018128, 0.013083, 0.010371,
- 0.008405, 0.006807, 0.006480, 0.006472, 0.006359, 0.005416, 0.003221,
- 0.002698, 0.002690, 0.002690, 0.002690, 0.002683, 0.002238, 0.023094,
- 0.021760, 0.015577, 0.011590, 0.009167, 0.007188, 0.006543, 0.006475,
- 0.006466, 0.005943, 0.003748, 0.002805, 0.002692, 0.002690, 0.002690,
- 0.002690, 0.002683, 0.002238, 0.019269, 0.018038, 0.013060, 0.010280,
- 0.008382, 0.006806, 0.006480, 0.006474, 0.006464, 0.005839, 0.003333,
- 0.002702, 0.002690, 0.002690, 0.002690, 0.002690, 0.002683, 0.002238,
- 0.016874, 0.015472, 0.011566, 0.009148, 0.007171, 0.006527, 0.006458,
- 0.006457, 0.006447, 0.005823, 0.003318, 0.002693, 0.002683, 0.002683,
- 0.002683, 0.002683, 0.002676, 0.002232, 0.011968, 0.011056, 0.008762,
- 0.007219, 0.005717, 0.005391, 0.005386, 0.005386, 0.005377, 0.004856,
- 0.002767, 0.002246, 0.002238, 0.002238, 0.002238, 0.002238, 0.002232,
- 0.001862,
+static int sse_norm_curvfit_model_cat_lookup(double sse_norm) {
+ return (sse_norm > 16.0);
+}
+
+// Models distortion by sse using a logistic function on
+// l = log2(sse / q^2) as:
+// dbysse = 16 / (1 + k exp(l + c))
+static double get_dbysse_logistic(double l, double c, double k) {
+ const double A = 16.0;
+ const double dbysse = A / (1 + k * exp(l + c));
+ return dbysse;
+}
+
+// Models rate using a clamped linear function on
+// l = log2(sse / q^2) as:
+// rate = max(0, a + b * l)
+static double get_rate_clamplinear(double l, double a, double b) {
+ const double rate = a + b * l;
+ return (rate < 0 ? 0 : rate);
+}
+
+static const uint8_t bsize_surffit_model_cat_lookup[BLOCK_SIZES_ALL] = {
+ 0, 0, 0, 0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8, 0, 0, 2, 2, 4, 4
};
-void av1_model_rd_surffit(double xm, double yl, double *rate_f,
- double *dist_f) {
- const double x_start = -0.5;
- const double x_end = 16.5;
- const double x_step = 1;
- const double y_start = -15.5;
- const double y_end = 16.5;
- const double y_step = 0.5;
- const double epsilon = 1e-6;
- const int stride = (int)rint((x_end - x_start) / x_step) + 1;
- (void)y_end;
+static const double surffit_rate_params[9][4] = {
+ {
+ 638.390212,
+ 2.253108,
+ 166.585650,
+ -3.939401,
+ },
+ {
+ 5.256905,
+ 81.997240,
+ -1.321771,
+ 17.694216,
+ },
+ {
+ -74.193045,
+ 72.431868,
+ -19.033152,
+ 15.407276,
+ },
+ {
+ 416.770113,
+ 14.794188,
+ 167.686830,
+ -6.997756,
+ },
+ {
+ 378.511276,
+ 9.558376,
+ 154.658843,
+ -6.635663,
+ },
+ {
+ 277.818787,
+ 4.413180,
+ 150.317637,
+ -9.893038,
+ },
+ {
+ 142.212132,
+ 11.542038,
+ 94.393964,
+ -5.518517,
+ },
+ {
+ 219.100256,
+ 4.007421,
+ 108.932852,
+ -6.981310,
+ },
+ {
+ 222.261971,
+ 3.251049,
+ 95.972916,
+ -5.609789,
+ },
+};
+
+static const double surffit_dist_params[7] = {
+ 1.475844, 4.328362, -5.680233, -0.500994, 0.554585, 4.839478, -0.695837
+};
- xm = AOMMAX(xm, x_start + x_step + epsilon);
- xm = AOMMIN(xm, x_end - x_step - epsilon);
- yl = AOMMAX(yl, y_start + y_step + epsilon);
- yl = AOMMIN(yl, y_end - y_step - epsilon);
+static void rate_surffit_model_params_lookup(BLOCK_SIZE bsize, double xm,
+ double *rpar) {
+ const int cat = bsize_surffit_model_cat_lookup[bsize];
+ rpar[0] = surffit_rate_params[cat][0] + surffit_rate_params[cat][1] * xm;
+ rpar[1] = surffit_rate_params[cat][2] + surffit_rate_params[cat][3] * xm;
+}
- const double y = (yl - y_start) / y_step;
- const double x = (xm - x_start) / x_step;
+static void dist_surffit_model_params_lookup(BLOCK_SIZE bsize, double xm,
+ double *dpar) {
+ (void)bsize;
+ const double *params = surffit_dist_params;
+ dpar[0] = params[0] + params[1] / (1 + exp((xm + params[2]) * params[3]));
+ dpar[1] = params[4] + params[5] * exp(params[6] * xm);
+}
- const int yi = (int)floor(y);
- const int xi = (int)floor(x);
- assert(xi > 0);
- assert(yi > 0);
+void av1_model_rd_surffit(BLOCK_SIZE bsize, double sse_norm, double xm,
+ double yl, double *rate_f, double *distbysse_f) {
+ (void)sse_norm;
+ double rpar[2], dpar[2];
+ rate_surffit_model_params_lookup(bsize, xm, rpar);
+ dist_surffit_model_params_lookup(bsize, xm, dpar);
- const double yo = y - yi;
- const double xo = x - xi;
- const double *prate = &interp_rgrid_surf[(yi - 1) * stride + (xi - 1)];
- const double *pdist = &interp_dgrid_surf[(yi - 1) * stride + (xi - 1)];
- *rate_f = interp_bicubic(prate, stride, xo, yo);
- *dist_f = interp_bicubic(pdist, stride, xo, yo);
+ *rate_f = get_rate_clamplinear(yl, rpar[0], rpar[1]);
+ *distbysse_f = get_dbysse_logistic(yl, dpar[0], dpar[1]);
}
-static const double interp_rgrid_curv[65] = {
- 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
- 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
- 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 4.759876,
- 8.132086, 13.651828, 21.908271, 33.522054, 48.782376, 71.530983,
- 106.728649, 151.942795, 199.893011, 242.850965, 283.933923, 322.154203,
- 360.684608, 394.801656, 426.879017, 460.234313, 484.103987, 508.261495,
- 536.486763, 558.196737, 586.285894, 614.764511, 634.166333, 647.706472,
- 658.211478, 681.360407, 701.052141, 727.007310, 768.663973, 804.407660,
- 884.627751, 1065.658131, 1238.875214, 1440.185176, 1678.377931, 1962.243390,
- 2300.571467, 2702.152072, 3175.775119, 3730.230519, 4374.308184, 5116.798028,
- 5966.489961, 6932.173897, 8022.639747, 9246.677424, 10613.076839,
+static const double interp_rgrid_curv[4][65] = {
+ {
+ 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+ 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+ 0.000000, 23.801499, 28.387688, 33.388795, 42.298282,
+ 41.525408, 51.597692, 49.566271, 54.632979, 60.321507,
+ 67.730678, 75.766165, 85.324032, 96.600012, 120.839562,
+ 173.917577, 255.974908, 354.107573, 458.063476, 562.345966,
+ 668.568424, 772.072881, 878.598490, 982.202274, 1082.708946,
+ 1188.037853, 1287.702240, 1395.588773, 1490.825830, 1584.231230,
+ 1691.386090, 1766.822555, 1869.630904, 1926.743565, 2002.949495,
+ 2047.431137, 2138.486068, 2154.743767, 2209.242472, 2277.593051,
+ 2290.996432, 2307.452938, 2343.567091, 2397.654644, 2469.425868,
+ 2558.591037, 2664.860422, 2787.944296, 2927.552932, 3083.396602,
+ 3255.185579, 3442.630134, 3645.440541, 3863.327072, 4096.000000,
+ },
+ {
+ 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+ 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+ 0.000000, 8.998436, 9.439592, 9.731837, 10.865931,
+ 11.561347, 12.578139, 14.205101, 16.770584, 19.094853,
+ 21.330863, 23.298907, 26.901921, 34.501017, 57.891733,
+ 112.234763, 194.853189, 288.302032, 380.499422, 472.625309,
+ 560.226809, 647.928463, 734.155122, 817.489721, 906.265783,
+ 999.260562, 1094.489206, 1197.062998, 1293.296825, 1378.926484,
+ 1472.760990, 1552.663779, 1635.196884, 1692.451951, 1759.741063,
+ 1822.162720, 1916.515921, 1966.686071, 2031.647506, 2033.700134,
+ 2087.847688, 2161.688858, 2242.536028, 2334.023491, 2436.337802,
+ 2549.665519, 2674.193198, 2810.107395, 2957.594666, 3116.841567,
+ 3288.034655, 3471.360486, 3667.005616, 3875.156602, 4096.000000,
+ },
+ {
+ 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+ 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+ 0.000000, 2.377584, 2.557185, 2.732445, 2.851114,
+ 3.281800, 3.765589, 4.342578, 5.145582, 5.611038,
+ 6.642238, 7.945977, 11.800522, 17.346624, 37.501413,
+ 87.216800, 165.860942, 253.865564, 332.039345, 408.518863,
+ 478.120452, 547.268590, 616.067676, 680.022540, 753.863541,
+ 834.529973, 919.489191, 1008.264989, 1092.230318, 1173.971886,
+ 1249.514122, 1330.510941, 1399.523249, 1466.923387, 1530.533471,
+ 1586.515722, 1695.197774, 1746.648696, 1837.136959, 1909.075485,
+ 1975.074651, 2060.159200, 2155.335095, 2259.762505, 2373.710437,
+ 2497.447898, 2631.243895, 2775.367434, 2930.087523, 3095.673170,
+ 3272.393380, 3460.517161, 3660.313520, 3872.051464, 4096.000000,
+ },
+ {
+ 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+ 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+ 0.000000, 0.296997, 0.342545, 0.403097, 0.472889,
+ 0.614483, 0.842937, 1.050824, 1.326663, 1.717750,
+ 2.530591, 3.582302, 6.995373, 9.973335, 24.042464,
+ 56.598240, 113.680735, 180.018689, 231.050567, 266.101082,
+ 294.957934, 323.326511, 349.434429, 380.443211, 408.171987,
+ 441.214916, 475.716772, 512.900000, 551.186939, 592.364455,
+ 624.527378, 661.940693, 679.185473, 724.800679, 764.781792,
+ 873.050019, 950.299001, 939.292954, 1052.406153, 1033.893184,
+ 1112.182406, 1219.174326, 1337.296681, 1471.648357, 1622.492809,
+ 1790.093491, 1974.713858, 2176.617364, 2396.067465, 2633.327614,
+ 2888.661266, 3162.331876, 3454.602899, 3765.737789, 4096.000000,
+ },
};
-static const double interp_dgrid_curv[65] = {
- 14.604855, 14.604855, 14.604855, 14.604855, 14.604855, 14.604855, 14.604855,
- 14.604855, 14.604855, 14.604855, 14.604855, 14.604855, 14.555776, 14.533692,
- 14.439920, 14.257791, 13.977230, 13.623229, 13.064884, 12.355411, 11.560773,
- 10.728960, 9.861975, 8.643612, 6.916021, 5.154769, 3.734940, 2.680051,
- 1.925506, 1.408410, 1.042223, 0.767641, 0.565392, 0.420116, 0.310427,
- 0.231711, 0.172999, 0.128293, 0.094992, 0.072171, 0.052972, 0.039354,
- 0.029555, 0.022857, 0.016832, 0.013297, 0.000000, 0.000000, 0.000000,
- 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
- 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
- 0.000000, 0.000000,
+static const double interp_dgrid_curv[2][65] = {
+ {
+ 16.000000, 15.962891, 15.925174, 15.886888, 15.848074, 15.808770,
+ 15.769015, 15.728850, 15.688313, 15.647445, 15.606284, 15.564870,
+ 15.525918, 15.483820, 15.373330, 15.126844, 14.637442, 14.184387,
+ 13.560070, 12.880717, 12.165995, 11.378144, 10.438769, 9.130790,
+ 7.487633, 5.688649, 4.267515, 3.196300, 2.434201, 1.834064,
+ 1.369920, 1.035921, 0.775279, 0.574895, 0.427232, 0.314123,
+ 0.233236, 0.171440, 0.128188, 0.092762, 0.067569, 0.049324,
+ 0.036330, 0.027008, 0.019853, 0.015539, 0.011093, 0.008733,
+ 0.007624, 0.008105, 0.005427, 0.004065, 0.003427, 0.002848,
+ 0.002328, 0.001865, 0.001457, 0.001103, 0.000801, 0.000550,
+ 0.000348, 0.000193, 0.000085, 0.000021, 0.000000,
+ },
+ {
+ 16.000000, 15.996116, 15.984769, 15.966413, 15.941505, 15.910501,
+ 15.873856, 15.832026, 15.785466, 15.734633, 15.679981, 15.621967,
+ 15.560961, 15.460157, 15.288367, 15.052462, 14.466922, 13.921212,
+ 13.073692, 12.222005, 11.237799, 9.985848, 8.898823, 7.423519,
+ 5.995325, 4.773152, 3.744032, 2.938217, 2.294526, 1.762412,
+ 1.327145, 1.020728, 0.765535, 0.570548, 0.425833, 0.313825,
+ 0.232959, 0.171324, 0.128174, 0.092750, 0.067558, 0.049319,
+ 0.036330, 0.027008, 0.019853, 0.015539, 0.011093, 0.008733,
+ 0.007624, 0.008105, 0.005427, 0.004065, 0.003427, 0.002848,
+ 0.002328, 0.001865, 0.001457, 0.001103, 0.000801, 0.000550,
+ 0.000348, 0.000193, 0.000085, 0.000021, -0.000000,
+ },
};
-void av1_model_rd_curvfit(double xqr, double *rate_f, double *distbysse_f) {
+void av1_model_rd_curvfit(BLOCK_SIZE bsize, double sse_norm, double xqr,
+ double *rate_f, double *distbysse_f) {
const double x_start = -15.5;
const double x_end = 16.5;
const double x_step = 0.5;
const double epsilon = 1e-6;
+ const int rcat = bsize_curvfit_model_cat_lookup[bsize];
+ const int dcat = sse_norm_curvfit_model_cat_lookup(sse_norm);
(void)x_end;
xqr = AOMMAX(xqr, x_start + x_step + epsilon);
@@ -1138,9 +935,9 @@ void av1_model_rd_curvfit(double xqr, double *rate_f, double *distbysse_f) {
assert(xi > 0);
- const double *prate = &interp_rgrid_curv[(xi - 1)];
- const double *pdist = &interp_dgrid_curv[(xi - 1)];
+ const double *prate = &interp_rgrid_curv[rcat][(xi - 1)];
*rate_f = interp_cubic(prate, xo);
+ const double *pdist = &interp_dgrid_curv[dcat][(xi - 1)];
*distbysse_f = interp_cubic(pdist, xo);
}
@@ -1257,13 +1054,12 @@ int16_t *av1_raster_block_offset_int16(BLOCK_SIZE plane_bsize, int raster_block,
YV12_BUFFER_CONFIG *av1_get_scaled_ref_frame(const AV1_COMP *cpi,
int ref_frame) {
- const AV1_COMMON *const cm = &cpi->common;
assert(ref_frame >= LAST_FRAME && ref_frame <= ALTREF_FRAME);
- const int scaled_idx = cpi->scaled_ref_idx[ref_frame - 1];
- const int ref_idx = get_ref_frame_buf_idx(cpi, ref_frame);
- return (scaled_idx != ref_idx && scaled_idx != INVALID_IDX)
- ? &cm->buffer_pool->frame_bufs[scaled_idx].buf
- : NULL;
+ RefCntBuffer *const scaled_buf = cpi->scaled_ref_buf[ref_frame - 1];
+ const RefCntBuffer *const ref_buf =
+ get_ref_frame_buf(&cpi->common, ref_frame);
+ return (scaled_buf != ref_buf && scaled_buf != NULL) ? &scaled_buf->buf
+ : NULL;
}
int av1_get_switchable_rate(const AV1_COMMON *const cm, MACROBLOCK *x,
@@ -1304,7 +1100,7 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
} else {
rd->thresh_mult[THR_NEARESTMV] = 0;
rd->thresh_mult[THR_NEARESTL2] = 0;
- rd->thresh_mult[THR_NEARESTL3] = 0;
+ rd->thresh_mult[THR_NEARESTL3] = 100;
rd->thresh_mult[THR_NEARESTB] = 0;
rd->thresh_mult[THR_NEARESTA2] = 0;
rd->thresh_mult[THR_NEARESTA] = 0;
@@ -1315,7 +1111,7 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
rd->thresh_mult[THR_NEWL2] += 1000;
rd->thresh_mult[THR_NEWL3] += 1000;
rd->thresh_mult[THR_NEWB] += 1000;
- rd->thresh_mult[THR_NEWA2] = 1000;
+ rd->thresh_mult[THR_NEWA2] = 1100;
rd->thresh_mult[THR_NEWA] += 1000;
rd->thresh_mult[THR_NEWG] += 1000;
@@ -1327,18 +1123,18 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
rd->thresh_mult[THR_NEARA] += 1000;
rd->thresh_mult[THR_NEARG] += 1000;
- rd->thresh_mult[THR_GLOBALMV] += 2000;
+ rd->thresh_mult[THR_GLOBALMV] += 2200;
rd->thresh_mult[THR_GLOBALL2] += 2000;
rd->thresh_mult[THR_GLOBALL3] += 2000;
- rd->thresh_mult[THR_GLOBALB] += 2000;
+ rd->thresh_mult[THR_GLOBALB] += 2400;
rd->thresh_mult[THR_GLOBALA2] = 2000;
rd->thresh_mult[THR_GLOBALG] += 2000;
- rd->thresh_mult[THR_GLOBALA] += 2000;
+ rd->thresh_mult[THR_GLOBALA] += 2400;
- rd->thresh_mult[THR_COMP_NEAREST_NEARESTLA] += 1000;
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTLA] += 1100;
rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2A] += 1000;
- rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3A] += 1000;
- rd->thresh_mult[THR_COMP_NEAREST_NEARESTGA] += 1000;
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3A] += 800;
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTGA] += 900;
rd->thresh_mult[THR_COMP_NEAREST_NEARESTLB] += 1000;
rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2B] += 1000;
rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3B] += 1000;
@@ -1356,17 +1152,17 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
rd->thresh_mult[THR_COMP_NEAR_NEARLA] += 1200;
rd->thresh_mult[THR_COMP_NEAREST_NEWLA] += 1500;
rd->thresh_mult[THR_COMP_NEW_NEARESTLA] += 1500;
- rd->thresh_mult[THR_COMP_NEAR_NEWLA] += 1700;
- rd->thresh_mult[THR_COMP_NEW_NEARLA] += 1700;
- rd->thresh_mult[THR_COMP_NEW_NEWLA] += 2000;
- rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLA] += 2500;
+ rd->thresh_mult[THR_COMP_NEAR_NEWLA] += 1530;
+ rd->thresh_mult[THR_COMP_NEW_NEARLA] += 1870;
+ rd->thresh_mult[THR_COMP_NEW_NEWLA] += 2400;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLA] += 2750;
rd->thresh_mult[THR_COMP_NEAR_NEARL2A] += 1200;
rd->thresh_mult[THR_COMP_NEAREST_NEWL2A] += 1500;
rd->thresh_mult[THR_COMP_NEW_NEARESTL2A] += 1500;
- rd->thresh_mult[THR_COMP_NEAR_NEWL2A] += 1700;
+ rd->thresh_mult[THR_COMP_NEAR_NEWL2A] += 1870;
rd->thresh_mult[THR_COMP_NEW_NEARL2A] += 1700;
- rd->thresh_mult[THR_COMP_NEW_NEWL2A] += 2000;
+ rd->thresh_mult[THR_COMP_NEW_NEWL2A] += 1800;
rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2A] += 2500;
rd->thresh_mult[THR_COMP_NEAR_NEARL3A] += 1200;
@@ -1375,23 +1171,23 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
rd->thresh_mult[THR_COMP_NEAR_NEWL3A] += 1700;
rd->thresh_mult[THR_COMP_NEW_NEARL3A] += 1700;
rd->thresh_mult[THR_COMP_NEW_NEWL3A] += 2000;
- rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3A] += 2500;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3A] += 3000;
- rd->thresh_mult[THR_COMP_NEAR_NEARGA] += 1200;
+ rd->thresh_mult[THR_COMP_NEAR_NEARGA] += 1320;
rd->thresh_mult[THR_COMP_NEAREST_NEWGA] += 1500;
rd->thresh_mult[THR_COMP_NEW_NEARESTGA] += 1500;
- rd->thresh_mult[THR_COMP_NEAR_NEWGA] += 1700;
+ rd->thresh_mult[THR_COMP_NEAR_NEWGA] += 2040;
rd->thresh_mult[THR_COMP_NEW_NEARGA] += 1700;
rd->thresh_mult[THR_COMP_NEW_NEWGA] += 2000;
- rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA] += 2500;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA] += 2250;
rd->thresh_mult[THR_COMP_NEAR_NEARLB] += 1200;
rd->thresh_mult[THR_COMP_NEAREST_NEWLB] += 1500;
rd->thresh_mult[THR_COMP_NEW_NEARESTLB] += 1500;
- rd->thresh_mult[THR_COMP_NEAR_NEWLB] += 1700;
+ rd->thresh_mult[THR_COMP_NEAR_NEWLB] += 1360;
rd->thresh_mult[THR_COMP_NEW_NEARLB] += 1700;
- rd->thresh_mult[THR_COMP_NEW_NEWLB] += 2000;
- rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLB] += 2500;
+ rd->thresh_mult[THR_COMP_NEW_NEWLB] += 2400;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLB] += 2250;
rd->thresh_mult[THR_COMP_NEAR_NEARL2B] += 1200;
rd->thresh_mult[THR_COMP_NEAREST_NEWL2B] += 1500;
@@ -1404,7 +1200,7 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
rd->thresh_mult[THR_COMP_NEAR_NEARL3B] += 1200;
rd->thresh_mult[THR_COMP_NEAREST_NEWL3B] += 1500;
rd->thresh_mult[THR_COMP_NEW_NEARESTL3B] += 1500;
- rd->thresh_mult[THR_COMP_NEAR_NEWL3B] += 1700;
+ rd->thresh_mult[THR_COMP_NEAR_NEWL3B] += 1870;
rd->thresh_mult[THR_COMP_NEW_NEARL3B] += 1700;
rd->thresh_mult[THR_COMP_NEW_NEWL3B] += 2000;
rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3B] += 2500;
@@ -1418,7 +1214,7 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGB] += 2500;
rd->thresh_mult[THR_COMP_NEAR_NEARLA2] += 1200;
- rd->thresh_mult[THR_COMP_NEAREST_NEWLA2] += 1500;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWLA2] += 1800;
rd->thresh_mult[THR_COMP_NEW_NEARESTLA2] += 1500;
rd->thresh_mult[THR_COMP_NEAR_NEWLA2] += 1700;
rd->thresh_mult[THR_COMP_NEW_NEARLA2] += 1700;
@@ -1433,7 +1229,7 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
rd->thresh_mult[THR_COMP_NEW_NEWL2A2] += 2000;
rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2A2] += 2500;
- rd->thresh_mult[THR_COMP_NEAR_NEARL3A2] += 1200;
+ rd->thresh_mult[THR_COMP_NEAR_NEARL3A2] += 1440;
rd->thresh_mult[THR_COMP_NEAREST_NEWL3A2] += 1500;
rd->thresh_mult[THR_COMP_NEW_NEARESTL3A2] += 1500;
rd->thresh_mult[THR_COMP_NEAR_NEWL3A2] += 1700;
@@ -1447,29 +1243,29 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
rd->thresh_mult[THR_COMP_NEAR_NEWGA2] += 1700;
rd->thresh_mult[THR_COMP_NEW_NEARGA2] += 1700;
rd->thresh_mult[THR_COMP_NEW_NEWGA2] += 2000;
- rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA2] += 2500;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA2] += 2750;
rd->thresh_mult[THR_COMP_NEAR_NEARLL2] += 1600;
rd->thresh_mult[THR_COMP_NEAREST_NEWLL2] += 2000;
rd->thresh_mult[THR_COMP_NEW_NEARESTLL2] += 2000;
- rd->thresh_mult[THR_COMP_NEAR_NEWLL2] += 2200;
+ rd->thresh_mult[THR_COMP_NEAR_NEWLL2] += 2640;
rd->thresh_mult[THR_COMP_NEW_NEARLL2] += 2200;
rd->thresh_mult[THR_COMP_NEW_NEWLL2] += 2400;
rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLL2] += 3200;
rd->thresh_mult[THR_COMP_NEAR_NEARLL3] += 1600;
rd->thresh_mult[THR_COMP_NEAREST_NEWLL3] += 2000;
- rd->thresh_mult[THR_COMP_NEW_NEARESTLL3] += 2000;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTLL3] += 1800;
rd->thresh_mult[THR_COMP_NEAR_NEWLL3] += 2200;
rd->thresh_mult[THR_COMP_NEW_NEARLL3] += 2200;
rd->thresh_mult[THR_COMP_NEW_NEWLL3] += 2400;
rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLL3] += 3200;
- rd->thresh_mult[THR_COMP_NEAR_NEARLG] += 1600;
- rd->thresh_mult[THR_COMP_NEAREST_NEWLG] += 2000;
+ rd->thresh_mult[THR_COMP_NEAR_NEARLG] += 1760;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWLG] += 2400;
rd->thresh_mult[THR_COMP_NEW_NEARESTLG] += 2000;
- rd->thresh_mult[THR_COMP_NEAR_NEWLG] += 2200;
- rd->thresh_mult[THR_COMP_NEW_NEARLG] += 2200;
+ rd->thresh_mult[THR_COMP_NEAR_NEWLG] += 1760;
+ rd->thresh_mult[THR_COMP_NEW_NEARLG] += 2640;
rd->thresh_mult[THR_COMP_NEW_NEWLG] += 2400;
rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLG] += 3200;
@@ -1477,34 +1273,25 @@ void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
rd->thresh_mult[THR_COMP_NEAREST_NEWBA] += 2000;
rd->thresh_mult[THR_COMP_NEW_NEARESTBA] += 2000;
rd->thresh_mult[THR_COMP_NEAR_NEWBA] += 2200;
- rd->thresh_mult[THR_COMP_NEW_NEARBA] += 2200;
- rd->thresh_mult[THR_COMP_NEW_NEWBA] += 2400;
+ rd->thresh_mult[THR_COMP_NEW_NEARBA] += 1980;
+ rd->thresh_mult[THR_COMP_NEW_NEWBA] += 2640;
rd->thresh_mult[THR_COMP_GLOBAL_GLOBALBA] += 3200;
rd->thresh_mult[THR_DC] += 1000;
rd->thresh_mult[THR_PAETH] += 1000;
- rd->thresh_mult[THR_SMOOTH] += 2000;
+ rd->thresh_mult[THR_SMOOTH] += 2200;
rd->thresh_mult[THR_SMOOTH_V] += 2000;
rd->thresh_mult[THR_SMOOTH_H] += 2000;
rd->thresh_mult[THR_H_PRED] += 2000;
- rd->thresh_mult[THR_V_PRED] += 2000;
+ rd->thresh_mult[THR_V_PRED] += 1800;
rd->thresh_mult[THR_D135_PRED] += 2500;
- rd->thresh_mult[THR_D203_PRED] += 2500;
+ rd->thresh_mult[THR_D203_PRED] += 2000;
rd->thresh_mult[THR_D157_PRED] += 2500;
- rd->thresh_mult[THR_D67_PRED] += 2500;
+ rd->thresh_mult[THR_D67_PRED] += 2000;
rd->thresh_mult[THR_D113_PRED] += 2500;
rd->thresh_mult[THR_D45_PRED] += 2500;
}
-void av1_set_rd_speed_thresholds_sub8x8(AV1_COMP *cpi) {
- static const int thresh_mult[MAX_REFS] = { 2500, 2500, 2500, 2500, 2500,
- 2500, 2500, 4500, 4500, 4500,
- 4500, 4500, 4500, 4500, 4500,
- 4500, 4500, 4500, 4500, 2500 };
- RD_OPT *const rd = &cpi->rd;
- memcpy(rd->thresh_mult_sub8x8, thresh_mult, sizeof(thresh_mult));
-}
-
void av1_update_rd_thresh_fact(const AV1_COMMON *const cm,
int (*factor_buf)[MAX_MODES], int rd_thresh,
int bsize, int best_mode_index) {
diff --git a/libaom/av1/encoder/rd.h b/libaom/av1/encoder/rd.h
index 2e2a30d..ff46083 100644
--- a/libaom/av1/encoder/rd.h
+++ b/libaom/av1/encoder/rd.h
@@ -48,7 +48,7 @@ extern "C" {
// This enumerator type needs to be kept aligned with the mode order in
// const MODE_DEFINITION av1_mode_order[MAX_MODES] used in the rd code.
-typedef enum {
+enum {
THR_NEARESTMV,
THR_NEARESTL2,
THR_NEARESTL3,
@@ -246,9 +246,9 @@ typedef enum {
MAX_SINGLE_REF_MODES = LAST_SINGLE_REF_MODES + 1,
LAST_COMP_REF_MODES = THR_COMP_GLOBAL_GLOBALBA,
MAX_COMP_REF_MODES = LAST_COMP_REF_MODES + 1
-} THR_MODES;
+} UENUM1BYTE(THR_MODES);
-typedef enum {
+enum {
THR_LAST,
THR_LAST2,
THR_LAST3,
@@ -275,7 +275,7 @@ typedef enum {
THR_INTRA,
MAX_REFS
-} THR_MODES_SUB8X8;
+} UENUM1BYTE(THR_MODES_SUB8X8);
typedef struct RD_OPT {
// Thresh_mult is used to set a threshold for the rd score. A higher value
@@ -283,7 +283,6 @@ typedef struct RD_OPT {
// is used in combination with the current block size, and thresh_freq_fact
// to pick a threshold.
int thresh_mult[MAX_MODES];
- int thresh_mult_sub8x8[MAX_REFS];
int threshes[MAX_SEGMENTS][BLOCK_SIZES_ALL][MAX_MODES];
@@ -319,25 +318,6 @@ static INLINE void av1_init_rd_stats(RD_STATS *rd_stats) {
}
}
#endif
-#if CONFIG_ONE_PASS_SVM
- rd_stats->eob = 0;
- rd_stats->eob_0 = 0;
- rd_stats->eob_1 = 0;
- rd_stats->eob_2 = 0;
- rd_stats->eob_3 = 0;
-
- rd_stats->rd = 0;
- rd_stats->rd_0 = 0;
- rd_stats->rd_1 = 0;
- rd_stats->rd_2 = 0;
- rd_stats->rd_3 = 0;
-
- rd_stats->y_sse = 0;
- rd_stats->sse_0 = 0;
- rd_stats->sse_1 = 0;
- rd_stats->sse_2 = 0;
- rd_stats->sse_3 = 0;
-#endif
}
static INLINE void av1_invalid_rd_stats(RD_STATS *rd_stats) {
@@ -365,30 +345,6 @@ static INLINE void av1_invalid_rd_stats(RD_STATS *rd_stats) {
}
}
#endif
-#if CONFIG_ONE_PASS_SVM
- // TODO(chiyotsai@google.com): Change invalid values to INT_MAX and
- // INT64_MAX. Currently there are some code paths where rd_stats's properties
- // are set directly without calling av1_init_rd_stats, so changing it now will
- // break this speed feature. Need to hunt down all places where rd_stats is
- // used without initialized.
- rd_stats->eob = 0;
- rd_stats->eob_0 = 0;
- rd_stats->eob_1 = 0;
- rd_stats->eob_2 = 0;
- rd_stats->eob_3 = 0;
-
- rd_stats->rd = 0;
- rd_stats->rd_0 = 0;
- rd_stats->rd_1 = 0;
- rd_stats->rd_2 = 0;
- rd_stats->rd_3 = 0;
-
- rd_stats->y_sse = 0;
- rd_stats->sse_0 = 0;
- rd_stats->sse_1 = 0;
- rd_stats->sse_2 = 0;
- rd_stats->sse_3 = 0;
-#endif
}
static INLINE void av1_merge_rd_stats(RD_STATS *rd_stats_dst,
@@ -422,222 +378,8 @@ static INLINE void av1_merge_rd_stats(RD_STATS *rd_stats_dst,
}
}
#endif
-#if CONFIG_ONE_PASS_SVM
- rd_stats_dst->eob += rd_stats_src->eob;
- rd_stats_dst->eob_0 += rd_stats_src->eob_0;
- rd_stats_dst->eob_1 += rd_stats_src->eob_1;
- rd_stats_dst->eob_2 += rd_stats_src->eob_2;
- rd_stats_dst->eob_3 += rd_stats_src->eob_3;
-
- rd_stats_dst->rd += rd_stats_src->rd;
- rd_stats_dst->rd_0 += rd_stats_src->rd_0;
- rd_stats_dst->rd_1 += rd_stats_src->rd_1;
- rd_stats_dst->rd_2 += rd_stats_src->rd_2;
- rd_stats_dst->rd_3 += rd_stats_src->rd_3;
-
- rd_stats_dst->y_sse += rd_stats_src->y_sse;
- rd_stats_dst->sse_0 += rd_stats_src->sse_0;
- rd_stats_dst->sse_1 += rd_stats_src->sse_1;
- rd_stats_dst->sse_2 += rd_stats_src->sse_2;
- rd_stats_dst->sse_3 += rd_stats_src->sse_3;
-#endif
-}
-
-#if CONFIG_ONE_PASS_SVM
-static INLINE void av1_add_reg_stat(RD_STATS *rd_stats, int eob, int64_t rd,
- int64_t sse, int blk_row, int blk_col,
- BLOCK_SIZE bsize, BLOCK_SIZE crop_bsize) {
- // NOTE: Currently the calculation of regional features works by assuming
- // bsize is square so that each transform block of size crop_bsize either
- // 1. locates completely within a quadrant or
- // 2. is exactly half of bsize or
- // 3. is the entire prediction block
- // Size of TX block and SB
- const int block_width_mi = mi_size_wide[bsize];
- const int block_height_mi = mi_size_high[bsize];
- const int crop_width_mi = mi_size_wide[crop_bsize];
- const int crop_height_mi = mi_size_high[crop_bsize];
-
- // Increment the eob proportionally to how much the tx_block overlaps with
- // each quadrant. We will scale it by MAX_MIB_SIZE * MAX_MIB_SIZE to avoid
- // being truncated.
- const int max_scaling_factor = MAX_MIB_SIZE * MAX_MIB_SIZE;
-
- // Update the stats
- rd_stats->eob = eob;
- rd_stats->rd = rd;
- rd_stats->y_sse = sse;
-
- if (crop_width_mi <= block_width_mi / 2 &&
- crop_height_mi <= block_width_mi / 2) {
- // The transform block lies completely in a quadrant.
- const int scaling_factor = max_scaling_factor;
- const int r_eob = eob * scaling_factor, r_rd = rd * scaling_factor,
- r_sse = sse * scaling_factor;
-
- if (blk_row < block_height_mi / 2 && blk_col < block_width_mi / 2) {
- rd_stats->eob_0 = r_eob;
- rd_stats->rd_0 = r_rd;
- rd_stats->sse_0 = r_sse;
- } else if (blk_row < block_height_mi / 2 && blk_col >= block_width_mi / 2) {
- rd_stats->eob_1 = r_eob;
- rd_stats->rd_1 = r_rd;
- rd_stats->sse_1 = r_sse;
- } else if (blk_row >= block_height_mi / 2 && blk_col < block_width_mi / 2) {
- rd_stats->eob_2 = r_eob;
- rd_stats->rd_2 = r_rd;
- rd_stats->sse_2 = r_sse;
- } else {
- rd_stats->eob_3 = r_eob;
- rd_stats->rd_3 = r_rd;
- rd_stats->sse_3 = r_sse;
- }
- } else if (crop_height_mi == block_height_mi &&
- crop_width_mi == block_width_mi) {
- // The transform block is the whole prediction block
- const int scaling_factor = max_scaling_factor;
- const int r_eob = eob * scaling_factor, r_rd = rd * scaling_factor,
- r_sse = sse * scaling_factor;
-
- rd_stats->eob_0 = r_eob;
- rd_stats->rd_0 = r_rd;
- rd_stats->sse_0 = r_sse;
-
- rd_stats->eob_1 = r_eob;
- rd_stats->rd_1 = r_rd;
- rd_stats->sse_1 = r_sse;
-
- rd_stats->eob_2 = r_eob;
- rd_stats->rd_2 = r_rd;
- rd_stats->sse_2 = r_sse;
-
- rd_stats->eob_3 = r_eob;
- rd_stats->rd_3 = r_rd;
- rd_stats->sse_3 = r_sse;
- } else if (crop_height_mi == block_height_mi) {
- // The tranform block is a vertical block
- const int scaling_factor = max_scaling_factor / 2;
- const int r_eob = eob * scaling_factor, r_rd = rd * scaling_factor,
- r_sse = sse * scaling_factor;
-
- if (blk_col < block_width_mi / 2) {
- rd_stats->eob_0 = r_eob;
- rd_stats->rd_0 = r_rd;
- rd_stats->sse_0 = r_sse;
-
- rd_stats->eob_2 = r_eob;
- rd_stats->rd_2 = r_rd;
- rd_stats->sse_2 = r_sse;
- } else {
- rd_stats->eob_1 = r_eob;
- rd_stats->rd_1 = r_rd;
- rd_stats->sse_1 = r_sse;
-
- rd_stats->eob_3 = r_eob;
- rd_stats->rd_3 = r_rd;
- rd_stats->sse_3 = r_sse;
- }
- } else if (crop_width_mi == block_width_mi) {
- // The tranform block is a horizontal block half the size of predition block
- const int scaling_factor = max_scaling_factor / 2;
- const int r_eob = eob * scaling_factor, r_rd = rd * scaling_factor,
- r_sse = sse * scaling_factor;
-
- if (blk_row < block_height_mi / 2) {
- rd_stats->eob_0 = r_eob;
- rd_stats->rd_0 = r_rd;
- rd_stats->sse_0 = r_sse;
-
- rd_stats->eob_1 = r_eob;
- rd_stats->rd_1 = r_rd;
- rd_stats->sse_1 = r_sse;
- } else {
- rd_stats->eob_2 = r_eob;
- rd_stats->rd_2 = r_rd;
- rd_stats->sse_2 = r_sse;
-
- rd_stats->eob_3 = r_eob;
- rd_stats->rd_3 = r_rd;
- rd_stats->sse_3 = r_sse;
- }
- } else {
- assert(0 && "Unexpected transform size");
- }
}
-static INLINE void av1_reg_stat_skipmode_update(RD_STATS *rd_stats,
- int rdmult) {
- // Update the stats
- rd_stats->eob = 0;
- rd_stats->eob_0 = 0;
- rd_stats->eob_1 = 0;
- rd_stats->eob_2 = 0;
- rd_stats->eob_3 = 0;
-
- rd_stats->rd = RDCOST(rdmult, 0, rd_stats->sse);
- rd_stats->rd_0 = RDCOST(rdmult, 0, rd_stats->sse_0);
- rd_stats->rd_1 = RDCOST(rdmult, 0, rd_stats->sse_1);
- rd_stats->rd_2 = RDCOST(rdmult, 0, rd_stats->sse_2);
- rd_stats->rd_3 = RDCOST(rdmult, 0, rd_stats->sse_3);
-}
-
-static INLINE void av1_copy_reg_stat(RD_STATS *rd_stats_dst,
- RD_STATS *rd_stats_src) {
- rd_stats_dst->eob = rd_stats_src->eob;
- rd_stats_dst->eob_0 = rd_stats_src->eob_0;
- rd_stats_dst->eob_1 = rd_stats_src->eob_1;
- rd_stats_dst->eob_2 = rd_stats_src->eob_2;
- rd_stats_dst->eob_3 = rd_stats_src->eob_3;
-
- rd_stats_dst->rd = rd_stats_src->rd;
- rd_stats_dst->rd_0 = rd_stats_src->rd_0;
- rd_stats_dst->rd_1 = rd_stats_src->rd_1;
- rd_stats_dst->rd_2 = rd_stats_src->rd_2;
- rd_stats_dst->rd_3 = rd_stats_src->rd_3;
-
- rd_stats_dst->y_sse = rd_stats_src->y_sse;
- rd_stats_dst->sse_0 = rd_stats_src->sse_0;
- rd_stats_dst->sse_1 = rd_stats_src->sse_1;
- rd_stats_dst->sse_2 = rd_stats_src->sse_2;
- rd_stats_dst->sse_3 = rd_stats_src->sse_3;
-}
-
-static INLINE void av1_unpack_reg_stat(RD_STATS *rd_stats, int *eob, int *eob_0,
- int *eob_1, int *eob_2, int *eob_3,
- int64_t *rd, int64_t *rd_0,
- int64_t *rd_1, int64_t *rd_2,
- int64_t *rd_3) {
- *rd = rd_stats->rd;
- *rd_0 = rd_stats->rd_0;
- *rd_1 = rd_stats->rd_1;
- *rd_2 = rd_stats->rd_2;
- *rd_3 = rd_stats->rd_3;
-
- *eob = rd_stats->eob;
- *eob_0 = rd_stats->eob_0;
- *eob_1 = rd_stats->eob_1;
- *eob_2 = rd_stats->eob_2;
- *eob_3 = rd_stats->eob_3;
-}
-
-static INLINE void av1_set_reg_stat(RD_STATS *rd_stats, int eob, int eob_0,
- int eob_1, int eob_2, int eob_3, int64_t rd,
- int64_t rd_0, int64_t rd_1, int64_t rd_2,
- int64_t rd_3) {
- rd_stats->rd = rd;
- rd_stats->rd_0 = rd_0;
- rd_stats->rd_1 = rd_1;
- rd_stats->rd_2 = rd_2;
- rd_stats->rd_3 = rd_3;
-
- rd_stats->eob = eob;
- rd_stats->eob_0 = eob_0;
- rd_stats->eob_1 = eob_1;
- rd_stats->eob_2 = eob_2;
- rd_stats->eob_3 = eob_3;
-}
-#endif
-
struct TileInfo;
struct TileDataEnc;
struct AV1_COMP;
@@ -657,9 +399,10 @@ void av1_initialize_me_consts(const struct AV1_COMP *cpi, MACROBLOCK *x,
void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n,
unsigned int qstep, int *rate, int64_t *dist);
-void av1_model_rd_curvfit(double xqr, double *rate_f, double *distbysse_f);
-void av1_model_rd_surffit(double xm, double yl, double *rate_f,
- double *distbysse_f);
+void av1_model_rd_curvfit(BLOCK_SIZE bsize, double sse_norm, double xqr,
+ double *rate_f, double *distbysse_f);
+void av1_model_rd_surffit(BLOCK_SIZE bsize, double sse_norm, double xm,
+ double yl, double *rate_f, double *distbysse_f);
int av1_get_switchable_rate(const AV1_COMMON *const cm, MACROBLOCK *x,
const MACROBLOCKD *xd);
@@ -684,8 +427,6 @@ void av1_get_entropy_contexts(BLOCK_SIZE bsize,
void av1_set_rd_speed_thresholds(struct AV1_COMP *cpi);
-void av1_set_rd_speed_thresholds_sub8x8(struct AV1_COMP *cpi);
-
void av1_update_rd_thresh_fact(const AV1_COMMON *const cm,
int (*fact)[MAX_MODES], int rd_thresh, int bsize,
int best_mode_index);
diff --git a/libaom/av1/encoder/rdopt.c b/libaom/av1/encoder/rdopt.c
index b393e6f..5e6054e 100644
--- a/libaom/av1/encoder/rdopt.c
+++ b/libaom/av1/encoder/rdopt.c
@@ -125,14 +125,14 @@ static void model_rd_with_surffit(const AV1_COMP *const cpi,
int64_t sse, int num_samples, int *rate,
int64_t *dist);
-typedef enum {
+enum {
MODELRD_LEGACY,
MODELRD_CURVFIT,
MODELRD_SUFFIT,
MODELRD_DNN,
MODELRD_FULLRDY,
MODELRD_TYPES
-} ModelRdType;
+} UENUM1BYTE(ModelRdType);
static model_rd_for_sb_type model_rd_sb_fn[MODELRD_TYPES] = {
model_rd_for_sb, model_rd_for_sb_with_curvfit, model_rd_for_sb_with_surffit,
@@ -150,11 +150,12 @@ static model_rd_from_sse_type model_rd_sse_fn[MODELRD_TYPES] = {
// 3: DNN regression model
// 4: Full rd model
#define MODELRD_TYPE_INTERP_FILTER 1
-#define MODELRD_TYPE_TX_SEARCH_PRUNE 2
+#define MODELRD_TYPE_TX_SEARCH_PRUNE 1
#define MODELRD_TYPE_MASKED_COMPOUND 1
#define MODELRD_TYPE_INTERINTRA 1
#define MODELRD_TYPE_INTRA 1
-#define MODELRD_TYPE_JNT_COMPOUND 1
+#define MODELRD_TYPE_DIST_WTD_COMPOUND 1
+#define MODELRD_TYPE_MOTION_MODE_RD 1
#define DUAL_FILTER_SET_SIZE (SWITCHABLE_FILTERS * SWITCHABLE_FILTERS)
static const InterpFilters filter_sets[DUAL_FILTER_SET_SIZE] = {
@@ -163,10 +164,6 @@ static const InterpFilters filter_sets[DUAL_FILTER_SET_SIZE] = {
0x00000002, 0x00010002, 0x00020002, // y = 2
};
-#define SECOND_REF_FRAME_MASK \
- ((1 << ALTREF_FRAME) | (1 << ALTREF2_FRAME) | (1 << BWDREF_FRAME) | \
- (1 << GOLDEN_FRAME) | (1 << LAST2_FRAME) | 0x01)
-
static const double ADST_FLIP_SVM[8] = {
/* vertical */
-6.6623, -2.8062, -3.2531, 3.1671,
@@ -179,26 +176,12 @@ typedef struct {
MV_REFERENCE_FRAME ref_frame[2];
} MODE_DEFINITION;
-typedef struct {
- MV_REFERENCE_FRAME ref_frame[2];
-} REF_DEFINITION;
-
-typedef enum {
+enum {
FTXS_NONE = 0,
FTXS_DCT_AND_1D_DCT_ONLY = 1 << 0,
FTXS_DISABLE_TRELLIS_OPT = 1 << 1,
FTXS_USE_TRANSFORM_DOMAIN = 1 << 2
-} FAST_TX_SEARCH_MODE;
-
-static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
- RD_STATS *rd_stats, BLOCK_SIZE bsize, int mi_row,
- int mi_col, int64_t ref_best_rd);
-
-static int inter_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x,
- RD_STATS *rd_stats, BLOCK_SIZE bsize,
- int64_t non_skip_ref_best_rd,
- int64_t skip_ref_best_rd,
- FAST_TX_SEARCH_MODE ftxs_mode);
+} UENUM1BYTE(FAST_TX_SEARCH_MODE);
struct rdcost_block_args {
const AV1_COMP *cpi;
@@ -212,6 +195,7 @@ struct rdcost_block_args {
int incomplete_exit;
int use_fast_coef_costing;
FAST_TX_SEARCH_MODE ftxs_mode;
+ int skip_trellis;
};
#define LAST_NEW_MV_INDEX 6
@@ -749,12 +733,12 @@ typedef struct InterModeSearchState {
MV_REFERENCE_FRAME single_rd_order[2][SINGLE_INTER_MODE_NUM][FWD_REFS];
} InterModeSearchState;
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
static int inter_mode_data_block_idx(BLOCK_SIZE bsize) {
- if (bsize == BLOCK_8X8) return 1;
- if (bsize == BLOCK_16X16) return 2;
- if (bsize == BLOCK_32X32) return 3;
- return -1;
+ if (bsize == BLOCK_4X4 || bsize == BLOCK_4X8 || bsize == BLOCK_8X4 ||
+ bsize == BLOCK_4X16 || bsize == BLOCK_16X4) {
+ return -1;
+ }
+ return 1;
}
void av1_inter_mode_data_init(TileDataEnc *tile_data) {
@@ -770,37 +754,41 @@ void av1_inter_mode_data_init(TileDataEnc *tile_data) {
}
}
-static int get_est_rate_dist(TileDataEnc *tile_data, BLOCK_SIZE bsize,
+static int get_est_rate_dist(const TileDataEnc *tile_data, BLOCK_SIZE bsize,
int64_t sse, int *est_residue_cost,
int64_t *est_dist) {
aom_clear_system_state();
const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize];
if (md->ready) {
- const double est_ld = md->a * sse + md->b;
if (sse < md->dist_mean) {
*est_residue_cost = 0;
*est_dist = sse;
} else {
- *est_residue_cost = (int)round((sse - md->dist_mean) / est_ld);
*est_dist = (int64_t)round(md->dist_mean);
+ const double est_ld = md->a * sse + md->b;
+ // Clamp estimated rate cost by INT_MAX / 2.
+ // TODO(angiebird@google.com): find better solution than clamping.
+ if (fabs(est_ld) < 1e-2) {
+ *est_residue_cost = INT_MAX / 2;
+ } else {
+ double est_residue_cost_dbl = ((sse - md->dist_mean) / est_ld);
+ if (est_residue_cost_dbl < 0) {
+ *est_residue_cost = 0;
+ } else {
+ *est_residue_cost =
+ (int)AOMMIN((int64_t)round(est_residue_cost_dbl), INT_MAX / 2);
+ }
+ }
+ if (*est_residue_cost <= 0) {
+ *est_residue_cost = 0;
+ *est_dist = sse;
+ }
}
return 1;
}
return 0;
}
-static int64_t get_est_rd(TileDataEnc *tile_data, BLOCK_SIZE bsize, int rdmult,
- int64_t sse, int curr_cost) {
- int est_residue_cost;
- int64_t est_dist;
- if (get_est_rate_dist(tile_data, bsize, sse, &est_residue_cost, &est_dist)) {
- int rate = est_residue_cost + curr_cost;
- int64_t est_rd = RDCOST(rdmult, rate, est_dist);
- return est_rd;
- }
- return 0;
-}
-
void av1_inter_mode_data_fit(TileDataEnc *tile_data, int rdmult) {
aom_clear_system_state();
for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
@@ -865,20 +853,31 @@ static void inter_mode_data_push(TileDataEnc *tile_data, BLOCK_SIZE bsize,
rd_model->dist_sum += dist;
rd_model->ld_sum += ld;
rd_model->sse_sum += sse;
- rd_model->sse_sse_sum += sse * sse;
+ rd_model->sse_sse_sum += (double)sse * (double)sse;
rd_model->sse_ld_sum += sse * ld;
}
}
static void inter_modes_info_push(InterModesInfo *inter_modes_info,
- int mode_rate, int64_t sse, int64_t est_rd,
+ int mode_rate, int64_t sse, int64_t rd,
+ bool true_rd, uint8_t *blk_skip,
+ RD_STATS *rd_cost, RD_STATS *rd_cost_y,
+ RD_STATS *rd_cost_uv,
const MB_MODE_INFO *mbmi) {
const int num = inter_modes_info->num;
assert(num < MAX_INTER_MODES);
inter_modes_info->mbmi_arr[num] = *mbmi;
inter_modes_info->mode_rate_arr[num] = mode_rate;
inter_modes_info->sse_arr[num] = sse;
- inter_modes_info->est_rd_arr[num] = est_rd;
+ inter_modes_info->est_rd_arr[num] = rd;
+ inter_modes_info->true_rd_arr[num] = true_rd;
+ if (blk_skip != NULL) {
+ memcpy(inter_modes_info->blk_skip_arr[num], blk_skip,
+ sizeof(blk_skip[0]) * MAX_MIB_SIZE * MAX_MIB_SIZE);
+ }
+ inter_modes_info->rd_cost_arr[num] = *rd_cost;
+ inter_modes_info->rd_cost_y_arr[num] = *rd_cost_y;
+ inter_modes_info->rd_cost_uv_arr[num] = *rd_cost_uv;
++inter_modes_info->num;
}
@@ -904,7 +903,6 @@ static void inter_modes_info_sort(const InterModesInfo *inter_modes_info,
qsort(rd_idx_pair_arr, inter_modes_info->num, sizeof(rd_idx_pair_arr[0]),
compare_rd_idx_pair);
}
-#endif // CONFIG_COLLECT_INTER_MODE_RD_STATS
static INLINE int write_uniform_cost(int n, int v) {
const int l = get_unsigned_bits(n);
@@ -961,7 +959,7 @@ static unsigned pixel_dist_visible_only(
}
const MACROBLOCKD *xd = &x->e_mbd;
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ if (is_cur_buf_hbd(xd)) {
uint64_t sse64 = aom_highbd_sse_odd_size(src, src_stride, dst, dst_stride,
visible_cols, visible_rows);
return (unsigned int)ROUND_POWER_OF_TWO(sse64, (xd->bd - 8) * 2);
@@ -1217,7 +1215,7 @@ int64_t av1_dist_8x8(const AV1_COMP *const cpi, const MACROBLOCK *x,
if (x->tune_metric == AOM_TUNE_CDEF_DIST ||
x->tune_metric == AOM_TUNE_DAALA_DIST) {
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ if (is_cur_buf_hbd(xd)) {
for (j = 0; j < bsh; j++)
for (i = 0; i < bsw; i++)
orig[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i];
@@ -1281,8 +1279,7 @@ int64_t av1_dist_8x8(const AV1_COMP *const cpi, const MACROBLOCK *x,
bsw, coeff_shift);
}
}
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
- d = ((uint64_t)d) >> 2 * coeff_shift;
+ if (is_cur_buf_hbd(xd)) d = ((uint64_t)d) >> 2 * coeff_shift;
} else {
// Otherwise, MSE by default
d = pixel_dist_visible_only(cpi, x, src, src_stride, dst, dst_stride,
@@ -1310,7 +1307,7 @@ static int64_t dist_8x8_diff(const MACROBLOCK *x, const uint8_t *src,
if (x->tune_metric == AOM_TUNE_CDEF_DIST ||
x->tune_metric == AOM_TUNE_DAALA_DIST) {
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ if (is_cur_buf_hbd(xd)) {
for (j = 0; j < bsh; j++)
for (i = 0; i < bsw; i++)
orig[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i];
@@ -1727,16 +1724,19 @@ void av1_get_horver_correlation_full_c(const int16_t *diff, int stride,
static void score_2D_transform_pow8(float *scores_2D, float shift) {
float sum = 0.0f;
int i;
-
for (i = 0; i < 16; i++) {
- float v, v2, v4;
- v = AOMMAX(scores_2D[i] + shift, 0.0f);
- v2 = v * v;
- v4 = v2 * v2;
+ const float v = AOMMIN(AOMMAX(scores_2D[i] + shift, 0.0f), 100.0f);
+ const float v2 = v * v;
+ const float v4 = v2 * v2;
scores_2D[i] = v4 * v4;
sum += scores_2D[i];
}
- for (i = 0; i < 16; i++) scores_2D[i] /= sum;
+ for (i = 0; i < 16; i++) {
+ if (scores_2D[i] < sum * 1e-4)
+ scores_2D[i] = 0.0f;
+ else
+ scores_2D[i] /= sum;
+ }
}
// These thresholds were calibrated to provide a certain number of TX types
@@ -1909,7 +1909,13 @@ static void prune_tx(const AV1_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
x->tx_search_prune[tx_set_type] = 0;
x->tx_split_prune_flag = 0;
const MB_MODE_INFO *mbmi = xd->mi[0];
- if (!is_inter_block(mbmi) || cpi->sf.tx_type_search.prune_mode == NO_PRUNE ||
+ const int is_inter = is_inter_block(mbmi);
+ if ((is_inter && cpi->oxcf.use_inter_dct_only) ||
+ (!is_inter && cpi->oxcf.use_intra_dct_only)) {
+ x->tx_search_prune[tx_set_type] = ~(1 << DCT_DCT);
+ return;
+ }
+ if (!is_inter || cpi->sf.tx_type_search.prune_mode == NO_PRUNE ||
x->use_default_inter_tx_type || xd->lossless[mbmi->segment_id] ||
x->cb_partition_scan)
return;
@@ -1948,8 +1954,7 @@ static void model_rd_from_sse(const AV1_COMP *const cpi,
(void)num_samples;
const MACROBLOCKD *const xd = &x->e_mbd;
const struct macroblockd_plane *const pd = &xd->plane[plane];
- const int dequant_shift =
- (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
+ const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
// Fast approximate the modelling function.
if (cpi->sf.simple_model_rd_from_var) {
@@ -1971,7 +1976,6 @@ static void model_rd_from_sse(const AV1_COMP *const cpi,
*dist <<= 4;
}
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
static int64_t get_sse(const AV1_COMP *cpi, const MACROBLOCK *x) {
const AV1_COMMON *cm = &cpi->common;
const int num_planes = av1_num_planes(cm);
@@ -1994,7 +1998,6 @@ static int64_t get_sse(const AV1_COMP *cpi, const MACROBLOCK *x) {
total_sse <<= 4;
return total_sse;
}
-#endif
static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
MACROBLOCK *x, MACROBLOCKD *xd, int plane_from,
@@ -2028,7 +2031,7 @@ static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
if (x->skip_chroma_rd && plane) continue;
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ if (is_cur_buf_hbd(xd)) {
sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf,
pd->dst.stride, bw, bh);
} else {
@@ -2057,43 +2060,6 @@ static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
*out_dist_sum = dist_sum;
}
-static void check_block_skip(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
- MACROBLOCK *x, MACROBLOCKD *xd, int plane_from,
- int plane_to, int *skip_txfm_sb) {
- *skip_txfm_sb = 1;
- for (int plane = plane_from; plane <= plane_to; ++plane) {
- struct macroblock_plane *const p = &x->plane[plane];
- struct macroblockd_plane *const pd = &xd->plane[plane];
- const BLOCK_SIZE bs =
- get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
- unsigned int sse;
-
- if (x->skip_chroma_rd && plane) continue;
-
- // Since fast HBD variance functions scale down sse by 4 bit, we first use
- // fast vf implementation to rule out blocks with non-zero scaled sse. Then,
- // only if the source is HBD and the scaled sse is 0, accurate sse
- // computation is applied to determine if the sse is really 0. This step is
- // necessary for HBD lossless coding.
- cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
- &sse);
- if (sse) {
- *skip_txfm_sb = 0;
- return;
- } else if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- uint64_t sse64 = aom_highbd_sse_odd_size(
- p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
- block_size_wide[bs], block_size_high[bs]);
-
- if (sse64) {
- *skip_txfm_sb = 0;
- return;
- }
- }
- }
- return;
-}
-
int64_t av1_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
intptr_t block_size, int64_t *ssz) {
int i;
@@ -2195,7 +2161,8 @@ static unsigned pixel_dist(const AV1_COMP *const cpi, const MACROBLOCK *x,
static INLINE int64_t pixel_diff_dist(const MACROBLOCK *x, int plane,
int blk_row, int blk_col,
const BLOCK_SIZE plane_bsize,
- const BLOCK_SIZE tx_bsize) {
+ const BLOCK_SIZE tx_bsize,
+ unsigned int *block_mse_q8) {
int visible_rows, visible_cols;
const MACROBLOCKD *xd = &x->e_mbd;
get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL,
@@ -2218,7 +2185,11 @@ static INLINE int64_t pixel_diff_dist(const MACROBLOCK *x, int plane,
}
#endif
diff += ((blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]);
- return aom_sum_squares_2d_i16(diff, diff_stride, visible_cols, visible_rows);
+ uint64_t sse =
+ aom_sum_squares_2d_i16(diff, diff_stride, visible_cols, visible_rows);
+ if (block_mse_q8 != NULL)
+ *block_mse_q8 = (unsigned int)((256 * sse) / (visible_cols * visible_rows));
+ return sse;
}
int av1_count_colors(const uint8_t *src, int stride, int rows, int cols,
@@ -2318,7 +2289,7 @@ static INLINE void dist_block_tx_domain(MACROBLOCK *x, int plane, int block,
tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+ if (is_cur_buf_hbd(xd))
*out_dist = av1_highbd_block_error(coeff, dqcoeff, buffer_length, &this_sse,
xd->bd);
else
@@ -2354,7 +2325,7 @@ static INLINE int64_t dist_block_px_domain(const AV1_COMP *cpi, MACROBLOCK *x,
uint8_t *recon;
DECLARE_ALIGNED(16, uint16_t, recon16[MAX_TX_SQUARE]);
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ if (is_cur_buf_hbd(xd)) {
recon = CONVERT_TO_BYTEPTR(recon16);
av1_highbd_convolve_2d_copy_sr(CONVERT_TO_SHORTPTR(dst), dst_stride,
CONVERT_TO_SHORTPTR(recon), MAX_TX_SIZE, bsw,
@@ -2376,11 +2347,29 @@ static INLINE int64_t dist_block_px_domain(const AV1_COMP *cpi, MACROBLOCK *x,
blk_row, blk_col, plane_bsize, tx_bsize);
}
-static double get_mean(const int16_t *diff, int stride, int w, int h) {
+static double get_diff_mean(const uint8_t *src, int src_stride,
+ const uint8_t *dst, int dst_stride, int w, int h) {
double sum = 0.0;
for (int j = 0; j < h; ++j) {
for (int i = 0; i < w; ++i) {
- sum += diff[j * stride + i];
+ const int diff = src[j * src_stride + i] - dst[j * dst_stride + i];
+ sum += diff;
+ }
+ }
+ assert(w > 0 && h > 0);
+ return sum / (w * h);
+}
+
+static double get_highbd_diff_mean(const uint8_t *src8, int src_stride,
+ const uint8_t *dst8, int dst_stride, int w,
+ int h) {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ double sum = 0.0;
+ for (int j = 0; j < h; ++j) {
+ for (int i = 0; i < w; ++i) {
+ const int diff = src[j * src_stride + i] - dst[j * dst_stride + i];
+ sum += diff;
}
}
assert(w > 0 && h > 0);
@@ -2469,6 +2458,17 @@ static void get_2x2_normalized_sses_and_sads(
#if CONFIG_COLLECT_RD_STATS
#if CONFIG_COLLECT_RD_STATS == 1
+static double get_mean(const int16_t *diff, int stride, int w, int h) {
+ double sum = 0.0;
+ for (int j = 0; j < h; ++j) {
+ for (int i = 0; i < w; ++i) {
+ sum += diff[j * stride + i];
+ }
+ }
+ assert(w > 0 && h > 0);
+ return sum / (w * h);
+}
+
static void PrintTransformUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
const RD_STATS *const rd_stats, int blk_row,
int blk_col, BLOCK_SIZE plane_bsize,
@@ -2491,10 +2491,9 @@ static void PrintTransformUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
const struct macroblockd_plane *const pd = &xd->plane[plane];
const int txw = tx_size_wide[tx_size];
const int txh = tx_size_high[tx_size];
- const int dequant_shift =
- (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
+ const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
const int q_step = pd->dequant_Q3[1] >> dequant_shift;
- const double num_samples = txw * txh;
+ const int num_samples = txw * txh;
const double rate_norm = (double)rd_stats->rate / num_samples;
const double dist_norm = (double)rd_stats->dist / num_samples;
@@ -2566,15 +2565,25 @@ static void PrintTransformUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
#endif // CONFIG_COLLECT_RD_STATS == 1
#if CONFIG_COLLECT_RD_STATS >= 2
-static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
+static void PrintPredictionUnitStats(const AV1_COMP *const cpi,
+ const TileDataEnc *tile_data,
+ MACROBLOCK *x,
const RD_STATS *const rd_stats,
BLOCK_SIZE plane_bsize) {
if (rd_stats->invalid_rate) return;
if (rd_stats->rate == INT_MAX || rd_stats->dist == INT64_MAX) return;
+ if (cpi->sf.inter_mode_rd_model_estimation == 1 &&
+ (tile_data == NULL ||
+ !tile_data->inter_mode_rd_models[plane_bsize].ready))
+ return;
+ (void)tile_data;
// Generate small sample to restrict output size.
static unsigned int seed = 95014;
- if (lcg_rand16(&seed) % 256 > 0) return;
+
+ if ((lcg_rand16(&seed) % (1 << (14 - num_pels_log2_lookup[plane_bsize]))) !=
+ 1)
+ return;
const char output_file[] = "pu_stats.txt";
FILE *fout = fopen(output_file, "a");
@@ -2589,8 +2598,7 @@ static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, &bw,
&bh);
const int num_samples = bw * bh;
- const int dequant_shift =
- (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
+ const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
const int q_step = pd->dequant_Q3[1] >> dequant_shift;
const double rate_norm = (double)rd_stats->rate / num_samples;
@@ -2607,7 +2615,14 @@ static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
const int16_t *const src_diff = p->src_diff;
const int shift = (xd->bd - 8);
- int64_t sse = aom_sum_squares_2d_i16(src_diff, diff_stride, bw, bh);
+ int64_t sse;
+ if (is_cur_buf_hbd(xd)) {
+ sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
+ bw, bh);
+ } else {
+ sse =
+ aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw, bh);
+ }
sse = ROUND_POWER_OF_TWO(sse, shift * 2);
const double sse_norm = (double)sse / num_samples;
@@ -2646,7 +2661,14 @@ static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
fprintf(fout, " %g %g %g", model_rate_norm, model_dist_norm,
model_rdcost_norm);
- double mean = get_mean(src_diff, diff_stride, bw, bh);
+ double mean;
+ if (is_cur_buf_hbd(xd)) {
+ mean = get_highbd_diff_mean(p->src.buf, p->src.stride, pd->dst.buf,
+ pd->dst.stride, bw, bh);
+ } else {
+ mean = get_diff_mean(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
+ bw, bh);
+ }
mean /= (1 << shift);
float hor_corr, vert_corr;
av1_get_horver_correlation_full(src_diff, diff_stride, bw, bh, &hor_corr,
@@ -2659,6 +2681,21 @@ static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
fprintf(fout, " %g %g %g %g %g %g %g %g", hdist[0], hdist[1], hdist[2],
hdist[3], vdist[0], vdist[1], vdist[2], vdist[3]);
+ if (cpi->sf.inter_mode_rd_model_estimation == 1) {
+ assert(tile_data->inter_mode_rd_models[plane_bsize].ready);
+ const int64_t overall_sse = get_sse(cpi, x);
+ int est_residue_cost = 0;
+ int64_t est_dist = 0;
+ get_est_rate_dist(tile_data, plane_bsize, overall_sse, &est_residue_cost,
+ &est_dist);
+ const double est_residue_cost_norm = (double)est_residue_cost / num_samples;
+ const double est_dist_norm = (double)est_dist / num_samples;
+ const double est_rdcost_norm =
+ (double)RDCOST(x->rdmult, est_residue_cost, est_dist) / num_samples;
+ fprintf(fout, " %g %g %g", est_residue_cost_norm, est_dist_norm,
+ est_rdcost_norm);
+ }
+
fprintf(fout, "\n");
fclose(fout);
}
@@ -2673,8 +2710,7 @@ static void model_rd_with_dnn(const AV1_COMP *const cpi,
const struct macroblockd_plane *const pd = &xd->plane[plane];
const int log_numpels = num_pels_log2_lookup[plane_bsize];
- const int dequant_shift =
- (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
+ const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
const int q_step = AOMMAX(pd->dequant_Q3[1] >> dequant_shift, 1);
const struct macroblock_plane *const p = &x->plane[plane];
@@ -2711,7 +2747,12 @@ static void model_rd_with_dnn(const AV1_COMP *const cpi,
get_2x2_normalized_sses_and_sads(cpi, plane_bsize, src, src_stride, dst,
dst_stride, src_diff, diff_stride,
sse_norm_arr, NULL);
- double mean = get_mean(src_diff, bw, bw, bh);
+ double mean;
+ if (is_cur_buf_hbd(xd)) {
+ mean = get_highbd_diff_mean(src, src_stride, dst, dst_stride, bw, bh);
+ } else {
+ mean = get_diff_mean(src, src_stride, dst, dst_stride, bw, bh);
+ }
if (shift) {
for (int k = 0; k < 4; ++k) sse_norm_arr[k] /= (1 << (2 * shift));
mean /= (1 << shift);
@@ -2790,7 +2831,7 @@ static void model_rd_for_sb_with_dnn(
int bw, bh;
get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL,
&bw, &bh);
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ if (is_cur_buf_hbd(xd)) {
sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf,
pd->dst.stride, bw, bh);
} else {
@@ -2829,8 +2870,7 @@ static void model_rd_with_surffit(const AV1_COMP *const cpi,
(void)plane_bsize;
const MACROBLOCKD *const xd = &x->e_mbd;
const struct macroblockd_plane *const pd = &xd->plane[plane];
- const int dequant_shift =
- (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
+ const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
const int qstep = AOMMAX(pd->dequant_Q3[1] >> dequant_shift, 1);
if (sse == 0) {
if (rate) *rate = 0;
@@ -2844,7 +2884,8 @@ static void model_rd_with_surffit(const AV1_COMP *const cpi,
const double yl = log(sse_norm / qstepsqr) / log(2.0);
double rate_f, dist_by_sse_norm_f;
- av1_model_rd_surffit(xm, yl, &rate_f, &dist_by_sse_norm_f);
+ av1_model_rd_surffit(plane_bsize, sse_norm, xm, yl, &rate_f,
+ &dist_by_sse_norm_f);
const double dist_f = dist_by_sse_norm_f * sse_norm;
int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5);
@@ -2894,7 +2935,7 @@ static void model_rd_for_sb_with_surffit(
const int shift = (xd->bd - 8);
get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL,
&bw, &bh);
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ if (is_cur_buf_hbd(xd)) {
sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf,
pd->dst.stride, bw, bh);
} else {
@@ -2934,8 +2975,7 @@ static void model_rd_with_curvfit(const AV1_COMP *const cpi,
(void)plane_bsize;
const MACROBLOCKD *const xd = &x->e_mbd;
const struct macroblockd_plane *const pd = &xd->plane[plane];
- const int dequant_shift =
- (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
+ const int dequant_shift = (is_cur_buf_hbd(xd)) ? xd->bd - 5 : 3;
const int qstep = AOMMAX(pd->dequant_Q3[1] >> dequant_shift, 1);
if (sse == 0) {
@@ -2946,10 +2986,11 @@ static void model_rd_with_curvfit(const AV1_COMP *const cpi,
aom_clear_system_state();
const double sse_norm = (double)sse / num_samples;
const double qstepsqr = (double)qstep * qstep;
- const double xqr = log(sse_norm / qstepsqr) / log(2.0);
+ const double xqr = log2(sse_norm / qstepsqr);
double rate_f, dist_by_sse_norm_f;
- av1_model_rd_curvfit(xqr, &rate_f, &dist_by_sse_norm_f);
+ av1_model_rd_curvfit(plane_bsize, sse_norm, xqr, &rate_f,
+ &dist_by_sse_norm_f);
const double dist_f = dist_by_sse_norm_f * sse_norm;
int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5);
@@ -3000,7 +3041,7 @@ static void model_rd_for_sb_with_curvfit(
get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL,
&bw, &bh);
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ if (is_cur_buf_hbd(xd)) {
sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf,
pd->dst.stride, bw, bh);
} else {
@@ -3029,78 +3070,13 @@ static void model_rd_for_sb_with_curvfit(
*out_dist_sum = dist_sum;
}
-static void model_rd_for_sb_with_fullrdy(
- const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
- int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
- int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
- int *plane_rate, int64_t *plane_sse, int64_t *plane_dist) {
- const int ref = xd->mi[0]->ref_frame[0];
-
- int64_t rate_sum = 0;
- int64_t dist_sum = 0;
- int64_t total_sse = 0;
-
- for (int plane = plane_from; plane <= plane_to; ++plane) {
- struct macroblock_plane *const p = &x->plane[plane];
- struct macroblockd_plane *const pd = &xd->plane[plane];
- const BLOCK_SIZE plane_bsize =
- get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
- const int bw = block_size_wide[plane_bsize];
- const int bh = block_size_high[plane_bsize];
- int64_t sse;
- int rate;
- int64_t dist;
-
- if (x->skip_chroma_rd && plane) continue;
-
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf,
- pd->dst.stride, bw, bh);
- } else {
- sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw,
- bh);
- }
- sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
-
- RD_STATS rd_stats;
- if (plane == 0) {
- select_tx_type_yrd(cpi, x, &rd_stats, bsize, mi_row, mi_col, INT64_MAX);
- if (rd_stats.invalid_rate) {
- rate = 0;
- dist = sse << 4;
- } else {
- rate = rd_stats.rate;
- dist = rd_stats.dist;
- }
- } else {
- model_rd_with_curvfit(cpi, x, plane_bsize, plane, sse, bw * bh, &rate,
- &dist);
- }
-
- if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
-
- total_sse += sse;
- rate_sum += rate;
- dist_sum += dist;
-
- if (plane_rate) plane_rate[plane] = rate;
- if (plane_sse) plane_sse[plane] = sse;
- if (plane_dist) plane_dist[plane] = dist;
- }
-
- if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0;
- if (skip_sse_sb) *skip_sse_sb = total_sse << 4;
- *out_rate_sum = (int)rate_sum;
- *out_dist_sum = dist_sum;
-}
-
static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
int block, int blk_row, int blk_col,
BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
const TXB_CTX *const txb_ctx,
FAST_TX_SEARCH_MODE ftxs_mode,
- int use_fast_coef_costing, int64_t ref_best_rd,
- RD_STATS *best_rd_stats) {
+ int use_fast_coef_costing, int skip_trellis,
+ int64_t ref_best_rd, RD_STATS *best_rd_stats) {
const AV1_COMMON *cm = &cpi->common;
MACROBLOCKD *xd = &x->e_mbd;
struct macroblockd_plane *const pd = &xd->plane[plane];
@@ -3118,6 +3094,7 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
tran_low_t *best_dqcoeff = this_dqcoeff;
const int txk_type_idx =
av1_get_txk_type_index(plane_bsize, blk_row, blk_col);
+ int perform_block_coeff_opt;
av1_invalid_rd_stats(best_rd_stats);
TXB_RD_INFO *intra_txb_rd_info = NULL;
@@ -3129,6 +3106,9 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
(mi_row + mi_size_high[plane_bsize] < xd->tile.mi_row_end) &&
mi_col >= xd->tile.mi_col_start &&
(mi_col + mi_size_wide[plane_bsize] < xd->tile.mi_col_end);
+ skip_trellis |=
+ cpi->optimize_seg_arr[mbmi->segment_id] == NO_TRELLIS_OPT ||
+ cpi->optimize_seg_arr[mbmi->segment_id] == FINAL_PASS_TRELLIS_OPT;
if (within_border && cpi->sf.use_intra_txb_hash && frame_is_intra_only(cm) &&
!is_inter && plane == 0 &&
tx_size_wide[tx_size] == tx_size_high[tx_size]) {
@@ -3168,7 +3148,8 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
TX_TYPE txk_end = TX_TYPES - 1;
if ((!is_inter && x->use_default_intra_tx_type) ||
(is_inter && x->use_default_inter_tx_type)) {
- txk_start = txk_end = get_default_tx_type(0, xd, tx_size);
+ txk_start = txk_end =
+ get_default_tx_type(0, xd, tx_size, cpi->is_screen_content_type);
} else if (x->rd_model == LOW_TXFM_RD || x->cb_partition_scan) {
if (plane == 0) txk_end = DCT_DCT;
}
@@ -3186,7 +3167,9 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
}
const uint16_t ext_tx_used_flag = av1_ext_tx_used_flag[tx_set_type];
if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32 ||
- ext_tx_used_flag == 0x0001) {
+ ext_tx_used_flag == 0x0001 ||
+ (is_inter && cpi->oxcf.use_inter_dct_only) ||
+ (!is_inter && cpi->oxcf.use_intra_dct_only)) {
txk_start = txk_end = DCT_DCT;
}
uint16_t allowed_tx_mask = 0; // 1: allow; 0: skip.
@@ -3212,14 +3195,35 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
}
}
}
+
+ if (cpi->oxcf.enable_flip_idtx == 0) {
+ for (TX_TYPE tx_type = FLIPADST_DCT; tx_type <= H_FLIPADST; ++tx_type) {
+ allowed_tx_mask &= ~(1 << tx_type);
+ }
+ }
+
// Need to have at least one transform type allowed.
if (allowed_tx_mask == 0) {
txk_start = txk_end = (plane ? uv_tx_type : DCT_DCT);
allowed_tx_mask = (1 << txk_start);
}
+ const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
+ int64_t block_sse = 0;
+ unsigned int block_mse_q8 = UINT_MAX;
+ block_sse = pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize, tx_bsize,
+ &block_mse_q8);
+ assert(block_mse_q8 != UINT_MAX);
+ if (is_cur_buf_hbd(xd)) {
+ block_sse = ROUND_POWER_OF_TWO(block_sse, (xd->bd - 8) * 2);
+ block_mse_q8 = ROUND_POWER_OF_TWO(block_mse_q8, (xd->bd - 8) * 2);
+ }
+ block_sse *= 16;
+ // Tranform domain distortion is accurate for higher residuals.
+ // TODO(any): Experiment with variance and mean based thresholds
int use_transform_domain_distortion =
(cpi->sf.use_transform_domain_distortion > 0) &&
+ (block_mse_q8 >= cpi->tx_domain_dist_threshold) &&
// Any 64-pt transforms only preserves half the coefficients.
// Therefore transform domain distortion is not valid for these
// transform sizes.
@@ -3237,20 +3241,18 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
const uint16_t *eobs_ptr = x->plane[plane].eobs;
- const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
- int64_t block_sse =
- pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize, tx_bsize);
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
- block_sse = ROUND_POWER_OF_TWO(block_sse, (xd->bd - 8) * 2);
- block_sse *= 16;
+ // Used mse based threshold logic to take decision of R-D of optimization of
+ // coeffs. For smaller residuals, coeff optimization would be helpful. For
+ // larger residuals, R-D optimization may not be effective.
+ // TODO(any): Experiment with variance and mean based thresholds
+ perform_block_coeff_opt = (block_mse_q8 <= cpi->coeff_opt_dist_threshold);
for (TX_TYPE tx_type = txk_start; tx_type <= txk_end; ++tx_type) {
if (!(allowed_tx_mask & (1 << tx_type))) continue;
if (plane == 0) mbmi->txk_type[txk_type_idx] = tx_type;
RD_STATS this_rd_stats;
av1_invalid_rd_stats(&this_rd_stats);
-
- if (!cpi->optimize_seg_arr[mbmi->segment_id]) {
+ if (skip_trellis || (!perform_block_coeff_opt)) {
av1_xform_quant(
cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, tx_type,
USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP);
@@ -3270,8 +3272,8 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
RDCOST(x->rdmult, 0, AOMMIN(this_rd_stats.dist, this_rd_stats.sse));
if (dist_cost_estimate - (dist_cost_estimate >> 3) > best_rd_) continue;
}
- av1_optimize_b(cpi, x, plane, block, tx_size, tx_type, txb_ctx, 1,
- &rate_cost);
+ av1_optimize_b(cpi, x, plane, block, tx_size, tx_type, txb_ctx,
+ cpi->sf.trellis_eob_fast, &rate_cost);
}
if (eobs_ptr[block] == 0) {
// When eob is 0, pixel domain distortion is more efficient and accurate.
@@ -3280,8 +3282,38 @@ static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
dist_block_tx_domain(x, plane, block, tx_size, &this_rd_stats.dist,
&this_rd_stats.sse);
} else {
- this_rd_stats.dist = dist_block_px_domain(
- cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size);
+ int64_t sse_diff = INT64_MAX;
+ // high_energy threshold assumes that every pixel within a txfm block
+ // has a residue energy of at least 25% of the maximum, i.e. 128 * 128
+ // for 8 bit, then the threshold is scaled based on input bit depth.
+ const int64_t high_energy_thresh =
+ ((int64_t)128 * 128 * tx_size_2d[tx_size]) << ((xd->bd - 8) * 2);
+ const int is_high_energy = (block_sse >= high_energy_thresh);
+ if (tx_size == TX_64X64 || is_high_energy) {
+ // Because 3 out 4 quadrants of transform coefficients are forced to
+ // zero, the inverse transform has a tendency to overflow. sse_diff
+ // is effectively the energy of those 3 quadrants, here we use it
+ // to decide if we should do pixel domain distortion. If the energy
+ // is mostly in first quadrant, then it is unlikely that we have
+ // overflow issue in inverse transform.
+ dist_block_tx_domain(x, plane, block, tx_size, &this_rd_stats.dist,
+ &this_rd_stats.sse);
+ sse_diff = block_sse - this_rd_stats.sse;
+ }
+ if (tx_size != TX_64X64 || !is_high_energy ||
+ (sse_diff * 2) < this_rd_stats.sse) {
+ const int64_t tx_domain_dist = this_rd_stats.dist;
+ this_rd_stats.dist = dist_block_px_domain(
+ cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size);
+ // For high energy blocks, occasionally, the pixel domain distortion
+ // can be artificially low due to clamping at reconstruction stage
+ // even when inverse transform output is hugely different from the
+ // actual residue.
+ if (is_high_energy && this_rd_stats.dist < tx_domain_dist)
+ this_rd_stats.dist = tx_domain_dist;
+ } else {
+ this_rd_stats.dist += sse_diff;
+ }
this_rd_stats.sse = block_sse;
}
@@ -3396,7 +3428,7 @@ RECON_INTRA:
// if the last search tx_type is the best tx_type, we don't need to
// do this again
if (best_tx_type != last_tx_type) {
- if (!cpi->optimize_seg_arr[mbmi->segment_id]) {
+ if (skip_trellis) {
av1_xform_quant(
cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
best_tx_type,
@@ -3404,8 +3436,8 @@ RECON_INTRA:
} else {
av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize,
tx_size, best_tx_type, AV1_XFORM_QUANT_FP);
- av1_optimize_b(cpi, x, plane, block, tx_size, best_tx_type, txb_ctx, 1,
- &rate_cost);
+ av1_optimize_b(cpi, x, plane, block, tx_size, best_tx_type, txb_ctx,
+ cpi->sf.trellis_eob_fast, &rate_cost);
}
}
@@ -3432,12 +3464,11 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
struct rdcost_block_args *args = arg;
MACROBLOCK *const x = args->x;
MACROBLOCKD *const xd = &x->e_mbd;
- const MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int is_inter = is_inter_block(xd->mi[0]);
const AV1_COMP *cpi = args->cpi;
ENTROPY_CONTEXT *a = args->t_above + blk_col;
ENTROPY_CONTEXT *l = args->t_left + blk_row;
const AV1_COMMON *cm = &cpi->common;
- int64_t rd1, rd2, rd;
RD_STATS this_rd_stats;
av1_init_rd_stats(&this_rd_stats);
@@ -3447,7 +3478,7 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
return;
}
- if (!is_inter_block(mbmi)) {
+ if (!is_inter) {
av1_predict_intra_block_facade(cm, xd, plane, blk_col, blk_row, tx_size);
av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size);
}
@@ -3455,10 +3486,11 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
search_txk_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
&txb_ctx, args->ftxs_mode, args->use_fast_coef_costing,
- args->best_rd - args->this_rd, &this_rd_stats);
+ args->skip_trellis, args->best_rd - args->this_rd,
+ &this_rd_stats);
if (plane == AOM_PLANE_Y && xd->cfl.store_y) {
- assert(!is_inter_block(mbmi) || plane_bsize < BLOCK_8X8);
+ assert(!is_inter || plane_bsize < BLOCK_8X8);
cfl_store_tx(xd, blk_row, blk_col, tx_size, plane_bsize);
}
@@ -3477,37 +3509,26 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
else
set_blk_skip(x, plane, blk_idx, 0);
- rd1 = RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist);
- rd2 = RDCOST(x->rdmult, 0, this_rd_stats.sse);
+ const int64_t rd1 = RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist);
+ const int64_t rd2 = RDCOST(x->rdmult, 0, this_rd_stats.sse);
// TODO(jingning): temporarily enabled only for luma component
- rd = AOMMIN(rd1, rd2);
+ const int64_t rd = AOMMIN(rd1, rd2);
this_rd_stats.skip &= !x->plane[plane].eobs[block];
-#if CONFIG_ONE_PASS_SVM
- if (plane == AOM_PLANE_Y && plane_bsize >= BLOCK_8X8) {
- int eob = x->plane[plane].eobs[block];
- av1_add_reg_stat(&this_rd_stats, eob, rd, this_rd_stats.sse, blk_row,
- blk_col, plane_bsize, txsize_to_bsize[tx_size]);
- }
-#endif
-
av1_merge_rd_stats(&args->rd_stats, &this_rd_stats);
args->this_rd += rd;
- if (args->this_rd > args->best_rd) {
- args->exit_early = 1;
- return;
- }
+ if (args->this_rd > args->best_rd) args->exit_early = 1;
}
static void txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
RD_STATS *rd_stats, int64_t ref_best_rd,
int64_t this_rd, int plane, BLOCK_SIZE bsize,
TX_SIZE tx_size, int use_fast_coef_casting,
- FAST_TX_SEARCH_MODE ftxs_mode) {
+ FAST_TX_SEARCH_MODE ftxs_mode, int skip_trellis) {
MACROBLOCKD *const xd = &x->e_mbd;
const struct macroblockd_plane *const pd = &xd->plane[plane];
struct rdcost_block_args args;
@@ -3518,8 +3539,14 @@ static void txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
args.use_fast_coef_costing = use_fast_coef_casting;
args.ftxs_mode = ftxs_mode;
args.this_rd = this_rd;
+ args.skip_trellis = skip_trellis;
av1_init_rd_stats(&args.rd_stats);
+ if (!cpi->oxcf.enable_tx64 && txsize_sqr_up_map[tx_size] == TX_64X64) {
+ av1_invalid_rd_stats(rd_stats);
+ return;
+ }
+
if (plane == 0) xd->mi[0]->tx_size = tx_size;
av1_get_entropy_contexts(bsize, pd, args.t_above, args.t_left);
@@ -3544,23 +3571,20 @@ static void txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
static int tx_size_cost(const AV1_COMMON *const cm, const MACROBLOCK *const x,
BLOCK_SIZE bsize, TX_SIZE tx_size) {
- const MACROBLOCKD *const xd = &x->e_mbd;
- const MB_MODE_INFO *const mbmi = xd->mi[0];
+ assert(bsize == x->e_mbd.mi[0]->sb_type);
+ if (cm->tx_mode != TX_MODE_SELECT || !block_signals_txsize(bsize)) return 0;
- if (cm->tx_mode == TX_MODE_SELECT && block_signals_txsize(mbmi->sb_type)) {
- const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize);
- const int depth = tx_size_to_depth(tx_size, bsize);
- const int tx_size_ctx = get_tx_size_context(xd);
- int r_tx_size = x->tx_size_cost[tx_size_cat][tx_size_ctx][depth];
- return r_tx_size;
- } else {
- return 0;
- }
+ const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize);
+ const int depth = tx_size_to_depth(tx_size, bsize);
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const int tx_size_ctx = get_tx_size_context(xd);
+ return x->tx_size_cost[tx_size_cat][tx_size_ctx][depth];
}
static int64_t txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
RD_STATS *rd_stats, int64_t ref_best_rd, BLOCK_SIZE bs,
- TX_SIZE tx_size, FAST_TX_SEARCH_MODE ftxs_mode) {
+ TX_SIZE tx_size, FAST_TX_SEARCH_MODE ftxs_mode,
+ int skip_trellis) {
const AV1_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = xd->mi[0];
@@ -3594,49 +3618,60 @@ static int64_t txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
mbmi->tx_size = tx_size;
txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, AOMMIN(this_rd, skip_rd),
AOM_PLANE_Y, bs, tx_size, cpi->sf.use_fast_coef_costing,
- ftxs_mode);
+ ftxs_mode, skip_trellis);
if (rd_stats->rate == INT_MAX) return INT64_MAX;
+ // rdstats->rate should include all the rate except skip/non-skip cost as the
+ // same is accounted in the caller functions after rd evaluation of all
+ // planes. However the decisions should be done after considering the
+ // skip/non-skip header cost
if (rd_stats->skip) {
if (is_inter) {
rd = RDCOST(x->rdmult, s1, rd_stats->sse);
-#if CONFIG_ONE_PASS_SVM
- // TODO(chiyotsai@google.com): Investigate if these updates are really
- // needed.
- av1_reg_stat_skipmode_update(rd_stats, x->rdmult);
-#endif
} else {
rd = RDCOST(x->rdmult, s1 + r_tx_size * tx_select, rd_stats->sse);
-#if CONFIG_ONE_PASS_SVM
- av1_reg_stat_skipmode_update(rd_stats, x->rdmult);
-#endif
+ rd_stats->rate += r_tx_size * tx_select;
}
} else {
rd = RDCOST(x->rdmult, rd_stats->rate + s0 + r_tx_size * tx_select,
rd_stats->dist);
+ rd_stats->rate += r_tx_size * tx_select;
+ }
+ if (is_inter && !xd->lossless[xd->mi[0]->segment_id]) {
+ int64_t temp_skip_rd = RDCOST(x->rdmult, s1, rd_stats->sse);
+ if (temp_skip_rd <= rd) {
+ rd = temp_skip_rd;
+ rd_stats->rate = 0;
+ rd_stats->dist = rd_stats->sse;
+ rd_stats->skip = 1;
+ }
}
-
- if (tx_select) rd_stats->rate += r_tx_size;
-
- if (is_inter && !xd->lossless[xd->mi[0]->segment_id] && !(rd_stats->skip))
- rd = AOMMIN(rd, RDCOST(x->rdmult, s1, rd_stats->sse));
return rd;
}
static int64_t estimate_yrd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bs,
- MACROBLOCK *x, int *r, int64_t *d, int *s,
- int64_t *sse, int64_t ref_best_rd) {
- RD_STATS rd_stats;
+ MACROBLOCK *x, int64_t ref_best_rd,
+ RD_STATS *rd_stats) {
+ MACROBLOCKD *const xd = &x->e_mbd;
av1_subtract_plane(x, bs, 0);
x->rd_model = LOW_TXFM_RD;
- int64_t rd = txfm_yrd(cpi, x, &rd_stats, ref_best_rd, bs,
- max_txsize_rect_lookup[bs], FTXS_NONE);
+ int skip_trellis = cpi->optimize_seg_arr[xd->mi[0]->segment_id] ==
+ NO_ESTIMATE_YRD_TRELLIS_OPT;
+ const int64_t rd =
+ txfm_yrd(cpi, x, rd_stats, ref_best_rd, bs, max_txsize_rect_lookup[bs],
+ FTXS_NONE, skip_trellis);
x->rd_model = FULL_TXFM_RD;
- *r = rd_stats.rate;
- *d = rd_stats.dist;
- *s = rd_stats.skip;
- *sse = rd_stats.sse;
+ if (rd != INT64_MAX) {
+ const int skip_ctx = av1_get_skip_context(xd);
+ if (rd_stats->skip) {
+ const int s1 = x->skip_cost[skip_ctx][1];
+ rd_stats->rate = s1;
+ } else {
+ const int s0 = x->skip_cost[skip_ctx][0];
+ rd_stats->rate += s0;
+ }
+ }
return rd;
}
@@ -3662,7 +3697,7 @@ static void choose_largest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, AOMMIN(this_rd, skip_rd),
AOM_PLANE_Y, bs, mbmi->tx_size,
- cpi->sf.use_fast_coef_costing, FTXS_NONE);
+ cpi->sf.use_fast_coef_costing, FTXS_NONE, 0);
// Reset the pruning flags.
av1_zero(x->tx_search_prune);
x->tx_split_prune_flag = 0;
@@ -3677,7 +3712,7 @@ static void choose_smallest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
mbmi->tx_size = TX_4X4;
// TODO(any) : Pass this_rd based on skip/non-skip cost
txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, 0, bs, mbmi->tx_size,
- cpi->sf.use_fast_coef_costing, FTXS_NONE);
+ cpi->sf.use_fast_coef_costing, FTXS_NONE, 0);
}
static INLINE int bsize_to_num_blk(BLOCK_SIZE bsize) {
@@ -3707,55 +3742,64 @@ static int get_search_init_depth(int mi_width, int mi_height, int is_inter,
static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
MACROBLOCK *x, RD_STATS *rd_stats,
int64_t ref_best_rd, BLOCK_SIZE bs) {
+ av1_invalid_rd_stats(rd_stats);
+
const AV1_COMMON *const cm = &cpi->common;
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = xd->mi[0];
- int64_t rd = INT64_MAX;
- int n;
- int start_tx;
- int depth;
- int64_t best_rd = INT64_MAX;
const TX_SIZE max_rect_tx_size = max_txsize_rect_lookup[bs];
- TX_SIZE best_tx_size = max_rect_tx_size;
- TX_TYPE best_txk_type[TXK_TYPE_BUF_LEN];
- uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
- const int n4 = bsize_to_num_blk(bs);
const int tx_select = cm->tx_mode == TX_MODE_SELECT;
-
- av1_invalid_rd_stats(rd_stats);
+ int start_tx;
+ int depth, init_depth;
if (tx_select) {
start_tx = max_rect_tx_size;
- depth = get_search_init_depth(mi_size_wide[bs], mi_size_high[bs],
- is_inter_block(mbmi), &cpi->sf);
+ init_depth = get_search_init_depth(mi_size_wide[bs], mi_size_high[bs],
+ is_inter_block(mbmi), &cpi->sf);
} else {
const TX_SIZE chosen_tx_size = tx_size_from_tx_mode(bs, cm->tx_mode);
start_tx = chosen_tx_size;
- depth = MAX_TX_DEPTH;
+ init_depth = MAX_TX_DEPTH;
}
prune_tx(cpi, bs, x, xd, EXT_TX_SET_ALL16);
- for (n = start_tx; depth <= MAX_TX_DEPTH; depth++, n = sub_tx_size_map[n]) {
+ TX_TYPE best_txk_type[TXK_TYPE_BUF_LEN];
+ uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+ TX_SIZE best_tx_size = max_rect_tx_size;
+ int64_t best_rd = INT64_MAX;
+ const int n4 = bsize_to_num_blk(bs);
+ x->rd_model = FULL_TXFM_RD;
+ depth = init_depth;
+ int64_t rd[MAX_TX_DEPTH + 1] = { INT64_MAX, INT64_MAX, INT64_MAX };
+ for (int n = start_tx; depth <= MAX_TX_DEPTH;
+ depth++, n = sub_tx_size_map[n]) {
#if CONFIG_DIST_8X8
if (x->using_dist_8x8) {
if (tx_size_wide[n] < 8 || tx_size_high[n] < 8) continue;
}
#endif
+ if (!cpi->oxcf.enable_tx64 && txsize_sqr_up_map[n] == TX_64X64) continue;
+
RD_STATS this_rd_stats;
- if (mbmi->ref_mv_idx > 0) x->rd_model = LOW_TXFM_RD;
- rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, n, FTXS_NONE);
- x->rd_model = FULL_TXFM_RD;
+ rd[depth] =
+ txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, n, FTXS_NONE, 0);
- if (rd < best_rd) {
+ if (rd[depth] < best_rd) {
memcpy(best_txk_type, mbmi->txk_type,
sizeof(best_txk_type[0]) * TXK_TYPE_BUF_LEN);
memcpy(best_blk_skip, x->blk_skip, sizeof(best_blk_skip[0]) * n4);
best_tx_size = n;
- best_rd = rd;
+ best_rd = rd[depth];
*rd_stats = this_rd_stats;
}
if (n == TX_4X4) break;
+ // If we are searching three depths, prune the smallest size depending
+ // on rd results for the first two depths for low contrast blocks.
+ if (depth > init_depth && depth != MAX_TX_DEPTH &&
+ x->source_variance < 256) {
+ if (rd[depth - 1] != INT64_MAX && rd[depth] > rd[depth - 1]) break;
+ }
}
if (rd_stats->rate != INT_MAX) {
@@ -3770,14 +3814,245 @@ static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
x->tx_split_prune_flag = 0;
}
+// origin_threshold * 128 / 100
+static const uint32_t skip_pred_threshold[3][BLOCK_SIZES_ALL] = {
+ {
+ 64, 64, 64, 70, 60, 60, 68, 68, 68, 68, 68,
+ 68, 68, 68, 68, 68, 64, 64, 70, 70, 68, 68,
+ },
+ {
+ 88, 88, 88, 86, 87, 87, 68, 68, 68, 68, 68,
+ 68, 68, 68, 68, 68, 88, 88, 86, 86, 68, 68,
+ },
+ {
+ 90, 93, 93, 90, 93, 93, 74, 74, 74, 74, 74,
+ 74, 74, 74, 74, 74, 90, 90, 90, 90, 74, 74,
+ },
+};
+
+// lookup table for predict_skip_flag
+// int max_tx_size = max_txsize_rect_lookup[bsize];
+// if (tx_size_high[max_tx_size] > 16 || tx_size_wide[max_tx_size] > 16)
+// max_tx_size = AOMMIN(max_txsize_lookup[bsize], TX_16X16);
+static const TX_SIZE max_predict_sf_tx_size[BLOCK_SIZES_ALL] = {
+ TX_4X4, TX_4X8, TX_8X4, TX_8X8, TX_8X16, TX_16X8,
+ TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_16X16,
+ TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_4X16, TX_16X4,
+ TX_8X8, TX_8X8, TX_16X16, TX_16X16,
+};
+
+// Uses simple features on top of DCT coefficients to quickly predict
+// whether optimal RD decision is to skip encoding the residual.
+// The sse value is stored in dist.
+static int predict_skip_flag(MACROBLOCK *x, BLOCK_SIZE bsize, int64_t *dist,
+ int reduced_tx_set) {
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ const MACROBLOCKD *xd = &x->e_mbd;
+ const int16_t dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd);
+
+ *dist = pixel_diff_dist(x, 0, 0, 0, bsize, bsize, NULL);
+
+ const int64_t mse = *dist / bw / bh;
+ // Normalized quantizer takes the transform upscaling factor (8 for tx size
+ // smaller than 32) into account.
+ const int16_t normalized_dc_q = dc_q >> 3;
+ const int64_t mse_thresh = (int64_t)normalized_dc_q * normalized_dc_q / 8;
+ // Predict not to skip when mse is larger than threshold.
+ if (mse > mse_thresh) return 0;
+
+ const int max_tx_size = max_predict_sf_tx_size[bsize];
+ const int tx_h = tx_size_high[max_tx_size];
+ const int tx_w = tx_size_wide[max_tx_size];
+ DECLARE_ALIGNED(32, tran_low_t, coefs[32 * 32]);
+ TxfmParam param;
+ param.tx_type = DCT_DCT;
+ param.tx_size = max_tx_size;
+ param.bd = xd->bd;
+ param.is_hbd = is_cur_buf_hbd(xd);
+ param.lossless = 0;
+ param.tx_set_type = av1_get_ext_tx_set_type(
+ param.tx_size, is_inter_block(xd->mi[0]), reduced_tx_set);
+ const int bd_idx = (xd->bd == 8) ? 0 : ((xd->bd == 10) ? 1 : 2);
+ const uint32_t max_qcoef_thresh = skip_pred_threshold[bd_idx][bsize];
+ const int16_t *src_diff = x->plane[0].src_diff;
+ const int n_coeff = tx_w * tx_h;
+ const int16_t ac_q = av1_ac_quant_QTX(x->qindex, 0, xd->bd);
+ const uint32_t dc_thresh = max_qcoef_thresh * dc_q;
+ const uint32_t ac_thresh = max_qcoef_thresh * ac_q;
+ for (int row = 0; row < bh; row += tx_h) {
+ for (int col = 0; col < bw; col += tx_w) {
+ av1_fwd_txfm(src_diff + col, coefs, bw, &param);
+ // Operating on TX domain, not pixels; we want the QTX quantizers
+ const uint32_t dc_coef = (((uint32_t)abs(coefs[0])) << 7);
+ if (dc_coef >= dc_thresh) return 0;
+ for (int i = 1; i < n_coeff; ++i) {
+ const uint32_t ac_coef = (((uint32_t)abs(coefs[i])) << 7);
+ if (ac_coef >= ac_thresh) return 0;
+ }
+ }
+ src_diff += tx_h * bw;
+ }
+ return 1;
+}
+
+// Used to set proper context for early termination with skip = 1.
+static void set_skip_flag(MACROBLOCK *x, RD_STATS *rd_stats, int bsize,
+ int64_t dist) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int n4 = bsize_to_num_blk(bsize);
+ const TX_SIZE tx_size = max_txsize_rect_lookup[bsize];
+ memset(mbmi->txk_type, DCT_DCT, sizeof(mbmi->txk_type[0]) * TXK_TYPE_BUF_LEN);
+ memset(mbmi->inter_tx_size, tx_size, sizeof(mbmi->inter_tx_size));
+ mbmi->tx_size = tx_size;
+ for (int i = 0; i < n4; ++i) set_blk_skip(x, 0, i, 1);
+ rd_stats->skip = 1;
+ if (is_cur_buf_hbd(xd)) dist = ROUND_POWER_OF_TWO(dist, (xd->bd - 8) * 2);
+ rd_stats->dist = rd_stats->sse = (dist << 4);
+ // Though decision is to make the block as skip based on luma stats,
+ // it is possible that block becomes non skip after chroma rd. In addition
+ // intermediate non skip costs calculated by caller function will be
+ // incorrect, if rate is set as zero (i.e., if zero_blk_rate is not
+ // accounted). Hence intermediate rate is populated to code the luma tx blks
+ // as skip, the caller function based on final rd decision (i.e., skip vs
+ // non-skip) sets the final rate accordingly. Here the rate populated
+ // corresponds to coding all the tx blocks with zero_blk_rate (based on max tx
+ // size possible) in the current block. Eg: For 128*128 block, rate would be
+ // 4 * zero_blk_rate where zero_blk_rate corresponds to coding of one 64x64 tx
+ // block as 'all zeros'
+ ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE];
+ ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE];
+ av1_get_entropy_contexts(bsize, &xd->plane[0], ctxa, ctxl);
+ ENTROPY_CONTEXT *ta = ctxa;
+ ENTROPY_CONTEXT *tl = ctxl;
+ const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+ TXB_CTX txb_ctx;
+ get_txb_ctx(bsize, tx_size, 0, ta, tl, &txb_ctx);
+ const int zero_blk_rate = x->coeff_costs[txs_ctx][PLANE_TYPE_Y]
+ .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
+ rd_stats->rate = zero_blk_rate *
+ (block_size_wide[bsize] >> tx_size_wide_log2[tx_size]) *
+ (block_size_high[bsize] >> tx_size_high_log2[tx_size]);
+}
+
+static INLINE uint32_t get_block_residue_hash(MACROBLOCK *x, BLOCK_SIZE bsize) {
+ const int rows = block_size_high[bsize];
+ const int cols = block_size_wide[bsize];
+ const int16_t *diff = x->plane[0].src_diff;
+ const uint32_t hash = av1_get_crc32c_value(&x->mb_rd_record.crc_calculator,
+ (uint8_t *)diff, 2 * rows * cols);
+ return (hash << 5) + bsize;
+}
+
+static void save_tx_rd_info(int n4, uint32_t hash, const MACROBLOCK *const x,
+ const RD_STATS *const rd_stats,
+ MB_RD_RECORD *tx_rd_record) {
+ int index;
+ if (tx_rd_record->num < RD_RECORD_BUFFER_LEN) {
+ index =
+ (tx_rd_record->index_start + tx_rd_record->num) % RD_RECORD_BUFFER_LEN;
+ ++tx_rd_record->num;
+ } else {
+ index = tx_rd_record->index_start;
+ tx_rd_record->index_start =
+ (tx_rd_record->index_start + 1) % RD_RECORD_BUFFER_LEN;
+ }
+ MB_RD_INFO *const tx_rd_info = &tx_rd_record->tx_rd_info[index];
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ tx_rd_info->hash_value = hash;
+ tx_rd_info->tx_size = mbmi->tx_size;
+ memcpy(tx_rd_info->blk_skip, x->blk_skip,
+ sizeof(tx_rd_info->blk_skip[0]) * n4);
+ av1_copy(tx_rd_info->inter_tx_size, mbmi->inter_tx_size);
+ av1_copy(tx_rd_info->txk_type, mbmi->txk_type);
+ tx_rd_info->rd_stats = *rd_stats;
+}
+
+static void fetch_tx_rd_info(int n4, const MB_RD_INFO *const tx_rd_info,
+ RD_STATS *const rd_stats, MACROBLOCK *const x) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ mbmi->tx_size = tx_rd_info->tx_size;
+ memcpy(x->blk_skip, tx_rd_info->blk_skip,
+ sizeof(tx_rd_info->blk_skip[0]) * n4);
+ av1_copy(mbmi->inter_tx_size, tx_rd_info->inter_tx_size);
+ av1_copy(mbmi->txk_type, tx_rd_info->txk_type);
+ *rd_stats = tx_rd_info->rd_stats;
+}
+
+static INLINE int32_t find_mb_rd_info(const MB_RD_RECORD *const mb_rd_record,
+ const int64_t ref_best_rd,
+ const uint32_t hash) {
+ int32_t match_index = -1;
+ if (ref_best_rd != INT64_MAX) {
+ for (int i = 0; i < mb_rd_record->num; ++i) {
+ const int index = (mb_rd_record->index_start + i) % RD_RECORD_BUFFER_LEN;
+ // If there is a match in the tx_rd_record, fetch the RD decision and
+ // terminate early.
+ if (mb_rd_record->tx_rd_info[index].hash_value == hash) {
+ match_index = index;
+ break;
+ }
+ }
+ }
+ return match_index;
+}
+
static void super_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
RD_STATS *rd_stats, BLOCK_SIZE bs,
int64_t ref_best_rd) {
MACROBLOCKD *xd = &x->e_mbd;
av1_init_rd_stats(rd_stats);
-
+ int is_inter = is_inter_block(xd->mi[0]);
assert(bs == xd->mi[0]->sb_type);
+ const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2);
+ const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2);
+
+ uint32_t hash = 0;
+ int32_t match_index = -1;
+ MB_RD_RECORD *mb_rd_record = NULL;
+ const int within_border = mi_row >= xd->tile.mi_row_start &&
+ (mi_row + mi_size_high[bs] < xd->tile.mi_row_end) &&
+ mi_col >= xd->tile.mi_col_start &&
+ (mi_col + mi_size_wide[bs] < xd->tile.mi_col_end);
+ const int is_mb_rd_hash_enabled =
+ (within_border && cpi->sf.use_mb_rd_hash && is_inter);
+ const int n4 = bsize_to_num_blk(bs);
+ if (is_mb_rd_hash_enabled) {
+ hash = get_block_residue_hash(x, bs);
+ mb_rd_record = &x->mb_rd_record;
+ match_index = find_mb_rd_info(mb_rd_record, ref_best_rd, hash);
+ if (match_index != -1) {
+ MB_RD_INFO *tx_rd_info = &mb_rd_record->tx_rd_info[match_index];
+ fetch_tx_rd_info(n4, tx_rd_info, rd_stats, x);
+ // Reset the pruning flags.
+ av1_zero(x->tx_search_prune);
+ x->tx_split_prune_flag = 0;
+ return;
+ }
+ }
+
+ // If we predict that skip is the optimal RD decision - set the respective
+ // context and terminate early.
+ int64_t dist;
+
+ if (cpi->sf.tx_type_search.use_skip_flag_prediction && is_inter &&
+ (!xd->lossless[xd->mi[0]->segment_id]) &&
+ predict_skip_flag(x, bs, &dist, cpi->common.reduced_tx_set_used)) {
+ // Populate rdstats as per skip decision
+ set_skip_flag(x, rd_stats, bs, dist);
+ // Save the RD search results into tx_rd_record.
+ if (is_mb_rd_hash_enabled)
+ save_tx_rd_info(n4, hash, x, rd_stats, mb_rd_record);
+ // Reset the pruning flags.
+ av1_zero(x->tx_search_prune);
+ x->tx_split_prune_flag = 0;
+ return;
+ }
+
if (xd->lossless[xd->mi[0]->segment_id]) {
choose_smallest_tx_size(cpi, x, rd_stats, ref_best_rd, bs);
} else if (cpi->sf.tx_size_search_method == USE_LARGESTALL) {
@@ -3785,6 +4060,12 @@ static void super_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
} else {
choose_tx_size_type_from_rd(cpi, x, rd_stats, ref_best_rd, bs);
}
+
+ // Save the RD search results into tx_rd_record.
+ if (is_mb_rd_hash_enabled) {
+ assert(mb_rd_record != NULL);
+ save_tx_rd_info(n4, hash, x, rd_stats, mb_rd_record);
+ }
}
// Return the rate cost for luma prediction mode info. of intra blocks.
@@ -4527,6 +4808,7 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
const int *bmode_costs;
PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
const int try_palette =
+ cpi->oxcf.enable_palette &&
av1_allow_palette(cpi->common.allow_screen_content_tools, mbmi->sb_type);
uint8_t *best_palette_color_map =
try_palette ? x->palette_buffer->best_palette_color_map : NULL;
@@ -4542,8 +4824,7 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
if (cpi->sf.intra_angle_estimation) {
const int src_stride = x->plane[0].src.stride;
const uint8_t *src = x->plane[0].src.buf;
- angle_estimation(src, src_stride, rows, cols, bsize,
- xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH,
+ angle_estimation(src, src_stride, rows, cols, bsize, is_cur_buf_hbd(xd),
directional_mode_skip_mask);
}
mbmi->filter_intra_mode_info.use_filter_intra = 0;
@@ -4561,6 +4842,11 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
int this_rate, this_rate_tokenonly, s;
int64_t this_distortion, this_rd, this_model_rd;
mbmi->mode = intra_rd_search_mode_order[mode_idx];
+ if ((!cpi->oxcf.enable_smooth_intra || cpi->sf.disable_smooth_intra) &&
+ (mbmi->mode == SMOOTH_PRED || mbmi->mode == SMOOTH_H_PRED ||
+ mbmi->mode == SMOOTH_V_PRED))
+ continue;
+ if (!cpi->oxcf.enable_paeth_intra && mbmi->mode == PAETH_PRED) continue;
mbmi->angle_delta[PLANE_TYPE_Y] = 0;
this_model_rd =
intra_model_yrd(cpi, x, bsize, bmode_costs[mbmi->mode], mi_row, mi_col);
@@ -4570,7 +4856,8 @@ static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
if (this_model_rd < best_model_rd) best_model_rd = this_model_rd;
is_directional_mode = av1_is_directional_mode(mbmi->mode);
if (is_directional_mode && directional_mode_skip_mask[mbmi->mode]) continue;
- if (is_directional_mode && av1_use_angle_delta(bsize)) {
+ if (is_directional_mode && av1_use_angle_delta(bsize) &&
+ cpi->oxcf.enable_angle_delta) {
this_rd_stats.rate = INT_MAX;
rd_pick_intra_angle_sby(cpi, x, mi_row, mi_col, &this_rate,
&this_rd_stats, bsize, bmode_costs[mbmi->mode],
@@ -4649,6 +4936,8 @@ static int super_block_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x,
const TX_SIZE uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd);
int plane;
int is_cost_valid = 1;
+ const int is_inter = is_inter_block(mbmi);
+ int64_t this_rd = 0, skip_rd = 0;
av1_init_rd_stats(rd_stats);
if (ref_best_rd < 0) is_cost_valid = 0;
@@ -4657,7 +4946,7 @@ static int super_block_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x,
bsize = scale_chroma_bsize(bsize, pd->subsampling_x, pd->subsampling_y);
- if (is_inter_block(mbmi) && is_cost_valid) {
+ if (is_inter && is_cost_valid) {
for (plane = 1; plane < MAX_MB_PLANE; ++plane)
av1_subtract_plane(x, bsize, plane);
}
@@ -4665,15 +4954,26 @@ static int super_block_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x,
if (is_cost_valid) {
for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
RD_STATS pn_rd_stats;
- txfm_rd_in_plane(x, cpi, &pn_rd_stats, ref_best_rd, 0, plane, bsize,
- uv_tx_size, cpi->sf.use_fast_coef_costing, FTXS_NONE);
+ int64_t chroma_ref_best_rd = ref_best_rd;
+ // For inter blocks, refined ref_best_rd is used for early exit
+ // For intra blocks, even though current rd crosses ref_best_rd, early
+ // exit is not recommended as current rd is used for gating subsequent
+ // modes as well (say, for angular modes)
+ // TODO(any): Extend the early exit mechanism for intra modes as well
+ if (cpi->sf.perform_best_rd_based_gating_for_chroma && is_inter &&
+ chroma_ref_best_rd != INT64_MAX)
+ chroma_ref_best_rd = ref_best_rd - AOMMIN(this_rd, skip_rd);
+ txfm_rd_in_plane(x, cpi, &pn_rd_stats, chroma_ref_best_rd, 0, plane,
+ bsize, uv_tx_size, cpi->sf.use_fast_coef_costing,
+ FTXS_NONE, 0);
if (pn_rd_stats.rate == INT_MAX) {
is_cost_valid = 0;
break;
}
av1_merge_rd_stats(rd_stats, &pn_rd_stats);
- if (RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) > ref_best_rd &&
- RDCOST(x->rdmult, 0, rd_stats->sse) > ref_best_rd) {
+ this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+ skip_rd = RDCOST(x->rdmult, 0, rd_stats->sse);
+ if (AOMMIN(this_rd, skip_rd) > ref_best_rd) {
is_cost_valid = 0;
break;
}
@@ -4688,11 +4988,12 @@ static int super_block_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x,
return is_cost_valid;
}
-static void tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
- int blk_row, int blk_col, int plane, int block,
- int plane_bsize, TXB_CTX *txb_ctx, RD_STATS *rd_stats,
- FAST_TX_SEARCH_MODE ftxs_mode, int64_t ref_rdcost,
- TXB_RD_INFO *rd_info_array) {
+// Pick transform type for a transform block of tx_size.
+static void tx_type_rd(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
+ int blk_row, int blk_col, int plane, int block,
+ int plane_bsize, TXB_CTX *txb_ctx, RD_STATS *rd_stats,
+ FAST_TX_SEARCH_MODE ftxs_mode, int64_t ref_rdcost,
+ TXB_RD_INFO *rd_info_array) {
const struct macroblock_plane *const p = &x->plane[plane];
const uint16_t cur_joint_ctx =
(txb_ctx->dc_sign_ctx << 8) + txb_ctx->txb_skip_ctx;
@@ -4720,7 +5021,7 @@ static void tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
RD_STATS this_rd_stats;
search_txk_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
- txb_ctx, ftxs_mode, 0, ref_rdcost, &this_rd_stats);
+ txb_ctx, ftxs_mode, 0, 0, ref_rdcost, &this_rd_stats);
av1_merge_rd_stats(rd_stats, &this_rd_stats);
@@ -4885,9 +5186,9 @@ static void try_tx_block_no_split(
rd_stats->zero_rate = zero_blk_rate;
const int index = av1_get_txb_size_index(plane_bsize, blk_row, blk_col);
mbmi->inter_tx_size[index] = tx_size;
- tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize,
- &txb_ctx, rd_stats, ftxs_mode, ref_best_rd,
- rd_info_node != NULL ? rd_info_node->rd_info_array : NULL);
+ tx_type_rd(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize, &txb_ctx,
+ rd_stats, ftxs_mode, ref_best_rd,
+ rd_info_node != NULL ? rd_info_node->rd_info_array : NULL);
assert(rd_stats->rate < INT_MAX);
if ((RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
@@ -4895,7 +5196,7 @@ static void try_tx_block_no_split(
rd_stats->skip == 1) &&
!xd->lossless[mbmi->segment_id]) {
#if CONFIG_RD_DEBUG
- av1_update_txb_coeff_cost(rd_stats, plane, tx_size, blk_row, blk_col,
+ av1_update_txb_coeff_cost(rd_stats, 0, tx_size, blk_row, blk_col,
zero_blk_rate - rd_stats->rate);
#endif // CONFIG_RD_DEBUG
rd_stats->rate = zero_blk_rate;
@@ -4918,13 +5219,6 @@ static void try_tx_block_no_split(
const int txk_type_idx =
av1_get_txk_type_index(plane_bsize, blk_row, blk_col);
no_split->tx_type = mbmi->txk_type[txk_type_idx];
-
-#if CONFIG_ONE_PASS_SVM
- if (plane_bsize >= BLOCK_8X8) {
- av1_add_reg_stat(rd_stats, p->eobs[block], no_split->rd, rd_stats->sse,
- blk_row, blk_col, plane_bsize, txsize_to_bsize[tx_size]);
- }
-#endif
}
static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
@@ -4932,8 +5226,8 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above,
TXFM_CONTEXT *tx_left, RD_STATS *rd_stats,
- int64_t ref_best_rd, int *is_cost_valid,
- FAST_TX_SEARCH_MODE ftxs_mode,
+ int64_t prev_level_rd, int64_t ref_best_rd,
+ int *is_cost_valid, FAST_TX_SEARCH_MODE ftxs_mode,
TXB_RD_INFO_NODE *rd_info_node);
static void try_tx_block_split(
@@ -4943,6 +5237,7 @@ static void try_tx_block_split(
int txfm_partition_ctx, int64_t no_split_rd, int64_t ref_best_rd,
FAST_TX_SEARCH_MODE ftxs_mode, TXB_RD_INFO_NODE *rd_info_node,
RD_STATS *split_rd_stats, int64_t *split_rd) {
+ assert(tx_size < TX_SIZES_ALL);
MACROBLOCKD *const xd = &x->e_mbd;
const int max_blocks_high = max_block_high(xd, plane_bsize, 0);
const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0);
@@ -4950,44 +5245,37 @@ static void try_tx_block_split(
const int bsw = tx_size_wide_unit[sub_txs];
const int bsh = tx_size_high_unit[sub_txs];
const int sub_step = bsw * bsh;
- RD_STATS this_rd_stats;
- int this_cost_valid = 1;
+ const int nblks =
+ (tx_size_high_unit[tx_size] / bsh) * (tx_size_wide_unit[tx_size] / bsw);
+ assert(nblks > 0);
+ int blk_idx = 0;
int64_t tmp_rd = 0;
-
+ *split_rd = INT64_MAX;
split_rd_stats->rate = x->txfm_partition_cost[txfm_partition_ctx][1];
- assert(tx_size < TX_SIZES_ALL);
-
- int blk_idx = 0;
for (int r = 0; r < tx_size_high_unit[tx_size]; r += bsh) {
for (int c = 0; c < tx_size_wide_unit[tx_size]; c += bsw, ++blk_idx) {
+ assert(blk_idx < 4);
const int offsetr = blk_row + r;
const int offsetc = blk_col + c;
if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
- assert(blk_idx < 4);
+
+ RD_STATS this_rd_stats;
+ int this_cost_valid = 1;
select_tx_block(
cpi, x, offsetr, offsetc, block, sub_txs, depth + 1, plane_bsize, ta,
- tl, tx_above, tx_left, &this_rd_stats, ref_best_rd - tmp_rd,
- &this_cost_valid, ftxs_mode,
+ tl, tx_above, tx_left, &this_rd_stats, no_split_rd / nblks,
+ ref_best_rd - tmp_rd, &this_cost_valid, ftxs_mode,
(rd_info_node != NULL) ? rd_info_node->children[blk_idx] : NULL);
-
- if (!this_cost_valid) goto LOOP_EXIT;
-
+ if (!this_cost_valid) return;
av1_merge_rd_stats(split_rd_stats, &this_rd_stats);
-
tmp_rd = RDCOST(x->rdmult, split_rd_stats->rate, split_rd_stats->dist);
-
- if (no_split_rd < tmp_rd) {
- this_cost_valid = 0;
- goto LOOP_EXIT;
- }
+ if (no_split_rd < tmp_rd) return;
block += sub_step;
}
}
-LOOP_EXIT : {}
-
- if (this_cost_valid) *split_rd = tmp_rd;
+ *split_rd = tmp_rd;
}
// Search for the best tx partition/type for a given luma block.
@@ -4996,8 +5284,8 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above,
TXFM_CONTEXT *tx_left, RD_STATS *rd_stats,
- int64_t ref_best_rd, int *is_cost_valid,
- FAST_TX_SEARCH_MODE ftxs_mode,
+ int64_t prev_level_rd, int64_t ref_best_rd,
+ int *is_cost_valid, FAST_TX_SEARCH_MODE ftxs_mode,
TXB_RD_INFO_NODE *rd_info_node) {
assert(tx_size < TX_SIZES_ALL);
av1_init_rd_stats(rd_stats);
@@ -5017,7 +5305,8 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
mbmi->sb_type, tx_size);
struct macroblock_plane *const p = &x->plane[0];
- const int try_no_split = 1;
+ const int try_no_split =
+ cpi->oxcf.enable_tx64 || txsize_sqr_up_map[tx_size] != TX_64X64;
int try_split = tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH;
#if CONFIG_DIST_8X8
if (x->using_dist_8x8)
@@ -5042,6 +5331,13 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
if (cpi->sf.txb_split_cap) {
if (p->eobs[block] == 0) try_split = 0;
}
+
+ if (cpi->sf.adaptive_txb_search_level &&
+ (no_split.rd -
+ (no_split.rd >> (2 + cpi->sf.adaptive_txb_search_level))) >
+ prev_level_rd) {
+ try_split = 0;
+ }
}
if (x->e_mbd.bd == 8 && !x->cb_partition_scan && try_split) {
@@ -5089,98 +5385,12 @@ static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
}
}
-static void select_inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
- RD_STATS *rd_stats, BLOCK_SIZE bsize,
- int64_t ref_best_rd,
- FAST_TX_SEARCH_MODE ftxs_mode,
- TXB_RD_INFO_NODE *rd_info_tree) {
- MACROBLOCKD *const xd = &x->e_mbd;
- int is_cost_valid = 1;
- int64_t this_rd = 0, skip_rd = 0;
-
- if (ref_best_rd < 0) is_cost_valid = 0;
-
- av1_init_rd_stats(rd_stats);
-
- if (is_cost_valid) {
- const struct macroblockd_plane *const pd = &xd->plane[0];
- const BLOCK_SIZE plane_bsize =
- get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
- const int mi_width = mi_size_wide[plane_bsize];
- const int mi_height = mi_size_high[plane_bsize];
- const TX_SIZE max_tx_size = max_txsize_rect_lookup[plane_bsize];
- const int bh = tx_size_high_unit[max_tx_size];
- const int bw = tx_size_wide_unit[max_tx_size];
- int idx, idy;
- int block = 0;
- int step = tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
- ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE];
- ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE];
- TXFM_CONTEXT tx_above[MAX_MIB_SIZE];
- TXFM_CONTEXT tx_left[MAX_MIB_SIZE];
-
- RD_STATS pn_rd_stats;
- const int init_depth =
- get_search_init_depth(mi_width, mi_height, 1, &cpi->sf);
- av1_init_rd_stats(&pn_rd_stats);
-
- av1_get_entropy_contexts(bsize, pd, ctxa, ctxl);
- memcpy(tx_above, xd->above_txfm_context, sizeof(TXFM_CONTEXT) * mi_width);
- memcpy(tx_left, xd->left_txfm_context, sizeof(TXFM_CONTEXT) * mi_height);
- const int skip_ctx = av1_get_skip_context(xd);
- const int s0 = x->skip_cost[skip_ctx][0];
- const int s1 = x->skip_cost[skip_ctx][1];
-
- skip_rd = RDCOST(x->rdmult, s1, 0);
- this_rd = RDCOST(x->rdmult, s0, 0);
- for (idy = 0; idy < mi_height; idy += bh) {
- for (idx = 0; idx < mi_width; idx += bw) {
- int64_t best_rd_sofar = (ref_best_rd - (AOMMIN(skip_rd, this_rd)));
- select_tx_block(cpi, x, idy, idx, block, max_tx_size, init_depth,
- plane_bsize, ctxa, ctxl, tx_above, tx_left,
- &pn_rd_stats, best_rd_sofar, &is_cost_valid, ftxs_mode,
- rd_info_tree);
- if (!is_cost_valid || pn_rd_stats.rate == INT_MAX) {
- av1_invalid_rd_stats(rd_stats);
- return;
- }
- av1_merge_rd_stats(rd_stats, &pn_rd_stats);
- skip_rd = RDCOST(x->rdmult, s1, rd_stats->sse);
- this_rd = RDCOST(x->rdmult, rd_stats->rate + s0, rd_stats->dist);
- block += step;
- if (rd_info_tree != NULL) rd_info_tree += 1;
- }
- }
- if (skip_rd <= this_rd) {
- rd_stats->rate = 0;
- rd_stats->dist = rd_stats->sse;
- rd_stats->skip = 1;
-#if CONFIG_ONE_PASS_SVM
- av1_reg_stat_skipmode_update(rd_stats, x->rdmult);
-#endif
- } else {
- rd_stats->skip = 0;
- }
- }
-
- if (!is_cost_valid) {
- // reset cost value
- av1_invalid_rd_stats(rd_stats);
- }
-}
-
-static int64_t select_tx_size_fix_type(const AV1_COMP *cpi, MACROBLOCK *x,
+static int64_t select_tx_size_and_type(const AV1_COMP *cpi, MACROBLOCK *x,
RD_STATS *rd_stats, BLOCK_SIZE bsize,
int64_t ref_best_rd,
TXB_RD_INFO_NODE *rd_info_tree) {
- const int fast_tx_search = cpi->sf.tx_size_search_method > USE_FULL_RD;
MACROBLOCKD *const xd = &x->e_mbd;
- MB_MODE_INFO *const mbmi = xd->mi[0];
- const int is_inter = is_inter_block(mbmi);
- const int skip_ctx = av1_get_skip_context(xd);
- int s0 = x->skip_cost[skip_ctx][0];
- int s1 = x->skip_cost[skip_ctx][1];
- int64_t rd;
+ assert(is_inter_block(xd->mi[0]));
// TODO(debargha): enable this as a speed feature where the
// select_inter_block_yrd() function above will use a simplified search
@@ -5188,16 +5398,71 @@ static int64_t select_tx_size_fix_type(const AV1_COMP *cpi, MACROBLOCK *x,
// will use more complex search given that the transform partitions have
// already been decided.
+ const int fast_tx_search = cpi->sf.tx_size_search_method > USE_FULL_RD;
int64_t rd_thresh = ref_best_rd;
if (fast_tx_search && rd_thresh < INT64_MAX) {
if (INT64_MAX - rd_thresh > (rd_thresh >> 3)) rd_thresh += (rd_thresh >> 3);
}
assert(rd_thresh > 0);
- FAST_TX_SEARCH_MODE ftxs_mode =
+ const FAST_TX_SEARCH_MODE ftxs_mode =
fast_tx_search ? FTXS_DCT_AND_1D_DCT_ONLY : FTXS_NONE;
- select_inter_block_yrd(cpi, x, rd_stats, bsize, rd_thresh, ftxs_mode,
- rd_info_tree);
+ const struct macroblockd_plane *const pd = &xd->plane[0];
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+ const int mi_width = mi_size_wide[plane_bsize];
+ const int mi_height = mi_size_high[plane_bsize];
+ ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE];
+ ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE];
+ TXFM_CONTEXT tx_above[MAX_MIB_SIZE];
+ TXFM_CONTEXT tx_left[MAX_MIB_SIZE];
+ av1_get_entropy_contexts(bsize, pd, ctxa, ctxl);
+ memcpy(tx_above, xd->above_txfm_context, sizeof(TXFM_CONTEXT) * mi_width);
+ memcpy(tx_left, xd->left_txfm_context, sizeof(TXFM_CONTEXT) * mi_height);
+
+ const int skip_ctx = av1_get_skip_context(xd);
+ const int s0 = x->skip_cost[skip_ctx][0];
+ const int s1 = x->skip_cost[skip_ctx][1];
+ const int init_depth =
+ get_search_init_depth(mi_width, mi_height, 1, &cpi->sf);
+ const TX_SIZE max_tx_size = max_txsize_rect_lookup[plane_bsize];
+ const int bh = tx_size_high_unit[max_tx_size];
+ const int bw = tx_size_wide_unit[max_tx_size];
+ const int step = bw * bh;
+ int64_t skip_rd = RDCOST(x->rdmult, s1, 0);
+ int64_t this_rd = RDCOST(x->rdmult, s0, 0);
+ int block = 0;
+
+ av1_init_rd_stats(rd_stats);
+ for (int idy = 0; idy < mi_height; idy += bh) {
+ for (int idx = 0; idx < mi_width; idx += bw) {
+ const int64_t best_rd_sofar =
+ (rd_thresh == INT64_MAX) ? INT64_MAX
+ : (rd_thresh - (AOMMIN(skip_rd, this_rd)));
+ int is_cost_valid = 1;
+ RD_STATS pn_rd_stats;
+ select_tx_block(cpi, x, idy, idx, block, max_tx_size, init_depth,
+ plane_bsize, ctxa, ctxl, tx_above, tx_left, &pn_rd_stats,
+ INT64_MAX, best_rd_sofar, &is_cost_valid, ftxs_mode,
+ rd_info_tree);
+ if (!is_cost_valid || pn_rd_stats.rate == INT_MAX) {
+ av1_invalid_rd_stats(rd_stats);
+ return INT64_MAX;
+ }
+ av1_merge_rd_stats(rd_stats, &pn_rd_stats);
+ skip_rd = RDCOST(x->rdmult, s1, rd_stats->sse);
+ this_rd = RDCOST(x->rdmult, rd_stats->rate + s0, rd_stats->dist);
+ block += step;
+ if (rd_info_tree != NULL) rd_info_tree += 1;
+ }
+ }
+
+ if (skip_rd <= this_rd) {
+ rd_stats->skip = 1;
+ } else {
+ rd_stats->skip = 0;
+ }
+
if (rd_stats->rate == INT_MAX) return INT64_MAX;
// If fast_tx_search is true, only DCT and 1D DCT were tested in
@@ -5208,20 +5473,15 @@ static int64_t select_tx_size_fix_type(const AV1_COMP *cpi, MACROBLOCK *x,
return INT64_MAX;
}
+ int64_t rd;
if (rd_stats->skip) {
rd = RDCOST(x->rdmult, s1, rd_stats->sse);
-#if CONFIG_ONE_PASS_SVM
- // TODO(chiyotsai@google.com): Investigate if these updates are really
- // needed.
- av1_reg_stat_skipmode_update(rd_stats, x->rdmult);
-#endif
} else {
rd = RDCOST(x->rdmult, rd_stats->rate + s0, rd_stats->dist);
+ if (!xd->lossless[xd->mi[0]->segment_id])
+ rd = AOMMIN(rd, RDCOST(x->rdmult, s1, rd_stats->sse));
}
- if (is_inter && !xd->lossless[xd->mi[0]->segment_id] && !(rd_stats->skip))
- rd = AOMMIN(rd, RDCOST(x->rdmult, s1, rd_stats->sse));
-
return rd;
}
@@ -5260,8 +5520,8 @@ static void tx_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
.txb_skip_cost[txb_ctx.txb_skip_ctx][1];
rd_stats->zero_rate = zero_blk_rate;
rd_stats->ref_rdcost = ref_best_rd;
- tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize,
- &txb_ctx, rd_stats, ftxs_mode, ref_best_rd, NULL);
+ tx_type_rd(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize,
+ &txb_ctx, rd_stats, ftxs_mode, ref_best_rd, NULL);
const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
if (RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) ||
@@ -5274,20 +5534,9 @@ static void tx_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
x->plane[0].txb_entropy_ctx[block] = 0;
update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
DCT_DCT);
-#if CONFIG_ONE_PASS_SVM
- av1_add_reg_stat(rd_stats, 0, RDCOST(x->rdmult, 0, rd_stats->sse),
- rd_stats->sse, blk_row, blk_col, plane_bsize,
- txsize_to_bsize[tx_size]);
-#endif
} else {
rd_stats->skip = 0;
set_blk_skip(x, 0, blk_row * mi_width + blk_col, 0);
-#if CONFIG_ONE_PASS_SVM
- av1_add_reg_stat(rd_stats, x->plane[0].eobs[block],
- RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist),
- rd_stats->sse, blk_row, blk_col, plane_bsize,
- txsize_to_bsize[tx_size]);
-#endif
}
if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
rd_stats->rate += x->txfm_partition_cost[ctx][0];
@@ -5395,11 +5644,6 @@ static int inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
rd_stats->rate = 0;
rd_stats->dist = rd_stats->sse;
rd_stats->skip = 1;
-#if CONFIG_ONE_PASS_SVM
- // TODO(chiyotasi@google.com): Investigate if these updates are really
- // needed.
- av1_reg_stat_skipmode_update(rd_stats, x->rdmult);
-#endif
}
if (this_rd > ref_best_rd) is_cost_valid = 0;
@@ -5410,52 +5654,6 @@ static int inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
return is_cost_valid;
}
-static INLINE uint32_t get_block_residue_hash(MACROBLOCK *x, BLOCK_SIZE bsize) {
- const int rows = block_size_high[bsize];
- const int cols = block_size_wide[bsize];
- const int16_t *diff = x->plane[0].src_diff;
- const uint32_t hash = av1_get_crc32c_value(&x->mb_rd_record.crc_calculator,
- (uint8_t *)diff, 2 * rows * cols);
- return (hash << 5) + bsize;
-}
-
-static void save_tx_rd_info(int n4, uint32_t hash, const MACROBLOCK *const x,
- const RD_STATS *const rd_stats,
- MB_RD_RECORD *tx_rd_record) {
- int index;
- if (tx_rd_record->num < RD_RECORD_BUFFER_LEN) {
- index =
- (tx_rd_record->index_start + tx_rd_record->num) % RD_RECORD_BUFFER_LEN;
- ++tx_rd_record->num;
- } else {
- index = tx_rd_record->index_start;
- tx_rd_record->index_start =
- (tx_rd_record->index_start + 1) % RD_RECORD_BUFFER_LEN;
- }
- MB_RD_INFO *const tx_rd_info = &tx_rd_record->tx_rd_info[index];
- const MACROBLOCKD *const xd = &x->e_mbd;
- const MB_MODE_INFO *const mbmi = xd->mi[0];
- tx_rd_info->hash_value = hash;
- tx_rd_info->tx_size = mbmi->tx_size;
- memcpy(tx_rd_info->blk_skip, x->blk_skip,
- sizeof(tx_rd_info->blk_skip[0]) * n4);
- av1_copy(tx_rd_info->inter_tx_size, mbmi->inter_tx_size);
- av1_copy(tx_rd_info->txk_type, mbmi->txk_type);
- tx_rd_info->rd_stats = *rd_stats;
-}
-
-static void fetch_tx_rd_info(int n4, const MB_RD_INFO *const tx_rd_info,
- RD_STATS *const rd_stats, MACROBLOCK *const x) {
- MACROBLOCKD *const xd = &x->e_mbd;
- MB_MODE_INFO *const mbmi = xd->mi[0];
- mbmi->tx_size = tx_rd_info->tx_size;
- memcpy(x->blk_skip, tx_rd_info->blk_skip,
- sizeof(tx_rd_info->blk_skip[0]) * n4);
- av1_copy(mbmi->inter_tx_size, tx_rd_info->inter_tx_size);
- av1_copy(mbmi->txk_type, tx_rd_info->txk_type);
- *rd_stats = tx_rd_info->rd_stats;
-}
-
static int find_tx_size_rd_info(TXB_RD_RECORD *cur_record,
const uint32_t hash) {
// Linear search through the circular buffer to find matching hash.
@@ -5706,158 +5904,13 @@ static int find_tx_size_rd_records(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
return 1;
}
-// origin_threshold * 128 / 100
-static const uint32_t skip_pred_threshold[3][BLOCK_SIZES_ALL] = {
- {
- 64, 64, 64, 70, 60, 60, 68, 68, 68, 68, 68,
- 68, 68, 68, 68, 68, 64, 64, 70, 70, 68, 68,
- },
- {
- 88, 88, 88, 86, 87, 87, 68, 68, 68, 68, 68,
- 68, 68, 68, 68, 68, 88, 88, 86, 86, 68, 68,
- },
- {
- 90, 93, 93, 90, 93, 93, 74, 74, 74, 74, 74,
- 74, 74, 74, 74, 74, 90, 90, 90, 90, 74, 74,
- },
-};
-
-// lookup table for predict_skip_flag
-// int max_tx_size = max_txsize_rect_lookup[bsize];
-// if (tx_size_high[max_tx_size] > 16 || tx_size_wide[max_tx_size] > 16)
-// max_tx_size = AOMMIN(max_txsize_lookup[bsize], TX_16X16);
-static const TX_SIZE max_predict_sf_tx_size[BLOCK_SIZES_ALL] = {
- TX_4X4, TX_4X8, TX_8X4, TX_8X8, TX_8X16, TX_16X8,
- TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_16X16,
- TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_4X16, TX_16X4,
- TX_8X8, TX_8X8, TX_16X16, TX_16X16,
-};
-
-// Uses simple features on top of DCT coefficients to quickly predict
-// whether optimal RD decision is to skip encoding the residual.
-// The sse value is stored in dist.
-static int predict_skip_flag(MACROBLOCK *x, BLOCK_SIZE bsize, int64_t *dist,
- int reduced_tx_set) {
- const int bw = block_size_wide[bsize];
- const int bh = block_size_high[bsize];
- const MACROBLOCKD *xd = &x->e_mbd;
- const int16_t dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd);
-
- *dist = pixel_diff_dist(x, 0, 0, 0, bsize, bsize);
- const int64_t mse = *dist / bw / bh;
- // Normalized quantizer takes the transform upscaling factor (8 for tx size
- // smaller than 32) into account.
- const int16_t normalized_dc_q = dc_q >> 3;
- const int64_t mse_thresh = (int64_t)normalized_dc_q * normalized_dc_q / 8;
- // Predict not to skip when mse is larger than threshold.
- if (mse > mse_thresh) return 0;
-
- const int max_tx_size = max_predict_sf_tx_size[bsize];
- const int tx_h = tx_size_high[max_tx_size];
- const int tx_w = tx_size_wide[max_tx_size];
- DECLARE_ALIGNED(32, tran_low_t, coefs[32 * 32]);
- TxfmParam param;
- param.tx_type = DCT_DCT;
- param.tx_size = max_tx_size;
- param.bd = xd->bd;
- param.is_hbd = get_bitdepth_data_path_index(xd);
- param.lossless = 0;
- param.tx_set_type = av1_get_ext_tx_set_type(
- param.tx_size, is_inter_block(xd->mi[0]), reduced_tx_set);
- const int bd_idx = (xd->bd == 8) ? 0 : ((xd->bd == 10) ? 1 : 2);
- const uint32_t max_qcoef_thresh = skip_pred_threshold[bd_idx][bsize];
- const int16_t *src_diff = x->plane[0].src_diff;
- const int n_coeff = tx_w * tx_h;
- const int16_t ac_q = av1_ac_quant_QTX(x->qindex, 0, xd->bd);
- const uint32_t dc_thresh = max_qcoef_thresh * dc_q;
- const uint32_t ac_thresh = max_qcoef_thresh * ac_q;
- for (int row = 0; row < bh; row += tx_h) {
- for (int col = 0; col < bw; col += tx_w) {
- av1_fwd_txfm(src_diff + col, coefs, bw, &param);
- // Operating on TX domain, not pixels; we want the QTX quantizers
- const uint32_t dc_coef = (((uint32_t)abs(coefs[0])) << 7);
- if (dc_coef >= dc_thresh) return 0;
- for (int i = 1; i < n_coeff; ++i) {
- const uint32_t ac_coef = (((uint32_t)abs(coefs[i])) << 7);
- if (ac_coef >= ac_thresh) return 0;
- }
- }
- src_diff += tx_h * bw;
- }
- return 1;
-}
-
-#if CONFIG_ONE_PASS_SVM
-static void calc_regional_sse(MACROBLOCK *x, BLOCK_SIZE bsize, int64_t dist,
- RD_STATS *rd_stats) {
- // TODO(chiyotsai@google.com): Don't need regional sse's unless we are doing
- // none.
- const int bw = block_size_wide[bsize];
- const int bw_mi = bw >> tx_size_wide_log2[0];
- const int bh_mi = bw >> tx_size_high_log2[0];
- const BLOCK_SIZE split_size = get_partition_subsize(bsize, PARTITION_SPLIT);
- int64_t dist_0, dist_1, dist_2, dist_3;
- MACROBLOCKD *xd = &x->e_mbd;
- dist_0 = pixel_diff_dist(x, AOM_PLANE_Y, 0, 0, bsize, split_size);
- dist_1 = pixel_diff_dist(x, AOM_PLANE_Y, 0, bw_mi / 2, bsize, split_size);
- dist_2 = pixel_diff_dist(x, AOM_PLANE_Y, bh_mi / 2, 0, bsize, split_size);
- dist_3 =
- pixel_diff_dist(x, AOM_PLANE_Y, bh_mi / 2, bw_mi / 2, bsize, split_size);
-
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- dist = ROUND_POWER_OF_TWO(dist, (xd->bd - 8) * 2);
- dist_0 = ROUND_POWER_OF_TWO(dist_0, (xd->bd - 8) * 2);
- dist_1 = ROUND_POWER_OF_TWO(dist_1, (xd->bd - 8) * 2);
- dist_2 = ROUND_POWER_OF_TWO(dist_2, (xd->bd - 8) * 2);
- dist_3 = ROUND_POWER_OF_TWO(dist_3, (xd->bd - 8) * 2);
- }
- const int scaling_factor = MAX_MIB_SIZE * MAX_MIB_SIZE;
- rd_stats->y_sse = (dist << 4);
- rd_stats->sse_0 = (dist_0 << 4) * scaling_factor;
- rd_stats->sse_1 = (dist_1 << 4) * scaling_factor;
- rd_stats->sse_2 = (dist_2 << 4) * scaling_factor;
- rd_stats->sse_3 = (dist_3 << 4) * scaling_factor;
- av1_reg_stat_skipmode_update(rd_stats, x->rdmult);
-}
-#endif
-
-// Used to set proper context for early termination with skip = 1.
-static void set_skip_flag(MACROBLOCK *x, RD_STATS *rd_stats, int bsize,
- int64_t dist) {
- MACROBLOCKD *const xd = &x->e_mbd;
- MB_MODE_INFO *const mbmi = xd->mi[0];
- const int n4 = bsize_to_num_blk(bsize);
- const TX_SIZE tx_size = max_txsize_rect_lookup[bsize];
- memset(mbmi->txk_type, DCT_DCT, sizeof(mbmi->txk_type[0]) * TXK_TYPE_BUF_LEN);
- memset(mbmi->inter_tx_size, tx_size, sizeof(mbmi->inter_tx_size));
- mbmi->tx_size = tx_size;
- for (int i = 0; i < n4; ++i) set_blk_skip(x, 0, i, 1);
- rd_stats->skip = 1;
- rd_stats->rate = 0;
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
- dist = ROUND_POWER_OF_TWO(dist, (xd->bd - 8) * 2);
- rd_stats->dist = rd_stats->sse = (dist << 4);
-}
-
-static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
- RD_STATS *rd_stats, BLOCK_SIZE bsize, int mi_row,
- int mi_col, int64_t ref_best_rd) {
+// Search for best transform size and type for luma inter blocks.
+static void pick_tx_size_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats, BLOCK_SIZE bsize,
+ int mi_row, int mi_col, int64_t ref_best_rd) {
const AV1_COMMON *cm = &cpi->common;
MACROBLOCKD *const xd = &x->e_mbd;
- MB_MODE_INFO *const mbmi = xd->mi[0];
- int64_t rd = INT64_MAX;
- int64_t best_rd = INT64_MAX;
- const int is_inter = is_inter_block(mbmi);
- const int n4 = bsize_to_num_blk(bsize);
- // Get the tx_size 1 level down
- const TX_SIZE min_tx_size = sub_tx_size_map[max_txsize_rect_lookup[bsize]];
- const TxSetType tx_set_type =
- av1_get_ext_tx_set_type(min_tx_size, is_inter, cm->reduced_tx_set_used);
- const int within_border =
- mi_row >= xd->tile.mi_row_start &&
- (mi_row + mi_size_high[bsize] < xd->tile.mi_row_end) &&
- mi_col >= xd->tile.mi_col_start &&
- (mi_col + mi_size_wide[bsize] < xd->tile.mi_col_end);
+ assert(is_inter_block(xd->mi[0]));
av1_invalid_rd_stats(rd_stats);
@@ -5874,8 +5927,7 @@ static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
// tighter.
assert(cpi->sf.model_based_prune_tx_search_level >= 0 &&
cpi->sf.model_based_prune_tx_search_level <= 2);
- static const int prune_factor_by8[] = { 2 + MODELRD_TYPE_TX_SEARCH_PRUNE,
- 4 + MODELRD_TYPE_TX_SEARCH_PRUNE };
+ static const int prune_factor_by8[] = { 3, 5 };
if (!model_skip &&
((model_rd *
prune_factor_by8[cpi->sf.model_based_prune_tx_search_level - 1]) >>
@@ -5883,38 +5935,41 @@ static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
return;
}
- const uint32_t hash = get_block_residue_hash(x, bsize);
- MB_RD_RECORD *mb_rd_record = &x->mb_rd_record;
-
- if (ref_best_rd != INT64_MAX && within_border && cpi->sf.use_mb_rd_hash) {
- for (int i = 0; i < mb_rd_record->num; ++i) {
- const int index = (mb_rd_record->index_start + i) % RD_RECORD_BUFFER_LEN;
- // If there is a match in the tx_rd_record, fetch the RD decision and
- // terminate early.
- if (mb_rd_record->tx_rd_info[index].hash_value == hash) {
- MB_RD_INFO *tx_rd_info = &mb_rd_record->tx_rd_info[index];
- fetch_tx_rd_info(n4, tx_rd_info, rd_stats, x);
- return;
- }
+ uint32_t hash = 0;
+ int32_t match_index = -1;
+ MB_RD_RECORD *mb_rd_record = NULL;
+ const int within_border =
+ mi_row >= xd->tile.mi_row_start &&
+ (mi_row + mi_size_high[bsize] < xd->tile.mi_row_end) &&
+ mi_col >= xd->tile.mi_col_start &&
+ (mi_col + mi_size_wide[bsize] < xd->tile.mi_col_end);
+ const int is_mb_rd_hash_enabled = (within_border && cpi->sf.use_mb_rd_hash);
+ const int n4 = bsize_to_num_blk(bsize);
+ if (is_mb_rd_hash_enabled) {
+ hash = get_block_residue_hash(x, bsize);
+ mb_rd_record = &x->mb_rd_record;
+ match_index = find_mb_rd_info(mb_rd_record, ref_best_rd, hash);
+ if (match_index != -1) {
+ MB_RD_INFO *tx_rd_info = &mb_rd_record->tx_rd_info[match_index];
+ fetch_tx_rd_info(n4, tx_rd_info, rd_stats, x);
+ return;
}
}
// If we predict that skip is the optimal RD decision - set the respective
// context and terminate early.
int64_t dist;
- if (is_inter && cpi->sf.tx_type_search.use_skip_flag_prediction &&
+ if (cpi->sf.tx_type_search.use_skip_flag_prediction &&
predict_skip_flag(x, bsize, &dist, cm->reduced_tx_set_used)) {
set_skip_flag(x, rd_stats, bsize, dist);
-#if CONFIG_ONE_PASS_SVM
- if (bsize >= BLOCK_8X8 && mi_size_wide[bsize] == mi_size_high[bsize] &&
- mbmi->partition == PARTITION_NONE) {
- calc_regional_sse(x, bsize, dist, rd_stats);
- }
-#endif
// Save the RD search results into tx_rd_record.
- if (within_border) save_tx_rd_info(n4, hash, x, rd_stats, mb_rd_record);
+ if (is_mb_rd_hash_enabled)
+ save_tx_rd_info(n4, hash, x, rd_stats, mb_rd_record);
return;
}
+#if CONFIG_SPEED_STATS
+ ++x->tx_search_count;
+#endif // CONFIG_SPEED_STATS
// Precompute residual hashes and find existing or add new RD records to
// store and reuse rate and distortion values to speed up TX size search.
@@ -5925,20 +5980,20 @@ static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
find_tx_size_rd_records(x, bsize, mi_row, mi_col, matched_rd_info);
}
+ // Get the tx_size 1 level down
+ const TX_SIZE min_tx_size = sub_tx_size_map[max_txsize_rect_lookup[bsize]];
+ const TxSetType tx_set_type =
+ av1_get_ext_tx_set_type(min_tx_size, 1, cm->reduced_tx_set_used);
prune_tx(cpi, bsize, x, xd, tx_set_type);
int found = 0;
-
RD_STATS this_rd_stats;
av1_init_rd_stats(&this_rd_stats);
+ const int64_t rd =
+ select_tx_size_and_type(cpi, x, &this_rd_stats, bsize, ref_best_rd,
+ found_rd_info ? matched_rd_info : NULL);
- rd = select_tx_size_fix_type(cpi, x, &this_rd_stats, bsize, ref_best_rd,
- found_rd_info ? matched_rd_info : NULL);
- assert(IMPLIES(this_rd_stats.skip && !this_rd_stats.invalid_rate,
- this_rd_stats.rate == 0));
-
- ref_best_rd = AOMMIN(rd, ref_best_rd);
- if (rd < best_rd) {
+ if (rd < INT64_MAX) {
*rd_stats = this_rd_stats;
found = 1;
}
@@ -5954,136 +6009,76 @@ static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
if (!found) return;
// Save the RD search results into tx_rd_record.
- if (within_border && cpi->sf.use_mb_rd_hash)
+ if (is_mb_rd_hash_enabled) {
+ assert(mb_rd_record != NULL);
save_tx_rd_info(n4, hash, x, rd_stats, mb_rd_record);
-}
-
-#define FAVOR_CHROMA_SKIP 1
-static void tx_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
- int blk_col, int plane, int block, TX_SIZE tx_size,
- BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *above_ctx,
- ENTROPY_CONTEXT *left_ctx, RD_STATS *rd_stats,
- FAST_TX_SEARCH_MODE ftxs_mode) {
- assert(plane > 0);
- assert(tx_size < TX_SIZES_ALL);
- MACROBLOCKD *const xd = &x->e_mbd;
- MB_MODE_INFO *const mbmi = xd->mi[0];
- const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
- const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
- if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
-
- ENTROPY_CONTEXT *ta = above_ctx + blk_col;
- ENTROPY_CONTEXT *tl = left_ctx + blk_row;
- TXB_CTX txb_ctx;
- get_txb_ctx(plane_bsize, tx_size, plane, ta, tl, &txb_ctx);
- const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
- const int zero_blk_rate = x->coeff_costs[txs_ctx][PLANE_TYPE_UV]
- .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
- tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block, plane_bsize,
- &txb_ctx, rd_stats, ftxs_mode, INT64_MAX, NULL);
-
- const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
- const int blk_idx = blk_row * mi_width + blk_col;
- const int64_t rdmult = x->rdmult * plane_rd_mult[1][PLANE_TYPE_UV] /
- plane_rd_mult[1][PLANE_TYPE_Y];
- av1_set_txb_context(x, plane, block, tx_size, ta, tl);
- if ((RDCOST(rdmult, rd_stats->rate, rd_stats->dist) >=
- RDCOST(rdmult, zero_blk_rate, rd_stats->sse) ||
- rd_stats->skip == 1) &&
- !xd->lossless[mbmi->segment_id]) {
- rd_stats->rate = zero_blk_rate;
- rd_stats->dist = rd_stats->sse;
- rd_stats->skip = 1;
-#if FAVOR_CHROMA_SKIP
- x->plane[plane].eobs[block] = 0;
- x->plane[plane].txb_entropy_ctx[block] = 0;
- set_blk_skip(x, plane, blk_idx, 1);
-#else
- set_blk_skip(x, plane, blk_idx, 0);
-#endif
- } else {
- set_blk_skip(x, plane, blk_idx, 0);
}
}
-// Return value 0: early termination triggered, no valid rd cost available;
-// 1: rd cost values are valid.
-static int inter_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x,
- RD_STATS *rd_stats, BLOCK_SIZE bsize,
- int64_t non_skip_ref_best_rd,
- int64_t skip_ref_best_rd,
- FAST_TX_SEARCH_MODE ftxs_mode) {
- MACROBLOCKD *const xd = &x->e_mbd;
- MB_MODE_INFO *const mbmi = xd->mi[0];
- int plane;
- int is_cost_valid = 1;
- int64_t this_rd = 0;
- int64_t skip_rd = 0;
-
- if ((non_skip_ref_best_rd < 0) && (skip_ref_best_rd < 0)) is_cost_valid = 0;
-
- av1_init_rd_stats(rd_stats);
+static void model_rd_for_sb_with_fullrdy(
+ const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+ int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
+ int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
+ int *plane_rate, int64_t *plane_sse, int64_t *plane_dist) {
+ const int ref = xd->mi[0]->ref_frame[0];
- if (x->skip_chroma_rd) {
- if (!is_cost_valid) av1_invalid_rd_stats(rd_stats);
+ int64_t rate_sum = 0;
+ int64_t dist_sum = 0;
+ int64_t total_sse = 0;
- return is_cost_valid;
- }
+ for (int plane = plane_from; plane <= plane_to; ++plane) {
+ struct macroblock_plane *const p = &x->plane[plane];
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+ const int bw = block_size_wide[plane_bsize];
+ const int bh = block_size_high[plane_bsize];
+ int64_t sse;
+ int rate;
+ int64_t dist;
- const BLOCK_SIZE bsizec = scale_chroma_bsize(
- bsize, xd->plane[1].subsampling_x, xd->plane[1].subsampling_y);
+ if (x->skip_chroma_rd && plane) continue;
- if (is_inter_block(mbmi) && is_cost_valid) {
- for (plane = 1; plane < MAX_MB_PLANE; ++plane)
- av1_subtract_plane(x, bsizec, plane);
- }
+ if (is_cur_buf_hbd(xd)) {
+ sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf,
+ pd->dst.stride, bw, bh);
+ } else {
+ sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw,
+ bh);
+ }
+ sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
- if (is_cost_valid) {
- for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
- const struct macroblockd_plane *const pd = &xd->plane[plane];
- const BLOCK_SIZE plane_bsize =
- get_plane_block_size(bsizec, pd->subsampling_x, pd->subsampling_y);
- const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
- const int mi_height =
- block_size_high[plane_bsize] >> tx_size_high_log2[0];
- const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane);
- const int bh = tx_size_high_unit[max_tx_size];
- const int bw = tx_size_wide_unit[max_tx_size];
- int idx, idy;
- int block = 0;
- const int step = bh * bw;
- ENTROPY_CONTEXT ta[MAX_MIB_SIZE];
- ENTROPY_CONTEXT tl[MAX_MIB_SIZE];
- av1_get_entropy_contexts(bsizec, pd, ta, tl);
-
- for (idy = 0; idy < mi_height; idy += bh) {
- for (idx = 0; idx < mi_width; idx += bw) {
- RD_STATS pn_rd_stats;
- av1_init_rd_stats(&pn_rd_stats);
- tx_block_uvrd(cpi, x, idy, idx, plane, block, max_tx_size,
- plane_bsize, ta, tl, &pn_rd_stats, ftxs_mode);
- if (pn_rd_stats.rate == INT_MAX) {
- av1_invalid_rd_stats(rd_stats);
- return 0;
- }
- av1_merge_rd_stats(rd_stats, &pn_rd_stats);
- this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
- skip_rd = RDCOST(x->rdmult, 0, rd_stats->sse);
- if ((this_rd > non_skip_ref_best_rd) &&
- (skip_rd > skip_ref_best_rd)) {
- av1_invalid_rd_stats(rd_stats);
- return 0;
- }
- block += step;
- }
+ RD_STATS rd_stats;
+ if (plane == 0) {
+ pick_tx_size_type_yrd(cpi, x, &rd_stats, bsize, mi_row, mi_col,
+ INT64_MAX);
+ if (rd_stats.invalid_rate) {
+ rate = 0;
+ dist = sse << 4;
+ } else {
+ rate = rd_stats.rate;
+ dist = rd_stats.dist;
}
+ } else {
+ model_rd_with_curvfit(cpi, x, plane_bsize, plane, sse, bw * bh, &rate,
+ &dist);
}
- } else {
- // reset cost value
- av1_invalid_rd_stats(rd_stats);
+
+ if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
+
+ total_sse += sse;
+ rate_sum += rate;
+ dist_sum += dist;
+
+ if (plane_rate) plane_rate[plane] = rate;
+ if (plane_sse) plane_sse[plane] = sse;
+ if (plane_dist) plane_dist[plane] = dist;
}
- return is_cost_valid;
+ if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0;
+ if (skip_sse_sb) *skip_sse_sb = total_sse << 4;
+ *out_rate_sum = (int)rate_sum;
+ *out_dist_sum = dist_sum;
}
static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
@@ -6331,7 +6326,7 @@ static int cfl_rd_pick_alpha(MACROBLOCK *const x, const AV1_COMP *const cpi,
const BLOCK_SIZE bsize = mbmi->sb_type;
#if CONFIG_DEBUG
- assert(is_cfl_allowed(xd));
+ assert(is_cfl_allowed(xd) && cpi->oxcf.enable_cfl_intra);
const int ssx = xd->plane[AOM_PLANE_U].subsampling_x;
const int ssy = xd->plane[AOM_PLANE_U].subsampling_y;
const BLOCK_SIZE plane_bsize = get_plane_block_size(mbmi->sb_type, ssx, ssy);
@@ -6368,7 +6363,7 @@ static int cfl_rd_pick_alpha(MACROBLOCK *const x, const AV1_COMP *const cpi,
mbmi->cfl_alpha_idx = 0;
mbmi->cfl_alpha_signs = joint_sign;
txfm_rd_in_plane(x, cpi, &rd_stats, best_rd, 0, plane + 1, bsize,
- tx_size, cpi->sf.use_fast_coef_costing, FTXS_NONE);
+ tx_size, cpi->sf.use_fast_coef_costing, FTXS_NONE, 0);
if (rd_stats.rate == INT_MAX) break;
}
const int alpha_rate = x->cfl_cost[joint_sign][plane][0];
@@ -6396,7 +6391,8 @@ static int cfl_rd_pick_alpha(MACROBLOCK *const x, const AV1_COMP *const cpi,
mbmi->cfl_alpha_idx = (c << CFL_ALPHABET_SIZE_LOG2) + c;
mbmi->cfl_alpha_signs = joint_sign;
txfm_rd_in_plane(x, cpi, &rd_stats, best_rd, 0, plane + 1, bsize,
- tx_size, cpi->sf.use_fast_coef_costing, FTXS_NONE);
+ tx_size, cpi->sf.use_fast_coef_costing, FTXS_NONE,
+ 0);
if (rd_stats.rate == INT_MAX) break;
}
const int alpha_rate = x->cfl_cost[joint_sign][plane][c];
@@ -6469,18 +6465,24 @@ static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
if (!(cpi->sf.intra_uv_mode_mask[txsize_sqr_up_map[max_tx_size]] &
(1 << mode)))
continue;
+ if (!cpi->oxcf.enable_smooth_intra && mode >= UV_SMOOTH_PRED &&
+ mode <= UV_SMOOTH_H_PRED)
+ continue;
+
+ if (!cpi->oxcf.enable_paeth_intra && mode == UV_PAETH_PRED) continue;
mbmi->uv_mode = mode;
int cfl_alpha_rate = 0;
if (mode == UV_CFL_PRED) {
- if (!is_cfl_allowed(xd)) continue;
+ if (!is_cfl_allowed(xd) || !cpi->oxcf.enable_cfl_intra) continue;
assert(!is_directional_mode);
const TX_SIZE uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd);
cfl_alpha_rate = cfl_rd_pick_alpha(x, cpi, uv_tx_size, best_rd);
if (cfl_alpha_rate == INT_MAX) continue;
}
mbmi->angle_delta[PLANE_TYPE_UV] = 0;
- if (is_directional_mode && av1_use_angle_delta(mbmi->sb_type)) {
+ if (is_directional_mode && av1_use_angle_delta(mbmi->sb_type) &&
+ cpi->oxcf.enable_angle_delta) {
const int rate_overhead =
x->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][mode];
if (!rd_pick_intra_angle_sbuv(cpi, x, bsize, rate_overhead, best_rd,
@@ -6497,7 +6499,7 @@ static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
this_rate = tokenonly_rd_stats.rate +
intra_mode_info_cost_uv(cpi, x, mbmi, bsize, mode_cost);
if (mode == UV_CFL_PRED) {
- assert(is_cfl_allowed(xd));
+ assert(is_cfl_allowed(xd) && cpi->oxcf.enable_cfl_intra);
#if CONFIG_DEBUG
if (!xd->lossless[mbmi->segment_id])
assert(xd->cfl.rate == tokenonly_rd_stats.rate + mode_cost);
@@ -6516,6 +6518,7 @@ static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
}
const int try_palette =
+ cpi->oxcf.enable_palette &&
av1_allow_palette(cpi->common.allow_screen_content_tools, mbmi->sb_type);
if (try_palette) {
uint8_t *best_palette_color_map = x->palette_buffer->best_palette_color_map;
@@ -6619,35 +6622,6 @@ static int get_interinter_compound_mask_rate(const MACROBLOCK *const x,
}
}
-typedef struct {
- int eobs;
- int brate;
- int byrate;
- int64_t bdist;
- int64_t bsse;
- int64_t brdcost;
- int_mv mvs[2];
- int_mv pred_mv[2];
- int_mv ref_mv[2];
-
- ENTROPY_CONTEXT ta[2];
- ENTROPY_CONTEXT tl[2];
-} SEG_RDSTAT;
-
-typedef struct {
- int_mv *ref_mv[2];
- int_mv mvp;
-
- int64_t segment_rd;
- int r;
- int64_t d;
- int64_t sse;
- int segment_yrate;
- PREDICTION_MODE modes[4];
- SEG_RDSTAT rdstat[4][INTER_MODES + INTER_COMPOUND_MODES];
- int mvthresh;
-} BEST_SEG_INFO;
-
static INLINE int mv_check_bounds(const MvLimits *mv_limits, const MV *mv) {
return (mv->row >> 3) < mv_limits->row_min ||
(mv->row >> 3) > mv_limits->row_max ||
@@ -6693,7 +6667,7 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
const int p_row = ((mi_row * MI_SIZE) >> pd->subsampling_y) + 4 * ir;
ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
- conv_params.use_jnt_comp_avg = 0;
+ conv_params.use_dist_wtd_comp_avg = 0;
WarpTypesAllowed warp_types[2];
for (ref = 0; ref < 2; ++ref) {
const WarpedMotionParams *const wm =
@@ -6734,7 +6708,7 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
} else {
int_mv cur_int_mv, init_int_mv;
cur_int_mv.as_mv.col = cur_mv[id].as_mv.col >> 3;
- cur_int_mv.as_mv.row = cur_mv[id].as_mv.col >> 3;
+ cur_int_mv.as_mv.row = cur_mv[id].as_mv.row >> 3;
init_int_mv.as_mv.row = init_mv[id].as_mv.row >> 3;
init_int_mv.as_mv.col = init_mv[id].as_mv.col >> 3;
if (cur_int_mv.as_int == init_int_mv.as_int) {
@@ -6780,9 +6754,9 @@ static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
mi_row * MI_SIZE, xd, cm->allow_warped_motion);
const int order_idx = id != 0;
- av1_jnt_comp_weight_assign(cm, mbmi, order_idx, &xd->jcp_param.fwd_offset,
- &xd->jcp_param.bck_offset,
- &xd->jcp_param.use_jnt_comp_avg, 1);
+ av1_dist_wtd_comp_weight_assign(
+ cm, mbmi, order_idx, &xd->jcp_param.fwd_offset,
+ &xd->jcp_param.bck_offset, &xd->jcp_param.use_dist_wtd_comp_avg, 1);
// Do full-pixel compound motion search on the current reference frame.
if (id) xd->plane[plane].pre[0] = ref_yv12[id];
@@ -7036,19 +7010,25 @@ static void setup_buffer_ref_mvs_inter(
struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) {
const AV1_COMMON *cm = &cpi->common;
const int num_planes = av1_num_planes(cm);
- const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
+ const YV12_BUFFER_CONFIG *scaled_ref_frame =
+ av1_get_scaled_ref_frame(cpi, ref_frame);
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = xd->mi[0];
- const struct scale_factors *const sf =
- &cm->current_frame.frame_refs[ref_frame - 1].sf;
MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
-
+ const struct scale_factors *const sf =
+ get_ref_scale_factors_const(cm, ref_frame);
+ const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, ref_frame);
assert(yv12 != NULL);
- // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this
- // use the UV scaling factors.
- av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf,
- num_planes);
+ if (scaled_ref_frame) {
+ // Setup pred block based on scaled reference, because av1_mv_pred() doesn't
+ // support scaling.
+ av1_setup_pred_block(xd, yv12_mb[ref_frame], scaled_ref_frame, mi_row,
+ mi_col, NULL, NULL, num_planes);
+ } else {
+ av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf,
+ num_planes);
+ }
// Gets an initial list of candidate vectors from neighbours and orders them
av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count,
@@ -7056,11 +7036,18 @@ static void setup_buffer_ref_mvs_inter(
mi_col, mbmi_ext->mode_context);
// Further refinement that is encode side only to test the top few candidates
- // in full and choose the best as the centre point for subsequent searches.
+ // in full and choose the best as the center point for subsequent searches.
// The current implementation doesn't support scaling.
- (void)block_size;
- av1_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride, ref_frame,
- block_size);
+ av1_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12_mb[ref_frame][0].stride,
+ ref_frame, block_size);
+
+ // Go back to unscaled reference.
+ if (scaled_ref_frame) {
+ // We had temporarily setup pred block based on scaled reference above. Go
+ // back to unscaled reference now, for subsequent use.
+ av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf,
+ num_planes);
+ }
}
static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
@@ -7165,13 +7152,13 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
bestsme = av1_full_pixel_search(
cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, 0,
sadpb, cond_cost_list(cpi, cost_list), &ref_mv, INT_MAX, 1,
- (MI_SIZE * mi_col), (MI_SIZE * mi_row), 0);
+ (MI_SIZE * mi_col), (MI_SIZE * mi_row), 0, &cpi->ss_cfg[SS_CFG_SRC]);
break;
case OBMC_CAUSAL:
- bestsme = av1_obmc_full_pixel_search(cpi, x, &mvp_full, step_param, sadpb,
- MAX_MVSEARCH_STEPS - 1 - step_param,
- 1, &cpi->fn_ptr[bsize], &ref_mv,
- &(x->best_mv.as_mv), 0);
+ bestsme = av1_obmc_full_pixel_search(
+ cpi, x, &mvp_full, step_param, sadpb,
+ MAX_MVSEARCH_STEPS - 1 - step_param, 1, &cpi->fn_ptr[bsize], &ref_mv,
+ &(x->best_mv.as_mv), 0, &cpi->ss_cfg[SS_CFG_SRC]);
break;
default: assert(0 && "Invalid motion mode!\n");
}
@@ -7264,10 +7251,9 @@ static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
x->pred_mv[ref] = x->best_mv.as_mv;
}
-static INLINE void restore_dst_buf(MACROBLOCKD *xd, BUFFER_SET dst,
+static INLINE void restore_dst_buf(MACROBLOCKD *xd, const BUFFER_SET dst,
const int num_planes) {
- int i;
- for (i = 0; i < num_planes; i++) {
+ for (int i = 0; i < num_planes; i++) {
xd->plane[i].dst.buf = dst.plane[i];
xd->plane[i].dst.stride = dst.stride[i];
}
@@ -7314,9 +7300,9 @@ static void build_second_inter_pred(const AV1_COMP *cpi, MACROBLOCK *x,
plane, !ref_idx, MV_PRECISION_Q3, mi_col * MI_SIZE,
mi_row * MI_SIZE, xd, cm->allow_warped_motion);
- av1_jnt_comp_weight_assign(cm, mbmi, 0, &xd->jcp_param.fwd_offset,
- &xd->jcp_param.bck_offset,
- &xd->jcp_param.use_jnt_comp_avg, 1);
+ av1_dist_wtd_comp_weight_assign(cm, mbmi, 0, &xd->jcp_param.fwd_offset,
+ &xd->jcp_param.bck_offset,
+ &xd->jcp_param.use_dist_wtd_comp_avg, 1);
}
// Search for the best mv for one component of a compound,
@@ -7442,7 +7428,7 @@ static void compound_single_motion_search_interinter(
// Prediction buffer from second frame.
DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[MAX_SB_SQUARE]);
uint8_t *second_pred;
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+ if (is_cur_buf_hbd(xd))
second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16);
else
second_pred = (uint8_t *)second_pred_alloc_16;
@@ -7572,7 +7558,7 @@ static int estimate_wedge_sign(const AV1_COMP *cpi, const MACROBLOCK *x,
const BLOCK_SIZE f_index = split_qtr[bsize];
assert(f_index != BLOCK_INVALID);
- if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ if (is_cur_buf_hbd(&x->e_mbd)) {
pred0 = CONVERT_TO_BYTEPTR(pred0);
pred1 = CONVERT_TO_BYTEPTR(pred1);
}
@@ -7622,7 +7608,7 @@ static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x,
int wedge_types = (1 << get_wedge_bits_lookup(bsize));
const uint8_t *mask;
uint64_t sse;
- const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+ const int hbd = is_cur_buf_hbd(xd);
const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
DECLARE_ALIGNED(32, int16_t, residual0[MAX_SB_SQUARE]); // src - pred0
@@ -7693,7 +7679,7 @@ static int64_t pick_wedge_fixed_sign(const AV1_COMP *const cpi,
int wedge_types = (1 << get_wedge_bits_lookup(bsize));
const uint8_t *mask;
uint64_t sse;
- const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+ const int hbd = is_cur_buf_hbd(xd);
const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
@@ -7759,7 +7745,7 @@ static int64_t pick_interinter_seg(const AV1_COMP *const cpi,
DIFFWTD_MASK_TYPE cur_mask_type;
int64_t best_rd = INT64_MAX;
DIFFWTD_MASK_TYPE best_mask_type = 0;
- const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+ const int hbd = is_cur_buf_hbd(xd);
const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
DECLARE_ALIGNED(16, uint8_t, seg_mask[2 * MAX_SB_SQUARE]);
uint8_t *tmp_mask[2] = { xd->seg_mask, seg_mask };
@@ -7810,7 +7796,7 @@ static int64_t pick_interintra_wedge(const AV1_COMP *const cpi,
const int bh = block_size_high[bsize];
DECLARE_ALIGNED(32, int16_t, residual1[MAX_SB_SQUARE]); // src - pred1
DECLARE_ALIGNED(32, int16_t, diff10[MAX_SB_SQUARE]); // pred1 - pred0
- if (get_bitdepth_data_path_index(xd)) {
+ if (is_cur_buf_hbd(xd)) {
aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride,
CONVERT_TO_BYTEPTR(p1), bw, xd->bd);
aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(p1), bw,
@@ -7889,7 +7875,7 @@ static void get_inter_predictors_masked_compound(
av1_build_inter_predictors_for_planes_single_buf(
xd, bsize, 0, 0, mi_row, mi_col, 1, preds1, strides, can_use_previous);
const struct buf_2d *const src = &x->plane[0].src;
- if (get_bitdepth_data_path_index(xd)) {
+ if (is_cur_buf_hbd(xd)) {
aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride,
CONVERT_TO_BYTEPTR(*preds1), bw, xd->bd);
aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(*preds1),
@@ -7904,21 +7890,24 @@ static void get_inter_predictors_masked_compound(
static int64_t build_and_cost_compound_type(
const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv,
const BLOCK_SIZE bsize, const PREDICTION_MODE this_mode, int *rs2,
- int rate_mv, BUFFER_SET *ctx, int *out_rate_mv, uint8_t **preds0,
+ int rate_mv, const BUFFER_SET *ctx, int *out_rate_mv, uint8_t **preds0,
uint8_t **preds1, int16_t *residual1, int16_t *diff10, int *strides,
int mi_row, int mi_col, int mode_rate, int64_t ref_best_rd,
- int *calc_pred_masked_compound) {
+ int *calc_pred_masked_compound, int32_t *comp_rate, int64_t *comp_dist,
+ int64_t *const comp_model_rd, const int64_t comp_best_model_rd,
+ int64_t *const comp_model_rd_cur) {
const AV1_COMMON *const cm = &cpi->common;
MACROBLOCKD *xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = xd->mi[0];
- int rate_sum;
- int64_t dist_sum;
int64_t best_rd_cur = INT64_MAX;
int64_t rd = INT64_MAX;
- int tmp_skip_txfm_sb;
- int64_t tmp_skip_sse_sb;
const COMPOUND_TYPE compound_type = mbmi->interinter_comp.type;
+ int rate_sum, tmp_skip_txfm_sb;
+ int64_t dist_sum, tmp_skip_sse_sb;
+ // TODO(any): Save pred and mask calculation as well into records. However
+ // this may increase memory requirements as compound segment mask needs to be
+ // stored in each record.
if (*calc_pred_masked_compound) {
get_inter_predictors_masked_compound(cpi, x, bsize, mi_row, mi_col, preds0,
preds1, residual1, diff10, strides);
@@ -7926,7 +7915,7 @@ static int64_t build_and_cost_compound_type(
}
if (cpi->sf.prune_wedge_pred_diff_based && compound_type == COMPOUND_WEDGE) {
unsigned int sse;
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+ if (is_cur_buf_hbd(xd))
(void)cpi->fn_ptr[bsize].vf(CONVERT_TO_BYTEPTR(*preds0), *strides,
CONVERT_TO_BYTEPTR(*preds1), *strides, &sse);
else
@@ -7934,8 +7923,10 @@ static int64_t build_and_cost_compound_type(
const unsigned int mse =
ROUND_POWER_OF_TWO(sse, num_pels_log2_lookup[bsize]);
// If two predictors are very similar, skip wedge compound mode search
- if (mse < 8 || (!have_newmv_in_inter_mode(this_mode) && mse < 64))
+ if (mse < 8 || (!have_newmv_in_inter_mode(this_mode) && mse < 64)) {
+ *comp_model_rd_cur = INT64_MAX;
return INT64_MAX;
+ }
}
best_rd_cur =
@@ -7947,34 +7938,76 @@ static int64_t build_and_cost_compound_type(
// is unlikely to be the best mode considering the transform rd cost and other
// mode overhead cost
int64_t mode_rd = RDCOST(x->rdmult, *rs2 + mode_rate, 0);
- if (mode_rd > ref_best_rd) return INT64_MAX;
-
- if (have_newmv_in_inter_mode(this_mode) && compound_type == COMPOUND_WEDGE) {
- *out_rate_mv = interinter_compound_motion_search(cpi, x, cur_mv, bsize,
- this_mode, mi_row, mi_col);
- av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, ctx, bsize);
- model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND](
- cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum,
- &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
- rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + rate_sum, dist_sum);
- if (rd >= best_rd_cur) {
- mbmi->mv[0].as_int = cur_mv[0].as_int;
- mbmi->mv[1].as_int = cur_mv[1].as_int;
+ if (mode_rd > ref_best_rd) {
+ *comp_model_rd_cur = INT64_MAX;
+ return INT64_MAX;
+ }
+
+ // Reuse data if matching record is found
+ if (comp_rate[compound_type] == INT_MAX) {
+ if (have_newmv_in_inter_mode(this_mode) &&
+ compound_type == COMPOUND_WEDGE &&
+ !cpi->sf.disable_interinter_wedge_newmv_search) {
+ *out_rate_mv = interinter_compound_motion_search(
+ cpi, x, cur_mv, bsize, this_mode, mi_row, mi_col);
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, ctx, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
+
+ model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND](
+ cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum,
+ &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
+ rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + rate_sum, dist_sum);
+ *comp_model_rd_cur = rd;
+ if (rd >= best_rd_cur) {
+ mbmi->mv[0].as_int = cur_mv[0].as_int;
+ mbmi->mv[1].as_int = cur_mv[1].as_int;
+ *out_rate_mv = rate_mv;
+ av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0,
+ strides, preds1, strides);
+ *comp_model_rd_cur = best_rd_cur;
+ }
+ } else {
*out_rate_mv = rate_mv;
av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0, strides,
preds1, strides);
+ model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND](
+ cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum,
+ &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
+ *comp_model_rd_cur =
+ RDCOST(x->rdmult, *rs2 + *out_rate_mv + rate_sum, dist_sum);
}
+ RD_STATS rd_stats;
+
+ if (cpi->sf.prune_comp_type_by_model_rd &&
+ (*comp_model_rd_cur > comp_best_model_rd) &&
+ comp_best_model_rd != INT64_MAX) {
+ *comp_model_rd_cur = INT64_MAX;
+ return INT64_MAX;
+ }
+ rd = estimate_yrd_for_sb(cpi, bsize, x, ref_best_rd, &rd_stats);
+ if (rd != INT64_MAX) {
+ rd =
+ RDCOST(x->rdmult, *rs2 + *out_rate_mv + rd_stats.rate, rd_stats.dist);
+ // Backup rate and distortion for future reuse
+ comp_rate[compound_type] = rd_stats.rate;
+ comp_dist[compound_type] = rd_stats.dist;
+ comp_model_rd[compound_type] = *comp_model_rd_cur;
+ }
} else {
+ assert(comp_dist[compound_type] != INT64_MAX);
+ // When disable_interinter_wedge_newmv_search is set, motion refinement is
+ // disabled. Hence rate and distortion can be reused in this case as well
+ assert(IMPLIES(have_newmv_in_inter_mode(this_mode),
+ cpi->sf.disable_interinter_wedge_newmv_search));
+ assert(mbmi->mv[0].as_int == cur_mv[0].as_int);
+ assert(mbmi->mv[1].as_int == cur_mv[1].as_int);
*out_rate_mv = rate_mv;
- av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0, strides,
- preds1, strides);
+ // Calculate RD cost based on stored stats
+ rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + comp_rate[compound_type],
+ comp_dist[compound_type]);
+ *comp_model_rd_cur = comp_model_rd[compound_type];
}
- rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
- &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
- if (rd != INT64_MAX)
- rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + rate_sum, dist_sum);
-
return rd;
}
@@ -8172,8 +8205,9 @@ static INLINE int get_switchable_rate(MACROBLOCK *const x,
// calculate the rdcost of given interpolation_filter
static INLINE int64_t interpolation_filter_rd(
- MACROBLOCK *const x, const AV1_COMP *const cpi, BLOCK_SIZE bsize,
- int mi_row, int mi_col, BUFFER_SET *const orig_dst, int64_t *const rd,
+ MACROBLOCK *const x, const AV1_COMP *const cpi,
+ const TileDataEnc *tile_data, BLOCK_SIZE bsize, int mi_row, int mi_col,
+ const BUFFER_SET *const orig_dst, int64_t *const rd,
int *const switchable_rate, int *const skip_txfm_sb,
int64_t *const skip_sse_sb, const BUFFER_SET *dst_bufs[2], int filter_idx,
const int switchable_ctx[2], const int skip_pred, int *rate,
@@ -8196,6 +8230,8 @@ static INLINE int64_t interpolation_filter_rd(
return 0;
}
+ (void)tile_data;
+
assert(skip_pred != 2);
assert((skip_pred >= 0) && (skip_pred <= cpi->default_interp_skip_flags));
assert(rate[0] >= 0);
@@ -8209,11 +8245,13 @@ static INLINE int64_t interpolation_filter_rd(
if (skip_pred != cpi->default_interp_skip_flags) {
if (skip_pred != DEFAULT_LUMA_INTERP_SKIP_FLAG) {
- av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize);
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
#if CONFIG_COLLECT_RD_STATS == 3
RD_STATS rd_stats_y;
- select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, mi_row, mi_col, INT64_MAX);
- PrintPredictionUnitStats(cpi, x, &rd_stats_y, bsize);
+ pick_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize, mi_row, mi_col,
+ INT64_MAX);
+ PrintPredictionUnitStats(cpi, tile_data, x, &rd_stats_y, bsize);
#endif // CONFIG_COLLECT_RD_STATS == 3
model_rd_sb_fn[MODELRD_TYPE_INTERP_FILTER](
cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &tmp_rate[0], &tmp_dist[0],
@@ -8234,8 +8272,8 @@ static INLINE int64_t interpolation_filter_rd(
mbmi->interp_filters = last_best;
return 0;
}
- av1_build_inter_predictors_sbp(cm, xd, mi_row, mi_col, orig_dst, bsize,
- plane);
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+ plane, plane);
model_rd_sb_fn[MODELRD_TYPE_INTERP_FILTER](
cpi, bsize, x, xd, plane, plane, mi_row, mi_col, &tmp_rate_uv,
&tmp_dist_uv, &tmp_skip_sb_uv, &tmp_skip_sse_uv, NULL, NULL, NULL);
@@ -8287,21 +8325,103 @@ static INLINE int64_t interpolation_filter_rd(
return 0;
}
+static INLINE void pred_dual_interp_filter_rd(
+ MACROBLOCK *const x, const AV1_COMP *const cpi,
+ const TileDataEnc *tile_data, BLOCK_SIZE bsize, int mi_row, int mi_col,
+ const BUFFER_SET *const orig_dst, int64_t *const rd,
+ int *const switchable_rate, int *const skip_txfm_sb,
+ int64_t *const skip_sse_sb, const BUFFER_SET *dst_bufs[2],
+ InterpFilters filter_idx, const int switchable_ctx[2], const int skip_pred,
+ int *rate, int64_t *dist, InterpFilters af_horiz, InterpFilters af_vert,
+ InterpFilters lf_horiz, InterpFilters lf_vert) {
+ if ((af_horiz == lf_horiz) && (af_horiz != SWITCHABLE)) {
+ if (((af_vert == lf_vert) && (af_vert != SWITCHABLE))) {
+ filter_idx = af_horiz + (af_vert * SWITCHABLE_FILTERS);
+ if (filter_idx) {
+ interpolation_filter_rd(x, cpi, tile_data, bsize, mi_row, mi_col,
+ orig_dst, rd, switchable_rate, skip_txfm_sb,
+ skip_sse_sb, dst_bufs, filter_idx,
+ switchable_ctx, skip_pred, rate, dist);
+ }
+ } else {
+ for (filter_idx = af_horiz; filter_idx < (DUAL_FILTER_SET_SIZE);
+ filter_idx += SWITCHABLE_FILTERS) {
+ if (filter_idx) {
+ interpolation_filter_rd(x, cpi, tile_data, bsize, mi_row, mi_col,
+ orig_dst, rd, switchable_rate, skip_txfm_sb,
+ skip_sse_sb, dst_bufs, filter_idx,
+ switchable_ctx, skip_pred, rate, dist);
+ }
+ }
+ }
+ } else if ((af_vert == lf_vert) && (af_vert != SWITCHABLE)) {
+ for (filter_idx = (af_vert * SWITCHABLE_FILTERS);
+ filter_idx <= ((af_vert * SWITCHABLE_FILTERS) + 2); filter_idx += 1) {
+ if (filter_idx) {
+ interpolation_filter_rd(x, cpi, tile_data, bsize, mi_row, mi_col,
+ orig_dst, rd, switchable_rate, skip_txfm_sb,
+ skip_sse_sb, dst_bufs, filter_idx,
+ switchable_ctx, skip_pred, rate, dist);
+ }
+ }
+ }
+}
+
// Find the best interp filter if dual_interp_filter = 0
static INLINE void find_best_non_dual_interp_filter(
- MACROBLOCK *const x, const AV1_COMP *const cpi, BLOCK_SIZE bsize,
- int mi_row, int mi_col, BUFFER_SET *const orig_dst, int64_t *const rd,
+ MACROBLOCK *const x, const AV1_COMP *const cpi,
+ const TileDataEnc *tile_data, BLOCK_SIZE bsize, int mi_row, int mi_col,
+ const BUFFER_SET *const orig_dst, int64_t *const rd,
int *const switchable_rate, int *const skip_txfm_sb,
int64_t *const skip_sse_sb, const BUFFER_SET *dst_bufs[2],
const int switchable_ctx[2], const int skip_ver, const int skip_hor,
int *rate, int64_t *dist, int filter_set_size) {
int16_t i;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
// Regular filter evaluation should have been done and hence the same should
// be the winner
assert(x->e_mbd.mi[0]->interp_filters == filter_sets[0]);
assert(filter_set_size == DUAL_FILTER_SET_SIZE);
-
+ if ((skip_hor & skip_ver) != cpi->default_interp_skip_flags) {
+ const AV1_COMMON *cm = &cpi->common;
+ int bsl, pred_filter_search;
+ InterpFilters af = SWITCHABLE, lf = SWITCHABLE, filter_idx = 0;
+ const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+ const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+ bsl = mi_size_wide_log2[bsize];
+ pred_filter_search =
+ cpi->sf.cb_pred_filter_search
+ ? (((mi_row + mi_col) >> bsl) +
+ get_chessboard_index(cm->current_frame.frame_number)) &
+ 0x1
+ : 0;
+ if (above_mbmi && is_inter_block(above_mbmi)) {
+ af = above_mbmi->interp_filters;
+ }
+ if (left_mbmi && is_inter_block(left_mbmi)) {
+ lf = left_mbmi->interp_filters;
+ }
+ pred_filter_search &= ((af == lf) && (af != SWITCHABLE));
+ if (pred_filter_search) {
+ filter_idx = SWITCHABLE * (af & 0xf);
+ // This assert tells that (filter_x == filter_y) for non-dual filter case
+ assert((filter_sets[filter_idx] & 0xffff) ==
+ (filter_sets[filter_idx] >> 16));
+ if (cpi->sf.adaptive_interp_filter_search &&
+ (cpi->sf.interp_filter_search_mask & (1 << (filter_idx >> 2)))) {
+ return;
+ }
+ if (filter_idx) {
+ interpolation_filter_rd(
+ x, cpi, tile_data, bsize, mi_row, mi_col, orig_dst, rd,
+ switchable_rate, skip_txfm_sb, skip_sse_sb, dst_bufs, filter_idx,
+ switchable_ctx, (skip_hor & skip_ver), rate, dist);
+ }
+ return;
+ }
+ }
// Reuse regular filter's modeled rd data for sharp filter for following
// cases
// 1) When bsize is 4x4
@@ -8321,10 +8441,14 @@ static INLINE void find_best_non_dual_interp_filter(
for (i = filter_set_size - 1; i > 0; i -= (SWITCHABLE_FILTERS + 1)) {
// This assert tells that (filter_x == filter_y) for non-dual filter case
assert((filter_sets[i] & 0xffff) == (filter_sets[i] >> 16));
- interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd,
- switchable_rate, skip_txfm_sb, skip_sse_sb,
- dst_bufs, i, switchable_ctx, skip_pred, rate,
- dist);
+ if (cpi->sf.adaptive_interp_filter_search &&
+ (cpi->sf.interp_filter_search_mask & (1 << (i >> 2)))) {
+ continue;
+ }
+ interpolation_filter_rd(x, cpi, tile_data, bsize, mi_row, mi_col,
+ orig_dst, rd, switchable_rate, skip_txfm_sb,
+ skip_sse_sb, dst_bufs, i, switchable_ctx,
+ skip_pred, rate, dist);
skip_pred = (skip_hor & skip_ver);
}
} else {
@@ -8333,10 +8457,14 @@ static INLINE void find_best_non_dual_interp_filter(
i += (SWITCHABLE_FILTERS + 1)) {
// This assert tells that (filter_x == filter_y) for non-dual filter case
assert((filter_sets[i] & 0xffff) == (filter_sets[i] >> 16));
- interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd,
- switchable_rate, skip_txfm_sb, skip_sse_sb,
- dst_bufs, i, switchable_ctx, skip_pred, rate,
- dist);
+ if (cpi->sf.adaptive_interp_filter_search &&
+ (cpi->sf.interp_filter_search_mask & (1 << (i >> 2)))) {
+ continue;
+ }
+ interpolation_filter_rd(x, cpi, tile_data, bsize, mi_row, mi_col,
+ orig_dst, rd, switchable_rate, skip_txfm_sb,
+ skip_sse_sb, dst_bufs, i, switchable_ctx,
+ skip_pred, rate, dist);
// In first iteration, smooth filter is evaluated. If smooth filter
// (which is less sharper) is the winner among regular and smooth filters,
// sharp filter evaluation is skipped
@@ -8344,8 +8472,6 @@ static INLINE void find_best_non_dual_interp_filter(
// accounting switchable filter rate)
if (cpi->sf.skip_sharp_interp_filter_search &&
skip_pred != cpi->default_interp_skip_flags) {
- MACROBLOCKD *const xd = &x->e_mbd;
- MB_MODE_INFO *const mbmi = xd->mi[0];
if (mbmi->interp_filters == filter_sets[(SWITCHABLE_FILTERS + 1)])
break;
}
@@ -8366,6 +8492,52 @@ static INLINE int is_interp_filter_match(const INTERPOLATION_FILTER_STATS *st,
return 1;
}
+// Checks if characteristics of search match
+static INLINE int is_comp_rd_match(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x,
+ const COMP_RD_STATS *st,
+ const MB_MODE_INFO *const mi,
+ int32_t *comp_rate, int64_t *comp_dist,
+ int64_t *comp_model_rd) {
+ // TODO(ranjit): Ensure that compound type search use regular filter always
+ // and check if following check can be removed
+ // Check if interp filter matches with previous case
+ if (st->filter != mi->interp_filters) return 0;
+
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ // Match MV and reference indices
+ for (int i = 0; i < 2; ++i) {
+ if ((st->ref_frames[i] != mi->ref_frame[i]) ||
+ (st->mv[i].as_int != mi->mv[i].as_int)) {
+ return 0;
+ }
+ const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[i]];
+ if (is_global_mv_block(mi, wm->wmtype) != st->is_global[i]) return 0;
+ }
+
+ // Store the stats for compound average
+ comp_rate[COMPOUND_AVERAGE] = st->rate[COMPOUND_AVERAGE];
+ comp_dist[COMPOUND_AVERAGE] = st->dist[COMPOUND_AVERAGE];
+ comp_model_rd[COMPOUND_AVERAGE] = st->comp_model_rd[COMPOUND_AVERAGE];
+ comp_rate[COMPOUND_DISTWTD] = st->rate[COMPOUND_DISTWTD];
+ comp_dist[COMPOUND_DISTWTD] = st->dist[COMPOUND_DISTWTD];
+ comp_model_rd[COMPOUND_DISTWTD] = st->comp_model_rd[COMPOUND_DISTWTD];
+
+ // For compound wedge/segment, reuse data only if NEWMV is not present in
+ // either of the directions
+ if ((!have_newmv_in_inter_mode(mi->mode) &&
+ !have_newmv_in_inter_mode(st->mode)) ||
+ (cpi->sf.disable_interinter_wedge_newmv_search)) {
+ memcpy(&comp_rate[COMPOUND_WEDGE], &st->rate[COMPOUND_WEDGE],
+ sizeof(comp_rate[COMPOUND_WEDGE]) * 2);
+ memcpy(&comp_dist[COMPOUND_WEDGE], &st->dist[COMPOUND_WEDGE],
+ sizeof(comp_dist[COMPOUND_WEDGE]) * 2);
+ memcpy(&comp_model_rd[COMPOUND_WEDGE], &st->comp_model_rd[COMPOUND_WEDGE],
+ sizeof(comp_model_rd[COMPOUND_WEDGE]) * 2);
+ }
+ return 1;
+}
+
static INLINE int find_interp_filter_in_stats(MACROBLOCK *x,
MB_MODE_INFO *const mbmi) {
const int comp_idx = mbmi->compound_idx;
@@ -8379,9 +8551,27 @@ static INLINE int find_interp_filter_in_stats(MACROBLOCK *x,
}
return -1; // no match result found
}
+// Checks if similar compound type search case is accounted earlier
+// If found, returns relevant rd data
+static INLINE int find_comp_rd_in_stats(const AV1_COMP *const cpi,
+ const MACROBLOCK *x,
+ const MB_MODE_INFO *const mbmi,
+ int32_t *comp_rate, int64_t *comp_dist,
+ int64_t *comp_model_rd) {
+ for (int j = 0; j < x->comp_rd_stats_idx; ++j) {
+ if (is_comp_rd_match(cpi, x, &x->comp_rd_stats[j], mbmi, comp_rate,
+ comp_dist, comp_model_rd)) {
+ return 1;
+ }
+ }
+ return 0; // no match result found
+}
static INLINE void save_interp_filter_search_stat(MACROBLOCK *x,
- MB_MODE_INFO *const mbmi) {
+ MB_MODE_INFO *const mbmi,
+ int64_t rd, int skip_txfm_sb,
+ int64_t skip_sse_sb,
+ unsigned int pred_sse) {
const int comp_idx = mbmi->compound_idx;
const int offset = x->interp_filter_stats_idx[comp_idx];
if (offset < MAX_INTERP_FILTER_STATS) {
@@ -8389,19 +8579,52 @@ static INLINE void save_interp_filter_search_stat(MACROBLOCK *x,
{ mbmi->mv[0], mbmi->mv[1] },
{ mbmi->ref_frame[0],
mbmi->ref_frame[1] },
- mbmi->interinter_comp.type };
+ mbmi->interinter_comp.type,
+ rd,
+ skip_txfm_sb,
+ skip_sse_sb,
+ pred_sse };
x->interp_filter_stats[comp_idx][offset] = stat;
x->interp_filter_stats_idx[comp_idx]++;
}
}
+static INLINE void save_comp_rd_search_stat(MACROBLOCK *x,
+ const MB_MODE_INFO *const mbmi,
+ const int32_t *comp_rate,
+ const int64_t *comp_dist,
+ const int64_t *comp_model_rd,
+ const int_mv *cur_mv) {
+ const int offset = x->comp_rd_stats_idx;
+ if (offset < MAX_COMP_RD_STATS) {
+ COMP_RD_STATS *const rd_stats = x->comp_rd_stats + offset;
+ memcpy(rd_stats->rate, comp_rate, sizeof(rd_stats->rate));
+ memcpy(rd_stats->dist, comp_dist, sizeof(rd_stats->dist));
+ memcpy(rd_stats->comp_model_rd, comp_model_rd,
+ sizeof(rd_stats->comp_model_rd));
+ memcpy(rd_stats->mv, cur_mv, sizeof(rd_stats->mv));
+ memcpy(rd_stats->ref_frames, mbmi->ref_frame, sizeof(rd_stats->ref_frames));
+ rd_stats->mode = mbmi->mode;
+ rd_stats->filter = mbmi->interp_filters;
+ rd_stats->ref_mv_idx = mbmi->ref_mv_idx;
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ for (int i = 0; i < 2; ++i) {
+ const WarpedMotionParams *const wm =
+ &xd->global_motion[mbmi->ref_frame[i]];
+ rd_stats->is_global[i] = is_global_mv_block(mbmi, wm->wmtype);
+ }
+ ++x->comp_rd_stats_idx;
+ }
+}
+
static int64_t interpolation_filter_search(
- MACROBLOCK *const x, const AV1_COMP *const cpi, BLOCK_SIZE bsize,
- int mi_row, int mi_col, const BUFFER_SET *const tmp_dst,
- BUFFER_SET *const orig_dst, InterpFilter (*const single_filter)[REF_FRAMES],
- int64_t *const rd, int *const switchable_rate, int *const skip_txfm_sb,
- int64_t *const skip_sse_sb, const int skip_build_pred,
- HandleInterModeArgs *args, int64_t ref_best_rd) {
+ MACROBLOCK *const x, const AV1_COMP *const cpi,
+ const TileDataEnc *tile_data, BLOCK_SIZE bsize, int mi_row, int mi_col,
+ const BUFFER_SET *const tmp_dst, const BUFFER_SET *const orig_dst,
+ InterpFilter (*const single_filter)[REF_FRAMES], int64_t *const rd,
+ int *const switchable_rate, int *const skip_txfm_sb,
+ int64_t *const skip_sse_sb, int *skip_build_pred, HandleInterModeArgs *args,
+ int64_t ref_best_rd) {
const AV1_COMMON *cm = &cpi->common;
const int num_planes = av1_num_planes(cm);
MACROBLOCKD *const xd = &x->e_mbd;
@@ -8418,12 +8641,23 @@ static int64_t interpolation_filter_search(
const int ref_frame = xd->mi[0]->ref_frame[0];
(void)single_filter;
- int match_found = -1;
+ int match_found_idx = -1;
const InterpFilter assign_filter = cm->interp_filter;
if (cpi->sf.skip_repeat_interpolation_filter_search && need_search) {
- match_found = find_interp_filter_in_stats(x, mbmi);
+ match_found_idx = find_interp_filter_in_stats(x, mbmi);
+ }
+ if (match_found_idx != -1) {
+ const int comp_idx = mbmi->compound_idx;
+ *rd = x->interp_filter_stats[comp_idx][match_found_idx].rd;
+ *skip_txfm_sb =
+ x->interp_filter_stats[comp_idx][match_found_idx].skip_txfm_sb;
+ *skip_sse_sb =
+ x->interp_filter_stats[comp_idx][match_found_idx].skip_sse_sb;
+ x->pred_sse[ref_frame] =
+ x->interp_filter_stats[comp_idx][match_found_idx].pred_sse;
+ return 0;
}
- if (!need_search || match_found == -1) {
+ if (!need_search || match_found_idx == -1) {
set_default_interp_filters(mbmi, assign_filter);
}
int switchable_ctx[2];
@@ -8431,13 +8665,16 @@ static int64_t interpolation_filter_search(
switchable_ctx[1] = av1_get_pred_context_switchable_interp(xd, 1);
*switchable_rate =
get_switchable_rate(x, mbmi->interp_filters, switchable_ctx);
- if (!skip_build_pred)
- av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
+ if (!(*skip_build_pred)) {
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, 0,
+ av1_num_planes(cm) - 1);
+ *skip_build_pred = 1;
+ }
#if CONFIG_COLLECT_RD_STATS == 3
RD_STATS rd_stats_y;
- select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, mi_row, mi_col, INT64_MAX);
- PrintPredictionUnitStats(cpi, x, &rd_stats_y, bsize);
+ pick_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize, mi_row, mi_col, INT64_MAX);
+ PrintPredictionUnitStats(cpi, tile_data, x, &rd_stats_y, bsize);
#endif // CONFIG_COLLECT_RD_STATS == 3
model_rd_sb_fn[MODELRD_TYPE_INTERP_FILTER](
cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &tmp_rate[0], &tmp_dist[0],
@@ -8458,7 +8695,7 @@ static int64_t interpolation_filter_search(
*skip_sse_sb = best_skip_sse_sb[1];
x->pred_sse[ref_frame] = (unsigned int)(best_skip_sse_sb[0] >> 4);
- if (assign_filter != SWITCHABLE || match_found != -1) {
+ if (assign_filter != SWITCHABLE || match_found_idx != -1) {
return 0;
}
if (!need_search) {
@@ -8493,9 +8730,8 @@ static int64_t interpolation_filter_search(
const int is_compound = has_second_ref(mbmi);
assert(is_intrabc_block(mbmi) == 0);
for (int j = 0; j < 1 + is_compound; ++j) {
- const RefBuffer *ref_buf =
- &cm->current_frame.frame_refs[mbmi->ref_frame[j] - LAST_FRAME];
- const struct scale_factors *const sf = &ref_buf->sf;
+ const struct scale_factors *const sf =
+ get_ref_scale_factors_const(cm, mbmi->ref_frame[j]);
// TODO(any): Refine skip flag calculation considering scaling
if (av1_is_scaled(sf)) {
skip_hor = 0;
@@ -8543,38 +8779,72 @@ static int64_t interpolation_filter_search(
int best_dual_mode = 0;
// Find best of {R}x{R,Sm,Sh}
const int bw = block_size_wide[bsize];
- int skip_pred = bw <= 4 ? cpi->default_interp_skip_flags : skip_hor;
- for (i = (SWITCHABLE_FILTERS - 1); i >= 1; --i) {
- if (interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd,
- switchable_rate, best_skip_txfm_sb,
- best_skip_sse_sb, dst_bufs, i, switchable_ctx,
- skip_pred, tmp_rate, tmp_dist)) {
- best_dual_mode = i;
- }
- skip_pred = skip_hor;
- }
- // From best of horizontal EIGHTTAP_REGULAR modes, check vertical modes
const int bh = block_size_high[bsize];
- skip_pred = bh <= 4 ? cpi->default_interp_skip_flags : skip_ver;
- assert(filter_set_size == DUAL_FILTER_SET_SIZE);
- for (i = (best_dual_mode + (SWITCHABLE_FILTERS * 2));
- i >= (best_dual_mode + SWITCHABLE_FILTERS); i -= SWITCHABLE_FILTERS) {
- interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd,
- switchable_rate, best_skip_txfm_sb,
- best_skip_sse_sb, dst_bufs, i, switchable_ctx,
- skip_pred, tmp_rate, tmp_dist);
- skip_pred = skip_ver;
+ int skip_pred;
+ int bsl, pred_filter_search;
+ InterpFilters af_horiz = SWITCHABLE, af_vert = SWITCHABLE,
+ lf_horiz = SWITCHABLE, lf_vert = SWITCHABLE, filter_idx = 0;
+ const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+ const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+ bsl = mi_size_wide_log2[bsize];
+ pred_filter_search =
+ cpi->sf.cb_pred_filter_search
+ ? (((mi_row + mi_col) >> bsl) +
+ get_chessboard_index(cm->current_frame.frame_number)) &
+ 0x1
+ : 0;
+ if (above_mbmi && is_inter_block(above_mbmi)) {
+ af_horiz = av1_extract_interp_filter(above_mbmi->interp_filters, 1);
+ af_vert = av1_extract_interp_filter(above_mbmi->interp_filters, 0);
+ }
+ if (left_mbmi && is_inter_block(left_mbmi)) {
+ lf_horiz = av1_extract_interp_filter(left_mbmi->interp_filters, 1);
+ lf_vert = av1_extract_interp_filter(left_mbmi->interp_filters, 0);
+ }
+ pred_filter_search &= !have_newmv_in_inter_mode(mbmi->mode);
+ pred_filter_search &=
+ ((af_horiz == lf_horiz) && (af_horiz != SWITCHABLE)) ||
+ ((af_vert == lf_vert) && (af_vert != SWITCHABLE));
+ if (pred_filter_search) {
+ pred_dual_interp_filter_rd(
+ x, cpi, tile_data, bsize, mi_row, mi_col, orig_dst, rd,
+ switchable_rate, best_skip_txfm_sb, best_skip_sse_sb, dst_bufs,
+ filter_idx, switchable_ctx, (skip_hor & skip_ver), tmp_rate, tmp_dist,
+ af_horiz, af_vert, lf_horiz, lf_vert);
+ } else {
+ skip_pred = bw <= 4 ? cpi->default_interp_skip_flags : skip_hor;
+ for (i = (SWITCHABLE_FILTERS - 1); i >= 1; --i) {
+ if (interpolation_filter_rd(
+ x, cpi, tile_data, bsize, mi_row, mi_col, orig_dst, rd,
+ switchable_rate, best_skip_txfm_sb, best_skip_sse_sb, dst_bufs,
+ i, switchable_ctx, skip_pred, tmp_rate, tmp_dist)) {
+ best_dual_mode = i;
+ }
+ skip_pred = skip_hor;
+ }
+ // From best of horizontal EIGHTTAP_REGULAR modes, check vertical modes
+ skip_pred = bh <= 4 ? cpi->default_interp_skip_flags : skip_ver;
+ assert(filter_set_size == DUAL_FILTER_SET_SIZE);
+ for (i = (best_dual_mode + (SWITCHABLE_FILTERS * 2));
+ i >= (best_dual_mode + SWITCHABLE_FILTERS);
+ i -= SWITCHABLE_FILTERS) {
+ interpolation_filter_rd(
+ x, cpi, tile_data, bsize, mi_row, mi_col, orig_dst, rd,
+ switchable_rate, best_skip_txfm_sb, best_skip_sse_sb, dst_bufs, i,
+ switchable_ctx, skip_pred, tmp_rate, tmp_dist);
+ skip_pred = skip_ver;
+ }
}
} else if (cm->seq_params.enable_dual_filter == 0) {
find_best_non_dual_interp_filter(
- x, cpi, bsize, mi_row, mi_col, orig_dst, rd, switchable_rate,
+ x, cpi, tile_data, bsize, mi_row, mi_col, orig_dst, rd, switchable_rate,
best_skip_txfm_sb, best_skip_sse_sb, dst_bufs, switchable_ctx, skip_ver,
skip_hor, tmp_rate, tmp_dist, filter_set_size);
} else {
// EIGHTTAP_REGULAR mode is calculated beforehand
for (i = 1; i < filter_set_size; ++i) {
- interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd,
- switchable_rate, best_skip_txfm_sb,
+ interpolation_filter_rd(x, cpi, tile_data, bsize, mi_row, mi_col,
+ orig_dst, rd, switchable_rate, best_skip_txfm_sb,
best_skip_sse_sb, dst_bufs, i, switchable_ctx,
(skip_hor & skip_ver), tmp_rate, tmp_dist);
}
@@ -8586,7 +8856,8 @@ static int64_t interpolation_filter_search(
// in either of the directions Condition below is necessary, but not
// sufficient
assert((skip_hor == 1) || (skip_ver == 1));
- av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize);
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
}
*skip_txfm_sb = best_skip_txfm_sb[1];
*skip_sse_sb = best_skip_sse_sb[1];
@@ -8594,174 +8865,145 @@ static int64_t interpolation_filter_search(
// save search results
if (cpi->sf.skip_repeat_interpolation_filter_search) {
- assert(match_found == -1);
- save_interp_filter_search_stat(x, mbmi);
+ assert(match_found_idx == -1);
+ save_interp_filter_search_stat(x, mbmi, *rd, *skip_txfm_sb, *skip_sse_sb,
+ x->pred_sse[ref_frame]);
}
return 0;
}
-static int txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
- int mi_row, int mi_col, RD_STATS *rd_stats,
- RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv,
- int mode_rate, int64_t ref_best_rd) {
+static int txfm_search(const AV1_COMP *cpi, const TileDataEnc *tile_data,
+ MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col,
+ RD_STATS *rd_stats, RD_STATS *rd_stats_y,
+ RD_STATS *rd_stats_uv, int mode_rate,
+ int64_t ref_best_rd) {
/*
* This function combines y and uv planes' transform search processes
- * together, when the prediction is generated. It first does subtration to
+ * together, when the prediction is generated. It first does subtraction to
* obtain the prediction error. Then it calls
- * select_tx_type_yrd/super_block_yrd and inter_block_uvrd sequentially and
- * handles the early terminations happen in those functions. At the end, it
+ * pick_tx_size_type_yrd/super_block_yrd and super_block_uvrd sequentially and
+ * handles the early terminations happening in those functions. At the end, it
* computes the rd_stats/_y/_uv accordingly.
*/
const AV1_COMMON *cm = &cpi->common;
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = xd->mi[0];
- int skip_txfm_sb = 0;
- const int num_planes = av1_num_planes(cm);
const int ref_frame_1 = mbmi->ref_frame[1];
const int64_t mode_rd = RDCOST(x->rdmult, mode_rate, 0);
const int64_t rd_thresh =
ref_best_rd == INT64_MAX ? INT64_MAX : ref_best_rd - mode_rd;
const int skip_ctx = av1_get_skip_context(xd);
+ const int skip_flag_cost[2] = { x->skip_cost[skip_ctx][0],
+ x->skip_cost[skip_ctx][1] };
const int64_t min_header_rate =
- mode_rate + AOMMIN(x->skip_cost[skip_ctx][0], x->skip_cost[skip_ctx][1]);
+ mode_rate + AOMMIN(skip_flag_cost[0], skip_flag_cost[1]);
// Account for minimum skip and non_skip rd.
// Eventually either one of them will be added to mode_rate
const int64_t min_header_rd_possible = RDCOST(x->rdmult, min_header_rate, 0);
+ (void)tile_data;
if (min_header_rd_possible > ref_best_rd) {
av1_invalid_rd_stats(rd_stats_y);
- av1_invalid_rd_stats(rd_stats);
return 0;
}
av1_init_rd_stats(rd_stats);
av1_init_rd_stats(rd_stats_y);
- av1_init_rd_stats(rd_stats_uv);
rd_stats->rate = mode_rate;
- if (!cpi->common.all_lossless)
- check_block_skip(cpi, bsize, x, xd, 0, num_planes - 1, &skip_txfm_sb);
- if (!skip_txfm_sb) {
- int64_t non_skip_rdcosty = INT64_MAX;
- int64_t skip_rdcosty = INT64_MAX;
- int64_t min_rdcosty = INT64_MAX;
- int is_cost_valid_uv = 0;
-
- // cost and distortion
- av1_subtract_plane(x, bsize, 0);
- if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
- // Motion mode
- select_tx_type_yrd(cpi, x, rd_stats_y, bsize, mi_row, mi_col, rd_thresh);
+ // cost and distortion
+ av1_subtract_plane(x, bsize, 0);
+ if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
+ pick_tx_size_type_yrd(cpi, x, rd_stats_y, bsize, mi_row, mi_col, rd_thresh);
#if CONFIG_COLLECT_RD_STATS == 2
- PrintPredictionUnitStats(cpi, x, rd_stats_y, bsize);
+ PrintPredictionUnitStats(cpi, tile_data, x, rd_stats_y, bsize);
#endif // CONFIG_COLLECT_RD_STATS == 2
- } else {
- super_block_yrd(cpi, x, rd_stats_y, bsize, rd_thresh);
- memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
- for (int i = 0; i < xd->n4_h * xd->n4_w; ++i)
- set_blk_skip(x, 0, i, rd_stats_y->skip);
- }
-
- if (rd_stats_y->rate == INT_MAX) {
- av1_invalid_rd_stats(rd_stats);
- // TODO(angiebird): check if we need this
- // restore_dst_buf(xd, *orig_dst, num_planes);
- mbmi->ref_frame[1] = ref_frame_1;
- return 0;
- }
+ } else {
+ super_block_yrd(cpi, x, rd_stats_y, bsize, rd_thresh);
+ memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
+ for (int i = 0; i < xd->n4_h * xd->n4_w; ++i)
+ set_blk_skip(x, 0, i, rd_stats_y->skip);
+ }
- av1_merge_rd_stats(rd_stats, rd_stats_y);
+ if (rd_stats_y->rate == INT_MAX) {
+ // TODO(angiebird): check if we need this
+ // restore_dst_buf(xd, *orig_dst, num_planes);
+ mbmi->ref_frame[1] = ref_frame_1;
+ return 0;
+ }
- non_skip_rdcosty = RDCOST(
- x->rdmult, rd_stats->rate + x->skip_cost[skip_ctx][0], rd_stats->dist);
- skip_rdcosty =
- RDCOST(x->rdmult, mode_rate + x->skip_cost[skip_ctx][1], rd_stats->sse);
- min_rdcosty = AOMMIN(non_skip_rdcosty, skip_rdcosty);
+ av1_merge_rd_stats(rd_stats, rd_stats_y);
+
+ const int64_t non_skip_rdcosty =
+ RDCOST(x->rdmult, rd_stats->rate + skip_flag_cost[0], rd_stats->dist);
+ const int64_t skip_rdcosty =
+ RDCOST(x->rdmult, mode_rate + skip_flag_cost[1], rd_stats->sse);
+ const int64_t min_rdcosty = AOMMIN(non_skip_rdcosty, skip_rdcosty);
+ if (min_rdcosty > ref_best_rd) {
+ const int64_t tokenonly_rdy =
+ AOMMIN(RDCOST(x->rdmult, rd_stats_y->rate, rd_stats_y->dist),
+ RDCOST(x->rdmult, 0, rd_stats_y->sse));
+ // Invalidate rd_stats_y to skip the rest of the motion modes search
+ if (tokenonly_rdy - (tokenonly_rdy >> cpi->sf.prune_motion_mode_level) >
+ rd_thresh)
+ av1_invalid_rd_stats(rd_stats_y);
+ mbmi->ref_frame[1] = ref_frame_1;
+ return 0;
+ }
- if (min_rdcosty > ref_best_rd) {
- int64_t tokenonly_rdy =
- AOMMIN(RDCOST(x->rdmult, rd_stats_y->rate, rd_stats_y->dist),
- RDCOST(x->rdmult, 0, rd_stats_y->sse));
- // Invalidate rd_stats_y to skip the rest of the motion modes search
- if (tokenonly_rdy - (tokenonly_rdy >> cpi->sf.adaptive_txb_search_level) >
- rd_thresh)
- av1_invalid_rd_stats(rd_stats_y);
+ av1_init_rd_stats(rd_stats_uv);
+ const int num_planes = av1_num_planes(cm);
+ if (num_planes > 1) {
+ int64_t ref_best_chroma_rd = ref_best_rd;
+ // Calculate best rd cost possible for chroma
+ if (cpi->sf.perform_best_rd_based_gating_for_chroma &&
+ (ref_best_chroma_rd != INT64_MAX)) {
+ ref_best_chroma_rd =
+ (ref_best_chroma_rd - AOMMIN(non_skip_rdcosty, skip_rdcosty));
+ }
+ const int is_cost_valid_uv =
+ super_block_uvrd(cpi, x, rd_stats_uv, bsize, ref_best_chroma_rd);
+ if (!is_cost_valid_uv) {
mbmi->ref_frame[1] = ref_frame_1;
return 0;
}
+ av1_merge_rd_stats(rd_stats, rd_stats_uv);
+ }
- if (num_planes > 1) {
- /* clang-format off */
- is_cost_valid_uv =
- inter_block_uvrd(cpi, x, rd_stats_uv, bsize,
- ref_best_rd - non_skip_rdcosty,
- ref_best_rd - skip_rdcosty, FTXS_NONE);
- if (!is_cost_valid_uv) {
- mbmi->ref_frame[1] = ref_frame_1;
- return 0;
- }
- /* clang-format on */
- av1_merge_rd_stats(rd_stats, rd_stats_uv);
- } else {
- av1_init_rd_stats(rd_stats_uv);
- }
- if (rd_stats->skip) {
- rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate;
- rd_stats_y->rate = 0;
- rd_stats_uv->rate = 0;
- rd_stats->rate += x->skip_cost[skip_ctx][1];
- mbmi->skip = 0;
- // here mbmi->skip temporarily plays a role as what this_skip2 does
-
- int64_t tmprd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
- if (tmprd > ref_best_rd) {
- mbmi->ref_frame[1] = ref_frame_1;
- return 0;
- }
-#if CONFIG_ONE_PASS_SVM
- av1_reg_stat_skipmode_update(rd_stats_y, x->rdmult);
-#endif
- } else if (!xd->lossless[mbmi->segment_id] &&
- (RDCOST(x->rdmult,
- rd_stats_y->rate + rd_stats_uv->rate +
- x->skip_cost[skip_ctx][0],
- rd_stats->dist) >=
- RDCOST(x->rdmult, x->skip_cost[skip_ctx][1], rd_stats->sse))) {
- rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate;
- rd_stats->rate += x->skip_cost[skip_ctx][1];
- rd_stats->dist = rd_stats->sse;
- rd_stats_y->rate = 0;
- rd_stats_uv->rate = 0;
- mbmi->skip = 1;
-#if CONFIG_ONE_PASS_SVM
- av1_reg_stat_skipmode_update(rd_stats_y, x->rdmult);
-#endif
- } else {
- rd_stats->rate += x->skip_cost[skip_ctx][0];
- mbmi->skip = 0;
- }
- } else {
- x->skip = 1;
- mbmi->tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode);
- // The cost of skip bit needs to be added.
- mbmi->skip = 0;
- rd_stats->rate += x->skip_cost[skip_ctx][1];
-
- rd_stats->dist = 0;
- rd_stats->sse = 0;
+ if (rd_stats->skip) {
+ rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate;
rd_stats_y->rate = 0;
rd_stats_uv->rate = 0;
- rd_stats->skip = 1;
- int64_t tmprd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+ rd_stats->dist = rd_stats->sse;
+ rd_stats_y->dist = rd_stats_y->sse;
+ rd_stats_uv->dist = rd_stats_uv->sse;
+ rd_stats->rate += skip_flag_cost[1];
+ mbmi->skip = 1;
+ // here mbmi->skip temporarily plays a role as what this_skip2 does
+
+ const int64_t tmprd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
if (tmprd > ref_best_rd) {
mbmi->ref_frame[1] = ref_frame_1;
return 0;
}
-#if CONFIG_ONE_PASS_SVM
- av1_add_reg_stat(rd_stats, 0, 0, 0, 0, 0, bsize, bsize);
- av1_reg_stat_skipmode_update(rd_stats, x->rdmult);
-#endif
+ } else if (!xd->lossless[mbmi->segment_id] &&
+ (RDCOST(x->rdmult,
+ rd_stats_y->rate + rd_stats_uv->rate + skip_flag_cost[0],
+ rd_stats->dist) >=
+ RDCOST(x->rdmult, skip_flag_cost[1], rd_stats->sse))) {
+ rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate;
+ rd_stats->rate += skip_flag_cost[1];
+ rd_stats->dist = rd_stats->sse;
+ rd_stats_y->dist = rd_stats_y->sse;
+ rd_stats_uv->dist = rd_stats_uv->sse;
+ rd_stats_y->rate = 0;
+ rd_stats_uv->rate = 0;
+ mbmi->skip = 1;
+ } else {
+ rd_stats->rate += skip_flag_cost[0];
+ mbmi->skip = 0;
}
+
return 1;
}
@@ -8773,18 +9015,30 @@ static INLINE bool enable_wedge_search(MACROBLOCK *const x,
x->edge_strength > cpi->sf.disable_wedge_search_edge_thresh;
}
+static INLINE bool enable_wedge_interinter_search(MACROBLOCK *const x,
+ const AV1_COMP *const cpi) {
+ return enable_wedge_search(x, cpi) && cpi->oxcf.enable_interinter_wedge;
+}
+
+static INLINE bool enable_wedge_interintra_search(MACROBLOCK *const x,
+ const AV1_COMP *const cpi) {
+ return enable_wedge_search(x, cpi) && cpi->oxcf.enable_interintra_wedge &&
+ !cpi->sf.disable_wedge_interintra_search;
+}
+
static int handle_inter_intra_mode(const AV1_COMP *const cpi,
MACROBLOCK *const x, BLOCK_SIZE bsize,
int mi_row, int mi_col, MB_MODE_INFO *mbmi,
HandleInterModeArgs *args,
int64_t ref_best_rd, int *rate_mv,
- int *tmp_rate2, BUFFER_SET *orig_dst) {
+ int *tmp_rate2, const BUFFER_SET *orig_dst) {
const AV1_COMMON *const cm = &cpi->common;
const int num_planes = av1_num_planes(cm);
MACROBLOCKD *xd = &x->e_mbd;
INTERINTRA_MODE best_interintra_mode = II_DC_PRED;
- int64_t rd, best_interintra_rd = INT64_MAX;
+ int64_t rd = INT64_MAX;
+ int64_t best_interintra_rd = INT64_MAX;
int rmode, rate_sum;
int64_t dist_sum;
int tmp_rate_mv = 0;
@@ -8803,60 +9057,118 @@ static int handle_inter_intra_mode(const AV1_COMP *const cpi,
mbmi->ref_frame[1] = NONE_FRAME;
xd->plane[0].dst.buf = tmp_buf;
xd->plane[0].dst.stride = bw;
- av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, NULL, bsize);
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
restore_dst_buf(xd, *orig_dst, num_planes);
mbmi->ref_frame[1] = INTRA_FRAME;
- mbmi->use_wedge_interintra = 0;
best_interintra_mode = args->inter_intra_mode[mbmi->ref_frame[0]];
- int j = 0;
- if (cpi->sf.reuse_inter_intra_mode == 0 ||
- best_interintra_mode == INTERINTRA_MODES) {
- for (j = 0; j < INTERINTRA_MODES; ++j) {
- mbmi->interintra_mode = (INTERINTRA_MODE)j;
- rmode = interintra_mode_cost[mbmi->interintra_mode];
+
+ if (cpi->oxcf.enable_smooth_interintra &&
+ !cpi->sf.disable_smooth_interintra) {
+ mbmi->use_wedge_interintra = 0;
+ int j = 0;
+ if (cpi->sf.reuse_inter_intra_mode == 0 ||
+ best_interintra_mode == INTERINTRA_MODES) {
+ for (j = 0; j < INTERINTRA_MODES; ++j) {
+ if ((!cpi->oxcf.enable_smooth_intra || cpi->sf.disable_smooth_intra) &&
+ (INTERINTRA_MODE)j == II_SMOOTH_PRED)
+ continue;
+ mbmi->interintra_mode = (INTERINTRA_MODE)j;
+ rmode = interintra_mode_cost[mbmi->interintra_mode];
+ av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+ intrapred, bw);
+ av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+ model_rd_sb_fn[MODELRD_TYPE_INTERINTRA](
+ cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum,
+ &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
+ rd = RDCOST(x->rdmult, tmp_rate_mv + rate_sum + rmode, dist_sum);
+ if (rd < best_interintra_rd) {
+ best_interintra_rd = rd;
+ best_interintra_mode = mbmi->interintra_mode;
+ }
+ }
+ args->inter_intra_mode[mbmi->ref_frame[0]] = best_interintra_mode;
+ }
+ assert(IMPLIES(!cpi->oxcf.enable_smooth_interintra ||
+ cpi->sf.disable_smooth_interintra,
+ best_interintra_mode != II_SMOOTH_PRED));
+ rmode = interintra_mode_cost[best_interintra_mode];
+ if (j == 0 || best_interintra_mode != II_SMOOTH_PRED) {
+ mbmi->interintra_mode = best_interintra_mode;
av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
intrapred, bw);
av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
- model_rd_sb_fn[MODELRD_TYPE_INTERINTRA](
- cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum,
- &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
- rd = RDCOST(x->rdmult, tmp_rate_mv + rate_sum + rmode, dist_sum);
- if (rd < best_interintra_rd) {
- best_interintra_rd = rd;
- best_interintra_mode = mbmi->interintra_mode;
- }
- }
- args->inter_intra_mode[mbmi->ref_frame[0]] = best_interintra_mode;
- }
- if (j == 0 || best_interintra_mode != II_SMOOTH_PRED) {
- mbmi->interintra_mode = best_interintra_mode;
- rmode = interintra_mode_cost[mbmi->interintra_mode];
- av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
- intrapred, bw);
- av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
- }
- rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
- &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
- if (rd != INT64_MAX)
- rd = RDCOST(x->rdmult, *rate_mv + rmode + rate_sum + rwedge, dist_sum);
- best_interintra_rd = rd;
- if (ref_best_rd < INT64_MAX && (best_interintra_rd >> 1) > ref_best_rd) {
- return -1;
+ }
+
+ RD_STATS rd_stats;
+ rd = estimate_yrd_for_sb(cpi, bsize, x, ref_best_rd, &rd_stats);
+ if (rd != INT64_MAX) {
+ rd = RDCOST(x->rdmult, *rate_mv + rmode + rd_stats.rate + rwedge,
+ rd_stats.dist);
+ }
+ best_interintra_rd = rd;
+ if (ref_best_rd < INT64_MAX &&
+ ((best_interintra_rd >> 4) * 9) > ref_best_rd) {
+ return -1;
+ }
}
if (is_wedge_used) {
int64_t best_interintra_rd_nowedge = rd;
int64_t best_interintra_rd_wedge = INT64_MAX;
int_mv tmp_mv;
- if (enable_wedge_search(x, cpi)) {
+ if (enable_wedge_interintra_search(x, cpi)) {
mbmi->use_wedge_interintra = 1;
rwedge = av1_cost_literal(get_interintra_wedge_bits(bsize)) +
x->wedge_interintra_cost[bsize][1];
- best_interintra_rd_wedge =
- pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
+ if (!cpi->oxcf.enable_smooth_interintra ||
+ cpi->sf.disable_smooth_interintra) {
+ if (best_interintra_mode == INTERINTRA_MODES) {
+ mbmi->interintra_mode = II_SMOOTH_PRED;
+ best_interintra_mode = II_SMOOTH_PRED;
+ av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+ intrapred, bw);
+ best_interintra_rd_wedge =
+ pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
+
+ int j = 0;
+ for (j = 0; j < INTERINTRA_MODES; ++j) {
+ mbmi->interintra_mode = (INTERINTRA_MODE)j;
+ rmode = interintra_mode_cost[mbmi->interintra_mode];
+ av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0,
+ orig_dst, intrapred, bw);
+ av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+ model_rd_sb_fn[MODELRD_TYPE_INTERINTRA](
+ cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum,
+ &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
+ rd = RDCOST(x->rdmult, tmp_rate_mv + rate_sum + rmode, dist_sum);
+ if (rd < best_interintra_rd) {
+ best_interintra_rd_wedge = rd;
+ best_interintra_mode = mbmi->interintra_mode;
+ }
+ }
+ args->inter_intra_mode[mbmi->ref_frame[0]] = best_interintra_mode;
+ mbmi->interintra_mode = best_interintra_mode;
+
+ if (best_interintra_mode != II_SMOOTH_PRED) {
+ av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0,
+ orig_dst, intrapred, bw);
+ }
+ } else {
+ mbmi->interintra_mode = best_interintra_mode;
+ av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+ intrapred, bw);
+ best_interintra_rd_wedge =
+ pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
+ }
+ } else {
+ best_interintra_rd_wedge =
+ pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
+ }
+ rmode = interintra_mode_cost[mbmi->interintra_mode];
best_interintra_rd_wedge +=
RDCOST(x->rdmult, rmode + *rate_mv + rwedge, 0);
rd = INT64_MAX;
@@ -8871,8 +9183,8 @@ static int handle_inter_intra_mode(const AV1_COMP *const cpi,
0);
if (mbmi->mv[0].as_int != tmp_mv.as_int) {
mbmi->mv[0].as_int = tmp_mv.as_int;
- av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst,
- bsize);
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND](
cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum,
&tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
@@ -8886,12 +9198,17 @@ static int handle_inter_intra_mode(const AV1_COMP *const cpi,
av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
}
// Evaluate closer to true rd
- rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
- &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
- if (rd != INT64_MAX)
- rd = RDCOST(x->rdmult, rmode + tmp_rate_mv + rwedge + rate_sum,
- dist_sum);
+ RD_STATS rd_stats;
+ rd = estimate_yrd_for_sb(cpi, bsize, x, ref_best_rd, &rd_stats);
+ if (rd != INT64_MAX) {
+ rd = RDCOST(x->rdmult, rmode + tmp_rate_mv + rwedge + rd_stats.rate,
+ rd_stats.dist);
+ }
best_interintra_rd_wedge = rd;
+ if ((!cpi->oxcf.enable_smooth_interintra ||
+ cpi->sf.disable_smooth_interintra) &&
+ best_interintra_rd_wedge == INT64_MAX)
+ return -1;
if (best_interintra_rd_wedge < best_interintra_rd_nowedge) {
mbmi->use_wedge_interintra = 1;
mbmi->mv[0].as_int = tmp_mv.as_int;
@@ -8900,33 +9217,133 @@ static int handle_inter_intra_mode(const AV1_COMP *const cpi,
} else {
mbmi->use_wedge_interintra = 0;
mbmi->mv[0].as_int = mv0.as_int;
- av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize);
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
}
} else {
+ if (!cpi->oxcf.enable_smooth_interintra ||
+ cpi->sf.disable_smooth_interintra)
+ return -1;
mbmi->use_wedge_interintra = 0;
}
- } // if (is_interintra_wedge_used(bsize))
+ } else {
+ if (best_interintra_rd == INT64_MAX) return -1;
+ }
if (num_planes > 1) {
- av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, orig_dst, bsize);
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+ AOM_PLANE_U, num_planes - 1);
+ }
+ return 0;
+}
+
+// If number of valid neighbours is 1,
+// 1) ROTZOOM parameters can be obtained reliably (2 parameters from
+// one neighbouring MV)
+// 2) For IDENTITY/TRANSLATION cases, warp can perform better due to
+// a different interpolation filter being used. However the quality
+// gains (due to the same) may not be much
+// For above 2 cases warp evaluation is skipped
+
+static int check_if_optimal_warp(const AV1_COMP *cpi,
+ WarpedMotionParams *wm_params,
+ int num_proj_ref) {
+ int is_valid_warp = 1;
+ if (cpi->sf.prune_warp_using_wmtype) {
+ TransformationType wmtype = get_wmtype(wm_params);
+ if (num_proj_ref == 1) {
+ if (wmtype != ROTZOOM) is_valid_warp = 0;
+ } else {
+ if (wmtype < ROTZOOM) is_valid_warp = 0;
+ }
+ }
+ return is_valid_warp;
+}
+
+struct obmc_check_mv_field_ctxt {
+ MB_MODE_INFO *current_mi;
+ int mv_field_check_result;
+};
+
+static INLINE void obmc_check_identical_mv(MACROBLOCKD *xd, int rel_mi_col,
+ uint8_t nb_mi_width,
+ MB_MODE_INFO *nb_mi, void *fun_ctxt,
+ const int num_planes) {
+ (void)xd;
+ (void)rel_mi_col;
+ (void)nb_mi_width;
+ (void)num_planes;
+ struct obmc_check_mv_field_ctxt *ctxt =
+ (struct obmc_check_mv_field_ctxt *)fun_ctxt;
+ const MB_MODE_INFO *current_mi = ctxt->current_mi;
+
+ if (ctxt->mv_field_check_result == 0) return;
+
+ if (nb_mi->ref_frame[0] != current_mi->ref_frame[0] ||
+ nb_mi->mv[0].as_int != current_mi->mv[0].as_int ||
+ nb_mi->interp_filters != current_mi->interp_filters) {
+ ctxt->mv_field_check_result = 0;
+ }
+}
+
+// Check if the neighbors' motions used by obmc have same parameters as for
+// the current block. If all the parameters are identical, obmc will produce
+// the same prediction as from regular bmc, therefore we can skip the
+// overlapping operations for less complexity. The parameters checked include
+// reference frame, motion vector, and interpolation filter.
+int check_identical_obmc_mv_field(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col) {
+ const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+ struct obmc_check_mv_field_ctxt mv_field_check_ctxt = { xd->mi[0], 1 };
+
+ foreach_overlappable_nb_above(cm, xd, mi_col,
+ max_neighbor_obmc[mi_size_wide_log2[bsize]],
+ obmc_check_identical_mv, &mv_field_check_ctxt);
+ foreach_overlappable_nb_left(cm, xd, mi_row,
+ max_neighbor_obmc[mi_size_high_log2[bsize]],
+ obmc_check_identical_mv, &mv_field_check_ctxt);
+
+ return mv_field_check_ctxt.mv_field_check_result;
+}
+
+static int skip_interintra_based_on_first_pass_stats(const AV1_COMP *const cpi,
+ MACROBLOCK *const x,
+ BLOCK_SIZE bsize,
+ int mi_row, int mi_col) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ if (cpi->two_pass_partition_search &&
+ cpi->sf.use_first_partition_pass_interintra_stats &&
+ !x->cb_partition_scan) {
+ const int mi_width = mi_size_wide[bsize];
+ const int mi_height = mi_size_high[bsize];
+ // Search in the stats table to see if obmc motion mode was used in the
+ // first pass of partition search.
+ for (int row = mi_row; row < mi_row + mi_width;
+ row += FIRST_PARTITION_PASS_SAMPLE_REGION) {
+ for (int col = mi_col; col < mi_col + mi_height;
+ col += FIRST_PARTITION_PASS_SAMPLE_REGION) {
+ const int index = av1_first_partition_pass_stats_index(row, col);
+ const FIRST_PARTITION_PASS_STATS *const stats =
+ &x->first_partition_pass_stats[index];
+ if (stats->interintra_motion_mode_count[mbmi->ref_frame[0]]) {
+ return 0;
+ }
+ }
+ }
+ return 1;
}
return 0;
}
// TODO(afergs): Refactor the MBMI references in here - there's four
// TODO(afergs): Refactor optional args - add them to a struct or remove
-static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
- BLOCK_SIZE bsize, RD_STATS *rd_stats,
- RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv,
- int *disable_skip, int mi_row, int mi_col,
- HandleInterModeArgs *const args,
- int64_t ref_best_rd, const int *refs,
- int *rate_mv, BUFFER_SET *orig_dst
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
- ,
- TileDataEnc *tile_data, int64_t *best_est_rd,
- int do_tx_search, InterModesInfo *inter_modes_info
-#endif
-) {
+static int64_t motion_mode_rd(
+ const AV1_COMP *const cpi, TileDataEnc *tile_data, MACROBLOCK *const x,
+ BLOCK_SIZE bsize, RD_STATS *rd_stats, RD_STATS *rd_stats_y,
+ RD_STATS *rd_stats_uv, int *disable_skip, int mi_row, int mi_col,
+ HandleInterModeArgs *const args, int64_t ref_best_rd, const int *refs,
+ int *rate_mv, const BUFFER_SET *orig_dst, int64_t *best_est_rd,
+ int do_tx_search, InterModesInfo *inter_modes_info) {
const AV1_COMMON *const cm = &cpi->common;
const int num_planes = av1_num_planes(cm);
MACROBLOCKD *xd = &x->e_mbd;
@@ -8936,16 +9353,17 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
const int rate2_nocoeff = rd_stats->rate;
int best_xskip = 0, best_disable_skip = 0;
RD_STATS best_rd_stats, best_rd_stats_y, best_rd_stats_uv;
- MB_MODE_INFO base_mbmi, best_mbmi;
uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
const int rate_mv0 = *rate_mv;
-
- int interintra_allowed = cm->seq_params.enable_interintra_compound &&
- is_interintra_allowed(mbmi) && mbmi->compound_idx;
+ int skip_interintra_mode = 0;
+ const int interintra_allowed = cm->seq_params.enable_interintra_compound &&
+ is_interintra_allowed(mbmi) &&
+ mbmi->compound_idx;
int pts0[SAMPLES_ARRAY_SIZE], pts_inref0[SAMPLES_ARRAY_SIZE];
assert(mbmi->ref_frame[1] != INTRA_FRAME);
const MV_REFERENCE_FRAME ref_frame_1 = mbmi->ref_frame[1];
+ (void)tile_data;
av1_invalid_rd_stats(&best_rd_stats);
aom_clear_system_state();
mbmi->num_proj_ref = 1; // assume num_proj_ref >=1
@@ -8957,21 +9375,22 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
if (last_motion_mode_allowed == WARPED_CAUSAL) {
mbmi->num_proj_ref = findSamples(cm, xd, mi_row, mi_col, pts0, pts_inref0);
}
- int total_samples = mbmi->num_proj_ref;
+ const int total_samples = mbmi->num_proj_ref;
if (total_samples == 0) {
last_motion_mode_allowed = OBMC_CAUSAL;
}
- base_mbmi = *mbmi;
- SimpleRDState *simple_states = &args->simple_rd_state[mbmi->ref_mv_idx];
+ const MB_MODE_INFO base_mbmi = *mbmi;
+ MB_MODE_INFO best_mbmi;
+ SimpleRDState *const simple_states = &args->simple_rd_state[mbmi->ref_mv_idx];
const int switchable_rate =
av1_is_interp_needed(xd) ? av1_get_switchable_rate(cm, x, xd) : 0;
int64_t best_rd = INT64_MAX;
int best_rate_mv = rate_mv0;
- int identical_obmc_mv_field_detected =
+ const int identical_obmc_mv_field_detected =
(cpi->sf.skip_obmc_in_uniform_mv_field ||
cpi->sf.skip_wm_in_uniform_mv_field)
- ? av1_check_identical_obmc_mv_field(cm, xd, mi_row, mi_col)
+ ? check_identical_obmc_mv_field(cm, xd, mi_row, mi_col)
: 0;
for (int mode_index = (int)SIMPLE_TRANSLATION;
mode_index <= (int)last_motion_mode_allowed + interintra_allowed;
@@ -8980,10 +9399,8 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
if (cpi->sf.prune_single_motion_modes_by_simple_trans &&
args->single_ref_first_pass && mode_index)
break;
- int64_t tmp_rd = INT64_MAX;
int tmp_rate2 = rate2_nocoeff;
- int is_interintra_mode = mode_index > (int)last_motion_mode_allowed;
- int skip_txfm_sb = 0;
+ const int is_interintra_mode = mode_index > (int)last_motion_mode_allowed;
int tmp_rate_mv = rate_mv0;
*mbmi = base_mbmi;
@@ -8994,6 +9411,9 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
assert(mbmi->ref_frame[1] != INTRA_FRAME);
}
+ if (cpi->oxcf.enable_obmc == 0 && mbmi->motion_mode == OBMC_CAUSAL)
+ continue;
+
if (identical_obmc_mv_field_detected) {
if (cpi->sf.skip_obmc_in_uniform_mv_field &&
mbmi->motion_mode == OBMC_CAUSAL)
@@ -9007,28 +9427,29 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
// SIMPLE_TRANSLATION mode: no need to recalculate.
// The prediction is calculated before motion_mode_rd() is called in
// handle_inter_mode()
- if (cpi->sf.prune_single_motion_modes_by_simple_trans &&
- args->single_ref_first_pass == 0 && !is_comp_pred) {
- if (simple_states->early_skipped) {
- assert(simple_states->rd_stats.rdcost == INT64_MAX);
- return INT64_MAX;
- }
- if (simple_states->rd_stats.rdcost != INT64_MAX) {
- best_rd = simple_states->rd_stats.rdcost;
- best_rd_stats = simple_states->rd_stats;
- best_rd_stats_y = simple_states->rd_stats_y;
- best_rd_stats_uv = simple_states->rd_stats_uv;
- memcpy(best_blk_skip, simple_states->blk_skip,
- sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w);
- best_xskip = simple_states->skip;
- best_disable_skip = simple_states->disable_skip;
- best_mbmi = *mbmi;
+ if (cpi->sf.prune_single_motion_modes_by_simple_trans && !is_comp_pred) {
+ if (args->single_ref_first_pass == 0) {
+ if (simple_states->early_skipped) {
+ assert(simple_states->rd_stats.rdcost == INT64_MAX);
+ return INT64_MAX;
+ }
+ if (simple_states->rd_stats.rdcost != INT64_MAX) {
+ best_rd = simple_states->rd_stats.rdcost;
+ best_rd_stats = simple_states->rd_stats;
+ best_rd_stats_y = simple_states->rd_stats_y;
+ best_rd_stats_uv = simple_states->rd_stats_uv;
+ memcpy(best_blk_skip, simple_states->blk_skip,
+ sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w);
+ best_xskip = simple_states->skip;
+ best_disable_skip = simple_states->disable_skip;
+ best_mbmi = *mbmi;
+ }
+ continue;
}
- continue;
+ simple_states->early_skipped = 0;
}
- simple_states->early_skipped = 0;
} else if (mbmi->motion_mode == OBMC_CAUSAL) {
- uint32_t cur_mv = mbmi->mv[0].as_int;
+ const uint32_t cur_mv = mbmi->mv[0].as_int;
assert(!is_comp_pred);
if (have_newmv_in_inter_mode(this_mode)) {
single_motion_search(cpi, x, bsize, mi_row, mi_col, 0, &tmp_rate_mv);
@@ -9041,7 +9462,8 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
tmp_rate2 = rate2_nocoeff - rate_mv0 + tmp_rate_mv;
}
if (mbmi->mv[0].as_int != cur_mv) {
- av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+ 0, av1_num_planes(cm) - 1);
}
av1_build_obmc_inter_prediction(
cm, xd, mi_row, mi_col, args->above_pred_buf, args->above_pred_stride,
@@ -9069,7 +9491,12 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
if (have_newmv_in_inter_mode(this_mode)) {
const int_mv mv0 = mbmi->mv[0];
const WarpedMotionParams wm_params0 = mbmi->wm_params;
- int num_proj_ref0 = mbmi->num_proj_ref;
+ const int num_proj_ref0 = mbmi->num_proj_ref;
+
+ if (cpi->sf.prune_warp_using_wmtype) {
+ TransformationType wmtype = get_wmtype(&mbmi->wm_params);
+ if (wmtype < ROTZOOM) continue;
+ }
// Refine MV in a small range.
av1_refine_warped_mv(cpi, x, bsize, mi_row, mi_col, pts0, pts_inref0,
@@ -9098,24 +9525,27 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
mbmi->wm_params = wm_params0;
mbmi->num_proj_ref = num_proj_ref0;
}
+ } else {
+ if (!check_if_optimal_warp(cpi, &mbmi->wm_params, mbmi->num_proj_ref))
+ continue;
}
- av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
+ av1_num_planes(cm) - 1);
} else {
continue;
}
} else if (is_interintra_mode) {
+ skip_interintra_mode = skip_interintra_based_on_first_pass_stats(
+ cpi, x, bsize, mi_row, mi_col);
+ if (skip_interintra_mode) continue;
const int ret = handle_inter_intra_mode(
cpi, x, bsize, mi_row, mi_col, mbmi, args, ref_best_rd, &tmp_rate_mv,
&tmp_rate2, orig_dst);
if (ret < 0) continue;
}
- if (!cpi->common.all_lossless)
- check_block_skip(cpi, bsize, x, xd, 0, num_planes - 1, &skip_txfm_sb);
-
x->skip = 0;
-
rd_stats->dist = 0;
rd_stats->sse = 0;
rd_stats->skip = 1;
@@ -9146,85 +9576,93 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
}
}
- if (!skip_txfm_sb) {
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
- int64_t est_rd = 0;
- int est_skip = 0;
- if (cpi->sf.inter_mode_rd_model_estimation && cm->tile_cols == 1 &&
- cm->tile_rows == 1) {
- InterModeRdModel *md = &tile_data->inter_mode_rd_models[mbmi->sb_type];
- if (md->ready) {
- const int64_t curr_sse = get_sse(cpi, x);
- est_rd = get_est_rd(tile_data, mbmi->sb_type, x->rdmult, curr_sse,
- rd_stats->rate);
- est_skip = est_rd * 0.8 > *best_est_rd;
- if (est_skip) {
- mbmi->ref_frame[1] = ref_frame_1;
- continue;
- } else {
- if (est_rd < *best_est_rd) {
- *best_est_rd = est_rd;
- }
- }
- }
+ if (cpi->sf.model_based_motion_mode_rd_breakout && do_tx_search) {
+ int model_rate;
+ int64_t model_dist;
+ model_rd_sb_fn[MODELRD_TYPE_MOTION_MODE_RD](
+ cpi, mbmi->sb_type, x, xd, 0, num_planes - 1, mi_row, mi_col,
+ &model_rate, &model_dist, NULL, NULL, NULL, NULL, NULL);
+ const int64_t est_rd =
+ RDCOST(x->rdmult, rd_stats->rate + model_rate, model_dist);
+ if ((est_rd >> 3) * 6 > ref_best_rd) {
+ mbmi->ref_frame[1] = ref_frame_1;
+ continue;
}
-#endif // CONFIG_COLLECT_INTER_MODE_RD_STATS
}
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
if (!do_tx_search) {
- const int64_t curr_sse = get_sse(cpi, x);
+ int64_t curr_sse = -1;
int est_residue_cost = 0;
int64_t est_dist = 0;
- const int has_est_rd = get_est_rate_dist(tile_data, bsize, curr_sse,
- &est_residue_cost, &est_dist);
- (void)has_est_rd;
- assert(has_est_rd);
+ int64_t est_rd = 0;
+ if (cpi->sf.inter_mode_rd_model_estimation == 1) {
+ curr_sse = get_sse(cpi, x);
+ const int has_est_rd = get_est_rate_dist(tile_data, bsize, curr_sse,
+ &est_residue_cost, &est_dist);
+ (void)has_est_rd;
+ assert(has_est_rd);
+ } else if (cpi->sf.inter_mode_rd_model_estimation == 2 ||
+ cpi->sf.use_nonrd_pick_mode) {
+ model_rd_sb_fn[MODELRD_TYPE_MOTION_MODE_RD](
+ cpi, bsize, x, xd, 0, num_planes - 1, mi_row, mi_col,
+ &est_residue_cost, &est_dist, NULL, &curr_sse, NULL, NULL, NULL);
+ }
+ est_rd = RDCOST(x->rdmult, rd_stats->rate + est_residue_cost, est_dist);
+ if (est_rd * 0.8 > *best_est_rd) {
+ mbmi->ref_frame[1] = ref_frame_1;
+ continue;
+ }
const int mode_rate = rd_stats->rate;
rd_stats->rate += est_residue_cost;
rd_stats->dist = est_dist;
- rd_stats->rdcost = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+ rd_stats->rdcost = est_rd;
+ *best_est_rd = AOMMIN(*best_est_rd, rd_stats->rdcost);
if (cm->current_frame.reference_mode == SINGLE_REFERENCE) {
if (!is_comp_pred) {
+ assert(curr_sse >= 0);
inter_modes_info_push(inter_modes_info, mode_rate, curr_sse,
- rd_stats->rdcost, mbmi);
+ rd_stats->rdcost, false, NULL, rd_stats,
+ rd_stats_y, rd_stats_uv, mbmi);
}
} else {
+ assert(curr_sse >= 0);
inter_modes_info_push(inter_modes_info, mode_rate, curr_sse,
- rd_stats->rdcost, mbmi);
+ rd_stats->rdcost, false, NULL, rd_stats,
+ rd_stats_y, rd_stats_uv, mbmi);
}
} else {
-#endif
- if (!txfm_search(cpi, x, bsize, mi_row, mi_col, rd_stats, rd_stats_y,
- rd_stats_uv, rd_stats->rate, ref_best_rd)) {
+ if (!txfm_search(cpi, tile_data, x, bsize, mi_row, mi_col, rd_stats,
+ rd_stats_y, rd_stats_uv, rd_stats->rate, ref_best_rd)) {
if (rd_stats_y->rate == INT_MAX && mode_index == 0) {
- simple_states->early_skipped = 1;
+ if (cpi->sf.prune_single_motion_modes_by_simple_trans &&
+ !is_comp_pred) {
+ simple_states->early_skipped = 1;
+ }
return INT64_MAX;
}
continue;
}
- if (!skip_txfm_sb) {
- const int64_t curr_rd =
- RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
- if (curr_rd < ref_best_rd) {
- ref_best_rd = curr_rd;
- }
- *disable_skip = 0;
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
- if (cpi->sf.inter_mode_rd_model_estimation) {
- const int skip_ctx = av1_get_skip_context(xd);
- inter_mode_data_push(tile_data, mbmi->sb_type, rd_stats->sse,
- rd_stats->dist,
- rd_stats_y->rate + rd_stats_uv->rate +
- x->skip_cost[skip_ctx][mbmi->skip]);
- }
-#endif // CONFIG_COLLECT_INTER_MODE_RD_STATS
- } else {
- *disable_skip = 1;
+
+ const int64_t curr_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+ ref_best_rd = AOMMIN(ref_best_rd, curr_rd);
+ *disable_skip = 0;
+ if (cpi->sf.inter_mode_rd_model_estimation == 1) {
+ const int skip_ctx = av1_get_skip_context(xd);
+ inter_mode_data_push(tile_data, mbmi->sb_type, rd_stats->sse,
+ rd_stats->dist,
+ rd_stats_y->rate + rd_stats_uv->rate +
+ x->skip_cost[skip_ctx][mbmi->skip]);
+ }
+
+ // 2 means to both do the tx search and also update the inter_modes_info
+ // structure, since some modes will be conditionally TX searched.
+ if (do_tx_search == 2) {
+ rd_stats->rdcost = curr_rd;
+ inter_modes_info_push(inter_modes_info, rd_stats->rate, rd_stats->sse,
+ curr_rd, true, x->blk_skip, rd_stats, rd_stats_y,
+ rd_stats_uv, mbmi);
}
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
}
-#endif
if (this_mode == GLOBALMV || this_mode == GLOBAL_GLOBALMV) {
if (is_nontrans_global_motion(xd, xd->mi[0])) {
@@ -9233,7 +9671,7 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
}
}
- tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+ const int64_t tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
if (mode_index == 0) {
args->simple_rd[this_mode][mbmi->ref_mv_idx][mbmi->ref_frame[0]] = tmp_rd;
if (!is_comp_pred) {
@@ -9247,7 +9685,7 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
simple_states->disable_skip = *disable_skip;
}
}
- if ((mode_index == 0) || (tmp_rd < best_rd)) {
+ if (mode_index == 0 || tmp_rd < best_rd) {
best_mbmi = *mbmi;
best_rd = tmp_rd;
best_rd_stats = *rd_stats;
@@ -9283,11 +9721,12 @@ static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
static int64_t skip_mode_rd(RD_STATS *rd_stats, const AV1_COMP *const cpi,
MACROBLOCK *const x, BLOCK_SIZE bsize, int mi_row,
- int mi_col, BUFFER_SET *const orig_dst) {
+ int mi_col, const BUFFER_SET *const orig_dst) {
const AV1_COMMON *cm = &cpi->common;
const int num_planes = av1_num_planes(cm);
MACROBLOCKD *const xd = &x->e_mbd;
- av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize, 0,
+ av1_num_planes(cm) - 1);
int64_t total_sse = 0;
for (int plane = 0; plane < num_planes; ++plane) {
@@ -9299,44 +9738,8 @@ static int64_t skip_mode_rd(RD_STATS *rd_stats, const AV1_COMP *const cpi,
const int bh = block_size_high[plane_bsize];
av1_subtract_plane(x, bsize, plane);
- int64_t sse;
-#if CONFIG_ONE_PASS_SVM
- if (plane == AOM_PLANE_Y && bsize >= BLOCK_8X8 && bw == bh) {
- rd_stats->sse_0 = aom_sum_squares_2d_i16(p->src_diff, bw, bw / 2, bh / 2)
- << 4;
- rd_stats->sse_1 =
- aom_sum_squares_2d_i16(p->src_diff + bw / 2, bw, bw / 2, bh / 2) << 4;
- rd_stats->sse_2 =
- aom_sum_squares_2d_i16(p->src_diff + bh / 2 * bw, bw, bw / 2, bh / 2)
- << 4;
- rd_stats->sse_3 =
- aom_sum_squares_2d_i16(p->src_diff + bh / 2 * bw + bw / 2, bw, bw / 2,
- bh / 2)
- << 4;
-
- sse =
- rd_stats->sse_0 + rd_stats->sse_1 + rd_stats->sse_2 + rd_stats->sse_3;
- total_sse += sse;
-
- const int scaling_factor = MAX_MIB_SIZE * MAX_MIB_SIZE;
- rd_stats->sse = sse;
- rd_stats->sse_0 = rd_stats->sse_0 * scaling_factor;
- rd_stats->sse_1 = rd_stats->sse_1 * scaling_factor;
- rd_stats->sse_2 = rd_stats->sse_2 * scaling_factor;
- rd_stats->sse_3 = rd_stats->sse_3 * scaling_factor;
- rd_stats->y_sse = sse;
- // TODO(chiyotsai@google.com): Don't manually set the flags
- av1_reg_stat_skipmode_update(rd_stats, x->rdmult);
- } else {
- sse = aom_sum_squares_2d_i16(p->src_diff, bw, bw, bh);
- sse = sse << 4;
- total_sse += sse;
- }
-#else
- sse = aom_sum_squares_2d_i16(p->src_diff, bw, bw, bh);
- sse = sse << 4;
+ int64_t sse = aom_sum_squares_2d_i16(p->src_diff, bw, bw, bh) << 4;
total_sse += sse;
-#endif
}
const int skip_mode_ctx = av1_get_skip_mode_context(xd);
rd_stats->dist = rd_stats->sse = total_sse;
@@ -9456,25 +9859,20 @@ typedef struct {
uint8_t *tmp_best_mask_buf; // backup of the best segmentation mask
} CompoundTypeRdBuffers;
-static int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
- BLOCK_SIZE bsize, int mi_col, int mi_row,
- int_mv *cur_mv, int masked_compound_used,
- BUFFER_SET *orig_dst, const BUFFER_SET *tmp_dst,
- CompoundTypeRdBuffers *buffers, int *rate_mv,
- int64_t *rd, RD_STATS *rd_stats,
- int64_t ref_best_rd) {
+static int compound_type_rd(
+ const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_col,
+ int mi_row, int_mv *cur_mv, int mode_search_mask, int masked_compound_used,
+ const BUFFER_SET *orig_dst, const BUFFER_SET *tmp_dst,
+ CompoundTypeRdBuffers *buffers, int *rate_mv, int64_t *rd,
+ RD_STATS *rd_stats, int64_t ref_best_rd, int *is_luma_interp_done) {
const AV1_COMMON *cm = &cpi->common;
MACROBLOCKD *xd = &x->e_mbd;
MB_MODE_INFO *mbmi = xd->mi[0];
const PREDICTION_MODE this_mode = mbmi->mode;
const int bw = block_size_wide[bsize];
- int rate_sum, rs2;
- int64_t dist_sum;
-
+ int rs2;
int_mv best_mv[2];
int best_tmp_rate_mv = *rate_mv;
- int tmp_skip_txfm_sb;
- int64_t tmp_skip_sse_sb;
INTERINTER_COMPOUND_DATA best_compound_data;
best_compound_data.type = COMPOUND_AVERAGE;
uint8_t *preds0[1] = { buffers->pred0 };
@@ -9486,56 +9884,214 @@ static int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
COMPOUND_TYPE cur_type;
int best_compmode_interinter_cost = 0;
int calc_pred_masked_compound = 1;
+ int64_t comp_dist[COMPOUND_TYPES] = { INT64_MAX, INT64_MAX, INT64_MAX,
+ INT64_MAX };
+ int32_t comp_rate[COMPOUND_TYPES] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX };
+ int64_t comp_model_rd[COMPOUND_TYPES] = { INT64_MAX, INT64_MAX, INT64_MAX,
+ INT64_MAX };
+ const int match_found =
+ find_comp_rd_in_stats(cpi, x, mbmi, comp_rate, comp_dist, comp_model_rd);
best_mv[0].as_int = cur_mv[0].as_int;
best_mv[1].as_int = cur_mv[1].as_int;
*rd = INT64_MAX;
+ int rate_sum, tmp_skip_txfm_sb;
+ int64_t dist_sum, tmp_skip_sse_sb;
+ int64_t comp_best_model_rd = INT64_MAX;
+ // Special handling if both compound_average and compound_distwtd
+ // are to be searched. In this case, first estimate between the two
+ // modes and then call estimate_yrd_for_sb() only for the better of
+ // the two.
+ const int try_average_comp = (mode_search_mask & (1 << COMPOUND_AVERAGE));
+ const int try_distwtd_comp =
+ ((mode_search_mask & (1 << COMPOUND_DISTWTD)) &&
+ cm->seq_params.order_hint_info.enable_dist_wtd_comp == 1 &&
+ cpi->sf.use_dist_wtd_comp_flag != DIST_WTD_COMP_DISABLED);
+ const int try_average_and_distwtd_comp =
+ try_average_comp && try_distwtd_comp &&
+ comp_rate[COMPOUND_AVERAGE] == INT_MAX &&
+ comp_rate[COMPOUND_DISTWTD] == INT_MAX;
for (cur_type = COMPOUND_AVERAGE; cur_type < COMPOUND_TYPES; cur_type++) {
- if (cur_type != COMPOUND_AVERAGE && !masked_compound_used) break;
+ if (((1 << cur_type) & mode_search_mask) == 0) {
+ if (cur_type == COMPOUND_AVERAGE) restore_dst_buf(xd, *tmp_dst, 1);
+ continue;
+ }
if (!is_interinter_compound_used(cur_type, bsize)) continue;
+ if (cur_type >= COMPOUND_WEDGE && !masked_compound_used) break;
+ if (cur_type == COMPOUND_DISTWTD && !try_distwtd_comp) continue;
+ if (cur_type == COMPOUND_AVERAGE && try_average_and_distwtd_comp) continue;
+
+ int64_t comp_model_rd_cur = INT64_MAX;
tmp_rate_mv = *rate_mv;
int64_t best_rd_cur = INT64_MAX;
- mbmi->interinter_comp.type = cur_type;
- int masked_type_cost = 0;
-
const int comp_group_idx_ctx = get_comp_group_idx_context(xd);
const int comp_index_ctx = get_comp_index_context(cm, xd);
- mbmi->compound_idx = 1;
- if (cur_type == COMPOUND_AVERAGE) {
+
+ if (cur_type == COMPOUND_DISTWTD && try_average_and_distwtd_comp) {
+ int est_rate[2];
+ int64_t est_dist[2], est_rd[2];
+
+ int masked_type_cost[2] = { 0, 0 };
mbmi->comp_group_idx = 0;
+
+ // First find the modeled rd cost for COMPOUND_AVERAGE
+ mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+ mbmi->compound_idx = 1;
if (masked_compound_used) {
- masked_type_cost += x->comp_group_idx_cost[comp_group_idx_ctx][0];
- }
- masked_type_cost += x->comp_idx_cost[comp_index_ctx][1];
- rs2 = masked_type_cost;
- const int64_t mode_rd = RDCOST(x->rdmult, rs2 + rd_stats->rate, 0);
- if (mode_rd < ref_best_rd) {
- av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize);
- int64_t est_rd =
- estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
- &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
- if (est_rd != INT64_MAX)
- best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + rate_sum, dist_sum);
- }
- // use spare buffer for following compound type try
+ masked_type_cost[COMPOUND_AVERAGE] +=
+ x->comp_group_idx_cost[comp_group_idx_ctx][mbmi->comp_group_idx];
+ }
+ masked_type_cost[COMPOUND_AVERAGE] +=
+ x->comp_idx_cost[comp_index_ctx][mbmi->compound_idx];
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
+ *is_luma_interp_done = 1;
+ model_rd_sb_fn[MODELRD_CURVFIT](
+ cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &est_rate[COMPOUND_AVERAGE],
+ &est_dist[COMPOUND_AVERAGE], NULL, NULL, NULL, NULL, NULL);
+ est_rate[COMPOUND_AVERAGE] += masked_type_cost[COMPOUND_AVERAGE];
+ est_rd[COMPOUND_AVERAGE] =
+ RDCOST(x->rdmult, est_rate[COMPOUND_AVERAGE] + *rate_mv,
+ est_dist[COMPOUND_AVERAGE]);
restore_dst_buf(xd, *tmp_dst, 1);
+
+ // Next find the modeled rd cost for COMPOUND_DISTWTD
+ mbmi->interinter_comp.type = COMPOUND_DISTWTD;
+ mbmi->compound_idx = 0;
+ if (masked_compound_used) {
+ masked_type_cost[COMPOUND_DISTWTD] +=
+ x->comp_group_idx_cost[comp_group_idx_ctx][mbmi->comp_group_idx];
+ }
+ masked_type_cost[COMPOUND_DISTWTD] +=
+ x->comp_idx_cost[comp_index_ctx][mbmi->compound_idx];
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst, bsize,
+ AOM_PLANE_Y, AOM_PLANE_Y);
+ model_rd_sb_fn[MODELRD_CURVFIT](
+ cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &est_rate[COMPOUND_DISTWTD],
+ &est_dist[COMPOUND_DISTWTD], NULL, NULL, NULL, NULL, NULL);
+ est_rate[COMPOUND_DISTWTD] += masked_type_cost[COMPOUND_DISTWTD];
+ est_rd[COMPOUND_DISTWTD] =
+ RDCOST(x->rdmult, est_rate[COMPOUND_DISTWTD] + *rate_mv,
+ est_dist[COMPOUND_DISTWTD]);
+
+ // Choose the better of the two based on modeled cost and call
+ // estimate_yrd_for_sb() for that one.
+ if (est_rd[COMPOUND_AVERAGE] <= est_rd[COMPOUND_DISTWTD]) {
+ mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+ mbmi->compound_idx = 1;
+ restore_dst_buf(xd, *orig_dst, 1);
+ RD_STATS est_rd_stats;
+ const int64_t est_rd_ =
+ estimate_yrd_for_sb(cpi, bsize, x, ref_best_rd, &est_rd_stats);
+ rs2 = masked_type_cost[COMPOUND_AVERAGE];
+ if (est_rd_ != INT64_MAX) {
+ best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + est_rd_stats.rate,
+ est_rd_stats.dist);
+ restore_dst_buf(xd, *tmp_dst, 1);
+ comp_rate[COMPOUND_AVERAGE] = est_rd_stats.rate;
+ comp_dist[COMPOUND_AVERAGE] = est_rd_stats.dist;
+ comp_model_rd[COMPOUND_AVERAGE] = est_rd[COMPOUND_AVERAGE];
+ comp_model_rd_cur = est_rd[COMPOUND_AVERAGE];
+ }
+ restore_dst_buf(xd, *tmp_dst, 1);
+ } else {
+ RD_STATS est_rd_stats;
+ const int64_t est_rd_ =
+ estimate_yrd_for_sb(cpi, bsize, x, ref_best_rd, &est_rd_stats);
+ rs2 = masked_type_cost[COMPOUND_DISTWTD];
+ if (est_rd_ != INT64_MAX) {
+ best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + est_rd_stats.rate,
+ est_rd_stats.dist);
+ comp_rate[COMPOUND_DISTWTD] = est_rd_stats.rate;
+ comp_dist[COMPOUND_DISTWTD] = est_rd_stats.dist;
+ comp_model_rd[COMPOUND_DISTWTD] = est_rd[COMPOUND_DISTWTD];
+ comp_model_rd_cur = est_rd[COMPOUND_DISTWTD];
+ }
+ }
} else {
- mbmi->comp_group_idx = 1;
- masked_type_cost += x->comp_group_idx_cost[comp_group_idx_ctx][1];
- masked_type_cost += x->compound_type_cost[bsize][cur_type - 1];
- rs2 = masked_type_cost;
- if (enable_wedge_search(x, cpi) && *rd / 3 < ref_best_rd) {
- best_rd_cur = build_and_cost_compound_type(
- cpi, x, cur_mv, bsize, this_mode, &rs2, *rate_mv, orig_dst,
- &tmp_rate_mv, preds0, preds1, buffers->residual1, buffers->diff10,
- strides, mi_row, mi_col, rd_stats->rate, ref_best_rd,
- &calc_pred_masked_compound);
+ mbmi->interinter_comp.type = cur_type;
+ int masked_type_cost = 0;
+ if (cur_type == COMPOUND_AVERAGE || cur_type == COMPOUND_DISTWTD) {
+ mbmi->comp_group_idx = 0;
+ mbmi->compound_idx = (cur_type == COMPOUND_AVERAGE);
+ if (masked_compound_used) {
+ masked_type_cost +=
+ x->comp_group_idx_cost[comp_group_idx_ctx][mbmi->comp_group_idx];
+ }
+ masked_type_cost +=
+ x->comp_idx_cost[comp_index_ctx][mbmi->compound_idx];
+ rs2 = masked_type_cost;
+ const int64_t mode_rd = RDCOST(x->rdmult, rs2 + rd_stats->rate, 0);
+ if (mode_rd < ref_best_rd) {
+ // Reuse data if matching record is found
+ if (comp_rate[cur_type] == INT_MAX) {
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, orig_dst,
+ bsize, AOM_PLANE_Y, AOM_PLANE_Y);
+ if (cur_type == COMPOUND_AVERAGE) *is_luma_interp_done = 1;
+ RD_STATS est_rd_stats;
+ const int64_t est_rd =
+ estimate_yrd_for_sb(cpi, bsize, x, ref_best_rd, &est_rd_stats);
+ if (comp_rate[cur_type] != INT_MAX) {
+ assert(comp_rate[cur_type] == est_rd_stats.rate);
+ assert(comp_dist[cur_type] == est_rd_stats.dist);
+ }
+ if (est_rd != INT64_MAX) {
+ best_rd_cur =
+ RDCOST(x->rdmult, rs2 + *rate_mv + est_rd_stats.rate,
+ est_rd_stats.dist);
+ model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND](
+ cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum,
+ &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
+ comp_model_rd_cur =
+ RDCOST(x->rdmult, rs2 + *rate_mv + rate_sum, dist_sum);
+
+ // Backup rate and distortion for future reuse
+ comp_rate[cur_type] = est_rd_stats.rate;
+ comp_dist[cur_type] = est_rd_stats.dist;
+ comp_model_rd[cur_type] = comp_model_rd_cur;
+ }
+ } else {
+ // Calculate RD cost based on stored stats
+ assert(comp_dist[cur_type] != INT64_MAX);
+ best_rd_cur =
+ RDCOST(x->rdmult, rs2 + *rate_mv + comp_rate[cur_type],
+ comp_dist[cur_type]);
+ comp_model_rd_cur = comp_model_rd[cur_type];
+ }
+ }
+ // use spare buffer for following compound type try
+ if (cur_type == COMPOUND_AVERAGE) restore_dst_buf(xd, *tmp_dst, 1);
+ } else {
+ mbmi->comp_group_idx = 1;
+ mbmi->compound_idx = 1;
+ masked_type_cost +=
+ x->comp_group_idx_cost[comp_group_idx_ctx][mbmi->comp_group_idx];
+ masked_type_cost +=
+ x->compound_type_cost[bsize][cur_type - COMPOUND_WEDGE];
+ rs2 = masked_type_cost;
+
+ if (((*rd / cpi->max_comp_type_rd_threshold_div) *
+ cpi->max_comp_type_rd_threshold_mul) < ref_best_rd) {
+ const COMPOUND_TYPE compound_type = mbmi->interinter_comp.type;
+
+ if (!((compound_type == COMPOUND_WEDGE &&
+ !enable_wedge_interinter_search(x, cpi)) ||
+ (compound_type == COMPOUND_DIFFWTD &&
+ !cpi->oxcf.enable_diff_wtd_comp)))
+ best_rd_cur = build_and_cost_compound_type(
+ cpi, x, cur_mv, bsize, this_mode, &rs2, *rate_mv, orig_dst,
+ &tmp_rate_mv, preds0, preds1, buffers->residual1,
+ buffers->diff10, strides, mi_row, mi_col, rd_stats->rate,
+ ref_best_rd, &calc_pred_masked_compound, comp_rate, comp_dist,
+ comp_model_rd, comp_best_model_rd, &comp_model_rd_cur);
+ }
}
}
if (best_rd_cur < *rd) {
*rd = best_rd_cur;
+ comp_best_model_rd = comp_model_rd_cur;
best_compound_data = mbmi->interinter_comp;
- if (masked_compound_used && cur_type != COMPOUND_TYPES - 1) {
+ if (masked_compound_used && cur_type >= COMPOUND_WEDGE) {
memcpy(buffers->tmp_best_mask_buf, xd->seg_mask, mask_len);
}
best_compmode_interinter_cost = rs2;
@@ -9555,8 +10111,8 @@ static int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
mbmi->mv[1].as_int = cur_mv[1].as_int;
}
if (mbmi->interinter_comp.type != best_compound_data.type) {
- mbmi->comp_group_idx =
- (best_compound_data.type == COMPOUND_AVERAGE) ? 0 : 1;
+ mbmi->comp_group_idx = (best_compound_data.type < COMPOUND_WEDGE) ? 0 : 1;
+ mbmi->compound_idx = !(best_compound_data.type == COMPOUND_DISTWTD);
mbmi->interinter_comp = best_compound_data;
memcpy(xd->seg_mask, buffers->tmp_best_mask_buf, mask_len);
}
@@ -9569,6 +10125,9 @@ static int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
}
}
restore_dst_buf(xd, *orig_dst, 1);
+ if (!match_found)
+ save_comp_rd_search_stat(x, mbmi, comp_rate, comp_dist, comp_model_rd,
+ cur_mv);
return best_compmode_interinter_cost;
}
@@ -9609,20 +10168,13 @@ typedef struct {
int_mv mv;
} inter_mode_info;
-static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
- BLOCK_SIZE bsize, RD_STATS *rd_stats,
- RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv,
- int *disable_skip, int mi_row, int mi_col,
- HandleInterModeArgs *args, int64_t ref_best_rd,
- uint8_t *const tmp_buf,
- CompoundTypeRdBuffers *rd_buffers
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
- ,
- TileDataEnc *tile_data, int64_t *best_est_rd,
- const int do_tx_search,
- InterModesInfo *inter_modes_info
-#endif
-) {
+static int64_t handle_inter_mode(
+ AV1_COMP *const cpi, TileDataEnc *tile_data, MACROBLOCK *x,
+ BLOCK_SIZE bsize, RD_STATS *rd_stats, RD_STATS *rd_stats_y,
+ RD_STATS *rd_stats_uv, int *disable_skip, int mi_row, int mi_col,
+ HandleInterModeArgs *args, int64_t ref_best_rd, uint8_t *const tmp_buf,
+ CompoundTypeRdBuffers *rd_buffers, int64_t *best_est_rd,
+ const int do_tx_search, InterModesInfo *inter_modes_info) {
const AV1_COMMON *cm = &cpi->common;
const int num_planes = av1_num_planes(cm);
MACROBLOCKD *xd = &x->e_mbd;
@@ -9642,7 +10194,7 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
// one for future predictions. In the end, copy from tmp_buf to
// dst if necessary.
struct macroblockd_plane *p = xd->plane;
- BUFFER_SET orig_dst = {
+ const BUFFER_SET orig_dst = {
{ p[0].dst.buf, p[1].dst.buf, p[2].dst.buf },
{ p[0].dst.stride, p[1].dst.stride, p[2].dst.stride },
};
@@ -9668,11 +10220,20 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
int backup_rate_mv = 0;
inter_mode_info mode_info[MAX_REF_MV_SERCH];
- int comp_idx;
- const int search_jnt_comp = is_comp_pred &
- cm->seq_params.order_hint_info.enable_jnt_comp &
- (mbmi->mode != GLOBAL_GLOBALMV) &
- (cpi->sf.use_jnt_comp_flag != JNT_COMP_DISABLED);
+ int mode_search_mask[2];
+ const int do_two_loop_comp_search =
+ is_comp_pred && cpi->sf.two_loop_comp_search;
+ if (do_two_loop_comp_search) {
+ // TODO(debargha): Change this to try alternate ways of splitting
+ // modes while doing two pass compound_mode search.
+ mode_search_mask[0] = (1 << COMPOUND_AVERAGE);
+ } else {
+ mode_search_mask[0] = (1 << COMPOUND_AVERAGE) | (1 << COMPOUND_DISTWTD) |
+ (1 << COMPOUND_WEDGE) | (1 << COMPOUND_DIFFWTD);
+ }
+ mode_search_mask[1] = ((1 << COMPOUND_AVERAGE) | (1 << COMPOUND_DISTWTD) |
+ (1 << COMPOUND_WEDGE) | (1 << COMPOUND_DIFFWTD)) -
+ mode_search_mask[0];
// TODO(jingning): This should be deprecated shortly.
const int has_nearmv = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0;
@@ -9729,42 +10290,35 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
}
const RD_STATS backup_rd_stats = *rd_stats;
- // If !search_jnt_comp, we need to force mbmi->compound_idx = 1.
- for (comp_idx = 1; comp_idx >= !search_jnt_comp; --comp_idx) {
+
+ for (int comp_loop_idx = 0; comp_loop_idx <= do_two_loop_comp_search;
+ ++comp_loop_idx) {
int rs = 0;
int compmode_interinter_cost = 0;
- mbmi->compound_idx = comp_idx;
- if (is_comp_pred && comp_idx == 0) {
- *rd_stats = backup_rd_stats;
- mbmi->interinter_comp.type = COMPOUND_AVERAGE;
- mbmi->num_proj_ref = 0;
- mbmi->motion_mode = SIMPLE_TRANSLATION;
- mbmi->comp_group_idx = 0;
- const int comp_index_ctx = get_comp_index_context(cm, xd);
- compmode_interinter_cost += x->comp_idx_cost[comp_index_ctx][0];
- }
+ if (is_comp_pred && comp_loop_idx == 1) *rd_stats = backup_rd_stats;
int_mv cur_mv[2];
if (!build_cur_mv(cur_mv, this_mode, cm, x)) {
continue;
}
if (have_newmv_in_inter_mode(this_mode)) {
- if (comp_idx == 0) {
+ if (comp_loop_idx == 1) {
cur_mv[0] = backup_mv[0];
cur_mv[1] = backup_mv[1];
rate_mv = backup_rate_mv;
}
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, handle_newmv_time);
+#endif
if (cpi->sf.prune_single_motion_modes_by_simple_trans &&
args->single_ref_first_pass == 0 && !is_comp_pred) {
const int ref0 = mbmi->ref_frame[0];
newmv_ret_val = args->single_newmv_valid[ref_mv_idx][ref0] ? 0 : 1;
cur_mv[0] = args->single_newmv[ref_mv_idx][ref0];
rate_mv = args->single_newmv_rate[ref_mv_idx][ref0];
- } else if (!(search_jnt_comp &&
- (cpi->sf.use_jnt_comp_flag == JNT_COMP_SKIP_MV_SEARCH) &&
- comp_idx == 0)) {
+ } else if (comp_loop_idx == 0) {
newmv_ret_val = handle_newmv(cpi, x, bsize, cur_mv, mi_row, mi_col,
&rate_mv, args);
@@ -9774,6 +10328,9 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
backup_mv[1] = cur_mv[1];
backup_rate_mv = rate_mv;
}
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, handle_newmv_time);
+#endif
if (newmv_ret_val != 0) {
continue;
@@ -9817,7 +10374,6 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
best_rd = RDCOST(x->rdmult, best_rd_stats.rate,
best_rd_stats.dist);
if (best_rd < ref_best_rd) ref_best_rd = best_rd;
-
skip = 1;
break;
}
@@ -9869,46 +10425,90 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
continue;
}
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, compound_type_rd_time);
+#endif
int skip_build_pred = 0;
- if (is_comp_pred && comp_idx) {
- // Find matching interp filter or set to default interp filter
- const int need_search =
- av1_is_interp_needed(xd) && av1_is_interp_search_needed(xd);
- int match_found = -1;
- const InterpFilter assign_filter = cm->interp_filter;
- if (cpi->sf.skip_repeat_interpolation_filter_search && need_search) {
- match_found = find_interp_filter_in_stats(x, mbmi);
- }
- if (!need_search || match_found == -1) {
- set_default_interp_filters(mbmi, assign_filter);
- }
+ if (is_comp_pred) {
+ if (mode_search_mask[comp_loop_idx] == (1 << COMPOUND_AVERAGE)) {
+ // Only compound_average
+ mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+ mbmi->num_proj_ref = 0;
+ mbmi->motion_mode = SIMPLE_TRANSLATION;
+ mbmi->comp_group_idx = 0;
+ mbmi->compound_idx = 1;
+ const int comp_index_ctx = get_comp_index_context(cm, xd);
+ compmode_interinter_cost +=
+ x->comp_idx_cost[comp_index_ctx][mbmi->compound_idx];
+ } else if (mode_search_mask[comp_loop_idx] == (1 << COMPOUND_DISTWTD)) {
+ // Only compound_distwtd
+ if (!cm->seq_params.order_hint_info.enable_dist_wtd_comp ||
+ cpi->sf.use_dist_wtd_comp_flag == DIST_WTD_COMP_DISABLED ||
+ (do_two_loop_comp_search && mbmi->mode == GLOBAL_GLOBALMV))
+ continue;
+ mbmi->interinter_comp.type = COMPOUND_DISTWTD;
+ mbmi->num_proj_ref = 0;
+ mbmi->motion_mode = SIMPLE_TRANSLATION;
+ mbmi->comp_group_idx = 0;
+ mbmi->compound_idx = 0;
+ const int comp_index_ctx = get_comp_index_context(cm, xd);
+ compmode_interinter_cost +=
+ x->comp_idx_cost[comp_index_ctx][mbmi->compound_idx];
+ } else {
+ // Find matching interp filter or set to default interp filter
+ const int need_search =
+ av1_is_interp_needed(xd) && av1_is_interp_search_needed(xd);
+ int match_found = -1;
+ const InterpFilter assign_filter = cm->interp_filter;
+ int is_luma_interp_done = 0;
+ if (cpi->sf.skip_repeat_interpolation_filter_search && need_search) {
+ match_found = find_interp_filter_in_stats(x, mbmi);
+ }
+ if (!need_search || match_found == -1) {
+ set_default_interp_filters(mbmi, assign_filter);
+ }
- int64_t best_rd_compound;
- compmode_interinter_cost = compound_type_rd(
- cpi, x, bsize, mi_col, mi_row, cur_mv, masked_compound_used,
- &orig_dst, &tmp_dst, rd_buffers, &rate_mv, &best_rd_compound,
- rd_stats, ref_best_rd);
- if (ref_best_rd < INT64_MAX &&
- (best_rd_compound >> 3) * 6 > ref_best_rd) {
- restore_dst_buf(xd, orig_dst, num_planes);
- continue;
- }
- // No need to call av1_build_inter_predictors_sby if
- // COMPOUND_AVERAGE is selected because it is the first
- // candidate in compound_type_rd, and the following
- // compound types searching uses tmp_dst buffer
- if (mbmi->interinter_comp.type == COMPOUND_AVERAGE) {
- if (num_planes > 1)
- av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, &orig_dst,
- bsize);
- skip_build_pred = 1;
+ int64_t best_rd_compound;
+ compmode_interinter_cost = compound_type_rd(
+ cpi, x, bsize, mi_col, mi_row, cur_mv,
+ mode_search_mask[comp_loop_idx], masked_compound_used, &orig_dst,
+ &tmp_dst, rd_buffers, &rate_mv, &best_rd_compound, rd_stats,
+ ref_best_rd, &is_luma_interp_done);
+ if (ref_best_rd < INT64_MAX &&
+ (best_rd_compound >> 4) * (11 + 2 * do_two_loop_comp_search) >
+ ref_best_rd) {
+ restore_dst_buf(xd, orig_dst, num_planes);
+ continue;
+ }
+ // No need to call av1_enc_build_inter_predictor for luma if
+ // COMPOUND_AVERAGE is selected because it is the first
+ // candidate in compound_type_rd, and the following
+ // compound types searching uses tmp_dst buffer
+
+ if (mbmi->interinter_comp.type == COMPOUND_AVERAGE &&
+ is_luma_interp_done) {
+ if (num_planes > 1) {
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst,
+ bsize, AOM_PLANE_U, num_planes - 1);
+ }
+ skip_build_pred = 1;
+ }
}
}
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, compound_type_rd_time);
+#endif
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, interpolation_filter_search_time);
+#endif
ret_val = interpolation_filter_search(
- x, cpi, bsize, mi_row, mi_col, &tmp_dst, &orig_dst,
+ x, cpi, tile_data, bsize, mi_row, mi_col, &tmp_dst, &orig_dst,
args->single_filter, &rd, &rs, &skip_txfm_sb, &skip_sse_sb,
- skip_build_pred, args, ref_best_rd);
+ &skip_build_pred, args, ref_best_rd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, interpolation_filter_search_time);
+#endif
if (args->modelled_rd != NULL && !is_comp_pred) {
args->modelled_rd[this_mode][ref_mv_idx][refs[0]] = rd;
}
@@ -9939,8 +10539,12 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
}
}
rd_stats->rate += compmode_interinter_cost;
+ if (skip_build_pred != 1) {
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, &orig_dst, bsize,
+ 0, av1_num_planes(cm) - 1);
+ }
- if (search_jnt_comp && cpi->sf.jnt_comp_fast_tx_search && comp_idx == 0) {
+ if (cpi->sf.second_loop_comp_fast_tx_search && comp_loop_idx == 1) {
// TODO(chengchen): this speed feature introduces big loss.
// Need better estimation of rate distortion.
int dummy_rate;
@@ -9949,7 +10553,7 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
int64_t plane_sse[MAX_MB_PLANE] = { 0 };
int64_t plane_dist[MAX_MB_PLANE] = { 0 };
- model_rd_sb_fn[MODELRD_TYPE_JNT_COMPOUND](
+ model_rd_sb_fn[MODELRD_TYPE_DIST_WTD_COMPOUND](
cpi, bsize, x, xd, 0, num_planes - 1, mi_row, mi_col, &dummy_rate,
&dummy_dist, &skip_txfm_sb, &skip_sse_sb, plane_rate, plane_sse,
plane_dist);
@@ -9965,15 +10569,15 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
rd_stats_y->dist = plane_dist[0];
rd_stats_uv->dist = plane_dist[1] + plane_dist[2];
} else {
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
- ret_val = motion_mode_rd(
- cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv, disable_skip,
- mi_row, mi_col, args, ref_best_rd, refs, &rate_mv, &orig_dst,
- tile_data, best_est_rd, do_tx_search, inter_modes_info);
-#else
- ret_val = motion_mode_rd(cpi, x, bsize, rd_stats, rd_stats_y,
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, motion_mode_rd_time);
+#endif
+ ret_val = motion_mode_rd(cpi, tile_data, x, bsize, rd_stats, rd_stats_y,
rd_stats_uv, disable_skip, mi_row, mi_col,
- args, ref_best_rd, refs, &rate_mv, &orig_dst);
+ args, ref_best_rd, refs, &rate_mv, &orig_dst,
+ best_est_rd, do_tx_search, inter_modes_info);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, motion_mode_rd_time);
#endif
}
mode_info[ref_mv_idx].mv.as_int = mbmi->mv[0].as_int;
@@ -10019,10 +10623,10 @@ static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
}
static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
- RD_STATS *rd_cost, BLOCK_SIZE bsize,
+ RD_STATS *rd_stats, BLOCK_SIZE bsize,
int64_t best_rd) {
const AV1_COMMON *const cm = &cpi->common;
- if (!av1_allow_intrabc(cm)) return INT64_MAX;
+ if (!av1_allow_intrabc(cm) || !cpi->oxcf.enable_intrabc) return INT64_MAX;
const int num_planes = av1_num_planes(cm);
MACROBLOCKD *const xd = &x->e_mbd;
@@ -10074,7 +10678,7 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
};
MB_MODE_INFO best_mbmi = *mbmi;
- RD_STATS best_rdcost = *rd_cost;
+ RD_STATS best_rdstats = *rd_stats;
int best_skip = x->skip;
uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE] = { 0 };
@@ -10118,17 +10722,18 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
MV mvp_full = dv_ref.as_mv;
mvp_full.col >>= 3;
mvp_full.row >>= 3;
- int sadpb = x->sadperbit16;
+ const int sadpb = x->sadperbit16;
int cost_list[5];
- int bestsme = av1_full_pixel_search(
+ const int bestsme = av1_full_pixel_search(
cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, 0,
sadpb, cond_cost_list(cpi, cost_list), &dv_ref.as_mv, INT_MAX, 1,
- (MI_SIZE * mi_col), (MI_SIZE * mi_row), 1);
+ (MI_SIZE * mi_col), (MI_SIZE * mi_row), 1,
+ &cpi->ss_cfg[SS_CFG_LOOKAHEAD]);
x->mv_limits = tmp_mv_limits;
if (bestsme == INT_MAX) continue;
mvp_full = x->best_mv.as_mv;
- MV dv = { .row = mvp_full.row * 8, .col = mvp_full.col * 8 };
+ const MV dv = { .row = mvp_full.row * 8, .col = mvp_full.col * 8 };
if (mv_check_bounds(&x->mv_limits, &dv)) continue;
if (!av1_is_dv_valid(dv, cm, xd, mi_row, mi_col, bsize,
cm->seq_params.mib_size_log2))
@@ -10147,74 +10752,39 @@ static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
mbmi->interp_filters = av1_broadcast_interp_filter(BILINEAR);
mbmi->skip = 0;
x->skip = 0;
- av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
+ av1_num_planes(cm) - 1);
int *dvcost[2] = { (int *)&cpi->dv_cost[0][MV_MAX],
(int *)&cpi->dv_cost[1][MV_MAX] };
// TODO(aconverse@google.com): The full motion field defining discount
// in MV_COST_WEIGHT is too large. Explore other values.
- int rate_mv = av1_mv_bit_cost(&dv, &dv_ref.as_mv, cpi->dv_joint_cost,
- dvcost, MV_COST_WEIGHT_SUB);
+ const int rate_mv = av1_mv_bit_cost(&dv, &dv_ref.as_mv, cpi->dv_joint_cost,
+ dvcost, MV_COST_WEIGHT_SUB);
const int rate_mode = x->intrabc_cost[1];
- RD_STATS rd_stats, rd_stats_uv;
- av1_subtract_plane(x, bsize, 0);
- if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
- // Intrabc
- select_tx_type_yrd(cpi, x, &rd_stats, bsize, mi_row, mi_col, INT64_MAX);
- } else {
- super_block_yrd(cpi, x, &rd_stats, bsize, INT64_MAX);
- memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
- for (int i = 0; i < xd->n4_h * xd->n4_w; ++i)
- set_blk_skip(x, 0, i, rd_stats.skip);
- }
- if (num_planes > 1) {
- super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
- av1_merge_rd_stats(&rd_stats, &rd_stats_uv);
- }
-#if CONFIG_RD_DEBUG
- mbmi->rd_stats = rd_stats;
-#endif
-
- const int skip_ctx = av1_get_skip_context(xd);
-
- RD_STATS rdc_noskip;
- av1_init_rd_stats(&rdc_noskip);
- rdc_noskip.rate =
- rate_mode + rate_mv + rd_stats.rate + x->skip_cost[skip_ctx][0];
- rdc_noskip.dist = rd_stats.dist;
- rdc_noskip.rdcost = RDCOST(x->rdmult, rdc_noskip.rate, rdc_noskip.dist);
- if (rdc_noskip.rdcost < best_rd) {
- best_rd = rdc_noskip.rdcost;
+ RD_STATS rd_stats_yuv, rd_stats_y, rd_stats_uv;
+ if (!txfm_search(cpi, NULL, x, bsize, mi_row, mi_col, &rd_stats_yuv,
+ &rd_stats_y, &rd_stats_uv, rate_mode + rate_mv, INT64_MAX))
+ continue;
+ rd_stats_yuv.rdcost =
+ RDCOST(x->rdmult, rd_stats_yuv.rate, rd_stats_yuv.dist);
+ if (rd_stats_yuv.rdcost < best_rd) {
+ best_rd = rd_stats_yuv.rdcost;
best_mbmi = *mbmi;
- best_skip = x->skip;
- best_rdcost = rdc_noskip;
+ best_skip = mbmi->skip;
+ best_rdstats = rd_stats_yuv;
memcpy(best_blk_skip, x->blk_skip,
sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w);
}
-
- if (!xd->lossless[mbmi->segment_id]) {
- x->skip = 1;
- mbmi->skip = 1;
- RD_STATS rdc_skip;
- av1_init_rd_stats(&rdc_skip);
- rdc_skip.rate = rate_mode + rate_mv + x->skip_cost[skip_ctx][1];
- rdc_skip.dist = rd_stats.sse;
- rdc_skip.rdcost = RDCOST(x->rdmult, rdc_skip.rate, rdc_skip.dist);
- if (rdc_skip.rdcost < best_rd) {
- best_rd = rdc_skip.rdcost;
- best_mbmi = *mbmi;
- best_skip = x->skip;
- best_rdcost = rdc_skip;
- memcpy(best_blk_skip, x->blk_skip,
- sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w);
- }
- }
}
*mbmi = best_mbmi;
- *rd_cost = best_rdcost;
+ *rd_stats = best_rdstats;
x->skip = best_skip;
memcpy(x->blk_skip, best_blk_skip,
sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w);
+#if CONFIG_RD_DEBUG
+ mbmi->rd_stats = *rd_stats;
+#endif
return best_rd;
}
@@ -10340,15 +10910,6 @@ static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
int above_stride, const uint8_t *left,
int left_stride);
-static const int ref_frame_flag_list[REF_FRAMES] = { 0,
- AOM_LAST_FLAG,
- AOM_LAST2_FLAG,
- AOM_LAST3_FLAG,
- AOM_GOLD_FLAG,
- AOM_BWD_FLAG,
- AOM_ALT2_FLAG,
- AOM_ALT_FLAG };
-
static void rd_pick_skip_mode(RD_STATS *rd_cost,
InterModeSearchState *search_state,
const AV1_COMP *const cpi, MACROBLOCK *const x,
@@ -10381,6 +10942,10 @@ static void rd_pick_skip_mode(RD_STATS *rd_cost,
return;
}
+ if (!cpi->oxcf.enable_onesided_comp && cpi->all_one_sided_refs) {
+ return;
+ }
+
mbmi->mode = this_mode;
mbmi->uv_mode = UV_DC_PRED;
mbmi->ref_frame[0] = ref_frame;
@@ -10437,7 +11002,8 @@ static void rd_pick_skip_mode(RD_STATS *rd_cost,
rd_cost->dist)
: INT64_MAX;
- if (skip_mode_rd_stats.rdcost <= best_intra_inter_mode_cost) {
+ if (skip_mode_rd_stats.rdcost <= best_intra_inter_mode_cost &&
+ (!xd->lossless[mbmi->segment_id] || skip_mode_rd_stats.dist == 0)) {
assert(mode_index != -1);
search_state->best_mbmode.skip_mode = 1;
search_state->best_mbmode = *mbmi;
@@ -10483,13 +11049,6 @@ static void rd_pick_skip_mode(RD_STATS *rd_cost,
rd_cost->dist = rd_cost->sse = skip_mode_rd_stats.dist;
rd_cost->rdcost = skip_mode_rd_stats.rdcost;
-#if CONFIG_ONE_PASS_SVM
- if (bsize >= BLOCK_8X8 &&
- block_size_high[bsize] == block_size_wide[bsize]) {
- av1_copy_reg_stat(rd_cost, &skip_mode_rd_stats);
- }
-#endif
-
search_state->best_rd = rd_cost->rdcost;
search_state->best_skip2 = 1;
search_state->best_mode_skippable = 1;
@@ -10539,15 +11098,15 @@ static void sf_refine_fast_tx_type_search(
}
if (is_inter_mode(mbmi->mode)) {
- av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
+ av1_num_planes(cm) - 1);
if (mbmi->motion_mode == OBMC_CAUSAL)
av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
av1_subtract_plane(x, bsize, 0);
if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
- // av1_rd_pick_inter_mode_sb
- select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, mi_row, mi_col,
- INT64_MAX);
+ pick_tx_size_type_yrd(cpi, x, &rd_stats_y, bsize, mi_row, mi_col,
+ INT64_MAX);
assert(rd_stats_y.rate != INT_MAX);
} else {
super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
@@ -10555,19 +11114,14 @@ static void sf_refine_fast_tx_type_search(
for (int i = 0; i < xd->n4_h * xd->n4_w; ++i)
set_blk_skip(x, 0, i, rd_stats_y.skip);
}
- if (num_planes > 1) {
- inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX, INT64_MAX,
- FTXS_NONE);
- } else {
- av1_init_rd_stats(&rd_stats_uv);
- }
} else {
super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
- if (num_planes > 1) {
- super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
- } else {
- av1_init_rd_stats(&rd_stats_uv);
- }
+ }
+
+ if (num_planes > 1) {
+ super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
+ } else {
+ av1_init_rd_stats(&rd_stats_uv);
}
if (RDCOST(x->rdmult,
@@ -10602,13 +11156,193 @@ static void sf_refine_fast_tx_type_search(
}
}
+typedef struct {
+ // Mask for each reference frame, specifying which prediction modes to NOT try
+ // during search.
+ uint32_t pred_modes[REF_FRAMES];
+ // If ref_combo[i][j + 1] is true, do NOT try prediction using combination of
+ // reference frames (i, j).
+ // Note: indexing with 'j + 1' is due to the fact that 2nd reference can be -1
+ // (NONE_FRAME).
+ bool ref_combo[REF_FRAMES][REF_FRAMES + 1];
+} mode_skip_mask_t;
+
+// Update 'ref_combo' mask to disable given 'ref' in single and compound modes.
+static void disable_reference(MV_REFERENCE_FRAME ref,
+ bool ref_combo[REF_FRAMES][REF_FRAMES + 1]) {
+ for (MV_REFERENCE_FRAME ref2 = NONE_FRAME; ref2 < REF_FRAMES; ++ref2) {
+ ref_combo[ref][ref2 + 1] = true;
+ }
+}
+
+// Update 'ref_combo' mask to disable all inter references except ALTREF.
+static void disable_inter_references_except_altref(
+ bool ref_combo[REF_FRAMES][REF_FRAMES + 1]) {
+ disable_reference(LAST_FRAME, ref_combo);
+ disable_reference(LAST2_FRAME, ref_combo);
+ disable_reference(LAST3_FRAME, ref_combo);
+ disable_reference(GOLDEN_FRAME, ref_combo);
+ disable_reference(BWDREF_FRAME, ref_combo);
+ disable_reference(ALTREF2_FRAME, ref_combo);
+}
+
+static const MV_REFERENCE_FRAME reduced_ref_combos[][2] = {
+ { LAST_FRAME, NONE_FRAME }, { ALTREF_FRAME, NONE_FRAME },
+ { LAST_FRAME, ALTREF_FRAME }, { GOLDEN_FRAME, NONE_FRAME },
+ { INTRA_FRAME, NONE_FRAME }, { GOLDEN_FRAME, ALTREF_FRAME },
+ { LAST_FRAME, GOLDEN_FRAME }, { LAST_FRAME, INTRA_FRAME },
+ { LAST_FRAME, BWDREF_FRAME }, { LAST_FRAME, LAST3_FRAME },
+ { GOLDEN_FRAME, BWDREF_FRAME }, { GOLDEN_FRAME, INTRA_FRAME },
+ { BWDREF_FRAME, NONE_FRAME }, { BWDREF_FRAME, ALTREF_FRAME },
+ { ALTREF_FRAME, INTRA_FRAME }, { BWDREF_FRAME, INTRA_FRAME },
+};
+
+static const MV_REFERENCE_FRAME real_time_ref_combos[][2] = {
+ { LAST_FRAME, NONE_FRAME },
+ { ALTREF_FRAME, NONE_FRAME },
+ { GOLDEN_FRAME, NONE_FRAME },
+ { INTRA_FRAME, NONE_FRAME }
+};
+
+typedef enum { REF_SET_FULL, REF_SET_REDUCED, REF_SET_REALTIME } REF_SET;
+
+static void default_skip_mask(mode_skip_mask_t *mask, REF_SET ref_set) {
+ if (ref_set == REF_SET_FULL) {
+ // Everything available by default.
+ memset(mask, 0, sizeof(*mask));
+ } else {
+ // All modes available by default.
+ memset(mask->pred_modes, 0, sizeof(mask->pred_modes));
+ // All references disabled first.
+ for (MV_REFERENCE_FRAME ref1 = INTRA_FRAME; ref1 < REF_FRAMES; ++ref1) {
+ for (MV_REFERENCE_FRAME ref2 = NONE_FRAME; ref2 < REF_FRAMES; ++ref2) {
+ mask->ref_combo[ref1][ref2 + 1] = true;
+ }
+ }
+ const MV_REFERENCE_FRAME(*ref_set_combos)[2];
+ int num_ref_combos;
+
+ // Then enable reduced set of references explicitly.
+ switch (ref_set) {
+ case REF_SET_REDUCED:
+ ref_set_combos = reduced_ref_combos;
+ num_ref_combos =
+ (int)sizeof(reduced_ref_combos) / sizeof(reduced_ref_combos[0]);
+ break;
+ case REF_SET_REALTIME:
+ ref_set_combos = real_time_ref_combos;
+ num_ref_combos =
+ (int)sizeof(real_time_ref_combos) / sizeof(real_time_ref_combos[0]);
+ break;
+ default: assert(0); num_ref_combos = 0;
+ }
+
+ for (int i = 0; i < num_ref_combos; ++i) {
+ const MV_REFERENCE_FRAME *const this_combo = ref_set_combos[i];
+ mask->ref_combo[this_combo[0]][this_combo[1] + 1] = false;
+ }
+ }
+}
+
+static void init_mode_skip_mask(mode_skip_mask_t *mask, const AV1_COMP *cpi,
+ MACROBLOCK *x, BLOCK_SIZE bsize) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const struct segmentation *const seg = &cm->seg;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ unsigned char segment_id = mbmi->segment_id;
+ const SPEED_FEATURES *const sf = &cpi->sf;
+ REF_SET ref_set = REF_SET_FULL;
+
+ if (sf->use_real_time_ref_set)
+ ref_set = REF_SET_REALTIME;
+ else if (cpi->oxcf.enable_reduced_reference_set)
+ ref_set = REF_SET_REDUCED;
+
+ default_skip_mask(mask, ref_set);
+
+ int min_pred_mv_sad = INT_MAX;
+ MV_REFERENCE_FRAME ref_frame;
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame)
+ min_pred_mv_sad = AOMMIN(min_pred_mv_sad, x->pred_mv_sad[ref_frame]);
+
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ if (!(cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame])) {
+ // Skip checking missing reference in both single and compound reference
+ // modes.
+ disable_reference(ref_frame, mask->ref_combo);
+ } else {
+ // Skip fixed mv modes for poor references
+ if ((x->pred_mv_sad[ref_frame] >> 2) > min_pred_mv_sad) {
+ mask->pred_modes[ref_frame] |= INTER_NEAREST_NEAR_ZERO;
+ }
+ }
+ if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+ get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
+ // Reference not used for the segment.
+ disable_reference(ref_frame, mask->ref_combo);
+ }
+ }
+ // Note: We use the following drop-out only if the SEG_LVL_REF_FRAME feature
+ // is disabled for this segment. This is to prevent the possibility that we
+ // end up unable to pick any mode.
+ if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
+ // Only consider GLOBALMV/ALTREF_FRAME for alt ref frame,
+ // unless ARNR filtering is enabled in which case we want
+ // an unfiltered alternative. We allow near/nearest as well
+ // because they may result in zero-zero MVs but be cheaper.
+ if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
+ disable_inter_references_except_altref(mask->ref_combo);
+
+ mask->pred_modes[ALTREF_FRAME] = ~INTER_NEAREST_NEAR_ZERO;
+ const MV_REFERENCE_FRAME tmp_ref_frames[2] = { ALTREF_FRAME, NONE_FRAME };
+ int_mv near_mv, nearest_mv, global_mv;
+ get_this_mv(&nearest_mv, NEARESTMV, 0, 0, tmp_ref_frames, x->mbmi_ext);
+ get_this_mv(&near_mv, NEARMV, 0, 0, tmp_ref_frames, x->mbmi_ext);
+ get_this_mv(&global_mv, GLOBALMV, 0, 0, tmp_ref_frames, x->mbmi_ext);
+
+ if (near_mv.as_int != global_mv.as_int)
+ mask->pred_modes[ALTREF_FRAME] |= (1 << NEARMV);
+ if (nearest_mv.as_int != global_mv.as_int)
+ mask->pred_modes[ALTREF_FRAME] |= (1 << NEARESTMV);
+ }
+ }
+
+ if (cpi->rc.is_src_frame_alt_ref) {
+ if (sf->alt_ref_search_fp) {
+ assert(cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME]);
+ mask->pred_modes[ALTREF_FRAME] = 0;
+ disable_inter_references_except_altref(mask->ref_combo);
+ disable_reference(INTRA_FRAME, mask->ref_combo);
+ }
+ }
+
+ if (sf->alt_ref_search_fp)
+ if (!cm->show_frame && x->pred_mv_sad[GOLDEN_FRAME] < INT_MAX)
+ if (x->pred_mv_sad[ALTREF_FRAME] > (x->pred_mv_sad[GOLDEN_FRAME] << 1))
+ mask->pred_modes[ALTREF_FRAME] |= INTER_ALL;
+
+ if (sf->adaptive_mode_search) {
+ if (cm->show_frame && !cpi->rc.is_src_frame_alt_ref &&
+ cpi->rc.frames_since_golden >= 3)
+ if ((x->pred_mv_sad[GOLDEN_FRAME] >> 1) > x->pred_mv_sad[LAST_FRAME])
+ mask->pred_modes[GOLDEN_FRAME] |= INTER_ALL;
+ }
+
+ if (bsize > sf->max_intra_bsize) {
+ disable_reference(INTRA_FRAME, mask->ref_combo);
+ }
+
+ mask->pred_modes[INTRA_FRAME] |=
+ ~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]);
+}
+
// Please add/modify parameter setting in this function, making it consistent
// and easy to read and maintain.
static void set_params_rd_pick_inter_mode(
const AV1_COMP *cpi, MACROBLOCK *x, HandleInterModeArgs *args,
- BLOCK_SIZE bsize, int mi_row, int mi_col, uint16_t ref_frame_skip_mask[2],
- uint32_t mode_skip_mask[REF_FRAMES], int skip_ref_frame_mask,
- unsigned int ref_costs_single[REF_FRAMES],
+ BLOCK_SIZE bsize, int mi_row, int mi_col, mode_skip_mask_t *mode_skip_mask,
+ int skip_ref_frame_mask, unsigned int ref_costs_single[REF_FRAMES],
unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES],
struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) {
const AV1_COMMON *const cm = &cpi->common;
@@ -10616,8 +11350,6 @@ static void set_params_rd_pick_inter_mode(
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = xd->mi[0];
MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
- const struct segmentation *const seg = &cm->seg;
- const SPEED_FEATURES *const sf = &cpi->sf;
unsigned char segment_id = mbmi->segment_id;
int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1,
@@ -10629,7 +11361,7 @@ static void set_params_rd_pick_inter_mode(
for (int i = 0; i < MB_MODE_COUNT; ++i)
for (int k = 0; k < REF_FRAMES; ++k) args->single_filter[i][k] = SWITCHABLE;
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ if (is_cur_buf_hbd(xd)) {
int len = sizeof(uint16_t);
args->above_pred_buf[0] = CONVERT_TO_BYTEPTR(x->above_pred_buf);
args->above_pred_buf[1] =
@@ -10659,9 +11391,8 @@ static void set_params_rd_pick_inter_mode(
for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
x->pred_mv_sad[ref_frame] = INT_MAX;
x->mbmi_ext->mode_context[ref_frame] = 0;
- x->mbmi_ext->compound_mode_context[ref_frame] = 0;
mbmi_ext->ref_mv_count[ref_frame] = UINT8_MAX;
- if (cpi->ref_frame_flags & ref_frame_flag_list[ref_frame]) {
+ if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) {
if (mbmi->partition != PARTITION_NONE &&
mbmi->partition != PARTITION_SPLIT) {
if (skip_ref_frame_mask & (1 << ref_frame)) {
@@ -10678,7 +11409,7 @@ static void set_params_rd_pick_inter_mode(
if (skip) continue;
}
}
- assert(get_ref_frame_buffer(cpi, ref_frame) != NULL);
+ assert(get_ref_frame_yv12_buf(cm, ref_frame) != NULL);
setup_buffer_ref_mvs_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
yv12_mb);
}
@@ -10688,8 +11419,8 @@ static void set_params_rd_pick_inter_mode(
x->mbmi_ext->mode_context[ref_frame] = 0;
mbmi_ext->ref_mv_count[ref_frame] = UINT8_MAX;
const MV_REFERENCE_FRAME *rf = ref_frame_map[ref_frame - REF_FRAMES];
- if (!((cpi->ref_frame_flags & ref_frame_flag_list[rf[0]]) &&
- (cpi->ref_frame_flags & ref_frame_flag_list[rf[1]]))) {
+ if (!((cpi->ref_frame_flags & av1_ref_frame_flag_list[rf[0]]) &&
+ (cpi->ref_frame_flags & av1_ref_frame_flag_list[rf[1]]))) {
continue;
}
@@ -10722,93 +11453,122 @@ static void set_params_rd_pick_inter_mode(
args->left_pred_stride[0]);
}
- int min_pred_mv_sad = INT_MAX;
- for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame)
- min_pred_mv_sad = AOMMIN(min_pred_mv_sad, x->pred_mv_sad[ref_frame]);
+ init_mode_skip_mask(mode_skip_mask, cpi, x, bsize);
- for (int i = 0; i < 2; ++i) {
- ref_frame_skip_mask[i] = 0;
- }
- memset(mode_skip_mask, 0, REF_FRAMES * sizeof(*mode_skip_mask));
- for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
- if (!(cpi->ref_frame_flags & ref_frame_flag_list[ref_frame])) {
- // Skip checking missing references in both single and compound reference
- // modes. Note that a mode will be skipped iff both reference frames
- // are masked out.
- ref_frame_skip_mask[0] |= (1 << ref_frame);
- ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
- } else {
- // Skip fixed mv modes for poor references
- if ((x->pred_mv_sad[ref_frame] >> 2) > min_pred_mv_sad) {
- mode_skip_mask[ref_frame] |= INTER_NEAREST_NEAR_ZERO;
- }
- }
- // If the segment reference frame feature is enabled....
- // then do nothing if the current ref frame is not allowed..
- if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
- get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
- ref_frame_skip_mask[0] |= (1 << ref_frame);
- ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
- }
+ if (cpi->sf.tx_type_search.fast_intra_tx_type_search ||
+ cpi->oxcf.use_intra_default_tx_only)
+ x->use_default_intra_tx_type = 1;
+ else
+ x->use_default_intra_tx_type = 0;
+
+ if (cpi->sf.tx_type_search.fast_inter_tx_type_search)
+ x->use_default_inter_tx_type = 1;
+ else
+ x->use_default_inter_tx_type = 0;
+ if (cpi->sf.skip_repeat_interpolation_filter_search) {
+ x->interp_filter_stats_idx[0] = 0;
+ x->interp_filter_stats_idx[1] = 0;
}
+ x->comp_rd_stats_idx = 0;
+}
- // Disable this drop out case if the ref frame
- // segment level feature is enabled for this segment. This is to
- // prevent the possibility that we end up unable to pick any mode.
- if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
- // Only consider GLOBALMV/ALTREF_FRAME for alt ref frame,
- // unless ARNR filtering is enabled in which case we want
- // an unfiltered alternative. We allow near/nearest as well
- // because they may result in zero-zero MVs but be cheaper.
- if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
- ref_frame_skip_mask[0] = (1 << LAST_FRAME) | (1 << LAST2_FRAME) |
- (1 << LAST3_FRAME) | (1 << BWDREF_FRAME) |
- (1 << ALTREF2_FRAME) | (1 << GOLDEN_FRAME);
- ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
- // TODO(zoeliu): To further explore whether following needs to be done for
- // BWDREF_FRAME as well.
- mode_skip_mask[ALTREF_FRAME] = ~INTER_NEAREST_NEAR_ZERO;
- const MV_REFERENCE_FRAME tmp_ref_frames[2] = { ALTREF_FRAME, NONE_FRAME };
- int_mv near_mv, nearest_mv, global_mv;
- get_this_mv(&nearest_mv, NEARESTMV, 0, 0, tmp_ref_frames, x->mbmi_ext);
- get_this_mv(&near_mv, NEARMV, 0, 0, tmp_ref_frames, x->mbmi_ext);
- get_this_mv(&global_mv, GLOBALMV, 0, 0, tmp_ref_frames, x->mbmi_ext);
+// TODO(kyslov): now this is very similar to set_params_rd_pick_inter_mode
+// (except that doesn't set ALTREF parameters)
+// consider passing a flag to select non-rd path (similar to
+// encode_sb_row)
+static void set_params_nonrd_pick_inter_mode(
+ const AV1_COMP *cpi, MACROBLOCK *x, HandleInterModeArgs *args,
+ BLOCK_SIZE bsize, int mi_row, int mi_col, mode_skip_mask_t *mode_skip_mask,
+ int skip_ref_frame_mask, unsigned int ref_costs_single[REF_FRAMES],
+ unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES],
+ struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+ unsigned char segment_id = mbmi->segment_id;
+ int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+ int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1,
+ MAX_SB_SIZE >> 1 };
+ int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1,
+ MAX_SB_SIZE >> 1 };
+ int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
- if (near_mv.as_int != global_mv.as_int)
- mode_skip_mask[ALTREF_FRAME] |= (1 << NEARMV);
- if (nearest_mv.as_int != global_mv.as_int)
- mode_skip_mask[ALTREF_FRAME] |= (1 << NEARESTMV);
- }
- }
+ for (int i = 0; i < MB_MODE_COUNT; ++i)
+ for (int k = 0; k < REF_FRAMES; ++k) args->single_filter[i][k] = SWITCHABLE;
- if (cpi->rc.is_src_frame_alt_ref) {
- if (sf->alt_ref_search_fp) {
- assert(cpi->ref_frame_flags & ref_frame_flag_list[ALTREF_FRAME]);
- mode_skip_mask[ALTREF_FRAME] = 0;
- ref_frame_skip_mask[0] = ~(1 << ALTREF_FRAME);
- ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
- }
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ int len = sizeof(uint16_t);
+ args->above_pred_buf[0] = CONVERT_TO_BYTEPTR(x->above_pred_buf);
+ args->above_pred_buf[1] =
+ CONVERT_TO_BYTEPTR(x->above_pred_buf + (MAX_SB_SQUARE >> 1) * len);
+ args->above_pred_buf[2] =
+ CONVERT_TO_BYTEPTR(x->above_pred_buf + MAX_SB_SQUARE * len);
+ args->left_pred_buf[0] = CONVERT_TO_BYTEPTR(x->left_pred_buf);
+ args->left_pred_buf[1] =
+ CONVERT_TO_BYTEPTR(x->left_pred_buf + (MAX_SB_SQUARE >> 1) * len);
+ args->left_pred_buf[2] =
+ CONVERT_TO_BYTEPTR(x->left_pred_buf + MAX_SB_SQUARE * len);
+ } else {
+ args->above_pred_buf[0] = x->above_pred_buf;
+ args->above_pred_buf[1] = x->above_pred_buf + (MAX_SB_SQUARE >> 1);
+ args->above_pred_buf[2] = x->above_pred_buf + MAX_SB_SQUARE;
+ args->left_pred_buf[0] = x->left_pred_buf;
+ args->left_pred_buf[1] = x->left_pred_buf + (MAX_SB_SQUARE >> 1);
+ args->left_pred_buf[2] = x->left_pred_buf + MAX_SB_SQUARE;
}
- if (sf->alt_ref_search_fp)
- if (!cm->show_frame && x->pred_mv_sad[GOLDEN_FRAME] < INT_MAX)
- if (x->pred_mv_sad[ALTREF_FRAME] > (x->pred_mv_sad[GOLDEN_FRAME] << 1))
- mode_skip_mask[ALTREF_FRAME] |= INTER_ALL;
+ av1_collect_neighbors_ref_counts(xd);
- if (sf->adaptive_mode_search) {
- if (cm->show_frame && !cpi->rc.is_src_frame_alt_ref &&
- cpi->rc.frames_since_golden >= 3)
- if ((x->pred_mv_sad[GOLDEN_FRAME] >> 1) > x->pred_mv_sad[LAST_FRAME])
- mode_skip_mask[GOLDEN_FRAME] |= INTER_ALL;
- }
+ estimate_ref_frame_costs(cm, xd, x, segment_id, ref_costs_single,
+ ref_costs_comp);
- if (bsize > sf->max_intra_bsize) {
- ref_frame_skip_mask[0] |= (1 << INTRA_FRAME);
- ref_frame_skip_mask[1] |= (1 << INTRA_FRAME);
+ MV_REFERENCE_FRAME ref_frame;
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ x->pred_mv_sad[ref_frame] = INT_MAX;
+ x->mbmi_ext->mode_context[ref_frame] = 0;
+ mbmi_ext->ref_mv_count[ref_frame] = UINT8_MAX;
+ if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) {
+ if (mbmi->partition != PARTITION_NONE &&
+ mbmi->partition != PARTITION_SPLIT) {
+ if (skip_ref_frame_mask & (1 << ref_frame)) {
+ int skip = 1;
+ for (int r = ALTREF_FRAME + 1; r < MODE_CTX_REF_FRAMES; ++r) {
+ if (!(skip_ref_frame_mask & (1 << r))) {
+ const MV_REFERENCE_FRAME *rf = ref_frame_map[r - REF_FRAMES];
+ if (rf[0] == ref_frame || rf[1] == ref_frame) {
+ skip = 0;
+ break;
+ }
+ }
+ }
+ if (skip) continue;
+ }
+ }
+ assert(get_ref_frame_yv12_buf(cm, ref_frame) != NULL);
+ setup_buffer_ref_mvs_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
+ yv12_mb);
+ }
}
+ av1_count_overlappable_neighbors(cm, xd, mi_row, mi_col);
- mode_skip_mask[INTRA_FRAME] |=
- ~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]);
+ if (check_num_overlappable_neighbors(mbmi) &&
+ is_motion_variation_allowed_bsize(bsize)) {
+ av1_build_prediction_by_above_preds(cm, xd, mi_row, mi_col,
+ args->above_pred_buf, dst_width1,
+ dst_height1, args->above_pred_stride);
+ av1_build_prediction_by_left_preds(cm, xd, mi_row, mi_col,
+ args->left_pred_buf, dst_width2,
+ dst_height2, args->left_pred_stride);
+ av1_setup_dst_planes(xd->plane, bsize, &cm->cur_frame->buf, mi_row, mi_col,
+ 0, num_planes);
+ calc_target_weighted_pred(
+ cm, x, xd, mi_row, mi_col, args->above_pred_buf[0],
+ args->above_pred_stride[0], args->left_pred_buf[0],
+ args->left_pred_stride[0]);
+ }
+ init_mode_skip_mask(mode_skip_mask, cpi, x, bsize);
if (cpi->sf.tx_type_search.fast_intra_tx_type_search)
x->use_default_intra_tx_type = 1;
@@ -10900,9 +11660,6 @@ static void search_palette_mode(const AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
rate2 -= rd_stats_y.rate;
if (num_planes > 1) rate2 -= search_state->rate_uv_tokenonly[uv_tx];
rate2 += x->skip_cost[av1_get_skip_context(xd)][1];
-#if CONFIG_ONE_PASS_SVM
- av1_reg_stat_skipmode_update(&rd_stats_y, x->rdmult);
-#endif
} else {
rate2 += x->skip_cost[av1_get_skip_context(xd)][0];
}
@@ -10919,9 +11676,6 @@ static void search_palette_mode(const AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
search_state->best_mode_skippable = skippable;
memcpy(ctx->blk_skip, x->blk_skip,
sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
-#if CONFIG_ONE_PASS_SVM
- av1_copy_reg_stat(rd_cost, &rd_stats_y);
-#endif
}
}
@@ -11016,32 +11770,89 @@ static void init_inter_mode_search_state(InterModeSearchState *search_state,
av1_zero(search_state->single_state_modelled_cnt);
}
+bool mask_says_skip(const mode_skip_mask_t *mode_skip_mask,
+ const MV_REFERENCE_FRAME *ref_frame,
+ const PREDICTION_MODE this_mode) {
+ if (mode_skip_mask->pred_modes[ref_frame[0]] & (1 << this_mode)) {
+ return true;
+ }
+
+ return mode_skip_mask->ref_combo[ref_frame[0]][ref_frame[1] + 1];
+}
+
+static int inter_mode_compatible_skip(const AV1_COMP *cpi, const MACROBLOCK *x,
+ BLOCK_SIZE bsize, int mode_index) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const struct segmentation *const seg = &cm->seg;
+ const MV_REFERENCE_FRAME *ref_frame = av1_mode_order[mode_index].ref_frame;
+ const PREDICTION_MODE this_mode = av1_mode_order[mode_index].mode;
+ const CurrentFrame *const current_frame = &cm->current_frame;
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ const unsigned char segment_id = mbmi->segment_id;
+ const int comp_pred = ref_frame[1] > INTRA_FRAME;
+
+ if (comp_pred) {
+ if (frame_is_intra_only(cm)) return 1;
+
+ if (current_frame->reference_mode == SINGLE_REFERENCE) return 1;
+
+ // Skip compound inter modes if ARF is not available.
+ if (!(cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame[1]]))
+ return 1;
+
+ // Do not allow compound prediction if the segment level reference frame
+ // feature is in use as in this case there can only be one reference.
+ if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) return 1;
+
+ if (!is_comp_ref_allowed(bsize)) return 1;
+ }
+
+ if (ref_frame[0] > INTRA_FRAME && ref_frame[1] == INTRA_FRAME) {
+ // Mode must be compatible
+ if (!is_interintra_allowed_mode(this_mode)) return 1;
+ if (!is_interintra_allowed_bsize(bsize)) return 1;
+ }
+
+ return 0;
+}
+
+static int fetch_picked_ref_frames_mask(const MACROBLOCK *const x,
+ BLOCK_SIZE bsize, int mib_size,
+ int mi_row, int mi_col) {
+ const int sb_size_mask = mib_size - 1;
+ const int mi_row_in_sb = mi_row & sb_size_mask;
+ const int mi_col_in_sb = mi_col & sb_size_mask;
+ const int mi_w = mi_size_wide[bsize];
+ const int mi_h = mi_size_high[bsize];
+ int picked_ref_frames_mask = 0;
+ for (int i = mi_row_in_sb; i < mi_row_in_sb + mi_h; ++i) {
+ for (int j = mi_col_in_sb; j < mi_col_in_sb + mi_w; ++j) {
+ picked_ref_frames_mask |= x->picked_ref_frames_mask[i * 32 + j];
+ }
+ }
+ return picked_ref_frames_mask;
+}
+
// Case 1: return 0, means don't skip this mode
// Case 2: return 1, means skip this mode completely
// Case 3: return 2, means skip compound only, but still try single motion modes
static int inter_mode_search_order_independent_skip(
- const AV1_COMP *cpi, const PICK_MODE_CONTEXT *ctx, const MACROBLOCK *x,
- BLOCK_SIZE bsize, int mode_index, int mi_row, int mi_col,
- uint32_t *mode_skip_mask, uint16_t *ref_frame_skip_mask,
- InterModeSearchState *search_state) {
+ const AV1_COMP *cpi, const MACROBLOCK *x, BLOCK_SIZE bsize, int mode_index,
+ int mi_row, int mi_col, mode_skip_mask_t *mode_skip_mask,
+ InterModeSearchState *search_state, int skip_ref_frame_mask) {
const SPEED_FEATURES *const sf = &cpi->sf;
const AV1_COMMON *const cm = &cpi->common;
- const struct segmentation *const seg = &cm->seg;
const OrderHintInfo *const order_hint_info = &cm->seq_params.order_hint_info;
const CurrentFrame *const current_frame = &cm->current_frame;
const MACROBLOCKD *const xd = &x->e_mbd;
const MB_MODE_INFO *const mbmi = xd->mi[0];
- const unsigned char segment_id = mbmi->segment_id;
const MV_REFERENCE_FRAME *ref_frame = av1_mode_order[mode_index].ref_frame;
const PREDICTION_MODE this_mode = av1_mode_order[mode_index].mode;
+ const int comp_pred = ref_frame[1] > INTRA_FRAME;
int skip_motion_mode = 0;
- if (mode_skip_mask[ref_frame[0]] & (1 << this_mode)) {
- return 1;
- }
-
- if ((ref_frame_skip_mask[0] & (1 << ref_frame[0])) &&
- (ref_frame_skip_mask[1] & (1 << AOMMAX(0, ref_frame[1])))) {
+ if (mask_says_skip(mode_skip_mask, ref_frame, this_mode)) {
return 1;
}
@@ -11053,14 +11864,14 @@ static int inter_mode_search_order_independent_skip(
if (mbmi->partition != PARTITION_NONE && mbmi->partition != PARTITION_SPLIT) {
const int ref_type = av1_ref_frame_type(ref_frame);
- int skip_ref = ctx->skip_ref_frame_mask & (1 << ref_type);
+ int skip_ref = skip_ref_frame_mask & (1 << ref_type);
if (ref_type <= ALTREF_FRAME && skip_ref) {
// Since the compound ref modes depends on the motion estimation result of
// two single ref modes( best mv of single ref modes as the start point )
// If current single ref mode is marked skip, we need to check if it will
// be used in compound ref modes.
for (int r = ALTREF_FRAME + 1; r < MODE_CTX_REF_FRAMES; ++r) {
- if (!(ctx->skip_ref_frame_mask & (1 << r))) {
+ if (!(skip_ref_frame_mask & (1 << r))) {
const MV_REFERENCE_FRAME *rf = ref_frame_map[r - REF_FRAMES];
if (rf[0] == ref_type || rf[1] == ref_type) {
// Found a not skipped compound ref mode which contains current
@@ -11077,8 +11888,7 @@ static int inter_mode_search_order_independent_skip(
if (skip_ref) return 1;
}
- if (cpi->sf.mode_pruning_based_on_two_pass_partition_search &&
- !x->cb_partition_scan) {
+ if (cpi->two_pass_partition_search && !x->cb_partition_scan) {
const int mi_width = mi_size_wide[bsize];
const int mi_height = mi_size_high[bsize];
int found = 0;
@@ -11101,12 +11911,6 @@ static int inter_mode_search_order_independent_skip(
if (!found) return 1;
}
- if (ref_frame[0] > INTRA_FRAME && ref_frame[1] == INTRA_FRAME) {
- // Mode must by compatible
- if (!is_interintra_allowed_mode(this_mode)) return 1;
- if (!is_interintra_allowed_bsize(bsize)) return 1;
- }
-
// This is only used in motion vector unit test.
if (cpi->oxcf.motion_vector_unit_test && ref_frame[0] == INTRA_FRAME)
return 1;
@@ -11121,22 +11925,6 @@ static int inter_mode_search_order_independent_skip(
x->source_variance < skip_intra_var_thresh)
return 1;
}
- } else {
- if (!is_comp_ref_allowed(bsize) && ref_frame[1] > INTRA_FRAME) return 1;
- }
-
- const int comp_pred = ref_frame[1] > INTRA_FRAME;
- if (comp_pred) {
- if (!cpi->allow_comp_inter_inter) return 1;
-
- if (current_frame->reference_mode == SINGLE_REFERENCE) return 1;
-
- // Skip compound inter modes if ARF is not available.
- if (!(cpi->ref_frame_flags & ref_frame_flag_list[ref_frame[1]])) return 1;
-
- // Do not allow compound prediction if the segment level reference frame
- // feature is in use as in this case there can only be one reference.
- if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) return 1;
}
if (sf->selective_ref_frame) {
@@ -11176,8 +11964,7 @@ static int inter_mode_search_order_independent_skip(
if ((sf->selective_ref_frame >= 2) && comp_pred && !cpi->all_one_sided_refs) {
unsigned int ref_offsets[2];
for (int i = 0; i < 2; ++i) {
- const RefCntBuffer *const buf =
- cm->current_frame.frame_refs[ref_frame[i] - LAST_FRAME].buf;
+ const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame[i]);
assert(buf != NULL);
ref_offsets[i] = buf->order_hint;
}
@@ -11192,12 +11979,57 @@ static int inter_mode_search_order_independent_skip(
return 1;
}
+ if (sf->selective_ref_frame >= 4 && comp_pred) {
+ // Check if one of the reference is ALTREF2_FRAME and BWDREF_FRAME is a
+ // valid reference.
+ if ((ref_frame[0] == ALTREF2_FRAME || ref_frame[1] == ALTREF2_FRAME) &&
+ (cpi->ref_frame_flags & av1_ref_frame_flag_list[BWDREF_FRAME])) {
+ // Check if both ALTREF2_FRAME and BWDREF_FRAME are future references.
+ if ((get_relative_dist(
+ order_hint_info,
+ cm->cur_frame->ref_order_hints[ALTREF2_FRAME - LAST_FRAME],
+ current_frame->order_hint) > 0) &&
+ (get_relative_dist(
+ order_hint_info,
+ cm->cur_frame->ref_order_hints[BWDREF_FRAME - LAST_FRAME],
+ current_frame->order_hint) > 0)) {
+ // Drop ALTREF2_FRAME as a reference if BWDREF_FRAME is a closer
+ // reference to the current frame than ALTREF2_FRAME
+ if (get_relative_dist(
+ order_hint_info,
+ cm->cur_frame->ref_order_hints[ALTREF2_FRAME - LAST_FRAME],
+ cm->cur_frame->ref_order_hints[BWDREF_FRAME - LAST_FRAME]) >=
+ 0) {
+ const RefCntBuffer *const buf_arf2 =
+ get_ref_frame_buf(cm, ALTREF2_FRAME);
+ assert(buf_arf2 != NULL);
+ const RefCntBuffer *const buf_bwd =
+ get_ref_frame_buf(cm, BWDREF_FRAME);
+ assert(buf_bwd != NULL);
+ (void)buf_arf2;
+ (void)buf_bwd;
+ return 1;
+ }
+ }
+ }
+ }
+
if (skip_repeated_mv(cm, x, this_mode, ref_frame, search_state)) {
return 1;
}
if (skip_motion_mode) {
return 2;
}
+
+ if (!cpi->oxcf.enable_global_motion &&
+ (this_mode == GLOBALMV || this_mode == GLOBAL_GLOBALMV)) {
+ return 1;
+ }
+
+ if (!cpi->oxcf.enable_onesided_comp && comp_pred && cpi->all_one_sided_refs) {
+ return 1;
+ }
+
return 0;
}
@@ -11233,6 +12065,7 @@ static int64_t handle_intra_mode(InterModeSearchState *search_state,
assert(mbmi->ref_frame[0] == INTRA_FRAME);
PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
const int try_palette =
+ cpi->oxcf.enable_palette &&
av1_allow_palette(cm->allow_screen_content_tools, mbmi->sb_type);
const int *const intra_mode_cost = x->mbmode_cost[size_group_lookup[bsize]];
const int intra_cost_penalty = av1_get_intra_cost_penalty(
@@ -11255,14 +12088,14 @@ static int64_t handle_intra_mode(InterModeSearchState *search_state,
TX_SIZE uv_tx;
int is_directional_mode = av1_is_directional_mode(mbmi->mode);
- if (is_directional_mode && av1_use_angle_delta(bsize)) {
+ if (is_directional_mode && av1_use_angle_delta(bsize) &&
+ cpi->oxcf.enable_angle_delta) {
int rate_dummy;
int64_t model_rd = INT64_MAX;
if (sf->intra_angle_estimation && !search_state->angle_stats_ready) {
const int src_stride = x->plane[0].src.stride;
const uint8_t *src = x->plane[0].src.buf;
- angle_estimation(src, src_stride, rows, cols, bsize,
- xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH,
+ angle_estimation(src, src_stride, rows, cols, bsize, is_cur_buf_hbd(xd),
search_state->directional_mode_skip_mask);
search_state->angle_stats_ready = 1;
}
@@ -11795,6 +12628,16 @@ static void release_compound_type_rd_buffers(
av1_zero(*bufs); // Set all pointers to NULL for safety.
}
+// Enables do_tx_search on a per-mode basis.
+int do_tx_search_mode(int do_tx_search_global, int midx, int adaptive) {
+ if (!adaptive || do_tx_search_global) {
+ return do_tx_search_global;
+ }
+ // A value of 2 indicates it is being turned on conditionally
+ // for the mode. Turn it on for the first 7 modes.
+ return midx < 7 ? 2 : 0;
+}
+
void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
MACROBLOCK *x, int mi_row, int mi_col,
RD_STATS *rd_cost, BLOCK_SIZE bsize,
@@ -11805,6 +12648,7 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = xd->mi[0];
const int try_palette =
+ cpi->oxcf.enable_palette &&
av1_allow_palette(cm->allow_screen_content_tools, mbmi->sb_type);
PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
const struct segmentation *const seg = &cm->seg;
@@ -11815,16 +12659,8 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
unsigned int ref_costs_single[REF_FRAMES];
unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES];
int *comp_inter_cost = x->comp_inter_cost[av1_get_reference_mode_context(xd)];
- int *mode_map = tile_data->mode_map[bsize];
- uint32_t mode_skip_mask[REF_FRAMES];
- uint16_t ref_frame_skip_mask[2];
+ mode_skip_mask_t mode_skip_mask;
uint8_t motion_mode_skip_mask = 0; // second pass of single ref modes
-#if CONFIG_ONE_PASS_SVM
- int temp_y_eob = 0, temp_y_eob_0 = 0, temp_y_eob_1 = 0, temp_y_eob_2 = 0,
- temp_y_eob_3 = 0;
- int64_t temp_y_rd = 0, temp_y_rd_0 = 0, temp_y_rd_1 = 0, temp_y_rd_2 = 0,
- temp_y_rd_3 = 0;
-#endif
InterModeSearchState search_state;
init_inter_mode_search_state(&search_state, cpi, tile_data, x, bsize,
@@ -11847,23 +12683,42 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
av1_invalid_rd_stats(rd_cost);
+ // Ref frames that are selected by square partition blocks.
+ int picked_ref_frames_mask = 0;
+ if (cpi->sf.prune_ref_frame_for_rect_partitions &&
+ mbmi->partition != PARTITION_NONE && mbmi->partition != PARTITION_SPLIT) {
+ // prune_ref_frame_for_rect_partitions = 1 implies prune only extended
+ // partition blocks. prune_ref_frame_for_rect_partitions >=2
+ // implies prune for vert, horiz and extended partition blocks.
+ if ((mbmi->partition != PARTITION_VERT &&
+ mbmi->partition != PARTITION_HORZ) ||
+ cpi->sf.prune_ref_frame_for_rect_partitions >= 2) {
+ picked_ref_frames_mask = fetch_picked_ref_frames_mask(
+ x, bsize, cm->seq_params.mib_size, mi_row, mi_col);
+ }
+ }
+
+ // Skip ref frames that never selected by square blocks.
+ const int skip_ref_frame_mask =
+ picked_ref_frames_mask ? ~picked_ref_frames_mask : 0;
+
// init params, set frame modes, speed features
- set_params_rd_pick_inter_mode(
- cpi, x, &args, bsize, mi_row, mi_col, ref_frame_skip_mask, mode_skip_mask,
- ctx->skip_ref_frame_mask, ref_costs_single, ref_costs_comp, yv12_mb);
+ set_params_rd_pick_inter_mode(cpi, x, &args, bsize, mi_row, mi_col,
+ &mode_skip_mask, skip_ref_frame_mask,
+ ref_costs_single, ref_costs_comp, yv12_mb);
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
int64_t best_est_rd = INT64_MAX;
// TODO(angiebird): Turn this on when this speed feature is well tested
-#if 1
const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize];
- const int do_tx_search = !md->ready;
-#else
- const int do_tx_search = 1;
-#endif
+ // If do_tx_search_global is 0, only estimated RD should be computed.
+ // If do_tx_search_global is 1, all modes have TX search performed.
+ // If do_tx_search_global is 2, some modes will have TX search performed.
+ const int do_tx_search_global =
+ !((cpi->sf.inter_mode_rd_model_estimation == 1 && md->ready) ||
+ (cpi->sf.inter_mode_rd_model_estimation == 2 &&
+ x->source_variance < 512));
InterModesInfo *inter_modes_info = x->inter_modes_info;
inter_modes_info->num = 0;
-#endif
int intra_mode_num = 0;
int intra_mode_idx_ls[MAX_MODES];
@@ -11876,8 +12731,9 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
alloc_compound_type_rd_buffers(cm, &rd_buffers);
for (int midx = 0; midx < MAX_MODES; ++midx) {
- int mode_index = mode_map[midx];
- const MODE_DEFINITION *mode_order = &av1_mode_order[mode_index];
+ const int do_tx_search = do_tx_search_mode(
+ do_tx_search_global, midx, sf->inter_mode_rd_model_estimation_adaptive);
+ const MODE_DEFINITION *mode_order = &av1_mode_order[midx];
this_mode = mode_order->mode;
const MV_REFERENCE_FRAME ref_frame = mode_order->ref_frame[0];
const MV_REFERENCE_FRAME second_ref_frame = mode_order->ref_frame[1];
@@ -11899,8 +12755,8 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
if (args.single_ref_first_pass) {
// clear stats
for (int k = 0; k < MAX_REF_MV_SERCH; ++k) {
- x->simple_rd_state[mode_index][k].rd_stats.rdcost = INT64_MAX;
- x->simple_rd_state[mode_index][k].early_skipped = 0;
+ x->simple_rd_state[midx][k].rd_stats.rdcost = INT64_MAX;
+ x->simple_rd_state[midx][k].early_skipped = 0;
}
} else {
if (motion_mode_skip_mask & (1 << ref_frame)) {
@@ -11923,14 +12779,16 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
int skippable = 0;
int this_skip2 = 0;
- init_mbmi(mbmi, mode_index, cm);
+ init_mbmi(mbmi, midx, cm);
x->skip = 0;
set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
+ if (inter_mode_compatible_skip(cpi, x, bsize, midx)) continue;
+
const int ret = inter_mode_search_order_independent_skip(
- cpi, ctx, x, bsize, mode_index, mi_row, mi_col, mode_skip_mask,
- ref_frame_skip_mask, &search_state);
+ cpi, x, bsize, midx, mi_row, mi_col, &mode_skip_mask, &search_state,
+ skip_ref_frame_mask);
if (ret == 1) continue;
args.skip_motion_mode = (ret == 2);
@@ -11940,8 +12798,7 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
}
}
- if (search_state.best_rd < search_state.mode_threshold[mode_index])
- continue;
+ if (search_state.best_rd < search_state.mode_threshold[midx]) continue;
if (sf->prune_comp_search_by_single_result > 0 && comp_pred) {
if (compound_skip_by_single_states(cpi, &search_state, this_mode,
@@ -11967,7 +12824,12 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
}
if (ref_frame == INTRA_FRAME) {
- if (sf->adaptive_mode_search)
+ if ((!cpi->oxcf.enable_smooth_intra || sf->disable_smooth_intra) &&
+ (mbmi->mode == SMOOTH_PRED || mbmi->mode == SMOOTH_H_PRED ||
+ mbmi->mode == SMOOTH_V_PRED))
+ continue;
+ if (!cpi->oxcf.enable_paeth_intra && mbmi->mode == PAETH_PRED) continue;
+ if (sf->adaptive_mode_search > 1)
if ((x->source_variance << num_pels_log2_lookup[bsize]) >
search_state.best_pred_sse)
continue;
@@ -11995,7 +12857,7 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
}
if (ref_frame == INTRA_FRAME) {
- intra_mode_idx_ls[intra_mode_num++] = mode_index;
+ intra_mode_idx_ls[intra_mode_num++] = midx;
continue;
} else {
mbmi->angle_delta[PLANE_TYPE_Y] = 0;
@@ -12014,30 +12876,25 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
args.single_newmv_valid = search_state.single_newmv_valid;
args.single_comp_cost = real_compmode_cost;
args.ref_frame_cost = ref_frame_cost;
- if (mode_index < MAX_SINGLE_REF_MODES) {
- args.simple_rd_state = x->simple_rd_state[mode_index];
+ if (midx < MAX_SINGLE_REF_MODES) {
+ args.simple_rd_state = x->simple_rd_state[midx];
}
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, handle_inter_mode_time);
+#endif
this_rd = handle_inter_mode(
- cpi, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv, &disable_skip,
- mi_row, mi_col, &args, ref_best_rd, tmp_buf, &rd_buffers, tile_data,
- &best_est_rd, do_tx_search, inter_modes_info);
-#else
- this_rd = handle_inter_mode(cpi, x, bsize, &rd_stats, &rd_stats_y,
- &rd_stats_uv, &disable_skip, mi_row, mi_col,
- &args, ref_best_rd, tmp_buf, &rd_buffers);
+ cpi, tile_data, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv,
+ &disable_skip, mi_row, mi_col, &args, ref_best_rd, tmp_buf,
+ &rd_buffers, &best_est_rd, do_tx_search, inter_modes_info);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, handle_inter_mode_time);
#endif
rate2 = rd_stats.rate;
skippable = rd_stats.skip;
distortion2 = rd_stats.dist;
rate_y = rd_stats_y.rate;
rate_uv = rd_stats_uv.rate;
-#if CONFIG_ONE_PASS_SVM
- av1_unpack_reg_stat(&rd_stats_y, &temp_y_eob, &temp_y_eob_0,
- &temp_y_eob_1, &temp_y_eob_2, &temp_y_eob_3,
- &temp_y_rd, &temp_y_rd_0, &temp_y_rd_1,
- &temp_y_rd_2, &temp_y_rd_3);
-#endif
}
if (sf->prune_comp_search_by_single_result > 0 &&
@@ -12063,7 +12920,7 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
}
if (!mode_excluded) {
// Note index of best mode so far
- search_state.best_mode_index = mode_index;
+ search_state.best_mode_index = midx;
if (ref_frame == INTRA_FRAME) {
/* required for left and above block mv */
@@ -12079,7 +12936,6 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
search_state.best_mbmode = *mbmi;
search_state.best_skip2 = this_skip2;
search_state.best_mode_skippable = skippable;
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
if (do_tx_search) {
// When do_tx_search == 0, handle_inter_mode won't provide correct
// rate_y and rate_uv because txfm_search process is replaced by
@@ -12090,24 +12946,7 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
rate_y +
x->skip_cost[av1_get_skip_context(xd)][this_skip2 || skippable];
search_state.best_rate_uv = rate_uv;
-
-#if CONFIG_ONE_PASS_SVM
- av1_set_reg_stat(rd_cost, temp_y_eob, temp_y_eob_0, temp_y_eob_1,
- temp_y_eob_2, temp_y_eob_3, temp_y_rd, temp_y_rd_0,
- temp_y_rd_1, temp_y_rd_2, temp_y_rd_3);
-#endif
}
-#else // CONFIG_COLLECT_INTER_MODE_RD_STATS
- search_state.best_rate_y =
- rate_y +
- x->skip_cost[av1_get_skip_context(xd)][this_skip2 || skippable];
- search_state.best_rate_uv = rate_uv;
-#if CONFIG_ONE_PASS_SVM
- av1_set_reg_stat(rd_cost, temp_y_eob, temp_y_eob_0, temp_y_eob_1,
- temp_y_eob_2, temp_y_eob_3, temp_y_rd, temp_y_rd_0,
- temp_y_rd_1, temp_y_rd_2, temp_y_rd_3);
-#endif
-#endif // CONFIG_COLLECT_INTER_MODE_RD_STATS
memcpy(ctx->blk_skip, x->blk_skip,
sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
}
@@ -12148,51 +12987,67 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
release_compound_type_rd_buffers(&rd_buffers);
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
- if (!do_tx_search) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, do_tx_search_time);
+#endif
+ if (do_tx_search_global != 1) {
inter_modes_info_sort(inter_modes_info, inter_modes_info->rd_idx_pair_arr);
search_state.best_rd = INT64_MAX;
int64_t top_est_rd =
- inter_modes_info->est_rd_arr[inter_modes_info->rd_idx_pair_arr[0].idx];
+ inter_modes_info->num > 0
+ ? inter_modes_info
+ ->est_rd_arr[inter_modes_info->rd_idx_pair_arr[0].idx]
+ : INT64_MAX;
for (int j = 0; j < inter_modes_info->num; ++j) {
const int data_idx = inter_modes_info->rd_idx_pair_arr[j].idx;
*mbmi = inter_modes_info->mbmi_arr[data_idx];
int64_t curr_est_rd = inter_modes_info->est_rd_arr[data_idx];
- if (curr_est_rd * 0.9 > top_est_rd) {
- continue;
- }
- const int mode_rate = inter_modes_info->mode_rate_arr[data_idx];
-
- x->skip = 0;
- set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
-
- // Select prediction reference frames.
- const int is_comp_pred = mbmi->ref_frame[1] > INTRA_FRAME;
- for (i = 0; i < num_planes; i++) {
- xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
- if (is_comp_pred) xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
- }
+ if (curr_est_rd * 0.80 > top_est_rd) break;
RD_STATS rd_stats;
RD_STATS rd_stats_y;
RD_STATS rd_stats_uv;
- av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
- if (mbmi->motion_mode == OBMC_CAUSAL)
- av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
-
- if (!txfm_search(cpi, x, bsize, mi_row, mi_col, &rd_stats, &rd_stats_y,
- &rd_stats_uv, mode_rate, search_state.best_rd)) {
- continue;
+ bool true_rd = inter_modes_info->true_rd_arr[data_idx];
+ if (true_rd) {
+ rd_stats = inter_modes_info->rd_cost_arr[data_idx];
+ rd_stats_y = inter_modes_info->rd_cost_y_arr[data_idx];
+ rd_stats_uv = inter_modes_info->rd_cost_uv_arr[data_idx];
+ memcpy(x->blk_skip, inter_modes_info->blk_skip_arr[data_idx],
+ sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
} else {
- const int skip_ctx = av1_get_skip_context(xd);
- inter_mode_data_push(tile_data, mbmi->sb_type, rd_stats.sse,
- rd_stats.dist,
- rd_stats_y.rate + rd_stats_uv.rate +
- x->skip_cost[skip_ctx][mbmi->skip]);
+ const int mode_rate = inter_modes_info->mode_rate_arr[data_idx];
+
+ x->skip = 0;
+ set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+
+ // Select prediction reference frames.
+ const int is_comp_pred = mbmi->ref_frame[1] > INTRA_FRAME;
+ for (i = 0; i < num_planes; i++) {
+ xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
+ if (is_comp_pred)
+ xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
+ }
+
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
+ av1_num_planes(cm) - 1);
+ if (mbmi->motion_mode == OBMC_CAUSAL)
+ av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+
+ if (!txfm_search(cpi, tile_data, x, bsize, mi_row, mi_col, &rd_stats,
+ &rd_stats_y, &rd_stats_uv, mode_rate,
+ search_state.best_rd)) {
+ continue;
+ } else if (cpi->sf.inter_mode_rd_model_estimation == 1) {
+ const int skip_ctx = av1_get_skip_context(xd);
+ inter_mode_data_push(tile_data, mbmi->sb_type, rd_stats.sse,
+ rd_stats.dist,
+ rd_stats_y.rate + rd_stats_uv.rate +
+ x->skip_cost[skip_ctx][mbmi->skip]);
+ }
+ rd_stats.rdcost = RDCOST(x->rdmult, rd_stats.rate, rd_stats.dist);
}
- rd_stats.rdcost = RDCOST(x->rdmult, rd_stats.rate, rd_stats.dist);
if (rd_stats.rdcost < search_state.best_rd) {
search_state.best_rd = rd_stats.rdcost;
@@ -12211,14 +13066,16 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
search_state.best_rate_uv = rd_stats_uv.rate;
memcpy(ctx->blk_skip, x->blk_skip,
sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
-#if CONFIG_ONE_PASS_SVM
- av1_copy_reg_stat(rd_cost, &rd_stats_y);
-#endif
}
}
}
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, do_tx_search_time);
#endif
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ start_timing(cpi, handle_intra_mode_time);
+#endif
for (int j = 0; j < intra_mode_num; ++j) {
const int mode_index = intra_mode_idx_ls[j];
const MV_REFERENCE_FRAME ref_frame =
@@ -12256,11 +13113,11 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
search_state.best_rate_uv = intra_rd_stats_uv.rate;
memcpy(ctx->blk_skip, x->blk_skip,
sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
-#if CONFIG_ONE_PASS_SVM
- av1_copy_reg_stat(rd_cost, &intra_rd_stats_y);
-#endif
}
}
+#if CONFIG_COLLECT_COMPONENT_TIMING
+ end_timing(cpi, handle_intra_mode_time);
+#endif
// In effect only when speed >= 2.
sf_refine_fast_tx_type_search(
@@ -12273,7 +13130,6 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
search_palette_mode(cpi, x, mi_row, mi_col, rd_cost, ctx, bsize, mbmi, pmi,
ref_costs_single, &search_state);
}
-
search_state.best_mbmode.skip_mode = 0;
if (cm->current_frame.skip_mode_info.skip_mode_flag &&
!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
@@ -12351,6 +13207,496 @@ void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
}
}
+// TODO(kyslov): now this is very similar to av1_rd_pick_inter_mode_sb except:
+// it only checks non-compound mode and
+// it doesn't check palette mode
+// it doesn't refine tx search
+// this function is likely to be heavily modified with nonrd mode
+// decision
+void av1_nonrd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
+ MACROBLOCK *x, int mi_row, int mi_col,
+ RD_STATS *rd_cost, BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx,
+ int64_t best_rd_so_far) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ const SPEED_FEATURES *const sf = &cpi->sf;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const struct segmentation *const seg = &cm->seg;
+ PREDICTION_MODE this_mode;
+ unsigned char segment_id = mbmi->segment_id;
+ int i;
+ struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE];
+ unsigned int ref_costs_single[REF_FRAMES];
+ unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES];
+ int *comp_inter_cost = x->comp_inter_cost[av1_get_reference_mode_context(xd)];
+ mode_skip_mask_t mode_skip_mask;
+ uint8_t motion_mode_skip_mask = 0; // second pass of single ref modes
+
+ InterModeSearchState search_state;
+ init_inter_mode_search_state(&search_state, cpi, tile_data, x, bsize,
+ best_rd_so_far);
+ INTERINTRA_MODE interintra_modes[REF_FRAMES] = {
+ INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES,
+ INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES
+ };
+ HandleInterModeArgs args = {
+ { NULL }, { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE },
+ { NULL }, { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1 },
+ NULL, NULL,
+ NULL, search_state.modelled_rd,
+ { { 0 } }, INT_MAX,
+ INT_MAX, search_state.simple_rd,
+ 0, interintra_modes,
+ 1, NULL
+ };
+ for (i = 0; i < REF_FRAMES; ++i) x->pred_sse[i] = INT_MAX;
+
+ av1_invalid_rd_stats(rd_cost);
+
+ // Ref frames that are selected by square partition blocks.
+ int picked_ref_frames_mask = 0;
+ if (cpi->sf.prune_ref_frame_for_rect_partitions &&
+ mbmi->partition != PARTITION_NONE && mbmi->partition != PARTITION_SPLIT) {
+ // Don't enable for vert and horz partition blocks if current frame
+ // will be used as bwd or arf2.
+ if ((!cpi->refresh_bwd_ref_frame && !cpi->refresh_alt2_ref_frame) ||
+ (mbmi->partition != PARTITION_VERT &&
+ mbmi->partition != PARTITION_HORZ)) {
+ picked_ref_frames_mask = fetch_picked_ref_frames_mask(
+ x, bsize, cm->seq_params.mib_size, mi_row, mi_col);
+ }
+ }
+
+ // Skip ref frames that never selected by square blocks.
+ const int skip_ref_frame_mask =
+ picked_ref_frames_mask ? ~picked_ref_frames_mask : 0;
+
+ // init params, set frame modes, speed features
+ set_params_nonrd_pick_inter_mode(cpi, x, &args, bsize, mi_row, mi_col,
+ &mode_skip_mask, skip_ref_frame_mask,
+ ref_costs_single, ref_costs_comp, yv12_mb);
+
+ int64_t best_est_rd = INT64_MAX;
+ InterModesInfo *inter_modes_info = x->inter_modes_info;
+ inter_modes_info->num = 0;
+
+ int intra_mode_num = 0;
+ int intra_mode_idx_ls[MAX_MODES];
+ int reach_first_comp_mode = 0;
+
+ // Temporary buffers used by handle_inter_mode().
+ uint8_t *const tmp_buf = get_buf_by_bd(xd, x->tmp_obmc_bufs[0]);
+
+ CompoundTypeRdBuffers rd_buffers;
+ alloc_compound_type_rd_buffers(cm, &rd_buffers);
+
+ for (int midx = 0; midx < MAX_MODES; ++midx) {
+ const MODE_DEFINITION *mode_order = &av1_mode_order[midx];
+ this_mode = mode_order->mode;
+ const MV_REFERENCE_FRAME ref_frame = mode_order->ref_frame[0];
+ const MV_REFERENCE_FRAME second_ref_frame = mode_order->ref_frame[1];
+ const int comp_pred = second_ref_frame > INTRA_FRAME;
+
+ if (second_ref_frame != NONE_FRAME) continue;
+
+ // When single ref motion search ends:
+ // 1st pass: To evaluate single ref RD results and rewind to the beginning;
+ // 2nd pass: To continue with compound ref search.
+ if (sf->prune_single_motion_modes_by_simple_trans) {
+ if (comp_pred && args.single_ref_first_pass) {
+ args.single_ref_first_pass = 0;
+ // Reach the first comp ref mode
+ // Reset midx to start the 2nd pass for single ref motion search
+ midx = -1;
+ motion_mode_skip_mask = analyze_simple_trans_states(cpi, x);
+ continue;
+ }
+ if (!comp_pred && ref_frame != INTRA_FRAME) { // single ref mode
+ if (args.single_ref_first_pass) {
+ // clear stats
+ for (int k = 0; k < MAX_REF_MV_SERCH; ++k) {
+ x->simple_rd_state[midx][k].rd_stats.rdcost = INT64_MAX;
+ x->simple_rd_state[midx][k].early_skipped = 0;
+ }
+ } else {
+ if (motion_mode_skip_mask & (1 << ref_frame)) {
+ continue;
+ }
+ }
+ }
+ }
+
+ // Reach the first compound prediction mode
+ if (sf->prune_comp_search_by_single_result > 0 && comp_pred &&
+ reach_first_comp_mode == 0) {
+ analyze_single_states(cpi, &search_state);
+ reach_first_comp_mode = 1;
+ }
+ int64_t this_rd = INT64_MAX;
+ int disable_skip = 0;
+ int rate2 = 0;
+ int64_t distortion2 = 0;
+ int skippable = 0;
+ int this_skip2 = 0;
+
+ init_mbmi(mbmi, midx, cm);
+
+ x->skip = 0;
+ set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
+
+ if (inter_mode_compatible_skip(cpi, x, bsize, midx)) continue;
+
+ const int ret = inter_mode_search_order_independent_skip(
+ cpi, x, bsize, midx, mi_row, mi_col, &mode_skip_mask, &search_state,
+ skip_ref_frame_mask);
+ if (ret == 1) continue;
+ args.skip_motion_mode = (ret == 2);
+
+ if (sf->drop_ref && comp_pred) {
+ if (sf_check_is_drop_ref(mode_order, &search_state)) {
+ continue;
+ }
+ }
+
+ if (search_state.best_rd < search_state.mode_threshold[midx]) continue;
+
+ if (sf->prune_comp_search_by_single_result > 0 && comp_pred) {
+ if (compound_skip_by_single_states(cpi, &search_state, this_mode,
+ ref_frame, second_ref_frame, x))
+ continue;
+ }
+
+ const int ref_frame_cost = comp_pred
+ ? ref_costs_comp[ref_frame][second_ref_frame]
+ : ref_costs_single[ref_frame];
+ const int compmode_cost =
+ is_comp_ref_allowed(mbmi->sb_type) ? comp_inter_cost[comp_pred] : 0;
+ const int real_compmode_cost =
+ cm->current_frame.reference_mode == REFERENCE_MODE_SELECT
+ ? compmode_cost
+ : 0;
+
+ if (comp_pred) {
+ if ((sf->mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
+ search_state.best_mode_index >= 0 &&
+ search_state.best_mbmode.ref_frame[0] == INTRA_FRAME)
+ continue;
+ }
+
+ if (ref_frame == INTRA_FRAME) {
+ if (!cpi->oxcf.enable_smooth_intra &&
+ (mbmi->mode == SMOOTH_PRED || mbmi->mode == SMOOTH_H_PRED ||
+ mbmi->mode == SMOOTH_V_PRED))
+ continue;
+ if (!cpi->oxcf.enable_paeth_intra && mbmi->mode == PAETH_PRED) continue;
+ if (sf->adaptive_mode_search > 1)
+ if ((x->source_variance << num_pels_log2_lookup[bsize]) >
+ search_state.best_pred_sse)
+ continue;
+
+ if (this_mode != DC_PRED) {
+ // Only search the oblique modes if the best so far is
+ // one of the neighboring directional modes
+ if ((sf->mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
+ (this_mode >= D45_PRED && this_mode <= PAETH_PRED)) {
+ if (search_state.best_mode_index >= 0 &&
+ search_state.best_mbmode.ref_frame[0] > INTRA_FRAME)
+ continue;
+ }
+ if (sf->mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
+ if (conditional_skipintra(this_mode, search_state.best_intra_mode))
+ continue;
+ }
+ }
+ }
+
+ // Select prediction reference frames.
+ for (i = 0; i < num_planes; i++) {
+ xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
+ if (comp_pred) xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
+ }
+
+ if (ref_frame == INTRA_FRAME) {
+ intra_mode_idx_ls[intra_mode_num++] = midx;
+ continue;
+ } else {
+ mbmi->angle_delta[PLANE_TYPE_Y] = 0;
+ mbmi->angle_delta[PLANE_TYPE_UV] = 0;
+ mbmi->filter_intra_mode_info.use_filter_intra = 0;
+ mbmi->ref_mv_idx = 0;
+ int64_t ref_best_rd = search_state.best_rd;
+ {
+ RD_STATS rd_stats, rd_stats_y, rd_stats_uv;
+ av1_init_rd_stats(&rd_stats);
+ rd_stats.rate = rate2;
+
+ // Point to variables that are maintained between loop iterations
+ args.single_newmv = search_state.single_newmv;
+ args.single_newmv_rate = search_state.single_newmv_rate;
+ args.single_newmv_valid = search_state.single_newmv_valid;
+ args.single_comp_cost = real_compmode_cost;
+ args.ref_frame_cost = ref_frame_cost;
+ if (midx < MAX_SINGLE_REF_MODES) {
+ args.simple_rd_state = x->simple_rd_state[midx];
+ }
+ this_rd = handle_inter_mode(
+ cpi, tile_data, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv,
+ &disable_skip, mi_row, mi_col, &args, ref_best_rd, tmp_buf,
+ &rd_buffers, &best_est_rd, 0, inter_modes_info);
+ rate2 = rd_stats.rate;
+ skippable = rd_stats.skip;
+ distortion2 = rd_stats.dist;
+ }
+
+ if (sf->prune_comp_search_by_single_result > 0 &&
+ is_inter_singleref_mode(this_mode) && args.single_ref_first_pass) {
+ collect_single_states(x, &search_state, mbmi);
+ }
+
+ if (this_rd == INT64_MAX) continue;
+
+ this_skip2 = mbmi->skip;
+ this_rd = RDCOST(x->rdmult, rate2, distortion2);
+ }
+
+ // Did this mode help.. i.e. is it the new best mode
+ if (this_rd < search_state.best_rd || x->skip) {
+ int mode_excluded = 0;
+ if (comp_pred) {
+ mode_excluded = cm->current_frame.reference_mode == SINGLE_REFERENCE;
+ }
+ if (!mode_excluded) {
+ // Note index of best mode so far
+ search_state.best_mode_index = midx;
+
+ if (ref_frame == INTRA_FRAME) {
+ /* required for left and above block mv */
+ mbmi->mv[0].as_int = 0;
+ } else {
+ search_state.best_pred_sse = x->pred_sse[ref_frame];
+ }
+
+ rd_cost->rate = rate2;
+ rd_cost->dist = distortion2;
+ rd_cost->rdcost = this_rd;
+ search_state.best_rd = this_rd;
+ search_state.best_mbmode = *mbmi;
+ search_state.best_skip2 = this_skip2;
+ search_state.best_mode_skippable = skippable;
+ memcpy(ctx->blk_skip, x->blk_skip,
+ sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+ }
+ }
+
+ /* keep record of best compound/single-only prediction */
+ if (!disable_skip && ref_frame != INTRA_FRAME) {
+ int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
+
+ if (cm->current_frame.reference_mode == REFERENCE_MODE_SELECT) {
+ single_rate = rate2 - compmode_cost;
+ hybrid_rate = rate2;
+ } else {
+ single_rate = rate2;
+ hybrid_rate = rate2 + compmode_cost;
+ }
+
+ single_rd = RDCOST(x->rdmult, single_rate, distortion2);
+ hybrid_rd = RDCOST(x->rdmult, hybrid_rate, distortion2);
+
+ if (!comp_pred) {
+ if (single_rd < search_state.best_pred_rd[SINGLE_REFERENCE])
+ search_state.best_pred_rd[SINGLE_REFERENCE] = single_rd;
+ } else {
+ if (single_rd < search_state.best_pred_rd[COMPOUND_REFERENCE])
+ search_state.best_pred_rd[COMPOUND_REFERENCE] = single_rd;
+ }
+ if (hybrid_rd < search_state.best_pred_rd[REFERENCE_MODE_SELECT])
+ search_state.best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
+ }
+ if (sf->drop_ref && second_ref_frame == NONE_FRAME) {
+ // Collect data from single ref mode, and analyze data.
+ sf_drop_ref_analyze(&search_state, mode_order, distortion2);
+ }
+
+ if (x->skip && !comp_pred) break;
+ }
+
+ release_compound_type_rd_buffers(&rd_buffers);
+
+ inter_modes_info_sort(inter_modes_info, inter_modes_info->rd_idx_pair_arr);
+ search_state.best_rd = INT64_MAX;
+
+ if (inter_modes_info->num > 0) {
+ const int data_idx = inter_modes_info->rd_idx_pair_arr[0].idx;
+ *mbmi = inter_modes_info->mbmi_arr[data_idx];
+ const int mode_rate = inter_modes_info->mode_rate_arr[data_idx];
+
+ x->skip = 0;
+ set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+
+ // Select prediction reference frames.
+ const int is_comp_pred = mbmi->ref_frame[1] > INTRA_FRAME;
+ for (i = 0; i < num_planes; i++) {
+ xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
+ if (is_comp_pred) xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
+ }
+
+ RD_STATS rd_stats;
+ RD_STATS rd_stats_y;
+ RD_STATS rd_stats_uv;
+
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, 0,
+ av1_num_planes(cm) - 1);
+ if (mbmi->motion_mode == OBMC_CAUSAL)
+ av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+
+ if (txfm_search(cpi, tile_data, x, bsize, mi_row, mi_col, &rd_stats,
+ &rd_stats_y, &rd_stats_uv, mode_rate,
+ search_state.best_rd)) {
+ if (cpi->sf.inter_mode_rd_model_estimation == 1) {
+ const int skip_ctx = av1_get_skip_context(xd);
+ inter_mode_data_push(tile_data, mbmi->sb_type, rd_stats.sse,
+ rd_stats.dist,
+ rd_stats_y.rate + rd_stats_uv.rate +
+ x->skip_cost[skip_ctx][mbmi->skip]);
+ }
+ rd_stats.rdcost = RDCOST(x->rdmult, rd_stats.rate, rd_stats.dist);
+
+ if (rd_stats.rdcost < search_state.best_rd) {
+ search_state.best_rd = rd_stats.rdcost;
+ // Note index of best mode so far
+ const int mode_index = get_prediction_mode_idx(
+ mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+ search_state.best_mode_index = mode_index;
+ *rd_cost = rd_stats;
+ search_state.best_rd = rd_stats.rdcost;
+ search_state.best_mbmode = *mbmi;
+ search_state.best_skip2 = mbmi->skip;
+ search_state.best_mode_skippable = rd_stats.skip;
+ search_state.best_rate_y =
+ rd_stats_y.rate +
+ x->skip_cost[av1_get_skip_context(xd)][rd_stats.skip || mbmi->skip];
+ search_state.best_rate_uv = rd_stats_uv.rate;
+ memcpy(ctx->blk_skip, x->blk_skip,
+ sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+ }
+ }
+ }
+
+ for (int j = 0; j < intra_mode_num; ++j) {
+ const int mode_index = intra_mode_idx_ls[j];
+ const MV_REFERENCE_FRAME ref_frame =
+ av1_mode_order[mode_index].ref_frame[0];
+ assert(av1_mode_order[mode_index].ref_frame[1] == NONE_FRAME);
+ assert(ref_frame == INTRA_FRAME);
+ if (sf->skip_intra_in_interframe && search_state.skip_intra_modes) break;
+ init_mbmi(mbmi, mode_index, cm);
+ x->skip = 0;
+ set_ref_ptrs(cm, xd, INTRA_FRAME, NONE_FRAME);
+
+ // Select prediction reference frames.
+ for (i = 0; i < num_planes; i++) {
+ xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
+ }
+
+ RD_STATS intra_rd_stats, intra_rd_stats_y, intra_rd_stats_uv;
+
+ const int ref_frame_cost = ref_costs_single[ref_frame];
+ intra_rd_stats.rdcost = handle_intra_mode(
+ &search_state, cpi, x, bsize, mi_row, mi_col, ref_frame_cost, ctx, 0,
+ &intra_rd_stats, &intra_rd_stats_y, &intra_rd_stats_uv);
+ if (intra_rd_stats.rdcost < search_state.best_rd) {
+ search_state.best_rd = intra_rd_stats.rdcost;
+ // Note index of best mode so far
+ search_state.best_mode_index = mode_index;
+ *rd_cost = intra_rd_stats;
+ search_state.best_rd = intra_rd_stats.rdcost;
+ search_state.best_mbmode = *mbmi;
+ search_state.best_skip2 = 0;
+ search_state.best_mode_skippable = intra_rd_stats.skip;
+ search_state.best_rate_y =
+ intra_rd_stats_y.rate +
+ x->skip_cost[av1_get_skip_context(xd)][intra_rd_stats.skip];
+ search_state.best_rate_uv = intra_rd_stats_uv.rate;
+ memcpy(ctx->blk_skip, x->blk_skip,
+ sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+ }
+ }
+
+ search_state.best_mbmode.skip_mode = 0;
+ if (cm->current_frame.skip_mode_info.skip_mode_flag &&
+ !segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+ is_comp_ref_allowed(bsize)) {
+ rd_pick_skip_mode(rd_cost, &search_state, cpi, x, bsize, mi_row, mi_col,
+ yv12_mb);
+ }
+
+ // Make sure that the ref_mv_idx is only nonzero when we're
+ // using a mode which can support ref_mv_idx
+ if (search_state.best_mbmode.ref_mv_idx != 0 &&
+ !(search_state.best_mbmode.mode == NEWMV ||
+ search_state.best_mbmode.mode == NEW_NEWMV ||
+ have_nearmv_in_inter_mode(search_state.best_mbmode.mode))) {
+ search_state.best_mbmode.ref_mv_idx = 0;
+ }
+
+ if (search_state.best_mode_index < 0 ||
+ search_state.best_rd >= best_rd_so_far) {
+ rd_cost->rate = INT_MAX;
+ rd_cost->rdcost = INT64_MAX;
+ return;
+ }
+
+ assert(
+ (cm->interp_filter == SWITCHABLE) ||
+ (cm->interp_filter ==
+ av1_extract_interp_filter(search_state.best_mbmode.interp_filters, 0)) ||
+ !is_inter_block(&search_state.best_mbmode));
+ assert(
+ (cm->interp_filter == SWITCHABLE) ||
+ (cm->interp_filter ==
+ av1_extract_interp_filter(search_state.best_mbmode.interp_filters, 1)) ||
+ !is_inter_block(&search_state.best_mbmode));
+
+ if (!cpi->rc.is_src_frame_alt_ref)
+ av1_update_rd_thresh_fact(cm, tile_data->thresh_freq_fact,
+ sf->adaptive_rd_thresh, bsize,
+ search_state.best_mode_index);
+
+ // macroblock modes
+ *mbmi = search_state.best_mbmode;
+ x->skip |= search_state.best_skip2;
+
+ // Note: this section is needed since the mode may have been forced to
+ // GLOBALMV by the all-zero mode handling of ref-mv.
+ if (mbmi->mode == GLOBALMV || mbmi->mode == GLOBAL_GLOBALMV) {
+ // Correct the interp filters for GLOBALMV
+ if (is_nontrans_global_motion(xd, xd->mi[0])) {
+ assert(mbmi->interp_filters ==
+ av1_broadcast_interp_filter(
+ av1_unswitchable_filter(cm->interp_filter)));
+ }
+ }
+
+ for (i = 0; i < REFERENCE_MODES; ++i) {
+ if (search_state.best_pred_rd[i] == INT64_MAX)
+ search_state.best_pred_diff[i] = INT_MIN;
+ else
+ search_state.best_pred_diff[i] =
+ search_state.best_rd - search_state.best_pred_rd[i];
+ }
+
+ x->skip |= search_state.best_mode_skippable;
+
+ assert(search_state.best_mode_index >= 0);
+
+ store_coding_context(x, ctx, search_state.best_mode_index,
+ search_state.best_pred_diff,
+ search_state.best_mode_skippable);
+}
+
void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
TileDataEnc *tile_data, MACROBLOCK *x,
int mi_row, int mi_col,
@@ -12494,7 +13840,7 @@ static INLINE void calc_target_weighted_pred_above(
int32_t *wsrc = ctxt->x->wsrc_buf + (rel_mi_col * MI_SIZE);
int32_t *mask = ctxt->x->mask_buf + (rel_mi_col * MI_SIZE);
const uint8_t *tmp = ctxt->tmp + rel_mi_col * MI_SIZE;
- const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+ const int is_hbd = is_cur_buf_hbd(xd);
if (!is_hbd) {
for (int row = 0; row < ctxt->overlap; ++row) {
@@ -12540,7 +13886,7 @@ static INLINE void calc_target_weighted_pred_left(
int32_t *wsrc = ctxt->x->wsrc_buf + (rel_mi_row * MI_SIZE * bw);
int32_t *mask = ctxt->x->mask_buf + (rel_mi_row * MI_SIZE * bw);
const uint8_t *tmp = ctxt->tmp + (rel_mi_row * MI_SIZE * ctxt->tmp_stride);
- const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+ const int is_hbd = is_cur_buf_hbd(xd);
if (!is_hbd) {
for (int row = 0; row < nb_mi_height * MI_SIZE; ++row) {
@@ -12622,7 +13968,7 @@ static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
int32_t *mask_buf = x->mask_buf;
int32_t *wsrc_buf = x->wsrc_buf;
- const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+ const int is_hbd = is_cur_buf_hbd(xd);
const int src_scale = AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA;
// plane 0 should not be subsampled
@@ -12741,12 +14087,14 @@ void gaussian_blur(const uint8_t *src, int src_stride, int w, int h,
}
}
-static uint16_t edge_probability(const uint8_t *input, int w, int h,
+static EdgeInfo edge_probability(const uint8_t *input, int w, int h,
bool high_bd, int bd) {
// The probability of an edge in the whole image is the same as the highest
// probability of an edge for any individual pixel. Use Sobel as the metric
// for finding an edge.
uint16_t highest = 0;
+ uint16_t highest_x = 0;
+ uint16_t highest_y = 0;
// Ignore the 1 pixel border around the image for the computation.
for (int j = 1; j < h - 1; ++j) {
for (int i = 1; i < w - 1; ++i) {
@@ -12756,18 +14104,22 @@ static uint16_t edge_probability(const uint8_t *input, int w, int h,
int16_t g_y = g.y >> (bd - 8);
uint16_t magnitude = (uint16_t)sqrt(g_x * g_x + g_y * g_y);
highest = AOMMAX(highest, magnitude);
+ highest_x = AOMMAX(highest_x, g_x);
+ highest_y = AOMMAX(highest_y, g_y);
}
}
- return highest;
+ EdgeInfo ei = { .magnitude = highest, .x = highest_x, .y = highest_y };
+ return ei;
}
/* Uses most of the Canny edge detection algorithm to find if there are any
* edges in the image.
*/
-uint16_t av1_edge_exists(const uint8_t *src, int src_stride, int w, int h,
+EdgeInfo av1_edge_exists(const uint8_t *src, int src_stride, int w, int h,
bool high_bd, int bd) {
if (w < 3 || h < 3) {
- return 0;
+ EdgeInfo n = { .magnitude = 0, .x = 0, .y = 0 };
+ return n;
}
uint8_t *blurred;
if (high_bd) {
@@ -12780,7 +14132,7 @@ uint16_t av1_edge_exists(const uint8_t *src, int src_stride, int w, int h,
// want a probability of an edge existing in the buffer, which is determined
// by the strongest edge in it -- we don't need to eliminate the weaker
// edges. Use Sobel for the edge detection.
- uint16_t prob = edge_probability(blurred, w, h, high_bd, bd);
+ EdgeInfo prob = edge_probability(blurred, w, h, high_bd, bd);
if (high_bd) {
aom_free(CONVERT_TO_SHORTPTR(blurred));
} else {
diff --git a/libaom/av1/encoder/rdopt.h b/libaom/av1/encoder/rdopt.h
index 5ff2df3..7ba1b18 100644
--- a/libaom/av1/encoder/rdopt.h
+++ b/libaom/av1/encoder/rdopt.h
@@ -123,18 +123,33 @@ void av1_rd_pick_inter_mode_sb(struct AV1_COMP *cpi,
struct RD_STATS *rd_cost, BLOCK_SIZE bsize,
PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far);
+void av1_nonrd_pick_inter_mode_sb(struct AV1_COMP *cpi,
+ struct TileDataEnc *tile_data,
+ struct macroblock *x, int mi_row, int mi_col,
+ struct RD_STATS *rd_cost, BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx,
+ int64_t best_rd_so_far);
+
void av1_rd_pick_inter_mode_sb_seg_skip(
const struct AV1_COMP *cpi, struct TileDataEnc *tile_data,
struct macroblock *x, int mi_row, int mi_col, struct RD_STATS *rd_cost,
BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far);
+// The best edge strength seen in the block, as well as the best x and y
+// components of edge strength seen.
+typedef struct {
+ uint16_t magnitude;
+ uint16_t x;
+ uint16_t y;
+} EdgeInfo;
+
/** Returns an integer indicating the strength of the edge.
* 0 means no edge found, 556 is the strength of a solid black/white edge,
* and the number may range higher if the signal is even stronger (e.g., on a
* corner). high_bd is a bool indicating the source should be treated
* as a 16-bit array. bd is the bit depth.
*/
-uint16_t av1_edge_exists(const uint8_t *src, int src_stride, int w, int h,
+EdgeInfo av1_edge_exists(const uint8_t *src, int src_stride, int w, int h,
bool high_bd, int bd);
/** Applies a Gaussian blur with sigma = 1.3. Used by av1_edge_exists and
@@ -151,10 +166,8 @@ typedef struct {
sobel_xy sobel(const uint8_t *input, int stride, int i, int j, bool high_bd);
-#if CONFIG_COLLECT_INTER_MODE_RD_STATS
void av1_inter_mode_data_init(struct TileDataEnc *tile_data);
void av1_inter_mode_data_fit(TileDataEnc *tile_data, int rdmult);
-#endif
#ifdef __cplusplus
} // extern "C"
diff --git a/libaom/av1/encoder/reconinter_enc.c b/libaom/av1/encoder/reconinter_enc.c
index 1100222..4b477ce 100644
--- a/libaom/av1/encoder/reconinter_enc.c
+++ b/libaom/av1/encoder/reconinter_enc.c
@@ -138,27 +138,28 @@ static INLINE void build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
assert(bw < 8 || bh < 8);
ConvolveParams conv_params = get_conv_params_no_round(
0, plane, xd->tmp_conv_dst, tmp_dst_stride, is_compound, xd->bd);
- conv_params.use_jnt_comp_avg = 0;
+ conv_params.use_dist_wtd_comp_avg = 0;
struct buf_2d *const dst_buf = &pd->dst;
uint8_t *dst = dst_buf->buf + dst_buf->stride * y + x;
ref = 0;
- const RefBuffer *ref_buf =
- &cm->current_frame
- .frame_refs[this_mbmi->ref_frame[ref] - LAST_FRAME];
+ const RefCntBuffer *ref_buf =
+ get_ref_frame_buf(cm, this_mbmi->ref_frame[ref]);
+ const struct scale_factors *ref_scale_factors =
+ get_ref_scale_factors_const(cm, this_mbmi->ref_frame[ref]);
- pd->pre[ref].buf0 = (plane == 1) ? ref_buf->buf->buf.u_buffer
- : ref_buf->buf->buf.v_buffer;
+ pd->pre[ref].buf0 =
+ (plane == 1) ? ref_buf->buf.u_buffer : ref_buf->buf.v_buffer;
pd->pre[ref].buf =
- pd->pre[ref].buf0 +
- scaled_buffer_offset(pre_x, pre_y, ref_buf->buf->buf.uv_stride,
- &ref_buf->sf);
- pd->pre[ref].width = ref_buf->buf->buf.uv_crop_width;
- pd->pre[ref].height = ref_buf->buf->buf.uv_crop_height;
- pd->pre[ref].stride = ref_buf->buf->buf.uv_stride;
+ pd->pre[ref].buf0 + scaled_buffer_offset(pre_x, pre_y,
+ ref_buf->buf.uv_stride,
+ ref_scale_factors);
+ pd->pre[ref].width = ref_buf->buf.uv_crop_width;
+ pd->pre[ref].height = ref_buf->buf.uv_crop_height;
+ pd->pre[ref].stride = ref_buf->buf.uv_stride;
const struct scale_factors *const sf =
- is_intrabc ? &cm->sf_identity : &ref_buf->sf;
+ is_intrabc ? &cm->sf_identity : ref_scale_factors;
struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
const MV mv = this_mbmi->mv[ref].as_mv;
@@ -195,15 +196,15 @@ static INLINE void build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
{
ConvolveParams conv_params = get_conv_params_no_round(
0, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd);
- av1_jnt_comp_weight_assign(cm, mi, 0, &conv_params.fwd_offset,
- &conv_params.bck_offset,
- &conv_params.use_jnt_comp_avg, is_compound);
+ av1_dist_wtd_comp_weight_assign(
+ cm, mi, 0, &conv_params.fwd_offset, &conv_params.bck_offset,
+ &conv_params.use_dist_wtd_comp_avg, is_compound);
struct buf_2d *const dst_buf = &pd->dst;
uint8_t *const dst = dst_buf->buf;
for (ref = 0; ref < 1 + is_compound; ++ref) {
const struct scale_factors *const sf =
- is_intrabc ? &cm->sf_identity : &xd->block_refs[ref]->sf;
+ is_intrabc ? &cm->sf_identity : xd->block_ref_scale_factors[ref];
struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
const MV mv = mi->mv[ref].as_mv;
@@ -236,46 +237,19 @@ static INLINE void build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
}
}
-static void build_inter_predictors_for_planes(const AV1_COMMON *cm,
- MACROBLOCKD *xd, BLOCK_SIZE bsize,
- int mi_row, int mi_col,
- int plane_from, int plane_to) {
- int plane;
+static void build_inter_predictors_for_plane(const AV1_COMMON *cm,
+ MACROBLOCKD *xd, int mi_row,
+ int mi_col, const BUFFER_SET *ctx,
+ BLOCK_SIZE bsize, int plane_idx) {
+ const struct macroblockd_plane *pd = &xd->plane[plane_idx];
+ if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+ pd->subsampling_y))
+ return;
+
const int mi_x = mi_col * MI_SIZE;
const int mi_y = mi_row * MI_SIZE;
- for (plane = plane_from; plane <= plane_to; ++plane) {
- const struct macroblockd_plane *pd = &xd->plane[plane];
- const int bw = pd->width;
- const int bh = pd->height;
-
- if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
- pd->subsampling_y))
- continue;
-
- build_inter_predictors(cm, xd, plane, xd->mi[0], 0, bw, bh, mi_x, mi_y);
- }
-}
-
-void av1_build_inter_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd,
- int mi_row, int mi_col, BUFFER_SET *ctx,
- BLOCK_SIZE bsize) {
- av1_build_inter_predictors_sbp(cm, xd, mi_row, mi_col, ctx, bsize, 0);
-}
-
-void av1_build_inter_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
- int mi_row, int mi_col, BUFFER_SET *ctx,
- BLOCK_SIZE bsize) {
- for (int plane_idx = 1; plane_idx < MAX_MB_PLANE; plane_idx++) {
- av1_build_inter_predictors_sbp(cm, xd, mi_row, mi_col, ctx, bsize,
- plane_idx);
- }
-}
-
-void av1_build_inter_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
- int mi_row, int mi_col, BUFFER_SET *ctx,
- BLOCK_SIZE bsize, int plane_idx) {
- build_inter_predictors_for_planes(cm, xd, bsize, mi_row, mi_col, plane_idx,
- plane_idx);
+ build_inter_predictors(cm, xd, plane_idx, xd->mi[0], 0, pd->width, pd->height,
+ mi_x, mi_y);
if (is_interintra_pred(xd->mi[0])) {
BUFFER_SET default_ctx = { { NULL, NULL, NULL }, { 0, 0, 0 } };
@@ -290,13 +264,14 @@ void av1_build_inter_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
}
}
-void av1_build_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
- int mi_row, int mi_col, BUFFER_SET *ctx,
- BLOCK_SIZE bsize) {
- const int num_planes = av1_num_planes(cm);
- av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, ctx, bsize);
- if (num_planes > 1)
- av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, ctx, bsize);
+void av1_enc_build_inter_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col,
+ const BUFFER_SET *ctx, BLOCK_SIZE bsize,
+ int plane_from, int plane_to) {
+ for (int plane_idx = plane_from; plane_idx <= plane_to; ++plane_idx) {
+ build_inter_predictors_for_plane(cm, xd, mi_row, mi_col, ctx, bsize,
+ plane_idx);
+ }
}
// TODO(sarahparker):
@@ -309,7 +284,7 @@ void av1_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
InterpFilters interp_filters,
const WarpTypesAllowed *warp_types, int p_col,
int p_row, int plane, int ref,
- enum mv_precision precision, int x, int y,
+ mv_precision precision, int x, int y,
const MACROBLOCKD *xd, int can_use_previous) {
const int is_q4 = precision == MV_PRECISION_Q4;
const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2,
@@ -452,7 +427,7 @@ void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ if (is_cur_buf_hbd(xd)) {
int len = sizeof(uint16_t);
dst_buf1[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0]);
dst_buf1[1] =
@@ -493,7 +468,7 @@ static void build_inter_predictors_single_buf(MACROBLOCKD *xd, int plane,
struct macroblockd_plane *const pd = &xd->plane[plane];
const MB_MODE_INFO *mi = xd->mi[0];
- const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
+ const struct scale_factors *const sf = xd->block_ref_scale_factors[ref];
struct buf_2d *const pre_buf = &pd->pre[ref];
uint8_t *const dst = get_buf_by_bd(xd, ext_dst) + ext_dst_stride * y + x;
const MV mv = mi->mv[ref].as_mv;
@@ -575,37 +550,41 @@ static void build_wedge_inter_predictor_from_buf(
uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
mbmi->interinter_comp.seg_mask = xd->seg_mask;
const INTERINTER_COMPOUND_DATA *comp_data = &mbmi->interinter_comp;
+ const int is_hbd = is_cur_buf_hbd(xd);
if (is_compound && is_masked_compound_type(comp_data->type)) {
if (!plane && comp_data->type == COMPOUND_DIFFWTD) {
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+ if (is_hbd) {
av1_build_compound_diffwtd_mask_highbd(
comp_data->seg_mask, comp_data->mask_type,
CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, h, w, xd->bd);
- else
+ } else {
av1_build_compound_diffwtd_mask(
comp_data->seg_mask, comp_data->mask_type, ext_dst0,
ext_dst_stride0, ext_dst1, ext_dst_stride1, h, w);
+ }
}
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+ if (is_hbd) {
build_masked_compound_highbd(
dst, dst_buf->stride, CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, comp_data,
mbmi->sb_type, h, w, xd->bd);
- else
+ } else {
build_masked_compound(dst, dst_buf->stride, ext_dst0, ext_dst_stride0,
ext_dst1, ext_dst_stride1, comp_data, mbmi->sb_type,
h, w);
+ }
} else {
- if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+ if (is_hbd) {
aom_highbd_convolve_copy(CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
dst, dst_buf->stride, NULL, 0, NULL, 0, w, h,
xd->bd);
- else
+ } else {
aom_convolve_copy(ext_dst0, ext_dst_stride0, dst, dst_buf->stride, NULL,
0, NULL, 0, w, h);
+ }
}
}
diff --git a/libaom/av1/encoder/reconinter_enc.h b/libaom/av1/encoder/reconinter_enc.h
index 10d5e8c..5687168 100644
--- a/libaom/av1/encoder/reconinter_enc.h
+++ b/libaom/av1/encoder/reconinter_enc.h
@@ -23,21 +23,10 @@
extern "C" {
#endif
-void av1_build_inter_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd,
- int mi_row, int mi_col, BUFFER_SET *ctx,
- BLOCK_SIZE bsize);
-
-void av1_build_inter_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
- int mi_row, int mi_col, BUFFER_SET *ctx,
- BLOCK_SIZE bsize);
-
-void av1_build_inter_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
- int mi_row, int mi_col, BUFFER_SET *ctx,
- BLOCK_SIZE bsize, int plane_idx);
-
-void av1_build_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
- int mi_row, int mi_col, BUFFER_SET *ctx,
- BLOCK_SIZE bsize);
+void av1_enc_build_inter_predictor(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col,
+ const BUFFER_SET *ctx, BLOCK_SIZE bsize,
+ int plane_from, int plane_to);
void av1_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
int dst_stride, const MV *src_mv,
@@ -46,7 +35,7 @@ void av1_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
InterpFilters interp_filters,
const WarpTypesAllowed *warp_types, int p_col,
int p_row, int plane, int ref,
- enum mv_precision precision, int x, int y,
+ mv_precision precision, int x, int y,
const MACROBLOCKD *xd, int can_use_previous);
// Detect if the block have sub-pixel level motion vectors
diff --git a/libaom/av1/encoder/speed_features.c b/libaom/av1/encoder/speed_features.c
index fd0368e..5dfc585 100644
--- a/libaom/av1/encoder/speed_features.c
+++ b/libaom/av1/encoder/speed_features.c
@@ -17,13 +17,9 @@
#include "aom_dsp/aom_dsp_common.h"
-// Setting this to 1 will disable trellis optimization completely.
-// Setting this to 2 will disable trellis optimization within the
-// transform search. Trellis optimization will still be applied
-// in the final encode.
-#define DISABLE_TRELLISQ_SEARCH 0
-
#define MAX_MESH_SPEED 5 // Max speed setting for mesh motion method
+// Max speed setting for tx domain evaluation
+#define MAX_TX_DOMAIN_EVAL_SPEED 5
static MESH_PATTERN
good_quality_mesh_patterns[MAX_MESH_SPEED + 1][MAX_MESH_STEP] = {
{ { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } },
@@ -50,6 +46,22 @@ static MESH_PATTERN intrabc_mesh_patterns[MAX_MESH_SPEED + 1][MAX_MESH_STEP] = {
static uint8_t intrabc_max_mesh_pct[MAX_MESH_SPEED + 1] = { 100, 100, 100,
25, 25, 10 };
+// Threshold values to be used for pruning the txfm_domain_distortion
+// based on block MSE
+// TODO(any): Experiment the threshold logic based on variance metric
+static unsigned int tx_domain_dist_thresholds[MAX_TX_DOMAIN_EVAL_SPEED + 1] = {
+ UINT_MAX, 162754, 22026, 22026, 22026, 0
+};
+// Threshold values to be used for disabling coeff RD-optimization
+// based on block MSE
+// TODO(any): Experiment the threshold logic based on variance metric
+static unsigned int coeff_opt_dist_thresholds[5] = { UINT_MAX, 162754, 162754,
+ 22026, 22026 };
+// scaling values to be used for gating wedge/compound segment based on best
+// approximate rd
+static int comp_type_rd_threshold_mul[3] = { 1, 11, 12 };
+static int comp_type_rd_threshold_div[3] = { 3, 16, 16 };
+
// Intra only frames, golden frames (except alt ref overlays) and
// alt ref frames tend to be coded at a higher than ambient quality
static int frame_is_boosted(const AV1_COMP *cpi) {
@@ -62,7 +74,7 @@ static int frame_is_boosted(const AV1_COMP *cpi) {
// partly on the screen area that over which they propogate. Propogation is
// limited by transform block size but the screen area take up by a given block
// size will be larger for a small image format stretched to full screen.
-static BLOCK_SIZE set_partition_min_limit(AV1_COMMON *const cm) {
+static BLOCK_SIZE set_partition_min_limit(const AV1_COMMON *const cm) {
unsigned int screen_area = (cm->width * cm->height);
// Select block size based on image format size.
@@ -78,24 +90,21 @@ static BLOCK_SIZE set_partition_min_limit(AV1_COMMON *const cm) {
}
}
-// Do we have an internal image edge (e.g. formatting bars).
-static int has_internal_image_edge(const AV1_COMP *cpi) {
- return (cpi->oxcf.pass == 2) &&
- ((cpi->twopass.this_frame_stats.inactive_zone_rows > 0) ||
- (cpi->twopass.this_frame_stats.inactive_zone_cols > 0));
-}
-
-static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi,
- SPEED_FEATURES *sf,
- int speed) {
- AV1_COMMON *const cm = &cpi->common;
+static void set_good_speed_feature_framesize_dependent(
+ const AV1_COMP *const cpi, SPEED_FEATURES *const sf, int speed) {
+ const AV1_COMMON *const cm = &cpi->common;
const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
if (is_480p_or_larger) {
sf->use_square_partition_only_threshold = BLOCK_128X128;
+ if (is_720p_or_larger)
+ sf->auto_max_partition_based_on_simple_motion = ADAPT_PRED;
+ else
+ sf->auto_max_partition_based_on_simple_motion = RELAXED_PRED;
} else {
sf->use_square_partition_only_threshold = BLOCK_64X64;
+ sf->auto_max_partition_based_on_simple_motion = DIRECT_PRED;
}
// TODO(huisu@google.com): train models for 720P and above.
@@ -107,6 +116,11 @@ static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi,
sf->ml_partition_search_breakout_thresh[4] = -1; // BLOCK_128X128
}
+ if (is_720p_or_larger && speed >= CONFIG_2PASS_PARTITION_SEARCH_LVL_START &&
+ speed < CONFIG_2PASS_PARTITION_SEARCH_LVL_END) {
+ sf->two_pass_partition_search = 1;
+ }
+
if (speed >= 1) {
if (is_720p_or_larger) {
sf->use_square_partition_only_threshold = BLOCK_128X128;
@@ -122,18 +136,28 @@ static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi,
sf->ml_partition_search_breakout_thresh[2] = 300; // BLOCK_32X32
sf->ml_partition_search_breakout_thresh[3] = 300; // BLOCK_64X64
sf->ml_partition_search_breakout_thresh[4] = -1; // BLOCK_128X128
+
+ sf->firstpass_simple_motion_search_early_term = 1;
}
}
if (speed >= 2) {
if (is_720p_or_larger) {
- sf->disable_split_mask =
- cm->show_frame ? DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
+ sf->use_square_partition_only_threshold = BLOCK_64X64;
+ } else if (is_480p_or_larger) {
+ sf->use_square_partition_only_threshold = BLOCK_32X32;
+ } else {
+ // TODO(chiyotsai@google.com): Setting the threshold to BLOCK_16X16 incurs
+ // a large loss (about 0.584%). Try increasing the threshold on boosted
+ // frame and see if it improves the performance.
+ sf->use_square_partition_only_threshold = BLOCK_32X32;
+ }
+
+ if (is_720p_or_larger) {
sf->adaptive_pred_interp_filter = 0;
sf->partition_search_breakout_dist_thr = (1 << 24);
sf->partition_search_breakout_rate_thr = 120;
} else {
- sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
sf->partition_search_breakout_dist_thr = (1 << 22);
sf->partition_search_breakout_rate_thr = 100;
}
@@ -142,24 +166,15 @@ static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi,
if (speed >= 3) {
if (is_720p_or_larger) {
- sf->disable_split_mask = DISABLE_ALL_SPLIT;
sf->partition_search_breakout_dist_thr = (1 << 25);
sf->partition_search_breakout_rate_thr = 200;
} else {
sf->max_intra_bsize = BLOCK_32X32;
- sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
sf->partition_search_breakout_dist_thr = (1 << 23);
sf->partition_search_breakout_rate_thr = 120;
}
- }
-
- // If this is a two pass clip that fits the criteria for animated or
- // graphics content then reset disable_split_mask for speeds 2+.
- // Also if the image edge is internal to the coded area.
- if ((speed >= 2) && (cpi->oxcf.pass == 2) &&
- ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ||
- (has_internal_image_edge(cpi)))) {
- sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
+ sf->use_first_partition_pass_interintra_stats =
+ sf->two_pass_partition_search;
}
if (speed >= 4) {
@@ -168,15 +183,15 @@ static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi,
} else {
sf->partition_search_breakout_dist_thr = (1 << 24);
}
- sf->disable_split_mask = DISABLE_ALL_SPLIT;
}
}
-static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
- SPEED_FEATURES *sf,
- int speed) {
- AV1_COMMON *const cm = &cpi->common;
+static void set_good_speed_features_framesize_independent(
+ const AV1_COMP *const cpi, SPEED_FEATURES *const sf, int speed) {
+ const AV1_COMMON *const cm = &cpi->common;
const int boosted = frame_is_boosted(cpi);
+ const int is_boosted_arf2_bwd_type =
+ boosted || cpi->refresh_bwd_ref_frame || cpi->refresh_alt2_ref_frame;
// Speed 0 for all speed features that give neutral coding performance change.
sf->reduce_inter_modes = 1;
@@ -184,16 +199,22 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
sf->ml_prune_rect_partition = 1;
sf->ml_prune_ab_partition = 1;
sf->ml_prune_4_partition = 1;
+ sf->simple_motion_search_prune_rect = 1;
sf->adaptive_txb_search_level = 1;
- sf->use_jnt_comp_flag = JNT_COMP_SKIP_MV_SEARCH;
+ sf->use_dist_wtd_comp_flag = DIST_WTD_COMP_SKIP_MV_SEARCH;
sf->model_based_prune_tx_search_level = 1;
sf->model_based_post_interp_filter_breakout = 1;
+ sf->model_based_motion_mode_rd_breakout = 1;
+
+ // TODO(debargha): Test, tweak and turn on either 1 or 2
sf->inter_mode_rd_model_estimation = 1;
+ sf->inter_mode_rd_model_estimation_adaptive = 0;
+
+ sf->two_loop_comp_search = 0;
sf->prune_ref_frame_for_rect_partitions =
- !(boosted || cpi->refresh_bwd_ref_frame || cpi->refresh_alt2_ref_frame);
- sf->prune_ref_mode_for_partitions = sf->prune_ref_frame_for_rect_partitions;
+ boosted ? 0 : (is_boosted_arf2_bwd_type ? 1 : 2);
sf->less_rectangular_check_level = 1;
- sf->gm_search_type = GM_REDUCED_REF_SEARCH;
+ sf->gm_search_type = GM_REDUCED_REF_SEARCH_SKIP_L2_L3;
sf->gm_disable_recode = 1;
sf->use_fast_interpolation_filter_search = 1;
sf->intra_tx_size_search_init_depth_sqr = 1;
@@ -202,28 +223,250 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
sf->prune_wedge_pred_diff_based = 1;
sf->disable_wedge_search_var_thresh = 0;
sf->disable_wedge_search_edge_thresh = 0;
+ sf->prune_motion_mode_level = 1;
+ sf->cb_pred_filter_search = 0;
+ sf->use_nonrd_pick_mode = 0;
+ sf->use_real_time_ref_set = 0;
if (speed >= 1) {
sf->gm_erroradv_type = GM_ERRORADV_TR_1;
sf->selective_ref_frame = 2;
+ sf->intra_tx_size_search_init_depth_rect = 1;
+ sf->tx_size_search_lgr_block = 1;
+
+ sf->prune_ext_partition_types_search_level = 2;
+ sf->skip_repeat_interpolation_filter_search = 1;
+ sf->tx_type_search.skip_tx_search = 1;
+ sf->tx_type_search.ml_tx_split_thresh = 40;
+ sf->model_based_prune_tx_search_level = 0;
+ sf->adaptive_txb_search_level = 2;
+ sf->use_intra_txb_hash = 1;
+ sf->optimize_b_precheck = 1;
+ sf->dual_sgr_penalty_level = 1;
+ sf->use_accurate_subpel_search = USE_4_TAPS;
+ sf->reuse_inter_intra_mode = 1;
+ sf->prune_comp_search_by_single_result = 1;
+ sf->skip_repeated_newmv = 1;
+ sf->obmc_full_pixel_search_level = 1;
+ // TODO(anyone): Following speed feature will be further explored to
+ // identify the appropriate tradeoff between encoder performance and its
+ // speed.
+ sf->prune_single_motion_modes_by_simple_trans = 1;
+
+ sf->simple_motion_search_split_only = 1;
+ sf->simple_motion_search_early_term_none = 1;
+
+ sf->disable_wedge_search_var_thresh = 0;
+ sf->disable_wedge_search_edge_thresh = 0;
+ sf->disable_interinter_wedge_newmv_search = boosted ? 0 : 1;
+ sf->prune_comp_type_by_comp_avg = 1;
+ sf->prune_motion_mode_level = 2;
+ sf->gm_search_type = GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2;
+ sf->cb_pred_filter_search = 1;
+ sf->use_transform_domain_distortion = boosted ? 0 : 1;
+ sf->perform_coeff_opt = boosted ? 0 : 1;
+ sf->use_inter_txb_hash = 0;
+ }
+
+ if (speed >= 2) {
+ sf->gm_erroradv_type = GM_ERRORADV_TR_2;
+
+ sf->selective_ref_frame = 3;
sf->inter_tx_size_search_init_depth_rect = 1;
sf->inter_tx_size_search_init_depth_sqr = 1;
+
+ sf->fast_cdef_search = 1;
+
+ sf->adaptive_rd_thresh = 1;
+ sf->mv.auto_mv_step_size = 1;
+ sf->mv.subpel_iters_per_step = 1;
+ sf->disable_filter_search_var_thresh = 100;
+ sf->comp_inter_joint_search_thresh = BLOCK_SIZES_ALL;
+
+ sf->partition_search_breakout_rate_thr = 80;
+ sf->allow_partition_search_skip = 1;
+ sf->disable_wedge_search_var_thresh = 100;
+ sf->disable_wedge_search_edge_thresh = 0;
+ sf->disable_interinter_wedge_newmv_search = 1;
+ sf->fast_wedge_sign_estimate = 1;
+ sf->disable_dual_filter = 1;
+ sf->use_dist_wtd_comp_flag = DIST_WTD_COMP_DISABLED;
+ sf->prune_comp_type_by_comp_avg = 2;
+ // TODO(Sachin): Enable/Enhance this speed feature for speed 2 & 3
+ sf->cb_pred_filter_search = 0;
+ sf->adaptive_interp_filter_search = 1;
+ sf->perform_coeff_opt = boosted ? 0 : 2;
+ }
+
+ if (speed >= 3) {
+ sf->tx_size_search_method = boosted ? USE_FULL_RD : USE_LARGESTALL;
+ sf->less_rectangular_check_level = 2;
+ sf->adaptive_pred_interp_filter = 1;
+ // adaptive_motion_search breaks encoder multi-thread tests.
+ // The values in x->pred_mv[] differ for single and multi-thread cases.
+ // See aomedia:1778.
+ // sf->adaptive_motion_search = 1;
+ sf->recode_loop = ALLOW_RECODE_KFARFGF;
+ sf->use_transform_domain_distortion = boosted ? 1 : 2;
+ sf->use_accurate_subpel_search = USE_2_TAPS;
+ sf->adaptive_rd_thresh = 2;
+ if (cpi->oxcf.enable_smooth_interintra) {
+ sf->disable_smooth_interintra =
+ (boosted || cpi->refresh_bwd_ref_frame || cpi->refresh_alt2_ref_frame)
+ ? 0
+ : 1;
+ }
+ sf->tx_type_search.prune_mode = PRUNE_2D_FAST;
+ sf->gm_search_type = GM_DISABLE_SEARCH;
+ sf->prune_comp_search_by_single_result = 2;
+ sf->prune_motion_mode_level = boosted ? 2 : 3;
+ sf->prune_warp_using_wmtype = 1;
+ // TODO(yunqing): evaluate this speed feature for speed 1 & 2, and combine
+ // it with cpi->sf.disable_wedge_search_var_thresh.
+ sf->disable_wedge_interintra_search = 1;
+ // TODO(any): Experiment with the early exit mechanism for speeds 0, 1 and 2
+ // and clean-up the speed feature
+ sf->perform_best_rd_based_gating_for_chroma = 1;
+ sf->prune_ref_frame_for_rect_partitions =
+ frame_is_intra_only(&cpi->common) ? 0 : (boosted ? 1 : 2);
+ sf->perform_coeff_opt = is_boosted_arf2_bwd_type ? 2 : 3;
+ sf->prune_comp_type_by_model_rd = boosted ? 0 : 1;
+ // TODO(Venkat): Clean-up frame type dependency for
+ // simple_motion_search_split_only in partition search function and set the
+ // speed feature accordingly
+ // TODO(Venkat): Evaluate this speed feature for speed 1 & 2
+ sf->simple_motion_search_split_only =
+ cm->allow_screen_content_tools ? 1 : 2;
+ sf->disable_smooth_intra =
+ !frame_is_intra_only(&cpi->common) || (cpi->rc.frames_to_key != 1);
+ }
+
+ if (speed >= 4) {
+ sf->use_intra_txb_hash = 0;
+ sf->tx_type_search.fast_intra_tx_type_search = 1;
+ sf->disable_loop_restoration_chroma =
+ (boosted || cm->allow_screen_content_tools) ? 0 : 1;
+ sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED;
+ sf->adaptive_pred_interp_filter = 0;
+ sf->cb_pred_filter_search = 1;
+ sf->adaptive_mode_search = 1;
+ sf->alt_ref_search_fp = 1;
+ sf->skip_sharp_interp_filter_search = 1;
+ sf->perform_coeff_opt = is_boosted_arf2_bwd_type ? 2 : 4;
+ sf->adaptive_txb_search_level = boosted ? 2 : 3;
+ }
+
+ if (speed >= 5) {
+ sf->recode_loop = ALLOW_RECODE_KFMAXBW;
+ sf->intra_y_mode_mask[TX_64X64] = INTRA_DC_H_V;
+ sf->intra_uv_mode_mask[TX_64X64] = UV_INTRA_DC_H_V_CFL;
+ sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+ sf->intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC_H_V_CFL;
+ sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
+ sf->intra_uv_mode_mask[TX_16X16] = UV_INTRA_DC_H_V_CFL;
+ sf->tx_size_search_method = USE_LARGESTALL;
+ sf->mv.search_method = BIGDIA;
+ sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
+ sf->adaptive_rd_thresh = 4;
+ sf->mode_search_skip_flags =
+ (cm->current_frame.frame_type == KEY_FRAME)
+ ? 0
+ : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER |
+ FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR |
+ FLAG_EARLY_TERMINATE;
+ sf->disable_filter_search_var_thresh = 200;
+ sf->use_fast_coef_costing = 1;
+ sf->partition_search_breakout_rate_thr = 300;
+ sf->use_transform_domain_distortion = 2;
+ }
+
+ if (speed >= 6) {
+ int i;
+ sf->optimize_coefficients = NO_TRELLIS_OPT;
+ sf->mv.search_method = HEX;
+ sf->disable_filter_search_var_thresh = 500;
+ for (i = 0; i < TX_SIZES; ++i) {
+ sf->intra_y_mode_mask[i] = INTRA_DC;
+ sf->intra_uv_mode_mask[i] = UV_INTRA_DC_CFL;
+ }
+ sf->partition_search_breakout_rate_thr = 500;
+ sf->mv.reduce_first_step_size = 1;
+ sf->simple_model_rd_from_var = 1;
+ }
+ if (speed >= 7) {
+ sf->default_max_partition_size = BLOCK_32X32;
+ sf->default_min_partition_size = BLOCK_8X8;
+ sf->intra_y_mode_mask[TX_64X64] = INTRA_DC;
+ sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
+ sf->frame_parameter_update = 0;
+ sf->mv.search_method = FAST_HEX;
+ sf->partition_search_type = REFERENCE_PARTITION;
+ sf->mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH;
+ // TODO(any): evaluate adaptive_mode_search=1 for speed 7 & 8
+ sf->adaptive_mode_search = 2;
+ }
+ if (speed >= 8) {
+ sf->mv.search_method = FAST_DIAMOND;
+ sf->mv.subpel_force_stop = HALF_PEL;
+ sf->lpf_pick = LPF_PICK_FROM_Q;
+ }
+}
+
+// TODO(kyslov): now this is very similar to
+// set_good_speed_features_framesize_independent
+// except it sets non-rd flag on speed8. This function will likely
+// be modified in the future with RT-specific speed features
+static void set_rt_speed_features_framesize_independent(AV1_COMP *cpi,
+ SPEED_FEATURES *sf,
+ int speed) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int boosted = frame_is_boosted(cpi);
+
+ // Speed 0 for all speed features that give neutral coding performance change.
+ sf->reduce_inter_modes = 1;
+ sf->prune_ext_partition_types_search_level = 1;
+ sf->ml_prune_rect_partition = 1;
+ sf->ml_prune_ab_partition = 1;
+ sf->ml_prune_4_partition = 1;
+ sf->adaptive_txb_search_level = 1;
+ sf->use_dist_wtd_comp_flag = DIST_WTD_COMP_SKIP_MV_SEARCH;
+ sf->model_based_prune_tx_search_level = 1;
+ sf->model_based_post_interp_filter_breakout = 1;
+ sf->model_based_motion_mode_rd_breakout = 1;
+
+ // TODO(debargha): Test, tweak and turn on either 1 or 2
+ sf->inter_mode_rd_model_estimation = 0;
+ sf->inter_mode_rd_model_estimation_adaptive = 0;
+ sf->two_loop_comp_search = 0;
+
+ sf->prune_ref_frame_for_rect_partitions = !boosted;
+ sf->less_rectangular_check_level = 1;
+ sf->gm_search_type = GM_REDUCED_REF_SEARCH_SKIP_L2_L3;
+ sf->gm_disable_recode = 1;
+ sf->use_fast_interpolation_filter_search = 1;
+ sf->intra_tx_size_search_init_depth_sqr = 1;
+ sf->intra_angle_estimation = 1;
+ sf->selective_ref_frame = 1;
+ sf->prune_wedge_pred_diff_based = 1;
+ sf->disable_wedge_search_var_thresh = 0;
+ sf->disable_wedge_search_edge_thresh = 0;
+ sf->prune_motion_mode_level = 1;
+ sf->cb_pred_filter_search = 0;
+ sf->use_nonrd_pick_mode = 0;
+ sf->use_real_time_ref_set = 0;
+
+ if (speed >= 1) {
+ sf->gm_erroradv_type = GM_ERRORADV_TR_1;
+ sf->selective_ref_frame = 2;
+
sf->intra_tx_size_search_init_depth_rect = 1;
sf->tx_size_search_lgr_block = 1;
- if (speed >= CONFIG_2PASS_PARTITION_SEARCH_LVL) {
- sf->two_pass_partition_search = 1;
- sf->mode_pruning_based_on_two_pass_partition_search = 1;
- }
sf->prune_ext_partition_types_search_level = 2;
sf->skip_repeat_interpolation_filter_search = 1;
sf->tx_type_search.skip_tx_search = 1;
sf->tx_type_search.ml_tx_split_thresh = 40;
sf->model_based_prune_tx_search_level = 0;
- sf->model_based_post_interp_filter_breakout = 0;
- // TODO(angiebird): Re-evaluate the impact of inter_mode_rd_model_estimation
- // on speed 1
- sf->inter_mode_rd_model_estimation = 0;
sf->adaptive_txb_search_level = 2;
sf->use_intra_txb_hash = 1;
sf->optimize_b_precheck = 1;
@@ -238,15 +481,23 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
// speed.
sf->prune_single_motion_modes_by_simple_trans = 1;
- sf->full_pixel_motion_search_based_split = 1;
+ sf->simple_motion_search_prune_rect = 1;
+
sf->disable_wedge_search_var_thresh = 0;
sf->disable_wedge_search_edge_thresh = 0;
+ sf->prune_comp_type_by_comp_avg = 1;
+ sf->prune_motion_mode_level = 2;
+ sf->gm_search_type = GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2;
+ sf->cb_pred_filter_search = 1;
+ sf->use_transform_domain_distortion = boosted ? 0 : 1;
}
if (speed >= 2) {
sf->gm_erroradv_type = GM_ERRORADV_TR_2;
sf->selective_ref_frame = 3;
+ sf->inter_tx_size_search_init_depth_rect = 1;
+ sf->inter_tx_size_search_init_depth_sqr = 1;
sf->fast_cdef_search = 1;
sf->adaptive_rd_thresh = 1;
@@ -256,18 +507,19 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
sf->comp_inter_joint_search_thresh = BLOCK_SIZES_ALL;
sf->partition_search_breakout_rate_thr = 80;
- // Note: This speed feature is disable as it seems to be worse in
- // compression/quality and is also slower.
- // sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
sf->allow_partition_search_skip = 1;
sf->disable_wedge_search_var_thresh = 100;
sf->disable_wedge_search_edge_thresh = 0;
sf->fast_wedge_sign_estimate = 1;
sf->disable_dual_filter = 1;
- sf->use_jnt_comp_flag = JNT_COMP_DISABLED;
+ sf->use_dist_wtd_comp_flag = DIST_WTD_COMP_DISABLED;
+ sf->prune_comp_type_by_comp_avg = 2;
+ sf->cb_pred_filter_search = 0;
+ sf->adaptive_interp_filter_search = 1;
}
if (speed >= 3) {
+ sf->selective_ref_frame = 4;
sf->tx_size_search_method = boosted ? USE_FULL_RD : USE_LARGESTALL;
sf->less_rectangular_check_level = 2;
sf->adaptive_pred_interp_filter = 1;
@@ -282,22 +534,23 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
sf->tx_type_search.prune_mode = PRUNE_2D_FAST;
sf->gm_search_type = GM_DISABLE_SEARCH;
sf->prune_comp_search_by_single_result = 2;
+ sf->prune_motion_mode_level = boosted ? 2 : 3;
+ sf->prune_warp_using_wmtype = 1;
+ // TODO(yunqing): evaluate this speed feature for speed 1 & 2, and combine
+ // it with cpi->sf.disable_wedge_search_var_thresh.
+ sf->disable_wedge_interintra_search = 1;
}
if (speed >= 4) {
sf->use_intra_txb_hash = 0;
- sf->use_inter_txb_hash = 0;
sf->use_mb_rd_hash = 0;
sf->tx_type_search.fast_intra_tx_type_search = 1;
sf->tx_type_search.fast_inter_tx_type_search = 1;
- sf->use_square_partition_only_threshold =
- boosted ? BLOCK_128X128 : BLOCK_4X4;
sf->tx_size_search_method =
frame_is_intra_only(cm) ? USE_FULL_RD : USE_LARGESTALL;
sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED;
sf->adaptive_pred_interp_filter = 0;
sf->adaptive_mode_search = 1;
- sf->cb_partition_search = !boosted;
sf->alt_ref_search_fp = 1;
sf->skip_sharp_interp_filter_search = 1;
}
@@ -310,7 +563,6 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
sf->intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC_H_V_CFL;
sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
sf->intra_uv_mode_mask[TX_16X16] = UV_INTRA_DC_H_V_CFL;
- sf->use_square_partition_only_threshold = BLOCK_4X4;
sf->tx_size_search_method = USE_LARGESTALL;
sf->mv.search_method = BIGDIA;
sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
@@ -352,30 +604,25 @@ static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
}
if (speed >= 8) {
sf->mv.search_method = FAST_DIAMOND;
- sf->mv.subpel_force_stop = 2;
- sf->lpf_pick = LPF_PICK_MINIMAL_LPF;
+ sf->lpf_pick = LPF_PICK_FROM_Q;
+ sf->default_max_partition_size = BLOCK_128X128;
+ sf->default_min_partition_size = BLOCK_8X8;
+ sf->partition_search_type = VAR_BASED_PARTITION;
+ sf->use_real_time_ref_set = 1;
+ // Can't use LARGEST TX mode with pre-calculated partition
+ // and disabled TX64
+ if (!cpi->oxcf.enable_tx64) sf->tx_size_search_method = USE_FAST_RD;
+ sf->use_nonrd_pick_mode = 1;
+ sf->inter_mode_rd_model_estimation = 2;
}
}
-void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi) {
+void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi, int speed) {
SPEED_FEATURES *const sf = &cpi->sf;
const AV1EncoderConfig *const oxcf = &cpi->oxcf;
- RD_OPT *const rd = &cpi->rd;
- int i;
if (oxcf->mode == GOOD) {
- set_good_speed_feature_framesize_dependent(cpi, sf, oxcf->speed);
- }
-
- if (sf->disable_split_mask == DISABLE_ALL_SPLIT) {
- sf->adaptive_pred_interp_filter = 0;
- }
-
- // Check for masked out split cases.
- for (i = 0; i < MAX_REFS; ++i) {
- if (sf->disable_split_mask & (1 << i)) {
- rd->thresh_mult_sub8x8[i] = INT_MAX;
- }
+ set_good_speed_feature_framesize_dependent(cpi, sf, speed);
}
// This is only used in motion vector unit test.
@@ -385,7 +632,7 @@ void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi) {
cpi->find_fractional_mv_step = av1_return_min_sub_pixel_mv;
}
-void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
+void av1_set_speed_features_framesize_independent(AV1_COMP *cpi, int speed) {
AV1_COMMON *const cm = &cpi->common;
SPEED_FEATURES *const sf = &cpi->sf;
MACROBLOCK *const x = &cpi->td.mb;
@@ -398,25 +645,33 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
sf->recode_loop = ALLOW_RECODE;
sf->mv.subpel_search_method = SUBPEL_TREE;
sf->mv.subpel_iters_per_step = 2;
- sf->mv.subpel_force_stop = 0;
-#if DISABLE_TRELLISQ_SEARCH == 2
- sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf)
- ? FINAL_PASS_TRELLIS_OPT
- : NO_TRELLIS_OPT;
-#elif DISABLE_TRELLISQ_SEARCH == 1
- sf->optimize_coefficients = NO_TRELLIS_OPT;
-#else
- if (is_lossless_requested(&cpi->oxcf))
+ sf->mv.subpel_force_stop = EIGHTH_PEL;
+ if (cpi->oxcf.disable_trellis_quant == 3) {
+ sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf)
+ ? NO_ESTIMATE_YRD_TRELLIS_OPT
+ : NO_TRELLIS_OPT;
+ } else if (cpi->oxcf.disable_trellis_quant == 2) {
+ sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf)
+ ? FINAL_PASS_TRELLIS_OPT
+ : NO_TRELLIS_OPT;
+ } else if (cpi->oxcf.disable_trellis_quant == 0) {
+ if (is_lossless_requested(&cpi->oxcf))
+ sf->optimize_coefficients = NO_TRELLIS_OPT;
+ else
+ sf->optimize_coefficients = FULL_TRELLIS_OPT;
+ } else if (cpi->oxcf.disable_trellis_quant == 1) {
sf->optimize_coefficients = NO_TRELLIS_OPT;
- else
- sf->optimize_coefficients = FULL_TRELLIS_OPT;
-#endif // DISABLE_TRELLISQ_SEARCH
+ } else {
+ assert(0 && "Invalid disable_trellis_quant value");
+ }
sf->gm_erroradv_type = GM_ERRORADV_TR_0;
sf->mv.reduce_first_step_size = 0;
sf->mv.auto_mv_step_size = 0;
sf->comp_inter_joint_search_thresh = BLOCK_4X4;
sf->adaptive_rd_thresh = 0;
- sf->tx_size_search_method = USE_FULL_RD;
+ // TODO(sarahparker) Pair this with a speed setting once experiments are done
+ sf->trellis_eob_fast = 0;
+ sf->tx_size_search_method = cpi->oxcf.tx_size_search_method;
sf->inter_tx_size_search_init_depth_sqr = 0;
sf->inter_tx_size_search_init_depth_rect = 0;
sf->intra_tx_size_search_init_depth_rect = 0;
@@ -424,12 +679,12 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
sf->tx_size_search_lgr_block = 0;
sf->model_based_prune_tx_search_level = 0;
sf->model_based_post_interp_filter_breakout = 0;
+ sf->model_based_motion_mode_rd_breakout = 0;
sf->reduce_inter_modes = 0;
sf->selective_ref_gm = 1;
sf->adaptive_motion_search = 0;
sf->adaptive_pred_interp_filter = 0;
sf->adaptive_mode_search = 0;
- sf->cb_partition_search = 0;
sf->alt_ref_search_fp = 0;
sf->partition_search_type = SEARCH_PARTITION;
sf->tx_type_search.prune_mode = PRUNE_2D_ACCURATE;
@@ -442,19 +697,20 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
sf->less_rectangular_check_level = 0;
sf->use_square_partition_only_threshold = BLOCK_128X128;
sf->prune_ref_frame_for_rect_partitions = 0;
- sf->prune_ref_mode_for_partitions = 0;
- sf->auto_min_max_partition_size = NOT_IN_USE;
+ sf->auto_max_partition_based_on_simple_motion = NOT_IN_USE;
+ sf->auto_min_partition_based_on_simple_motion = 0;
sf->rd_auto_partition_min_limit = BLOCK_4X4;
sf->default_max_partition_size = BLOCK_LARGEST;
sf->default_min_partition_size = BLOCK_4X4;
sf->adjust_partitioning_from_last_frame = 0;
- sf->disable_split_mask = 0;
sf->mode_search_skip_flags = 0;
sf->disable_filter_search_var_thresh = 0;
sf->allow_partition_search_skip = 0;
sf->use_accurate_subpel_search = USE_8_TAPS;
sf->disable_wedge_search_edge_thresh = 0;
+ sf->use_first_partition_pass_interintra_stats = 0;
sf->disable_wedge_search_var_thresh = 0;
+ sf->disable_loop_restoration_chroma = 0;
sf->fast_wedge_sign_estimate = 0;
sf->prune_wedge_pred_diff_based = 0;
sf->drop_ref = 0;
@@ -462,17 +718,19 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
sf->txb_split_cap = 1;
sf->adaptive_txb_search_level = 0;
sf->two_pass_partition_search = 0;
- sf->mode_pruning_based_on_two_pass_partition_search = 0;
+ sf->firstpass_simple_motion_search_early_term = 0;
sf->use_intra_txb_hash = 0;
sf->use_inter_txb_hash = 1;
sf->use_mb_rd_hash = 1;
sf->optimize_b_precheck = 0;
- sf->jnt_comp_fast_tx_search = 0;
- sf->use_jnt_comp_flag = JNT_COMP_ENABLED;
+ sf->two_loop_comp_search = 1;
+ sf->second_loop_comp_fast_tx_search = 0;
+ sf->use_dist_wtd_comp_flag = DIST_WTD_COMP_ENABLED;
sf->reuse_inter_intra_mode = 0;
sf->intra_angle_estimation = 0;
sf->skip_obmc_in_uniform_mv_field = 0;
sf->skip_wm_in_uniform_mv_field = 0;
+ sf->adaptive_interp_filter_search = 0;
for (i = 0; i < TX_SIZES; i++) {
sf->intra_y_mode_mask[i] = INTRA_ALL;
@@ -497,7 +755,9 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
for (i = 0; i < PARTITION_BLOCK_SIZES; ++i) {
sf->ml_partition_search_breakout_thresh[i] = -1; // -1 means not enabled.
}
- sf->full_pixel_motion_search_based_split = 0;
+ sf->simple_motion_search_split_only = 0;
+ sf->simple_motion_search_prune_rect = 0;
+ sf->simple_motion_search_early_term_none = 0;
// Set this at the appropriate speed levels
sf->use_transform_domain_distortion = 0;
@@ -514,12 +774,29 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
// Set decoder side speed feature to use less dual sgr modes
sf->dual_sgr_penalty_level = 0;
+ // TODO(angiebird, debargha): Re-evaluate the impact of
+ // inter_mode_rd_model_estimation in conjunction with
+ // model_based_motion_mode_rd_breakout
sf->inter_mode_rd_model_estimation = 0;
+ sf->inter_mode_rd_model_estimation_adaptive = 0;
+
sf->obmc_full_pixel_search_level = 0;
sf->skip_sharp_interp_filter_search = 0;
+ sf->prune_comp_type_by_comp_avg = 0;
+ sf->disable_interinter_wedge_newmv_search = 0;
+ sf->disable_smooth_interintra = 0;
+ sf->prune_motion_mode_level = 0;
+ sf->prune_warp_using_wmtype = 0;
+ sf->disable_wedge_interintra_search = 0;
+ sf->perform_coeff_opt = 0;
+ sf->prune_comp_type_by_model_rd = 0;
+ sf->disable_smooth_intra = 0;
+ sf->perform_best_rd_based_gating_for_chroma = 0;
if (oxcf->mode == GOOD)
- set_good_speed_features_framesize_independent(cpi, sf, oxcf->speed);
+ set_good_speed_features_framesize_independent(cpi, sf, speed);
+ else if (oxcf->mode == REALTIME)
+ set_rt_speed_features_framesize_independent(cpi, sf, speed);
if (!cpi->seq_params_locked) {
cpi->common.seq_params.enable_dual_filter &= !sf->disable_dual_filter;
@@ -534,39 +811,44 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
cpi->diamond_search_sad = av1_diamond_search_sad;
sf->allow_exhaustive_searches = 1;
- int speed = (oxcf->speed > MAX_MESH_SPEED) ? MAX_MESH_SPEED : oxcf->speed;
+
+ const int mesh_speed = AOMMIN(speed, MAX_MESH_SPEED);
if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION)
sf->exhaustive_searches_thresh = (1 << 24);
else
sf->exhaustive_searches_thresh = (1 << 25);
- sf->max_exaustive_pct = good_quality_max_mesh_pct[speed];
- if (speed > 0)
+ sf->max_exaustive_pct = good_quality_max_mesh_pct[mesh_speed];
+ if (mesh_speed > 0)
sf->exhaustive_searches_thresh = sf->exhaustive_searches_thresh << 1;
for (i = 0; i < MAX_MESH_STEP; ++i) {
- sf->mesh_patterns[i].range = good_quality_mesh_patterns[speed][i].range;
+ sf->mesh_patterns[i].range =
+ good_quality_mesh_patterns[mesh_speed][i].range;
sf->mesh_patterns[i].interval =
- good_quality_mesh_patterns[speed][i].interval;
+ good_quality_mesh_patterns[mesh_speed][i].interval;
}
if ((frame_is_intra_only(cm) && cm->allow_screen_content_tools) &&
(cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION ||
cpi->oxcf.content == AOM_CONTENT_SCREEN)) {
for (i = 0; i < MAX_MESH_STEP; ++i) {
- sf->mesh_patterns[i].range = intrabc_mesh_patterns[speed][i].range;
- sf->mesh_patterns[i].interval = intrabc_mesh_patterns[speed][i].interval;
+ sf->mesh_patterns[i].range = intrabc_mesh_patterns[mesh_speed][i].range;
+ sf->mesh_patterns[i].interval =
+ intrabc_mesh_patterns[mesh_speed][i].interval;
}
- sf->max_exaustive_pct = intrabc_max_mesh_pct[speed];
+ sf->max_exaustive_pct = intrabc_max_mesh_pct[mesh_speed];
}
// Slow quant, dct and trellis not worthwhile for first pass
// so make sure they are always turned off.
if (oxcf->pass == 1) sf->optimize_coefficients = NO_TRELLIS_OPT;
- // No recode for 1 pass.
+ // No recode or trellis for 1 pass.
if (oxcf->pass == 0) {
sf->recode_loop = DISALLOW_RECODE;
sf->optimize_coefficients = NO_TRELLIS_OPT;
}
+ // FIXME: trellis not very efficient for quantization matrices
+ if (oxcf->using_qm) sf->optimize_coefficients = NO_TRELLIS_OPT;
if (sf->mv.subpel_search_method == SUBPEL_TREE) {
cpi->find_fractional_mv_step = av1_find_best_sub_pixel_tree;
@@ -578,12 +860,6 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
cpi->find_fractional_mv_step = av1_find_best_sub_pixel_tree_pruned_evenmore;
}
- cpi->optimize_speed_feature =
- oxcf->pass != 1 ? sf->optimize_coefficients : NO_TRELLIS_OPT;
- // FIXME: trellis not very efficient for quantisation matrices
- if (cm->using_qmatrix) cpi->optimize_speed_feature = NO_TRELLIS_OPT;
- if (oxcf->disable_trellis_quant) cpi->optimize_speed_feature = NO_TRELLIS_OPT;
-
x->min_partition_size = sf->default_min_partition_size;
x->max_partition_size = sf->default_max_partition_size;
@@ -592,6 +868,17 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
cpi->find_fractional_mv_step = av1_return_max_sub_pixel_mv;
else if (cpi->oxcf.motion_vector_unit_test == 2)
cpi->find_fractional_mv_step = av1_return_min_sub_pixel_mv;
+ cpi->max_comp_type_rd_threshold_mul =
+ comp_type_rd_threshold_mul[sf->prune_comp_type_by_comp_avg];
+ cpi->max_comp_type_rd_threshold_div =
+ comp_type_rd_threshold_div[sf->prune_comp_type_by_comp_avg];
+ const int tx_domain_speed = AOMMIN(speed, MAX_TX_DOMAIN_EVAL_SPEED);
+ cpi->tx_domain_dist_threshold = tx_domain_dist_thresholds[tx_domain_speed];
+
+ // assert ensures that coeff_opt_dist_thresholds is accessed correctly
+ assert(cpi->sf.perform_coeff_opt >= 0 && cpi->sf.perform_coeff_opt < 5);
+ cpi->coeff_opt_dist_threshold =
+ coeff_opt_dist_thresholds[cpi->sf.perform_coeff_opt];
#if CONFIG_DIST_8X8
if (sf->use_transform_domain_distortion > 0) cpi->oxcf.using_dist_8x8 = 0;
@@ -600,6 +887,9 @@ void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
#endif // CONFIG_DIST_8X8
if (cpi->oxcf.row_mt == 1 && (cpi->oxcf.max_threads > 1)) {
sf->adaptive_rd_thresh = 0;
- sf->inter_mode_rd_model_estimation = 0;
+ if (sf->inter_mode_rd_model_estimation == 1) {
+ sf->inter_mode_rd_model_estimation = 0;
+ sf->inter_mode_rd_model_estimation_adaptive = 0;
+ }
}
}
diff --git a/libaom/av1/encoder/speed_features.h b/libaom/av1/encoder/speed_features.h
index f71dcbf..a321192 100644
--- a/libaom/av1/encoder/speed_features.h
+++ b/libaom/av1/encoder/speed_features.h
@@ -73,7 +73,7 @@ enum {
(1 << THR_ALTR) | (1 << THR_GOLD)
};
-typedef enum {
+enum {
TXFM_CODING_SF = 1,
INTER_PRED_SF = 2,
INTRA_PRED_SF = 4,
@@ -82,9 +82,9 @@ typedef enum {
RD_SKIP_SF = 32,
RESERVE_2_SF = 64,
RESERVE_3_SF = 128,
-} DEV_SPEED_FEATURES;
+} UENUM1BYTE(DEV_SPEED_FEATURES);
-typedef enum {
+enum {
DIAMOND = 0,
NSTEP = 1,
HEX = 2,
@@ -92,9 +92,9 @@ typedef enum {
SQUARE = 4,
FAST_HEX = 5,
FAST_DIAMOND = 6
-} SEARCH_METHODS;
+} UENUM1BYTE(SEARCH_METHODS);
-typedef enum {
+enum {
// No recode.
DISALLOW_RECODE = 0,
// Allow recode for KF and exceeding maximum frame bandwidth.
@@ -103,28 +103,23 @@ typedef enum {
ALLOW_RECODE_KFARFGF = 2,
// Allow recode for all frames based on bitrate constraints.
ALLOW_RECODE = 3,
-} RECODE_LOOP_TYPE;
+} UENUM1BYTE(RECODE_LOOP_TYPE);
-typedef enum {
+enum {
SUBPEL_TREE = 0,
SUBPEL_TREE_PRUNED = 1, // Prunes 1/2-pel searches
SUBPEL_TREE_PRUNED_MORE = 2, // Prunes 1/2-pel searches more aggressively
SUBPEL_TREE_PRUNED_EVENMORE = 3, // Prunes 1/2- and 1/4-pel searches
// Other methods to come
-} SUBPEL_SEARCH_METHODS;
+} UENUM1BYTE(SUBPEL_SEARCH_METHODS);
-typedef enum {
+enum {
USE_FULL_RD = 0,
USE_FAST_RD,
USE_LARGESTALL,
-} TX_SIZE_SEARCH_METHOD;
-
-typedef enum {
- NOT_IN_USE = 0,
- RELAXED_NEIGHBORING_MIN_MAX = 1
-} AUTO_MIN_MAX_MODE;
+} UENUM1BYTE(TX_SIZE_SEARCH_METHOD);
-typedef enum {
+enum {
// Try the full image with different values.
LPF_PICK_FROM_FULL_IMAGE,
// Try a small portion of the image with different values.
@@ -133,9 +128,9 @@ typedef enum {
LPF_PICK_FROM_Q,
// Pick 0 to disable LPF if LPF was enabled last frame
LPF_PICK_MINIMAL_LPF
-} LPF_PICK_METHOD;
+} UENUM1BYTE(LPF_PICK_METHOD);
-typedef enum {
+enum {
// Terminate search early based on distortion so far compared to
// qp step, distortion in the neighborhood of the frame, etc.
FLAG_EARLY_TERMINATE = 1 << 0,
@@ -152,9 +147,9 @@ typedef enum {
// Skips intra modes other than DC_PRED if the source variance is small
FLAG_SKIP_INTRA_LOWVAR = 1 << 5,
-} MODE_SEARCH_SKIP_LOGIC;
+} UENUM1BYTE(MODE_SEARCH_SKIP_LOGIC);
-typedef enum {
+enum {
NO_PRUNE = 0,
// eliminates one tx type in vertical and horizontal direction
PRUNE_ONE = 1,
@@ -165,7 +160,7 @@ typedef enum {
PRUNE_2D_ACCURATE = 3,
// similar, but applies much more aggressive pruning to get better speed-up
PRUNE_2D_FAST = 4,
-} TX_TYPE_PRUNE_MODE;
+} UENUM1BYTE(TX_TYPE_PRUNE_MODE);
typedef struct {
TX_TYPE_PRUNE_MODE prune_mode;
@@ -184,15 +179,31 @@ typedef struct {
int skip_tx_search;
} TX_TYPE_SEARCH;
-typedef enum {
+enum {
// Search partitions using RD criterion
SEARCH_PARTITION,
// Always use a fixed size partition
FIXED_PARTITION,
- REFERENCE_PARTITION
-} PARTITION_SEARCH_TYPE;
+ REFERENCE_PARTITION,
+
+ VAR_BASED_PARTITION
+} UENUM1BYTE(PARTITION_SEARCH_TYPE);
+
+enum {
+ EIGHTH_PEL,
+ QUARTER_PEL,
+ HALF_PEL,
+ FULL_PEL
+} UENUM1BYTE(SUBPEL_FORCE_STOP);
+
+enum {
+ NOT_IN_USE,
+ DIRECT_PRED,
+ RELAXED_PRED,
+ ADAPT_PRED
+} UENUM1BYTE(MAX_PART_PRED_MODE);
typedef struct MV_SPEED_FEATURES {
// Motion search method (Diamond, NSTEP, Hex, Big Diamond, Square, etc).
@@ -215,8 +226,8 @@ typedef struct MV_SPEED_FEATURES {
// Maximum number of steps in logarithmic subpel search before giving up.
int subpel_iters_per_step;
- // Control when to stop subpel search
- int subpel_force_stop;
+ // When to stop subpel search.
+ SUBPEL_FORCE_STOP subpel_force_stop;
} MV_SPEED_FEATURES;
#define MAX_MESH_STEP 4
@@ -226,35 +237,43 @@ typedef struct MESH_PATTERN {
int interval;
} MESH_PATTERN;
-typedef enum {
+enum {
GM_FULL_SEARCH,
- GM_REDUCED_REF_SEARCH,
+ GM_REDUCED_REF_SEARCH_SKIP_L2_L3,
+ GM_REDUCED_REF_SEARCH_SKIP_L2_L3_ARF2,
GM_DISABLE_SEARCH
-} GM_SEARCH_TYPE;
+} UENUM1BYTE(GM_SEARCH_TYPE);
-typedef enum {
+enum {
GM_ERRORADV_TR_0,
GM_ERRORADV_TR_1,
GM_ERRORADV_TR_2,
GM_ERRORADV_TR_TYPES,
-} GM_ERRORADV_TYPE;
+} UENUM1BYTE(GM_ERRORADV_TYPE);
-typedef enum {
- NO_TRELLIS_OPT, // No trellis optimization
- FULL_TRELLIS_OPT, // Trellis optimization in all stages
- FINAL_PASS_TRELLIS_OPT // Trellis optimization in only the final encode pass
-} TRELLIS_OPT_TYPE;
+enum {
+ NO_TRELLIS_OPT, // No trellis optimization
+ FULL_TRELLIS_OPT, // Trellis optimization in all stages
+ FINAL_PASS_TRELLIS_OPT, // Trellis optimization in only the final encode pass
+ NO_ESTIMATE_YRD_TRELLIS_OPT // Disable trellis in estimate_yrd_for_sb
+} UENUM1BYTE(TRELLIS_OPT_TYPE);
-typedef enum {
+enum {
FULL_TXFM_RD,
LOW_TXFM_RD,
-} TXFM_RD_MODEL;
+} UENUM1BYTE(TXFM_RD_MODEL);
+
+enum {
+ DIST_WTD_COMP_ENABLED,
+ DIST_WTD_COMP_SKIP_MV_SEARCH,
+ DIST_WTD_COMP_DISABLED,
+} UENUM1BYTE(DIST_WTD_COMP_FLAG);
typedef enum {
- JNT_COMP_ENABLED,
- JNT_COMP_SKIP_MV_SEARCH,
- JNT_COMP_DISABLED,
-} JNT_COMP_FLAG;
+ FLAG_SKIP_EIGHTTAP = 1 << EIGHTTAP_REGULAR,
+ FLAG_SKIP_EIGHTTAP_SMOOTH = 1 << EIGHTTAP_SMOOTH,
+ FLAG_SKIP_EIGHTTAP_SHARP = 1 << MULTITAP_SHARP,
+} INTERP_FILTER_MASK;
typedef struct SPEED_FEATURES {
MV_SPEED_FEATURES mv;
@@ -335,11 +354,16 @@ typedef struct SPEED_FEATURES {
// 1: use model based rd breakout
int model_based_post_interp_filter_breakout;
+ // Model based breakout in motion_mode_rd
+ // 0: no breakout
+ // 1: use model based rd breakout
+ int model_based_motion_mode_rd_breakout;
+
// Used if partition_search_type = FIXED_SIZE_PARTITION
BLOCK_SIZE always_this_block_size;
// Drop less likely to be picked reference frames in the RD search.
- // Has four levels for now: 0, 1, 2 and 3, where higher levels prune more
+ // Has five levels for now: 0, 1, 2, 3 and 4, where higher levels prune more
// aggressively than lower ones. (0 means no pruning).
int selective_ref_frame;
@@ -351,6 +375,10 @@ typedef struct SPEED_FEATURES {
// Use a ML model to prune horz and vert partitions
int ml_prune_rect_partition;
+ // Disable/Enable interintra motion mode based on stats collected during
+ // first_partition_search_pass
+ int use_first_partition_pass_interintra_stats;
+
// Use a ML model to prune horz_a, horz_b, vert_a and vert_b partitions.
int ml_prune_ab_partition;
@@ -359,12 +387,13 @@ typedef struct SPEED_FEATURES {
int fast_cdef_search;
- // 2-pass coding block partition search
+ // 2-pass coding block partition search, and also use the mode decisions made
+ // in the initial partition search to prune mode candidates, e.g. ref frames.
int two_pass_partition_search;
- // Use the mode decisions made in the initial partition search to prune mode
- // candidates, e.g. ref frames.
- int mode_pruning_based_on_two_pass_partition_search;
+ // Terminate early in firstpass of two_pass partition search for faster
+ // firstpass.
+ int firstpass_simple_motion_search_early_term;
// Skip rectangular partition test when partition type none gives better
// rd than partition type split. Can take values 0 - 2, 0 referring to no
@@ -375,14 +404,17 @@ typedef struct SPEED_FEATURES {
BLOCK_SIZE use_square_partition_only_threshold;
// Prune reference frames for rectangular partitions.
+ // 0 implies no pruning
+ // 1 implies prune for extended partition
+ // 2 implies prune horiz, vert and extended partition
int prune_ref_frame_for_rect_partitions;
- // Prune ref/mode choices for partitions.
- int prune_ref_mode_for_partitions;
+ // Sets min and max square partition levels for this superblock based on
+ // motion vector and prediction error distribution produced from 16x16
+ // simple motion search
+ MAX_PART_PRED_MODE auto_max_partition_based_on_simple_motion;
+ int auto_min_partition_based_on_simple_motion;
- // Sets min and max partition sizes for this superblock based on the
- // same superblock in last encoded frame, and the left and above neighbor.
- AUTO_MIN_MAX_MODE auto_min_max_partition_size;
// Ensures the rd based auto partition search will always
// go down at least to the specified level.
BLOCK_SIZE rd_auto_partition_min_limit;
@@ -396,11 +428,6 @@ typedef struct SPEED_FEATURES {
// frame's partitioning. Only used if use_lastframe_partitioning is set.
int adjust_partitioning_from_last_frame;
- // Disables sub 8x8 blocksizes in different scenarios: Choices are to disable
- // it always, to allow it for only Last frame and Intra, disable it for all
- // inter modes or to enable it always.
- int disable_split_mask;
-
// TODO(jingning): combine the related motion search speed features
// This allows us to use motion search at other sizes as a starting
// point for this motion search and limits the search range around it.
@@ -427,8 +454,6 @@ typedef struct SPEED_FEATURES {
// Adaptive prediction mode search
int adaptive_mode_search;
- int cb_partition_search;
-
int alt_ref_search_fp;
// Implements various heuristics to skip searching modes
@@ -541,18 +566,26 @@ typedef struct SPEED_FEATURES {
// Calculate RD cost before doing optimize_b, and skip if the cost is large.
int optimize_b_precheck;
- // Use model rd instead of transform search in jnt_comp
- int jnt_comp_fast_tx_search;
+ // Use two-loop compound search
+ int two_loop_comp_search;
+
+ // Use model rd instead of transform search in second loop of compound search
+ int second_loop_comp_fast_tx_search;
// Decide when and how to use joint_comp.
- JNT_COMP_FLAG use_jnt_comp_flag;
+ DIST_WTD_COMP_FLAG use_dist_wtd_comp_flag;
// Decoder side speed feature to add penalty for use of dual-sgr filters.
// Takes values 0 - 10, 0 indicating no penalty and each additional level
// adding a penalty of 1%
int dual_sgr_penalty_level;
- // Dynamically estimate final rd from prediction error and mode cost
+ // 2-pass inter mode model estimation where the preliminary pass skips
+ // transform search and uses a model to estimate rd, while the final pass
+ // computes the full transform search. Two types of models are supported:
+ // 0: not used
+ // 1: used with online dynamic rd model
+ // 2: used with static rd model
int inter_mode_rd_model_estimation;
// Skip some ref frames in compound motion search by single motion search
@@ -581,24 +614,95 @@ typedef struct SPEED_FEATURES {
// Prune intra mode candidates based on source block gradient stats.
int intra_angle_estimation;
- // Performs full pixel motion search before none_partition to decide if we
- // want to split directly without trying other partition types.
- int full_pixel_motion_search_based_split;
-
// Skip obmc or warped motion mode when neighborhood motion field is
// identical
int skip_obmc_in_uniform_mv_field;
int skip_wm_in_uniform_mv_field;
+ // Enable/disable ME for interinter wedge search.
+ int disable_interinter_wedge_newmv_search;
+
+ // Enable/disable smooth inter-intra mode
+ int disable_smooth_interintra;
+
// skip sharp_filter evaluation based on regular and smooth filter rd for
// dual_filter=0 case
int skip_sharp_interp_filter_search;
+
+ // prune wedge and compound segment approximate rd evaluation based on
+ // compound average rd/ref_best_rd
+ int prune_comp_type_by_comp_avg;
+
+ // Prune/gate motion mode evaluation based on token based rd
+ // during transform search for inter blocks
+ // Values are 0 (not used) , 1 - 3 with progressively increasing
+ // aggressiveness
+ int prune_motion_mode_level;
+
+ // Gate warp evaluation for motions of type IDENTITY,
+ // TRANSLATION and AFFINE(based on number of warp neighbors)
+ int prune_warp_using_wmtype;
+
+ // Perform simple_motion_search on each possible subblock and use it to prune
+ // PARTITION_HORZ and PARTITION_VERT.
+ int simple_motion_search_prune_rect;
+
+ // Perform simple motion search before none_partition to decide if we
+ // want to split directly without trying other partition types.
+ int simple_motion_search_split_only;
+
+ // Use features from simple_motion_search to terminate prediction block
+ // partition after PARTITION_NONE
+ int simple_motion_search_early_term_none;
+
+ int cb_pred_filter_search;
+
+ // adaptive interp_filter search to allow skip of certain filter types.
+ int adaptive_interp_filter_search;
+
+ // mask for skip evaluation of certain interp_filter type.
+ INTERP_FILTER_MASK interp_filter_search_mask;
+
+ // Flag used to control the ref_best_rd based gating for chroma
+ int perform_best_rd_based_gating_for_chroma;
+
+ // Enable/disable interintra wedge search.
+ int disable_wedge_interintra_search;
+
+ // Disable loop restoration for Chroma plane
+ int disable_loop_restoration_chroma;
+
+ // Flag used to control the extent of coeff R-D optimization
+ int perform_coeff_opt;
+
+ // Flag used to control the speed of the eob selection in trellis.
+ int trellis_eob_fast;
+
+ // This flag controls the use of non-RD mode decision.
+ int use_nonrd_pick_mode;
+
+ // prune wedge and compound segment approximate rd evaluation based on
+ // compound average modeled rd
+ int prune_comp_type_by_model_rd;
+
+ // Enable/disable smooth intra modes.
+ int disable_smooth_intra;
+
+ // use reduced ref set for real-time mode
+ int use_real_time_ref_set;
+
+ // Perform a full TX search on some modes while using the
+ // inter-mode RD model for others. Only enabled when
+ // inter_mode_rd_model_estimation != 0
+ int inter_mode_rd_model_estimation_adaptive;
} SPEED_FEATURES;
struct AV1_COMP;
-void av1_set_speed_features_framesize_independent(struct AV1_COMP *cpi);
-void av1_set_speed_features_framesize_dependent(struct AV1_COMP *cpi);
+void av1_set_speed_features_framesize_independent(struct AV1_COMP *cpi,
+ int speed);
+void av1_set_speed_features_framesize_dependent(struct AV1_COMP *cpi,
+ int speed);
#ifdef __cplusplus
} // extern "C"
diff --git a/libaom/av1/encoder/temporal_filter.c b/libaom/av1/encoder/temporal_filter.c
index ace585e..ba883d7 100644
--- a/libaom/av1/encoder/temporal_filter.c
+++ b/libaom/av1/encoder/temporal_filter.c
@@ -37,13 +37,22 @@
#define EDGE_THRESHOLD 50
#define SQRT_PI_BY_2 1.25331413732
+static unsigned int index_mult[14] = {
+ 0, 0, 0, 0, 49152, 39322, 32768, 28087, 24576, 21846, 19661, 17874, 0, 15124
+};
+
+static int64_t highbd_index_mult[14] = { 0U, 0U, 0U,
+ 0U, 3221225472U, 2576980378U,
+ 2147483648U, 1840700270U, 1610612736U,
+ 1431655766U, 1288490189U, 1171354718U,
+ 0U, 991146300U };
+
static void temporal_filter_predictors_mb_c(
MACROBLOCKD *xd, uint8_t *y_mb_ptr, uint8_t *u_mb_ptr, uint8_t *v_mb_ptr,
int stride, int uv_block_width, int uv_block_height, int mv_row, int mv_col,
uint8_t *pred, struct scale_factors *scale, int x, int y,
- int can_use_previous, int num_planes) {
- const MV mv = { mv_row, mv_col };
- enum mv_precision mv_precision_uv;
+ int can_use_previous, int num_planes, MV *blk_mvs, int use_32x32) {
+ mv_precision mv_precision_uv;
int uv_stride;
// TODO(angiebird): change plane setting accordingly
ConvolveParams conv_params = get_conv_params(0, 0, xd->bd);
@@ -52,33 +61,146 @@ static void temporal_filter_predictors_mb_c(
WarpTypesAllowed warp_types;
memset(&warp_types, 0, sizeof(WarpTypesAllowed));
- if (uv_block_width == 8) {
+ const int ssx = (uv_block_width == (BW >> 1)) ? 1 : 0;
+ if (ssx) {
uv_stride = (stride + 1) >> 1;
mv_precision_uv = MV_PRECISION_Q4;
} else {
uv_stride = stride;
mv_precision_uv = MV_PRECISION_Q3;
}
- av1_build_inter_predictor(y_mb_ptr, stride, &pred[0], 16, &mv, scale, 16, 16,
- &conv_params, interp_filters, &warp_types, x, y, 0,
- 0, MV_PRECISION_Q3, x, y, xd, can_use_previous);
+ if (use_32x32) {
+ assert(mv_row >= INT16_MIN && mv_row <= INT16_MAX && mv_col >= INT16_MIN &&
+ mv_col <= INT16_MAX);
+ const MV mv = { (int16_t)mv_row, (int16_t)mv_col };
+
+ av1_build_inter_predictor(y_mb_ptr, stride, &pred[0], BW, &mv, scale, BW,
+ BH, &conv_params, interp_filters, &warp_types, x,
+ y, 0, 0, MV_PRECISION_Q3, x, y, xd,
+ can_use_previous);
+ if (num_planes > 1) {
+ av1_build_inter_predictor(
+ u_mb_ptr, uv_stride, &pred[BLK_PELS], uv_block_width, &mv, scale,
+ uv_block_width, uv_block_height, &conv_params, interp_filters,
+ &warp_types, x, y, 1, 0, mv_precision_uv, x, y, xd, can_use_previous);
+ av1_build_inter_predictor(
+ v_mb_ptr, uv_stride, &pred[(BLK_PELS << 1)], uv_block_width, &mv,
+ scale, uv_block_width, uv_block_height, &conv_params, interp_filters,
+ &warp_types, x, y, 2, 0, mv_precision_uv, x, y, xd, can_use_previous);
+ }
+
+ return;
+ }
+
+ // While use_32x32 = 0, construct the 32x32 predictor using 4 16x16
+ // predictors.
+ int i, j, k = 0, ys = (BH >> 1), xs = (BW >> 1);
+ // Y predictor
+ for (i = 0; i < BH; i += ys) {
+ for (j = 0; j < BW; j += xs) {
+ const MV mv = blk_mvs[k];
+ const int y_offset = i * stride + j;
+ const int p_offset = i * BW + j;
+
+ av1_build_inter_predictor(y_mb_ptr + y_offset, stride, &pred[p_offset],
+ BW, &mv, scale, xs, ys, &conv_params,
+ interp_filters, &warp_types, x, y, 0, 0,
+ MV_PRECISION_Q3, x, y, xd, can_use_previous);
+ k++;
+ }
+ }
+
+ // U and V predictors
if (num_planes > 1) {
- av1_build_inter_predictor(
- u_mb_ptr, uv_stride, &pred[256], uv_block_width, &mv, scale,
- uv_block_width, uv_block_height, &conv_params, interp_filters,
- &warp_types, x, y, 1, 0, mv_precision_uv, x, y, xd, can_use_previous);
-
- av1_build_inter_predictor(
- v_mb_ptr, uv_stride, &pred[512], uv_block_width, &mv, scale,
- uv_block_width, uv_block_height, &conv_params, interp_filters,
- &warp_types, x, y, 2, 0, mv_precision_uv, x, y, xd, can_use_previous);
+ ys = (uv_block_height >> 1);
+ xs = (uv_block_width >> 1);
+ k = 0;
+
+ for (i = 0; i < uv_block_height; i += ys) {
+ for (j = 0; j < uv_block_width; j += xs) {
+ const MV mv = blk_mvs[k];
+ const int uv_offset = i * uv_stride + j;
+ const int p_offset = i * uv_block_width + j;
+
+ av1_build_inter_predictor(u_mb_ptr + uv_offset, uv_stride,
+ &pred[BLK_PELS + p_offset], uv_block_width,
+ &mv, scale, xs, ys, &conv_params,
+ interp_filters, &warp_types, x, y, 1, 0,
+ mv_precision_uv, x, y, xd, can_use_previous);
+ av1_build_inter_predictor(
+ v_mb_ptr + uv_offset, uv_stride, &pred[(BLK_PELS << 1) + p_offset],
+ uv_block_width, &mv, scale, xs, ys, &conv_params, interp_filters,
+ &warp_types, x, y, 2, 0, mv_precision_uv, x, y, xd,
+ can_use_previous);
+ k++;
+ }
+ }
}
}
-static INLINE int64_t mod_index(int64_t sum_dist, int index, int rounding,
- int strength, int filter_weight) {
- int64_t mod = (sum_dist * 3) / index;
+static void apply_temporal_filter_self(const uint8_t *pred, int buf_stride,
+ unsigned int block_width,
+ unsigned int block_height,
+ int filter_weight, uint32_t *accumulator,
+ uint16_t *count) {
+ const int modifier = filter_weight * 16;
+ unsigned int i, j, k = 0;
+ assert(filter_weight == 2);
+
+ for (i = 0; i < block_height; i++) {
+ for (j = 0; j < block_width; j++) {
+ const int pixel_value = pred[i * buf_stride + j];
+ count[k] += modifier;
+ accumulator[k] += modifier * pixel_value;
+ ++k;
+ }
+ }
+}
+
+static void highbd_apply_temporal_filter_self(
+ const uint8_t *pred8, int buf_stride, unsigned int block_width,
+ unsigned int block_height, int filter_weight, uint32_t *accumulator,
+ uint16_t *count) {
+ const int modifier = filter_weight * 16;
+ const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ unsigned int i, j, k = 0;
+ assert(filter_weight == 2);
+
+ for (i = 0; i < block_height; i++) {
+ for (j = 0; j < block_width; j++) {
+ const int pixel_value = pred[i * buf_stride + j];
+ count[k] += modifier;
+ accumulator[k] += modifier * pixel_value;
+ ++k;
+ }
+ }
+}
+
+static INLINE int mod_index(int sum_dist, int index, int rounding, int strength,
+ int filter_weight) {
+ assert(index >= 0 && index <= 13);
+ assert(index_mult[index] != 0);
+
+ int mod = (clamp(sum_dist, 0, UINT16_MAX) * index_mult[index]) >> 16;
+ mod += rounding;
+ mod >>= strength;
+
+ mod = AOMMIN(16, mod);
+
+ mod = 16 - mod;
+ mod *= filter_weight;
+
+ return mod;
+}
+
+static INLINE int highbd_mod_index(int64_t sum_dist, int index, int rounding,
+ int strength, int filter_weight) {
+ assert(index >= 0 && index <= 13);
+ assert(highbd_index_mult[index] != 0);
+
+ int mod =
+ (int)((AOMMIN(sum_dist, INT32_MAX) * highbd_index_mult[index]) >> 32);
mod += rounding;
mod >>= strength;
@@ -106,12 +228,35 @@ static INLINE void calculate_squared_errors(const uint8_t *s, int s_stride,
}
}
-static void apply_temporal_filter(
+static INLINE int get_filter_weight(unsigned int i, unsigned int j,
+ unsigned int block_height,
+ unsigned int block_width, const int *blk_fw,
+ int use_32x32) {
+ if (use_32x32)
+ // blk_fw[0] ~ blk_fw[3] are the same.
+ return blk_fw[0];
+
+ int filter_weight = 0;
+ if (i < block_height / 2) {
+ if (j < block_width / 2)
+ filter_weight = blk_fw[0];
+ else
+ filter_weight = blk_fw[1];
+ } else {
+ if (j < block_width / 2)
+ filter_weight = blk_fw[2];
+ else
+ filter_weight = blk_fw[3];
+ }
+ return filter_weight;
+}
+
+void av1_apply_temporal_filter_c(
const uint8_t *y_frame1, int y_stride, const uint8_t *y_pred,
int y_buf_stride, const uint8_t *u_frame1, const uint8_t *v_frame1,
int uv_stride, const uint8_t *u_pred, const uint8_t *v_pred,
int uv_buf_stride, unsigned int block_width, unsigned int block_height,
- int ss_x, int ss_y, int strength, int filter_weight,
+ int ss_x, int ss_y, int strength, const int *blk_fw, int use_32x32,
uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator,
uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count) {
unsigned int i, j, k, m;
@@ -119,20 +264,17 @@ static void apply_temporal_filter(
const int rounding = (1 << strength) >> 1;
const unsigned int uv_block_width = block_width >> ss_x;
const unsigned int uv_block_height = block_height >> ss_y;
- DECLARE_ALIGNED(16, uint16_t, y_diff_sse[256]);
- DECLARE_ALIGNED(16, uint16_t, u_diff_sse[256]);
- DECLARE_ALIGNED(16, uint16_t, v_diff_sse[256]);
+ DECLARE_ALIGNED(16, uint16_t, y_diff_sse[BLK_PELS]);
+ DECLARE_ALIGNED(16, uint16_t, u_diff_sse[BLK_PELS]);
+ DECLARE_ALIGNED(16, uint16_t, v_diff_sse[BLK_PELS]);
int idx = 0, idy;
- assert(filter_weight >= 0);
- assert(filter_weight <= 2);
-
- memset(y_diff_sse, 0, 256 * sizeof(uint16_t));
- memset(u_diff_sse, 0, 256 * sizeof(uint16_t));
- memset(v_diff_sse, 0, 256 * sizeof(uint16_t));
+ memset(y_diff_sse, 0, BLK_PELS * sizeof(uint16_t));
+ memset(u_diff_sse, 0, BLK_PELS * sizeof(uint16_t));
+ memset(v_diff_sse, 0, BLK_PELS * sizeof(uint16_t));
- // Calculate diff^2 for each pixel of the 16x16 block.
+ // Calculate diff^2 for each pixel of the block.
// TODO(yunqing): the following code needs to be optimized.
calculate_squared_errors(y_frame1, y_stride, y_pred, y_buf_stride, y_diff_sse,
block_width, block_height);
@@ -144,6 +286,8 @@ static void apply_temporal_filter(
for (i = 0, k = 0, m = 0; i < block_height; i++) {
for (j = 0; j < block_width; j++) {
const int pixel_value = y_pred[i * y_buf_stride + j];
+ int filter_weight =
+ get_filter_weight(i, j, block_height, block_width, blk_fw, use_32x32);
// non-local mean approach
int y_index = 0;
@@ -249,22 +393,22 @@ static INLINE void highbd_calculate_squared_errors(
}
}
-static void highbd_apply_temporal_filter(
+void av1_highbd_apply_temporal_filter_c(
const uint8_t *yf, int y_stride, const uint8_t *yp, int y_buf_stride,
const uint8_t *uf, const uint8_t *vf, int uv_stride, const uint8_t *up,
const uint8_t *vp, int uv_buf_stride, unsigned int block_width,
unsigned int block_height, int ss_x, int ss_y, int strength,
- int filter_weight, uint32_t *y_accumulator, uint16_t *y_count,
- uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator,
- uint16_t *v_count) {
+ const int *blk_fw, int use_32x32, uint32_t *y_accumulator,
+ uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count,
+ uint32_t *v_accumulator, uint16_t *v_count) {
unsigned int i, j, k, m;
int64_t modifier;
const int rounding = (1 << strength) >> 1;
const unsigned int uv_block_width = block_width >> ss_x;
const unsigned int uv_block_height = block_height >> ss_y;
- DECLARE_ALIGNED(16, uint32_t, y_diff_sse[256]);
- DECLARE_ALIGNED(16, uint32_t, u_diff_sse[256]);
- DECLARE_ALIGNED(16, uint32_t, v_diff_sse[256]);
+ DECLARE_ALIGNED(16, uint32_t, y_diff_sse[BLK_PELS]);
+ DECLARE_ALIGNED(16, uint32_t, u_diff_sse[BLK_PELS]);
+ DECLARE_ALIGNED(16, uint32_t, v_diff_sse[BLK_PELS]);
const uint16_t *y_frame1 = CONVERT_TO_SHORTPTR(yf);
const uint16_t *u_frame1 = CONVERT_TO_SHORTPTR(uf);
@@ -274,14 +418,11 @@ static void highbd_apply_temporal_filter(
const uint16_t *v_pred = CONVERT_TO_SHORTPTR(vp);
int idx = 0, idy;
- assert(filter_weight >= 0);
- assert(filter_weight <= 2);
-
- memset(y_diff_sse, 0, 256 * sizeof(uint32_t));
- memset(u_diff_sse, 0, 256 * sizeof(uint32_t));
- memset(v_diff_sse, 0, 256 * sizeof(uint32_t));
+ memset(y_diff_sse, 0, BLK_PELS * sizeof(uint32_t));
+ memset(u_diff_sse, 0, BLK_PELS * sizeof(uint32_t));
+ memset(v_diff_sse, 0, BLK_PELS * sizeof(uint32_t));
- // Calculate diff^2 for each pixel of the 16x16 block.
+ // Calculate diff^2 for each pixel of the block.
// TODO(yunqing): the following code needs to be optimized.
highbd_calculate_squared_errors(y_frame1, y_stride, y_pred, y_buf_stride,
y_diff_sse, block_width, block_height);
@@ -293,6 +434,8 @@ static void highbd_apply_temporal_filter(
for (i = 0, k = 0, m = 0; i < block_height; i++) {
for (j = 0; j < block_width; j++) {
const int pixel_value = y_pred[i * y_buf_stride + j];
+ int filter_weight =
+ get_filter_weight(i, j, block_height, block_width, blk_fw, use_32x32);
// non-local mean approach
int y_index = 0;
@@ -321,11 +464,11 @@ static void highbd_apply_temporal_filter(
y_index += 2;
- modifier =
- mod_index(modifier, y_index, rounding, strength, filter_weight);
+ const int final_y_mod = highbd_mod_index(modifier, y_index, rounding,
+ strength, filter_weight);
- y_count[k] += modifier;
- y_accumulator[k] += modifier * pixel_value;
+ y_count[k] += final_y_mod;
+ y_accumulator[k] += final_y_mod * pixel_value;
++k;
@@ -367,13 +510,15 @@ static void highbd_apply_temporal_filter(
u_mod += y_diff;
v_mod += y_diff;
- u_mod = mod_index(u_mod, cr_index, rounding, strength, filter_weight);
- v_mod = mod_index(v_mod, cr_index, rounding, strength, filter_weight);
+ const int final_u_mod = highbd_mod_index(u_mod, cr_index, rounding,
+ strength, filter_weight);
+ const int final_v_mod = highbd_mod_index(v_mod, cr_index, rounding,
+ strength, filter_weight);
- u_count[m] += u_mod;
- u_accumulator[m] += u_mod * u_pixel_value;
- v_count[m] += v_mod;
- v_accumulator[m] += v_mod * v_pixel_value;
+ u_count[m] += final_u_mod;
+ u_accumulator[m] += final_u_mod * u_pixel_value;
+ v_count[m] += final_v_mod;
+ v_accumulator[m] += final_v_mod * v_pixel_value;
++m;
} // Complete YUV pixel
@@ -385,8 +530,8 @@ static void highbd_apply_temporal_filter(
void av1_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride,
uint8_t *frame2, unsigned int block_width,
unsigned int block_height, int strength,
- int filter_weight, unsigned int *accumulator,
- uint16_t *count) {
+ const int *blk_fw, int use_32x32,
+ unsigned int *accumulator, uint16_t *count) {
unsigned int i, j, k;
int modifier;
int byte = 0;
@@ -395,6 +540,8 @@ void av1_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride,
for (i = 0, k = 0; i < block_height; i++) {
for (j = 0; j < block_width; j++, k++) {
int pixel_value = *frame2;
+ int filter_weight =
+ get_filter_weight(i, j, block_height, block_width, blk_fw, use_32x32);
// non-local mean approach
int diff_sse[9] = { 0 };
@@ -447,7 +594,7 @@ void av1_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride,
void av1_highbd_temporal_filter_apply_c(
uint8_t *frame1_8, unsigned int stride, uint8_t *frame2_8,
unsigned int block_width, unsigned int block_height, int strength,
- int filter_weight, unsigned int *accumulator, uint16_t *count) {
+ int *blk_fw, int use_32x32, unsigned int *accumulator, uint16_t *count) {
uint16_t *frame1 = CONVERT_TO_SHORTPTR(frame1_8);
uint16_t *frame2 = CONVERT_TO_SHORTPTR(frame2_8);
unsigned int i, j, k;
@@ -458,6 +605,8 @@ void av1_highbd_temporal_filter_apply_c(
for (i = 0, k = 0; i < block_height; i++) {
for (j = 0; j < block_width; j++, k++) {
int pixel_value = *frame2;
+ int filter_weight =
+ get_filter_weight(i, j, block_height, block_width, blk_fw, use_32x32);
// non-local mean approach
int diff_sse[9] = { 0 };
@@ -509,8 +658,8 @@ void av1_highbd_temporal_filter_apply_c(
static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi,
uint8_t *arf_frame_buf,
uint8_t *frame_ptr_buf,
- int stride, int x_pos,
- int y_pos) {
+ int stride, int x_pos, int y_pos,
+ MV *blk_mvs, int *blk_bestsme) {
MACROBLOCK *const x = &cpi->td.mb;
MACROBLOCKD *const xd = &x->e_mbd;
const MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
@@ -543,9 +692,12 @@ static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi,
av1_set_mv_search_range(&x->mv_limits, &best_ref_mv1);
- av1_full_pixel_search(cpi, x, BLOCK_16X16, &best_ref_mv1_full, step_param,
- NSTEP, 1, sadpb, cond_cost_list(cpi, cost_list),
- &best_ref_mv1, 0, 0, x_pos, y_pos, 0);
+ // av1_full_pixel_search() parameters: best_ref_mv1_full is the start mv, and
+ // best_ref_mv1 is for mv rate calculation. The search result is stored in
+ // x->best_mv.
+ av1_full_pixel_search(cpi, x, TF_BLOCK, &best_ref_mv1_full, step_param, NSTEP,
+ 1, sadpb, cond_cost_list(cpi, cost_list), &best_ref_mv1,
+ 0, 0, x_pos, y_pos, 0, &cpi->ss_cfg[SS_CFG_LOOKAHEAD]);
x->mv_limits = tmp_mv_limits;
// Ignore mv costing by sending NULL pointer instead of cost array
@@ -559,19 +711,64 @@ static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi,
x->best_mv.as_mv.row *= 8;
x->best_mv.as_mv.col *= 8;
- bestsme = cpi->fn_ptr[BLOCK_16X16].vf(y + offset, y_stride, src_address,
- src_stride, &sse);
- } else {
- bestsme = cpi->find_fractional_mv_step(
- x, &cpi->common, 0, 0, &best_ref_mv1,
- cpi->common.allow_high_precision_mv, x->errorperbit,
- &cpi->fn_ptr[BLOCK_16X16], 0, mv_sf->subpel_iters_per_step,
- cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL,
- NULL, 0, 0, 16, 16, USE_8_TAPS, 1);
+ bestsme = cpi->fn_ptr[TF_BLOCK].vf(y + offset, y_stride, src_address,
+ src_stride, &sse);
+
+ x->e_mbd.mi[0]->mv[0] = x->best_mv;
+
+ // Restore input state
+ x->plane[0].src = src;
+ xd->plane[0].pre[0] = pre;
+
+ return bestsme;
}
+ // find_fractional_mv_step parameters: best_ref_mv1 is for mv rate cost
+ // calculation. The start full mv and the search result are stored in
+ // x->best_mv. mi_row and mi_col are only needed for "av1_is_scaled(sf)=1"
+ // case.
+ bestsme = cpi->find_fractional_mv_step(
+ x, &cpi->common, 0, 0, &best_ref_mv1, cpi->common.allow_high_precision_mv,
+ x->errorperbit, &cpi->fn_ptr[TF_BLOCK], 0, mv_sf->subpel_iters_per_step,
+ cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, NULL,
+ 0, 0, BW, BH, USE_8_TAPS, 1);
+
x->e_mbd.mi[0]->mv[0] = x->best_mv;
+ // DO motion search on 4 16x16 sub_blocks.
+ int i, j, k = 0;
+ best_ref_mv1.row = x->e_mbd.mi[0]->mv[0].as_mv.row;
+ best_ref_mv1.col = x->e_mbd.mi[0]->mv[0].as_mv.col;
+ best_ref_mv1_full.col = best_ref_mv1.col >> 3;
+ best_ref_mv1_full.row = best_ref_mv1.row >> 3;
+
+ for (i = 0; i < BH; i += SUB_BH) {
+ for (j = 0; j < BW; j += SUB_BW) {
+ // Setup frame pointers
+ x->plane[0].src.buf = arf_frame_buf + i * stride + j;
+ x->plane[0].src.stride = stride;
+ xd->plane[0].pre[0].buf = frame_ptr_buf + i * stride + j;
+ xd->plane[0].pre[0].stride = stride;
+
+ av1_set_mv_search_range(&x->mv_limits, &best_ref_mv1);
+ av1_full_pixel_search(cpi, x, TF_SUB_BLOCK, &best_ref_mv1_full,
+ step_param, NSTEP, 1, sadpb,
+ cond_cost_list(cpi, cost_list), &best_ref_mv1, 0, 0,
+ x_pos, y_pos, 0, &cpi->ss_cfg[SS_CFG_LOOKAHEAD]);
+ x->mv_limits = tmp_mv_limits;
+
+ blk_bestsme[k] = cpi->find_fractional_mv_step(
+ x, &cpi->common, 0, 0, &best_ref_mv1,
+ cpi->common.allow_high_precision_mv, x->errorperbit,
+ &cpi->fn_ptr[TF_SUB_BLOCK], 0, mv_sf->subpel_iters_per_step,
+ cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL,
+ NULL, 0, 0, SUB_BW, SUB_BH, USE_8_TAPS, 1);
+
+ blk_mvs[k] = x->best_mv.as_mv;
+ k++;
+ }
+ }
+
// Restore input state
x->plane[0].src = src;
xd->plane[0].pre[0] = pre;
@@ -582,39 +779,42 @@ static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi,
static void temporal_filter_iterate_c(AV1_COMP *cpi,
YV12_BUFFER_CONFIG **frames,
int frame_count, int alt_ref_index,
- int strength, RefBuffer *ref_buf) {
+ int strength,
+ struct scale_factors *ref_scale_factors) {
const AV1_COMMON *cm = &cpi->common;
const int num_planes = av1_num_planes(cm);
int byte;
int frame;
int mb_col, mb_row;
- unsigned int filter_weight;
- int mb_cols = (frames[alt_ref_index]->y_crop_width + 15) >> 4;
- int mb_rows = (frames[alt_ref_index]->y_crop_height + 15) >> 4;
+ int mb_cols = (frames[alt_ref_index]->y_crop_width + BW - 1) >> BW_LOG2;
+ int mb_rows = (frames[alt_ref_index]->y_crop_height + BH - 1) >> BH_LOG2;
int mb_y_offset = 0;
+ int mb_y_src_offset = 0;
int mb_uv_offset = 0;
- DECLARE_ALIGNED(16, unsigned int, accumulator[16 * 16 * 3]);
- DECLARE_ALIGNED(16, uint16_t, count[16 * 16 * 3]);
+ int mb_uv_src_offset = 0;
+ DECLARE_ALIGNED(16, unsigned int, accumulator[BLK_PELS * 3]);
+ DECLARE_ALIGNED(16, uint16_t, count[BLK_PELS * 3]);
MACROBLOCKD *mbd = &cpi->td.mb.e_mbd;
YV12_BUFFER_CONFIG *f = frames[alt_ref_index];
uint8_t *dst1, *dst2;
- DECLARE_ALIGNED(32, uint16_t, predictor16[16 * 16 * 3]);
- DECLARE_ALIGNED(32, uint8_t, predictor8[16 * 16 * 3]);
+ DECLARE_ALIGNED(32, uint16_t, predictor16[BLK_PELS * 3]);
+ DECLARE_ALIGNED(32, uint8_t, predictor8[BLK_PELS * 3]);
uint8_t *predictor;
- const int mb_uv_height = 16 >> mbd->plane[1].subsampling_y;
- const int mb_uv_width = 16 >> mbd->plane[1].subsampling_x;
+ const int mb_uv_height = BH >> mbd->plane[1].subsampling_y;
+ const int mb_uv_width = BW >> mbd->plane[1].subsampling_x;
// Save input state
uint8_t *input_buffer[MAX_MB_PLANE];
int i;
- if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ const int is_hbd = is_cur_buf_hbd(mbd);
+ if (is_hbd) {
predictor = CONVERT_TO_BYTEPTR(predictor16);
} else {
predictor = predictor8;
}
- mbd->block_refs[0] = ref_buf;
- mbd->block_refs[1] = ref_buf;
+ mbd->block_ref_scale_factors[0] = ref_scale_factors;
+ mbd->block_ref_scale_factors[1] = ref_scale_factors;
for (i = 0; i < num_planes; i++) input_buffer[i] = mbd->plane[i].pre[0].buf;
@@ -631,108 +831,173 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi,
// To keep the mv in play for both Y and UV planes the max that it
// can be on a border is therefore 16 - (2*AOM_INTERP_EXTEND+1).
cpi->td.mb.mv_limits.row_min =
- -((mb_row * 16) + (17 - 2 * AOM_INTERP_EXTEND));
+ -((mb_row * BH) + (17 - 2 * AOM_INTERP_EXTEND));
cpi->td.mb.mv_limits.row_max =
- ((mb_rows - 1 - mb_row) * 16) + (17 - 2 * AOM_INTERP_EXTEND);
+ ((mb_rows - 1 - mb_row) * BH) + (17 - 2 * AOM_INTERP_EXTEND);
for (mb_col = 0; mb_col < mb_cols; mb_col++) {
int j, k;
int stride;
- memset(accumulator, 0, 16 * 16 * 3 * sizeof(accumulator[0]));
- memset(count, 0, 16 * 16 * 3 * sizeof(count[0]));
+ memset(accumulator, 0, BLK_PELS * 3 * sizeof(accumulator[0]));
+ memset(count, 0, BLK_PELS * 3 * sizeof(count[0]));
cpi->td.mb.mv_limits.col_min =
- -((mb_col * 16) + (17 - 2 * AOM_INTERP_EXTEND));
+ -((mb_col * BW) + (17 - 2 * AOM_INTERP_EXTEND));
cpi->td.mb.mv_limits.col_max =
- ((mb_cols - 1 - mb_col) * 16) + (17 - 2 * AOM_INTERP_EXTEND);
+ ((mb_cols - 1 - mb_col) * BW) + (17 - 2 * AOM_INTERP_EXTEND);
for (frame = 0; frame < frame_count; frame++) {
- const int thresh_low = 10000;
- const int thresh_high = 20000;
+ // MVs for 4 16x16 sub blocks.
+ MV blk_mvs[4];
+ // Filter weights for 4 16x16 sub blocks.
+ int blk_fw[4] = { 0, 0, 0, 0 };
+ int use_32x32 = 0;
if (frames[frame] == NULL) continue;
mbd->mi[0]->mv[0].as_mv.row = 0;
mbd->mi[0]->mv[0].as_mv.col = 0;
mbd->mi[0]->motion_mode = SIMPLE_TRANSLATION;
+ blk_mvs[0] = kZeroMv;
+ blk_mvs[1] = kZeroMv;
+ blk_mvs[2] = kZeroMv;
+ blk_mvs[3] = kZeroMv;
if (frame == alt_ref_index) {
- filter_weight = 2;
+ blk_fw[0] = blk_fw[1] = blk_fw[2] = blk_fw[3] = 2;
+ use_32x32 = 1;
} else {
+ int thresh_low = 10000;
+ int thresh_high = 20000;
+ int blk_bestsme[4] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX };
+
// Find best match in this frame by MC
int err = temporal_filter_find_matching_mb_c(
- cpi, frames[alt_ref_index]->y_buffer + mb_y_offset,
- frames[frame]->y_buffer + mb_y_offset, frames[frame]->y_stride,
- mb_col * 16, mb_row * 16);
-
- // Assign higher weight to matching MB if it's error
- // score is lower. If not applying MC default behavior
- // is to weight all MBs equal.
- filter_weight = err < thresh_low ? 2 : err < thresh_high ? 1 : 0;
+ cpi, frames[alt_ref_index]->y_buffer + mb_y_src_offset,
+ frames[frame]->y_buffer + mb_y_src_offset,
+ frames[frame]->y_stride, mb_col * BW, mb_row * BH, blk_mvs,
+ blk_bestsme);
+
+ int err16 =
+ blk_bestsme[0] + blk_bestsme[1] + blk_bestsme[2] + blk_bestsme[3];
+ int max_err = INT_MIN, min_err = INT_MAX;
+ for (k = 0; k < 4; k++) {
+ if (min_err > blk_bestsme[k]) min_err = blk_bestsme[k];
+ if (max_err < blk_bestsme[k]) max_err = blk_bestsme[k];
+ }
+
+ if (((err * 15 < (err16 << 4)) && max_err - min_err < 12000) ||
+ ((err * 14 < (err16 << 4)) && max_err - min_err < 6000)) {
+ use_32x32 = 1;
+ // Assign higher weight to matching MB if it's error
+ // score is lower. If not applying MC default behavior
+ // is to weight all MBs equal.
+ blk_fw[0] = err < (thresh_low << THR_SHIFT)
+ ? 2
+ : err < (thresh_high << THR_SHIFT) ? 1 : 0;
+ blk_fw[1] = blk_fw[2] = blk_fw[3] = blk_fw[0];
+ } else {
+ use_32x32 = 0;
+ for (k = 0; k < 4; k++)
+ blk_fw[k] = blk_bestsme[k] < thresh_low
+ ? 2
+ : blk_bestsme[k] < thresh_high ? 1 : 0;
+ }
}
- if (filter_weight != 0) {
+ if (blk_fw[0] || blk_fw[1] || blk_fw[2] || blk_fw[3]) {
// Construct the predictors
temporal_filter_predictors_mb_c(
- mbd, frames[frame]->y_buffer + mb_y_offset,
- frames[frame]->u_buffer + mb_uv_offset,
- frames[frame]->v_buffer + mb_uv_offset, frames[frame]->y_stride,
- mb_uv_width, mb_uv_height, mbd->mi[0]->mv[0].as_mv.row,
- mbd->mi[0]->mv[0].as_mv.col, predictor, &ref_buf->sf, mb_col * 16,
- mb_row * 16, cm->allow_warped_motion, num_planes);
+ mbd, frames[frame]->y_buffer + mb_y_src_offset,
+ frames[frame]->u_buffer + mb_uv_src_offset,
+ frames[frame]->v_buffer + mb_uv_src_offset,
+ frames[frame]->y_stride, mb_uv_width, mb_uv_height,
+ mbd->mi[0]->mv[0].as_mv.row, mbd->mi[0]->mv[0].as_mv.col,
+ predictor, ref_scale_factors, mb_col * BW, mb_row * BH,
+ cm->allow_warped_motion, num_planes, blk_mvs, use_32x32);
// Apply the filter (YUV)
- if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
- int adj_strength = strength + 2 * (mbd->bd - 8);
-
- if (num_planes <= 1) {
- // Single plane case
- av1_highbd_temporal_filter_apply_c(
- f->y_buffer + mb_y_offset, f->y_stride, predictor, 16, 16,
- adj_strength, filter_weight, accumulator, count);
- } else {
- // Process 3 planes together.
- highbd_apply_temporal_filter(
- f->y_buffer + mb_y_offset, f->y_stride, predictor, 16,
- f->u_buffer + mb_uv_offset, f->v_buffer + mb_uv_offset,
- f->uv_stride, predictor + 256, predictor + 512, mb_uv_width,
- 16, 16, mbd->plane[1].subsampling_x,
- mbd->plane[1].subsampling_y, adj_strength, filter_weight,
- accumulator, count, accumulator + 256, count + 256,
- accumulator + 512, count + 512);
+ if (frame == alt_ref_index) {
+ uint8_t *pred = predictor;
+ uint32_t *accum = accumulator;
+ uint16_t *cnt = count;
+ int plane;
+
+ // All 4 blk_fws are equal to 2.
+ for (plane = 0; plane < num_planes; ++plane) {
+ const int pred_stride = plane ? mb_uv_width : BW;
+ const unsigned int w = plane ? mb_uv_width : BW;
+ const unsigned int h = plane ? mb_uv_height : BH;
+
+ if (is_hbd) {
+ highbd_apply_temporal_filter_self(pred, pred_stride, w, h,
+ blk_fw[0], accum, cnt);
+ } else {
+ apply_temporal_filter_self(pred, pred_stride, w, h, blk_fw[0],
+ accum, cnt);
+ }
+
+ pred += BLK_PELS;
+ accum += BLK_PELS;
+ cnt += BLK_PELS;
}
} else {
- if (num_planes <= 1) {
- // Single plane case
- av1_temporal_filter_apply_c(
- f->y_buffer + mb_y_offset, f->y_stride, predictor, 16, 16,
- strength, filter_weight, accumulator, count);
+ if (is_hbd) {
+ const int adj_strength = strength + 2 * (mbd->bd - 8);
+
+ if (num_planes <= 1) {
+ // Single plane case
+ av1_highbd_temporal_filter_apply_c(
+ f->y_buffer + mb_y_src_offset, f->y_stride, predictor, BW,
+ BH, adj_strength, blk_fw, use_32x32, accumulator, count);
+ } else {
+ // Process 3 planes together.
+ av1_highbd_apply_temporal_filter(
+ f->y_buffer + mb_y_src_offset, f->y_stride, predictor, BW,
+ f->u_buffer + mb_uv_src_offset,
+ f->v_buffer + mb_uv_src_offset, f->uv_stride,
+ predictor + BLK_PELS, predictor + (BLK_PELS << 1),
+ mb_uv_width, BW, BH, mbd->plane[1].subsampling_x,
+ mbd->plane[1].subsampling_y, adj_strength, blk_fw,
+ use_32x32, accumulator, count, accumulator + BLK_PELS,
+ count + BLK_PELS, accumulator + (BLK_PELS << 1),
+ count + (BLK_PELS << 1));
+ }
} else {
- // Process 3 planes together.
- apply_temporal_filter(
- f->y_buffer + mb_y_offset, f->y_stride, predictor, 16,
- f->u_buffer + mb_uv_offset, f->v_buffer + mb_uv_offset,
- f->uv_stride, predictor + 256, predictor + 512, mb_uv_width,
- 16, 16, mbd->plane[1].subsampling_x,
- mbd->plane[1].subsampling_y, strength, filter_weight,
- accumulator, count, accumulator + 256, count + 256,
- accumulator + 512, count + 512);
+ if (num_planes <= 1) {
+ // Single plane case
+ av1_temporal_filter_apply_c(
+ f->y_buffer + mb_y_src_offset, f->y_stride, predictor, BW,
+ BH, strength, blk_fw, use_32x32, accumulator, count);
+ } else {
+ // Process 3 planes together.
+ av1_apply_temporal_filter(
+ f->y_buffer + mb_y_src_offset, f->y_stride, predictor, BW,
+ f->u_buffer + mb_uv_src_offset,
+ f->v_buffer + mb_uv_src_offset, f->uv_stride,
+ predictor + BLK_PELS, predictor + (BLK_PELS << 1),
+ mb_uv_width, BW, BH, mbd->plane[1].subsampling_x,
+ mbd->plane[1].subsampling_y, strength, blk_fw, use_32x32,
+ accumulator, count, accumulator + BLK_PELS,
+ count + BLK_PELS, accumulator + (BLK_PELS << 1),
+ count + (BLK_PELS << 1));
+ }
}
}
}
}
// Normalize filter output to produce AltRef frame
- if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ if (is_hbd) {
uint16_t *dst1_16;
uint16_t *dst2_16;
dst1 = cpi->alt_ref_buffer.y_buffer;
dst1_16 = CONVERT_TO_SHORTPTR(dst1);
stride = cpi->alt_ref_buffer.y_stride;
byte = mb_y_offset;
- for (i = 0, k = 0; i < 16; i++) {
- for (j = 0; j < 16; j++, k++) {
+ for (i = 0, k = 0; i < BH; i++) {
+ for (j = 0; j < BW; j++, k++) {
dst1_16[byte] =
(uint16_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]);
@@ -740,7 +1005,7 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi,
byte++;
}
- byte += stride - 16;
+ byte += stride - BW;
}
if (num_planes > 1) {
dst1 = cpi->alt_ref_buffer.u_buffer;
@@ -749,9 +1014,9 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi,
dst2_16 = CONVERT_TO_SHORTPTR(dst2);
stride = cpi->alt_ref_buffer.uv_stride;
byte = mb_uv_offset;
- for (i = 0, k = 256; i < mb_uv_height; i++) {
+ for (i = 0, k = BLK_PELS; i < mb_uv_height; i++) {
for (j = 0; j < mb_uv_width; j++, k++) {
- int m = k + 256;
+ int m = k + BLK_PELS;
// U
dst1_16[byte] =
(uint16_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]);
@@ -768,24 +1033,24 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi,
dst1 = cpi->alt_ref_buffer.y_buffer;
stride = cpi->alt_ref_buffer.y_stride;
byte = mb_y_offset;
- for (i = 0, k = 0; i < 16; i++) {
- for (j = 0; j < 16; j++, k++) {
+ for (i = 0, k = 0; i < BH; i++) {
+ for (j = 0; j < BW; j++, k++) {
dst1[byte] =
(uint8_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]);
// move to next pixel
byte++;
}
- byte += stride - 16;
+ byte += stride - BW;
}
if (num_planes > 1) {
dst1 = cpi->alt_ref_buffer.u_buffer;
dst2 = cpi->alt_ref_buffer.v_buffer;
stride = cpi->alt_ref_buffer.uv_stride;
byte = mb_uv_offset;
- for (i = 0, k = 256; i < mb_uv_height; i++) {
+ for (i = 0, k = BLK_PELS; i < mb_uv_height; i++) {
for (j = 0; j < mb_uv_width; j++, k++) {
- int m = k + 256;
+ int m = k + BLK_PELS;
// U
dst1[byte] =
(uint8_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]);
@@ -799,11 +1064,16 @@ static void temporal_filter_iterate_c(AV1_COMP *cpi,
}
}
}
- mb_y_offset += 16;
+ mb_y_offset += BW;
+ mb_y_src_offset += BW;
mb_uv_offset += mb_uv_width;
+ mb_uv_src_offset += mb_uv_width;
}
- mb_y_offset += 16 * (f->y_stride - mb_cols);
- mb_uv_offset += mb_uv_height * f->uv_stride - mb_uv_width * mb_cols;
+ mb_y_offset += BH * cpi->alt_ref_buffer.y_stride - BW * mb_cols;
+ mb_y_src_offset += BH * f->y_stride - BW * mb_cols;
+ mb_uv_src_offset += mb_uv_height * f->uv_stride - mb_uv_width * mb_cols;
+ mb_uv_offset +=
+ mb_uv_height * cpi->alt_ref_buffer.uv_stride - mb_uv_width * mb_cols;
}
// Restore input state
@@ -920,7 +1190,7 @@ static void adjust_arnr_filter(AV1_COMP *cpi, int distance, int group_boost,
MACROBLOCKD *mbd = &cpi->td.mb.e_mbd;
struct lookahead_entry *buf = av1_lookahead_peek(cpi->lookahead, distance);
double noiselevel;
- if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ if (is_cur_buf_hbd(mbd)) {
noiselevel = highbd_estimate_noise(
buf->img.y_buffer, buf->img.y_crop_width, buf->img.y_crop_height,
buf->img.y_stride, mbd->bd, EDGE_THRESHOLD);
@@ -974,8 +1244,7 @@ void av1_temporal_filter(AV1_COMP *cpi, int distance) {
int strength;
int frames_to_blur_backward;
int frames_to_blur_forward;
- RefBuffer ref_buf;
- ref_buf.buf = NULL;
+ struct scale_factors sf;
YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS] = { NULL };
const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
@@ -984,9 +1253,8 @@ void av1_temporal_filter(AV1_COMP *cpi, int distance) {
// Apply context specific adjustments to the arnr filter parameters.
if (gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE) {
// TODO(weitinglin): Currently, we enforce the filtering strength on
- // extra ARFs' to be zeros. We should investigate in which
- // case it is more beneficial to use non-zero strength
- // filtering.
+ // internal ARFs to be zeros. We should investigate in which case it is more
+ // beneficial to use non-zero strength filtering.
strength = 0;
frames_to_blur = 1;
} else {
@@ -1020,7 +1288,7 @@ void av1_temporal_filter(AV1_COMP *cpi, int distance) {
// supported.
// ARF is produced at the native frame size and resized when coded.
av1_setup_scale_factors_for_frame(
- &ref_buf.sf, frames[0]->y_crop_width, frames[0]->y_crop_height,
+ &sf, frames[0]->y_crop_width, frames[0]->y_crop_height,
frames[0]->y_crop_width, frames[0]->y_crop_height);
}
@@ -1031,5 +1299,5 @@ void av1_temporal_filter(AV1_COMP *cpi, int distance) {
av1_initialize_cost_tables(&cpi->common, &cpi->td.mb);
temporal_filter_iterate_c(cpi, frames, frames_to_blur,
- frames_to_blur_backward, strength, &ref_buf);
+ frames_to_blur_backward, strength, &sf);
}
diff --git a/libaom/av1/encoder/temporal_filter.h b/libaom/av1/encoder/temporal_filter.h
index 1ff1162..bb26c36 100644
--- a/libaom/av1/encoder/temporal_filter.h
+++ b/libaom/av1/encoder/temporal_filter.h
@@ -18,6 +18,18 @@ extern "C" {
#define ARNR_FILT_QINDEX 128
+// Block size used in temporal filtering
+#define TF_BLOCK BLOCK_32X32
+#define BH 32
+#define BH_LOG2 5
+#define BW 32
+#define BW_LOG2 5
+#define BLK_PELS 1024 // Pixels in the block
+#define THR_SHIFT 2
+#define TF_SUB_BLOCK BLOCK_16X16
+#define SUB_BH 16
+#define SUB_BW 16
+
void av1_temporal_filter(AV1_COMP *cpi, int distance);
#ifdef __cplusplus
diff --git a/libaom/av1/encoder/tokenize.h b/libaom/av1/encoder/tokenize.h
index 63b505f..c80af7b 100644
--- a/libaom/av1/encoder/tokenize.h
+++ b/libaom/av1/encoder/tokenize.h
@@ -38,11 +38,11 @@ struct tokenize_b_args {
uint8_t allow_update_cdf;
};
-typedef enum {
+enum {
OUTPUT_ENABLED = 0,
DRY_RUN_NORMAL,
DRY_RUN_COSTCOEFFS,
-} RUN_TYPE;
+} UENUM1BYTE(RUN_TYPE);
// Note in all the tokenize functions rate if non NULL is incremented
// with the coefficient token cost only if dry_run = DRY_RUN_COSTCOEFS,
diff --git a/libaom/av1/encoder/tpl_model.c b/libaom/av1/encoder/tpl_model.c
new file mode 100644
index 0000000..79afb6d
--- /dev/null
+++ b/libaom/av1/encoder/tpl_model.c
@@ -0,0 +1,592 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdint.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_codec.h"
+
+#include "av1/common/onyxc_int.h"
+#include "av1/common/reconintra.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/reconinter_enc.h"
+
+typedef struct GF_PICTURE {
+ YV12_BUFFER_CONFIG *frame;
+ int ref_frame[7];
+} GF_PICTURE;
+
+static void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff,
+ tran_low_t *qcoeff, tran_low_t *dqcoeff,
+ TX_SIZE tx_size, int64_t *recon_error,
+ int64_t *sse) {
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const SCAN_ORDER *const scan_order = &av1_default_scan_orders[tx_size];
+ uint16_t eob;
+ int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]];
+ const int shift = tx_size == TX_32X32 ? 0 : 2;
+
+ av1_quantize_fp_32x32(coeff, pix_num, p->zbin_QTX, p->round_fp_QTX,
+ p->quant_fp_QTX, p->quant_shift_QTX, qcoeff, dqcoeff,
+ p->dequant_QTX, &eob, scan_order->scan,
+ scan_order->iscan);
+
+ *recon_error = av1_block_error(coeff, dqcoeff, pix_num, sse) >> shift;
+ *recon_error = AOMMAX(*recon_error, 1);
+
+ *sse = (*sse) >> shift;
+ *sse = AOMMAX(*sse, 1);
+}
+
+static void wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
+ TX_SIZE tx_size) {
+ switch (tx_size) {
+ case TX_8X8: aom_hadamard_8x8(src_diff, bw, coeff); break;
+ case TX_16X16: aom_hadamard_16x16(src_diff, bw, coeff); break;
+ case TX_32X32: aom_hadamard_32x32(src_diff, bw, coeff); break;
+ default: assert(0);
+ }
+}
+
+static uint32_t motion_compensated_prediction(AV1_COMP *cpi, ThreadData *td,
+ uint8_t *cur_frame_buf,
+ uint8_t *ref_frame_buf,
+ int stride, BLOCK_SIZE bsize,
+ int mi_row, int mi_col) {
+ AV1_COMMON *cm = &cpi->common;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
+ const SEARCH_METHODS search_method = NSTEP;
+ int step_param;
+ int sadpb = x->sadperbit16;
+ uint32_t bestsme = UINT_MAX;
+ int distortion;
+ uint32_t sse;
+ int cost_list[5];
+ const MvLimits tmp_mv_limits = x->mv_limits;
+
+ MV best_ref_mv1 = { 0, 0 };
+ MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
+
+ best_ref_mv1_full.col = best_ref_mv1.col >> 3;
+ best_ref_mv1_full.row = best_ref_mv1.row >> 3;
+
+ // Setup frame pointers
+ x->plane[0].src.buf = cur_frame_buf;
+ x->plane[0].src.stride = stride;
+ xd->plane[0].pre[0].buf = ref_frame_buf;
+ xd->plane[0].pre[0].stride = stride;
+
+ step_param = mv_sf->reduce_first_step_size;
+ step_param = AOMMIN(step_param, MAX_MVSEARCH_STEPS - 2);
+
+ av1_set_mv_search_range(&x->mv_limits, &best_ref_mv1);
+
+ av1_full_pixel_search(cpi, x, bsize, &best_ref_mv1_full, step_param,
+ search_method, 0, sadpb, cond_cost_list(cpi, cost_list),
+ &best_ref_mv1, INT_MAX, 0, (MI_SIZE * mi_col),
+ (MI_SIZE * mi_row), 0, &cpi->ss_cfg[SS_CFG_SRC]);
+
+ /* restore UMV window */
+ x->mv_limits = tmp_mv_limits;
+
+ const int pw = block_size_wide[bsize];
+ const int ph = block_size_high[bsize];
+ bestsme = cpi->find_fractional_mv_step(
+ x, cm, mi_row, mi_col, &best_ref_mv1, cpi->common.allow_high_precision_mv,
+ x->errorperbit, &cpi->fn_ptr[bsize], 0, mv_sf->subpel_iters_per_step,
+ cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, NULL,
+ 0, 0, pw, ph, 1, 1);
+
+ return bestsme;
+}
+
+static void mode_estimation(AV1_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
+ struct scale_factors *sf, GF_PICTURE *gf_picture,
+ int frame_idx, int16_t *src_diff, tran_low_t *coeff,
+ tran_low_t *qcoeff, tran_low_t *dqcoeff, int mi_row,
+ int mi_col, BLOCK_SIZE bsize, TX_SIZE tx_size,
+ YV12_BUFFER_CONFIG *ref_frame[], uint8_t *predictor,
+ TplDepStats *tpl_stats) {
+ AV1_COMMON *cm = &cpi->common;
+ ThreadData *td = &cpi->td;
+
+ const int bw = 4 << mi_size_wide_log2[bsize];
+ const int bh = 4 << mi_size_high_log2[bsize];
+ const int pix_num = bw * bh;
+ int best_rf_idx = -1;
+ int_mv best_mv;
+ int64_t best_inter_cost = INT64_MAX;
+ int64_t inter_cost;
+ int rf_idx;
+ const InterpFilters kernel =
+ av1_make_interp_filters(EIGHTTAP_REGULAR, EIGHTTAP_REGULAR);
+
+ int64_t best_intra_cost = INT64_MAX;
+ int64_t intra_cost;
+ PREDICTION_MODE mode;
+ int mb_y_offset = mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
+ MB_MODE_INFO mi_above, mi_left;
+
+ memset(tpl_stats, 0, sizeof(*tpl_stats));
+
+ xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
+ xd->mb_to_bottom_edge = ((cm->mi_rows - 1 - mi_row) * MI_SIZE) * 8;
+ xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
+ xd->mb_to_right_edge = ((cm->mi_cols - 1 - mi_col) * MI_SIZE) * 8;
+ xd->above_mbmi = (mi_row > 0) ? &mi_above : NULL;
+ xd->left_mbmi = (mi_col > 0) ? &mi_left : NULL;
+
+ // Intra prediction search
+ for (mode = DC_PRED; mode <= PAETH_PRED; ++mode) {
+ uint8_t *src, *dst;
+ int src_stride, dst_stride;
+
+ src = xd->cur_buf->y_buffer + mb_y_offset;
+ src_stride = xd->cur_buf->y_stride;
+
+ dst = &predictor[0];
+ dst_stride = bw;
+
+ xd->mi[0]->sb_type = bsize;
+ xd->mi[0]->ref_frame[0] = INTRA_FRAME;
+
+ av1_predict_intra_block(
+ cm, xd, block_size_wide[bsize], block_size_high[bsize], tx_size, mode,
+ 0, 0, FILTER_INTRA_MODES, src, src_stride, dst, dst_stride, 0, 0, 0);
+
+ if (is_cur_buf_hbd(xd)) {
+ aom_highbd_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst,
+ dst_stride, xd->bd);
+ } else {
+ aom_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst,
+ dst_stride);
+ }
+
+ wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+
+ intra_cost = aom_satd(coeff, pix_num);
+
+ if (intra_cost < best_intra_cost) best_intra_cost = intra_cost;
+ }
+
+ // Motion compensated prediction
+ best_mv.as_int = 0;
+
+ (void)mb_y_offset;
+ // Motion estimation column boundary
+ x->mv_limits.col_min = -((mi_col * MI_SIZE) + (17 - 2 * AOM_INTERP_EXTEND));
+ x->mv_limits.col_max =
+ ((cm->mi_cols - 1 - mi_col) * MI_SIZE) + (17 - 2 * AOM_INTERP_EXTEND);
+
+ for (rf_idx = 0; rf_idx < 7; ++rf_idx) {
+ if (ref_frame[rf_idx] == NULL) continue;
+
+ motion_compensated_prediction(cpi, td, xd->cur_buf->y_buffer + mb_y_offset,
+ ref_frame[rf_idx]->y_buffer + mb_y_offset,
+ xd->cur_buf->y_stride, bsize, mi_row, mi_col);
+
+ // TODO(jingning): Not yet support high bit-depth in the next three
+ // steps.
+ ConvolveParams conv_params = get_conv_params(0, 0, xd->bd);
+ WarpTypesAllowed warp_types;
+ memset(&warp_types, 0, sizeof(WarpTypesAllowed));
+
+ av1_build_inter_predictor(
+ ref_frame[rf_idx]->y_buffer + mb_y_offset, ref_frame[rf_idx]->y_stride,
+ &predictor[0], bw, &x->best_mv.as_mv, sf, bw, bh, &conv_params, kernel,
+ &warp_types, mi_col * MI_SIZE, mi_row * MI_SIZE, 0, 0, MV_PRECISION_Q3,
+ mi_col * MI_SIZE, mi_row * MI_SIZE, xd, 0);
+ if (is_cur_buf_hbd(xd)) {
+ aom_highbd_subtract_block(
+ bh, bw, src_diff, bw, xd->cur_buf->y_buffer + mb_y_offset,
+ xd->cur_buf->y_stride, &predictor[0], bw, xd->bd);
+ } else {
+ aom_subtract_block(bh, bw, src_diff, bw,
+ xd->cur_buf->y_buffer + mb_y_offset,
+ xd->cur_buf->y_stride, &predictor[0], bw);
+ }
+ wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+
+ inter_cost = aom_satd(coeff, pix_num);
+ if (inter_cost < best_inter_cost) {
+ int64_t recon_error, sse;
+
+ best_rf_idx = rf_idx;
+ best_inter_cost = inter_cost;
+ best_mv.as_int = x->best_mv.as_int;
+ get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, &recon_error,
+ &sse);
+ }
+ }
+ best_intra_cost = AOMMAX(best_intra_cost, 1);
+ best_inter_cost = AOMMIN(best_intra_cost, best_inter_cost);
+ tpl_stats->inter_cost = best_inter_cost << TPL_DEP_COST_SCALE_LOG2;
+ tpl_stats->intra_cost = best_intra_cost << TPL_DEP_COST_SCALE_LOG2;
+ tpl_stats->mc_dep_cost = tpl_stats->intra_cost + tpl_stats->mc_flow;
+
+ tpl_stats->ref_frame_index = gf_picture[frame_idx].ref_frame[best_rf_idx];
+ tpl_stats->mv.as_int = best_mv.as_int;
+}
+
+static int round_floor(int ref_pos, int bsize_pix) {
+ int round;
+ if (ref_pos < 0)
+ round = -(1 + (-ref_pos - 1) / bsize_pix);
+ else
+ round = ref_pos / bsize_pix;
+
+ return round;
+}
+
+static int get_overlap_area(int grid_pos_row, int grid_pos_col, int ref_pos_row,
+ int ref_pos_col, int block, BLOCK_SIZE bsize) {
+ int width = 0, height = 0;
+ int bw = 4 << mi_size_wide_log2[bsize];
+ int bh = 4 << mi_size_high_log2[bsize];
+
+ switch (block) {
+ case 0:
+ width = grid_pos_col + bw - ref_pos_col;
+ height = grid_pos_row + bh - ref_pos_row;
+ break;
+ case 1:
+ width = ref_pos_col + bw - grid_pos_col;
+ height = grid_pos_row + bh - ref_pos_row;
+ break;
+ case 2:
+ width = grid_pos_col + bw - ref_pos_col;
+ height = ref_pos_row + bh - grid_pos_row;
+ break;
+ case 3:
+ width = ref_pos_col + bw - grid_pos_col;
+ height = ref_pos_row + bh - grid_pos_row;
+ break;
+ default: assert(0);
+ }
+
+ return width * height;
+}
+
+static void tpl_model_update_b(TplDepFrame *tpl_frame, TplDepStats *tpl_stats,
+ int mi_row, int mi_col, const BLOCK_SIZE bsize) {
+ TplDepFrame *ref_tpl_frame = &tpl_frame[tpl_stats->ref_frame_index];
+ TplDepStats *ref_stats = ref_tpl_frame->tpl_stats_ptr;
+ MV mv = tpl_stats->mv.as_mv;
+ int mv_row = mv.row >> 3;
+ int mv_col = mv.col >> 3;
+
+ int ref_pos_row = mi_row * MI_SIZE + mv_row;
+ int ref_pos_col = mi_col * MI_SIZE + mv_col;
+
+ const int bw = 4 << mi_size_wide_log2[bsize];
+ const int bh = 4 << mi_size_high_log2[bsize];
+ const int mi_height = mi_size_high[bsize];
+ const int mi_width = mi_size_wide[bsize];
+ const int pix_num = bw * bh;
+
+ // top-left on grid block location in pixel
+ int grid_pos_row_base = round_floor(ref_pos_row, bh) * bh;
+ int grid_pos_col_base = round_floor(ref_pos_col, bw) * bw;
+ int block;
+
+ for (block = 0; block < 4; ++block) {
+ int grid_pos_row = grid_pos_row_base + bh * (block >> 1);
+ int grid_pos_col = grid_pos_col_base + bw * (block & 0x01);
+
+ if (grid_pos_row >= 0 && grid_pos_row < ref_tpl_frame->mi_rows * MI_SIZE &&
+ grid_pos_col >= 0 && grid_pos_col < ref_tpl_frame->mi_cols * MI_SIZE) {
+ int overlap_area = get_overlap_area(
+ grid_pos_row, grid_pos_col, ref_pos_row, ref_pos_col, block, bsize);
+ int ref_mi_row = round_floor(grid_pos_row, bh) * mi_height;
+ int ref_mi_col = round_floor(grid_pos_col, bw) * mi_width;
+
+ int64_t mc_flow = tpl_stats->mc_dep_cost -
+ (tpl_stats->mc_dep_cost * tpl_stats->inter_cost) /
+ tpl_stats->intra_cost;
+
+ int idx, idy;
+
+ for (idy = 0; idy < mi_height; ++idy) {
+ for (idx = 0; idx < mi_width; ++idx) {
+ TplDepStats *des_stats =
+ &ref_stats[(ref_mi_row + idy) * ref_tpl_frame->stride +
+ (ref_mi_col + idx)];
+
+ des_stats->mc_flow += (mc_flow * overlap_area) / pix_num;
+ assert(overlap_area >= 0);
+ }
+ }
+ }
+ }
+}
+
+static void tpl_model_update(TplDepFrame *tpl_frame, TplDepStats *tpl_stats,
+ int mi_row, int mi_col, const BLOCK_SIZE bsize) {
+ int idx, idy;
+ const int mi_height = mi_size_high[bsize];
+ const int mi_width = mi_size_wide[bsize];
+
+ for (idy = 0; idy < mi_height; ++idy) {
+ for (idx = 0; idx < mi_width; ++idx) {
+ TplDepStats *tpl_ptr =
+ &tpl_stats[(mi_row + idy) * tpl_frame->stride + (mi_col + idx)];
+ tpl_model_update_b(tpl_frame, tpl_ptr, mi_row + idy, mi_col + idx,
+ BLOCK_4X4);
+ }
+ }
+}
+
+static void tpl_model_store(TplDepStats *tpl_stats, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, int stride,
+ const TplDepStats *src_stats) {
+ const int mi_height = mi_size_high[bsize];
+ const int mi_width = mi_size_wide[bsize];
+ int idx, idy;
+
+ int64_t intra_cost = src_stats->intra_cost / (mi_height * mi_width);
+ int64_t inter_cost = src_stats->inter_cost / (mi_height * mi_width);
+
+ TplDepStats *tpl_ptr;
+
+ intra_cost = AOMMAX(1, intra_cost);
+ inter_cost = AOMMAX(1, inter_cost);
+
+ for (idy = 0; idy < mi_height; ++idy) {
+ tpl_ptr = &tpl_stats[(mi_row + idy) * stride + mi_col];
+ for (idx = 0; idx < mi_width; ++idx) {
+ tpl_ptr->intra_cost = intra_cost;
+ tpl_ptr->inter_cost = inter_cost;
+ tpl_ptr->mc_dep_cost = tpl_ptr->intra_cost + tpl_ptr->mc_flow;
+ tpl_ptr->ref_frame_index = src_stats->ref_frame_index;
+ tpl_ptr->mv.as_int = src_stats->mv.as_int;
+ ++tpl_ptr;
+ }
+ }
+}
+
+static void mc_flow_dispenser(AV1_COMP *cpi, GF_PICTURE *gf_picture,
+ int frame_idx) {
+ TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
+ YV12_BUFFER_CONFIG *this_frame = gf_picture[frame_idx].frame;
+ YV12_BUFFER_CONFIG *ref_frame[7] = {
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL
+ };
+
+ AV1_COMMON *cm = &cpi->common;
+ struct scale_factors sf;
+ int rdmult, idx;
+ ThreadData *td = &cpi->td;
+ MACROBLOCK *x = &td->mb;
+ MACROBLOCKD *xd = &x->e_mbd;
+ int mi_row, mi_col;
+
+ DECLARE_ALIGNED(32, uint16_t, predictor16[32 * 32 * 3]);
+ DECLARE_ALIGNED(32, uint8_t, predictor8[32 * 32 * 3]);
+ uint8_t *predictor;
+ DECLARE_ALIGNED(32, int16_t, src_diff[32 * 32]);
+ DECLARE_ALIGNED(32, tran_low_t, coeff[32 * 32]);
+ DECLARE_ALIGNED(32, tran_low_t, qcoeff[32 * 32]);
+ DECLARE_ALIGNED(32, tran_low_t, dqcoeff[32 * 32]);
+
+ const BLOCK_SIZE bsize = BLOCK_32X32;
+ const TX_SIZE tx_size = max_txsize_lookup[bsize];
+ const int mi_height = mi_size_high[bsize];
+ const int mi_width = mi_size_wide[bsize];
+
+ // Setup scaling factor
+ av1_setup_scale_factors_for_frame(
+ &sf, this_frame->y_crop_width, this_frame->y_crop_height,
+ this_frame->y_crop_width, this_frame->y_crop_height);
+
+ if (is_cur_buf_hbd(xd))
+ predictor = CONVERT_TO_BYTEPTR(predictor16);
+ else
+ predictor = predictor8;
+
+ // Prepare reference frame pointers. If any reference frame slot is
+ // unavailable, the pointer will be set to Null.
+ for (idx = 0; idx < 7; ++idx) {
+ int rf_idx = gf_picture[frame_idx].ref_frame[idx];
+ if (rf_idx != -1) ref_frame[idx] = gf_picture[rf_idx].frame;
+ }
+
+ xd->mi = cm->mi_grid_visible;
+ xd->mi[0] = cm->mi;
+ xd->cur_buf = this_frame;
+
+ // Get rd multiplier set up.
+ rdmult = (int)av1_compute_rd_mult(cpi, tpl_frame->base_qindex);
+ if (rdmult < 1) rdmult = 1;
+ set_error_per_bit(x, rdmult);
+ av1_initialize_me_consts(cpi, x, tpl_frame->base_qindex);
+
+ tpl_frame->is_valid = 1;
+
+ cm->base_qindex = tpl_frame->base_qindex;
+ av1_frame_init_quantizer(cpi);
+
+ for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
+ // Motion estimation row boundary
+ x->mv_limits.row_min = -((mi_row * MI_SIZE) + (17 - 2 * AOM_INTERP_EXTEND));
+ x->mv_limits.row_max =
+ (cm->mi_rows - 1 - mi_row) * MI_SIZE + (17 - 2 * AOM_INTERP_EXTEND);
+ for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
+ TplDepStats tpl_stats;
+ mode_estimation(cpi, x, xd, &sf, gf_picture, frame_idx, src_diff, coeff,
+ qcoeff, dqcoeff, mi_row, mi_col, bsize, tx_size,
+ ref_frame, predictor, &tpl_stats);
+
+ // Motion flow dependency dispenser.
+ tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize,
+ tpl_frame->stride, &tpl_stats);
+
+ tpl_model_update(cpi->tpl_stats, tpl_frame->tpl_stats_ptr, mi_row, mi_col,
+ bsize);
+ }
+ }
+}
+
+static void init_gop_frames(AV1_COMP *cpi, GF_PICTURE *gf_picture,
+ const GF_GROUP *gf_group, int *tpl_group_frames,
+ const EncodeFrameInput *const frame_input) {
+ AV1_COMMON *cm = &cpi->common;
+ const SequenceHeader *const seq_params = &cm->seq_params;
+ int frame_idx = 0;
+ int i;
+ int gld_index = -1;
+ int alt_index = -1;
+ int lst_index = -1;
+ int extend_frame_count = 0;
+ int pframe_qindex = cpi->tpl_stats[2].base_qindex;
+
+ RefCntBuffer *frame_bufs = cm->buffer_pool->frame_bufs;
+ int recon_frame_index[INTER_REFS_PER_FRAME + 1] = { -1, -1, -1, -1,
+ -1, -1, -1, -1 };
+
+ // TODO(jingning): To be used later for gf frame type parsing.
+ (void)gf_group;
+
+ for (i = 0; i < FRAME_BUFFERS && frame_idx < INTER_REFS_PER_FRAME + 1; ++i) {
+ if (frame_bufs[i].ref_count == 0) {
+ alloc_frame_mvs(cm, &frame_bufs[i]);
+ if (aom_realloc_frame_buffer(
+ &frame_bufs[i].buf, cm->width, cm->height,
+ seq_params->subsampling_x, seq_params->subsampling_y,
+ seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels,
+ cm->byte_alignment, NULL, NULL, NULL))
+ aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate frame buffer");
+
+ recon_frame_index[frame_idx] = i;
+ ++frame_idx;
+ }
+ }
+
+ for (i = 0; i < INTER_REFS_PER_FRAME + 1; ++i) {
+ assert(recon_frame_index[i] >= 0);
+ cpi->tpl_recon_frames[i] = &frame_bufs[recon_frame_index[i]].buf;
+ }
+
+ *tpl_group_frames = 0;
+
+ // Initialize Golden reference frame.
+ gf_picture[0].frame = NULL;
+ RefCntBuffer *ref_buf = get_ref_frame_buf(cm, GOLDEN_FRAME);
+ if (ref_buf) gf_picture[0].frame = &ref_buf->buf;
+ for (i = 0; i < 7; ++i) gf_picture[0].ref_frame[i] = -1;
+ gld_index = 0;
+ ++*tpl_group_frames;
+
+ // Initialize ARF frame
+ gf_picture[1].frame = frame_input->source;
+ gf_picture[1].ref_frame[0] = gld_index;
+ gf_picture[1].ref_frame[1] = lst_index;
+ gf_picture[1].ref_frame[2] = alt_index;
+ // TODO(yuec) Need o figure out full AV1 reference model
+ for (i = 3; i < 7; ++i) gf_picture[1].ref_frame[i] = -1;
+ alt_index = 1;
+ ++*tpl_group_frames;
+
+ // Initialize P frames
+ for (frame_idx = 2; frame_idx < MAX_LAG_BUFFERS; ++frame_idx) {
+ struct lookahead_entry *buf =
+ av1_lookahead_peek(cpi->lookahead, frame_idx - 2);
+
+ if (buf == NULL) break;
+
+ gf_picture[frame_idx].frame = &buf->img;
+ gf_picture[frame_idx].ref_frame[0] = gld_index;
+ gf_picture[frame_idx].ref_frame[1] = lst_index;
+ gf_picture[frame_idx].ref_frame[2] = alt_index;
+ for (i = 3; i < 7; ++i) gf_picture[frame_idx].ref_frame[i] = -1;
+
+ ++*tpl_group_frames;
+ lst_index = frame_idx;
+
+ if (frame_idx == cpi->rc.baseline_gf_interval + 1) break;
+ }
+
+ gld_index = frame_idx;
+ lst_index = AOMMAX(0, frame_idx - 1);
+ alt_index = -1;
+ ++frame_idx;
+
+ // Extend two frames outside the current gf group.
+ for (; frame_idx < MAX_LAG_BUFFERS && extend_frame_count < 2; ++frame_idx) {
+ struct lookahead_entry *buf =
+ av1_lookahead_peek(cpi->lookahead, frame_idx - 2);
+
+ if (buf == NULL) break;
+
+ cpi->tpl_stats[frame_idx].base_qindex = pframe_qindex;
+
+ gf_picture[frame_idx].frame = &buf->img;
+ gf_picture[frame_idx].ref_frame[0] = gld_index;
+ gf_picture[frame_idx].ref_frame[1] = lst_index;
+ gf_picture[frame_idx].ref_frame[2] = alt_index;
+ for (i = 3; i < 7; ++i) gf_picture[frame_idx].ref_frame[i] = -1;
+ lst_index = frame_idx;
+ ++*tpl_group_frames;
+ ++extend_frame_count;
+ }
+}
+
+static void init_tpl_stats(AV1_COMP *cpi) {
+ int frame_idx;
+ for (frame_idx = 0; frame_idx < MAX_LAG_BUFFERS; ++frame_idx) {
+ TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
+ memset(tpl_frame->tpl_stats_ptr, 0,
+ tpl_frame->height * tpl_frame->width *
+ sizeof(*tpl_frame->tpl_stats_ptr));
+ tpl_frame->is_valid = 0;
+ }
+}
+
+void av1_tpl_setup_stats(AV1_COMP *cpi,
+ const EncodeFrameInput *const frame_input) {
+ GF_PICTURE gf_picture[MAX_LAG_BUFFERS];
+ const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+ int tpl_group_frames = 0;
+ int frame_idx;
+
+ init_gop_frames(cpi, gf_picture, gf_group, &tpl_group_frames, frame_input);
+
+ init_tpl_stats(cpi);
+
+ // Backward propagation from tpl_group_frames to 1.
+ for (frame_idx = tpl_group_frames - 1; frame_idx > 0; --frame_idx)
+ mc_flow_dispenser(cpi, gf_picture, frame_idx);
+}
diff --git a/libaom/av1/encoder/tpl_model.h b/libaom/av1/encoder/tpl_model.h
new file mode 100644
index 0000000..f6b33b0
--- /dev/null
+++ b/libaom/av1/encoder/tpl_model.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_TPL_MODEL_H_
+#define AOM_AV1_ENCODER_TPL_MODEL_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_tpl_setup_stats(AV1_COMP *cpi,
+ const EncodeFrameInput *const frame_input);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_TPL_MODEL_H_
diff --git a/libaom/av1/encoder/var_based_part.c b/libaom/av1/encoder/var_based_part.c
new file mode 100644
index 0000000..3cead91
--- /dev/null
+++ b/libaom/av1/encoder/var_based_part.c
@@ -0,0 +1,778 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <math.h>
+#include <stdbool.h>
+#include <stdio.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/binary_codes_writer.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/system_state.h"
+
+#include "av1/common/reconinter.h"
+#include "av1/common/blockd.h"
+
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/var_based_part.h"
+#include "av1/encoder/reconinter_enc.h"
+
+extern const uint8_t AV1_VAR_OFFS[];
+
+typedef struct {
+ // TODO(kyslov): consider changing to 64bit
+
+ // This struct is used for computing variance in choose_partitioning(), where
+ // the max number of samples within a superblock is 32x32 (with 4x4 avg).
+ // With 8bit bitdepth, uint32_t is enough for sum_square_error (2^8 * 2^8 * 32
+ // * 32 = 2^26). For high bitdepth we need to consider changing this to 64 bit
+ uint32_t sum_square_error;
+ int32_t sum_error;
+ int log2_count;
+ int variance;
+} var;
+
+typedef struct {
+ var none;
+ var horz[2];
+ var vert[2];
+} partition_variance;
+
+typedef struct {
+ partition_variance part_variances;
+ var split[4];
+} v4x4;
+
+typedef struct {
+ partition_variance part_variances;
+ v4x4 split[4];
+} v8x8;
+
+typedef struct {
+ partition_variance part_variances;
+ v8x8 split[4];
+} v16x16;
+
+typedef struct {
+ partition_variance part_variances;
+ v16x16 split[4];
+} v32x32;
+
+typedef struct {
+ partition_variance part_variances;
+ v32x32 split[4];
+} v64x64;
+
+typedef struct {
+ partition_variance part_variances;
+ v64x64 split[4];
+} v128x128;
+
+typedef struct {
+ partition_variance *part_variances;
+ var *split[4];
+} variance_node;
+
+static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) {
+ int i;
+ node->part_variances = NULL;
+ switch (bsize) {
+ case BLOCK_128X128: {
+ v128x128 *vt = (v128x128 *)data;
+ node->part_variances = &vt->part_variances;
+ for (i = 0; i < 4; i++)
+ node->split[i] = &vt->split[i].part_variances.none;
+ break;
+ }
+ case BLOCK_64X64: {
+ v64x64 *vt = (v64x64 *)data;
+ node->part_variances = &vt->part_variances;
+ for (i = 0; i < 4; i++)
+ node->split[i] = &vt->split[i].part_variances.none;
+ break;
+ }
+ case BLOCK_32X32: {
+ v32x32 *vt = (v32x32 *)data;
+ node->part_variances = &vt->part_variances;
+ for (i = 0; i < 4; i++)
+ node->split[i] = &vt->split[i].part_variances.none;
+ break;
+ }
+ case BLOCK_16X16: {
+ v16x16 *vt = (v16x16 *)data;
+ node->part_variances = &vt->part_variances;
+ for (i = 0; i < 4; i++)
+ node->split[i] = &vt->split[i].part_variances.none;
+ break;
+ }
+ case BLOCK_8X8: {
+ v8x8 *vt = (v8x8 *)data;
+ node->part_variances = &vt->part_variances;
+ for (i = 0; i < 4; i++)
+ node->split[i] = &vt->split[i].part_variances.none;
+ break;
+ }
+ default: {
+ v4x4 *vt = (v4x4 *)data;
+ assert(bsize == BLOCK_4X4);
+ node->part_variances = &vt->part_variances;
+ for (i = 0; i < 4; i++) node->split[i] = &vt->split[i];
+ break;
+ }
+ }
+}
+
+// Set variance values given sum square error, sum error, count.
+static void fill_variance(uint32_t s2, int32_t s, int c, var *v) {
+ v->sum_square_error = s2;
+ v->sum_error = s;
+ v->log2_count = c;
+}
+
+static void get_variance(var *v) {
+ v->variance =
+ (int)(256 * (v->sum_square_error -
+ (uint32_t)(((int64_t)v->sum_error * v->sum_error) >>
+ v->log2_count)) >>
+ v->log2_count);
+}
+
+static void sum_2_variances(const var *a, const var *b, var *r) {
+ assert(a->log2_count == b->log2_count);
+ fill_variance(a->sum_square_error + b->sum_square_error,
+ a->sum_error + b->sum_error, a->log2_count + 1, r);
+}
+
+static void fill_variance_tree(void *data, BLOCK_SIZE bsize) {
+ variance_node node;
+ memset(&node, 0, sizeof(node));
+ tree_to_node(data, bsize, &node);
+ sum_2_variances(node.split[0], node.split[1], &node.part_variances->horz[0]);
+ sum_2_variances(node.split[2], node.split[3], &node.part_variances->horz[1]);
+ sum_2_variances(node.split[0], node.split[2], &node.part_variances->vert[0]);
+ sum_2_variances(node.split[1], node.split[3], &node.part_variances->vert[1]);
+ sum_2_variances(&node.part_variances->vert[0], &node.part_variances->vert[1],
+ &node.part_variances->none);
+}
+
+static void set_block_size(AV1_COMP *const cpi, MACROBLOCK *const x,
+ MACROBLOCKD *const xd, int mi_row, int mi_col,
+ BLOCK_SIZE bsize) {
+ if (cpi->common.mi_cols > mi_col && cpi->common.mi_rows > mi_row) {
+ set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
+ xd->mi[0]->sb_type = bsize;
+ }
+}
+
+static int set_vt_partitioning(AV1_COMP *cpi, MACROBLOCK *const x,
+ MACROBLOCKD *const xd,
+ const TileInfo *const tile, void *data,
+ BLOCK_SIZE bsize, int mi_row, int mi_col,
+ int64_t threshold, BLOCK_SIZE bsize_min,
+ int force_split) {
+ AV1_COMMON *const cm = &cpi->common;
+ variance_node vt;
+ const int block_width = mi_size_wide[bsize];
+ const int block_height = mi_size_high[bsize];
+
+ assert(block_height == block_width);
+ tree_to_node(data, bsize, &vt);
+
+ if (force_split == 1) return 0;
+
+ if (mi_col + block_width > tile->mi_col_end ||
+ mi_row + block_height > tile->mi_row_end)
+ return 0;
+
+ // For bsize=bsize_min (16x16/8x8 for 8x8/4x4 downsampling), select if
+ // variance is below threshold, otherwise split will be selected.
+ // No check for vert/horiz split as too few samples for variance.
+ if (bsize == bsize_min) {
+ // Variance already computed to set the force_split.
+ if (frame_is_intra_only(cm)) get_variance(&vt.part_variances->none);
+ if (mi_col + block_width / 2 < cm->mi_cols &&
+ mi_row + block_height / 2 < cm->mi_rows &&
+ vt.part_variances->none.variance < threshold) {
+ set_block_size(cpi, x, xd, mi_row, mi_col, bsize);
+ return 1;
+ }
+ return 0;
+ } else if (bsize > bsize_min) {
+ // Variance already computed to set the force_split.
+ if (frame_is_intra_only(cm)) get_variance(&vt.part_variances->none);
+ // For key frame: take split for bsize above 32X32 or very high variance.
+ if (frame_is_intra_only(cm) &&
+ (bsize > BLOCK_32X32 ||
+ vt.part_variances->none.variance > (threshold << 4))) {
+ return 0;
+ }
+ // If variance is low, take the bsize (no split).
+ if (mi_col + block_width / 2 < cm->mi_cols &&
+ mi_row + block_height / 2 < cm->mi_rows &&
+ vt.part_variances->none.variance < threshold) {
+ set_block_size(cpi, x, xd, mi_row, mi_col, bsize);
+ return 1;
+ }
+
+ // Check vertical split.
+ if (mi_row + block_height / 2 < cm->mi_rows) {
+ BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_VERT);
+ get_variance(&vt.part_variances->vert[0]);
+ get_variance(&vt.part_variances->vert[1]);
+ if (vt.part_variances->vert[0].variance < threshold &&
+ vt.part_variances->vert[1].variance < threshold &&
+ get_plane_block_size(subsize, xd->plane[1].subsampling_x,
+ xd->plane[1].subsampling_y) < BLOCK_INVALID) {
+ set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
+ set_block_size(cpi, x, xd, mi_row, mi_col + block_width / 2, subsize);
+ return 1;
+ }
+ }
+ // Check horizontal split.
+ if (mi_col + block_width / 2 < cm->mi_cols) {
+ BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_HORZ);
+ get_variance(&vt.part_variances->horz[0]);
+ get_variance(&vt.part_variances->horz[1]);
+ if (vt.part_variances->horz[0].variance < threshold &&
+ vt.part_variances->horz[1].variance < threshold &&
+ get_plane_block_size(subsize, xd->plane[1].subsampling_x,
+ xd->plane[1].subsampling_y) < BLOCK_INVALID) {
+ set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
+ set_block_size(cpi, x, xd, mi_row + block_height / 2, mi_col, subsize);
+ return 1;
+ }
+ }
+
+ return 0;
+ }
+ return 0;
+}
+
+static void fill_variance_8x8avg(const uint8_t *s, int sp, const uint8_t *d,
+ int dp, int x16_idx, int y16_idx, v16x16 *vst,
+ int pixels_wide, int pixels_high,
+ int is_key_frame) {
+ int k;
+ for (k = 0; k < 4; k++) {
+ int x8_idx = x16_idx + ((k & 1) << 3);
+ int y8_idx = y16_idx + ((k >> 1) << 3);
+ unsigned int sse = 0;
+ int sum = 0;
+ if (x8_idx < pixels_wide && y8_idx < pixels_high) {
+ int s_avg;
+ int d_avg = 128;
+ s_avg = aom_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+ if (!is_key_frame) d_avg = aom_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+
+ sum = s_avg - d_avg;
+ sse = sum * sum;
+ }
+ fill_variance(sse, sum, 0, &vst->split[k].part_variances.none);
+ }
+}
+
+static int compute_minmax_8x8(const uint8_t *s, int sp, const uint8_t *d,
+ int dp, int x16_idx, int y16_idx, int pixels_wide,
+ int pixels_high) {
+ int k;
+ int minmax_max = 0;
+ int minmax_min = 255;
+ // Loop over the 4 8x8 subblocks.
+ for (k = 0; k < 4; k++) {
+ int x8_idx = x16_idx + ((k & 1) << 3);
+ int y8_idx = y16_idx + ((k >> 1) << 3);
+ int min = 0;
+ int max = 0;
+ if (x8_idx < pixels_wide && y8_idx < pixels_high) {
+ aom_minmax_8x8(s + y8_idx * sp + x8_idx, sp, d + y8_idx * dp + x8_idx, dp,
+ &min, &max);
+ if ((max - min) > minmax_max) minmax_max = (max - min);
+ if ((max - min) < minmax_min) minmax_min = (max - min);
+ }
+ }
+ return (minmax_max - minmax_min);
+}
+
+static void fill_variance_4x4avg(const uint8_t *s, int sp, const uint8_t *d,
+ int dp, int x8_idx, int y8_idx, v8x8 *vst,
+ int pixels_wide, int pixels_high,
+ int is_key_frame) {
+ int k;
+ for (k = 0; k < 4; k++) {
+ int x4_idx = x8_idx + ((k & 1) << 2);
+ int y4_idx = y8_idx + ((k >> 1) << 2);
+ unsigned int sse = 0;
+ int sum = 0;
+ if (x4_idx < pixels_wide && y4_idx < pixels_high) {
+ int s_avg;
+ int d_avg = 128;
+ s_avg = aom_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+ if (!is_key_frame) d_avg = aom_avg_4x4(d + y4_idx * dp + x4_idx, dp);
+ sum = s_avg - d_avg;
+ sse = sum * sum;
+ }
+ fill_variance(sse, sum, 0, &vst->split[k].part_variances.none);
+ }
+}
+
+static int64_t scale_part_thresh_sumdiff(int64_t threshold_base, int speed,
+ int width, int height,
+ int content_state) {
+ if (speed >= 8) {
+ if (width <= 640 && height <= 480)
+ return (5 * threshold_base) >> 2;
+ else if ((content_state == kLowSadLowSumdiff) ||
+ (content_state == kHighSadLowSumdiff) ||
+ (content_state == kLowVarHighSumdiff))
+ return (5 * threshold_base) >> 2;
+ } else if (speed == 7) {
+ if ((content_state == kLowSadLowSumdiff) ||
+ (content_state == kHighSadLowSumdiff) ||
+ (content_state == kLowVarHighSumdiff)) {
+ return (5 * threshold_base) >> 2;
+ }
+ }
+ return threshold_base;
+}
+
+// Set the variance split thresholds for following the block sizes:
+// 0 - threshold_128x128, 1 - threshold_64x64, 2 - threshold_32x32,
+// 3 - vbp_threshold_16x16. 4 - vbp_threshold_8x8 (to split to 4x4 partition) is
+// currently only used on key frame.
+static void set_vbp_thresholds(AV1_COMP *cpi, int64_t thresholds[], int q,
+ int content_state) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int is_key_frame = frame_is_intra_only(cm);
+ const int threshold_multiplier = is_key_frame ? 40 : 1;
+ int64_t threshold_base =
+ (int64_t)(threshold_multiplier * cpi->dequants.y_dequant_QTX[q][1]);
+
+ if (is_key_frame) {
+ thresholds[0] = threshold_base;
+ thresholds[1] = threshold_base;
+ thresholds[2] = threshold_base >> 2;
+ thresholds[3] = threshold_base >> 2;
+ thresholds[4] = threshold_base << 2;
+ } else {
+ // Increase base variance threshold based on content_state/sum_diff level.
+ threshold_base = scale_part_thresh_sumdiff(
+ threshold_base, cpi->oxcf.speed, cm->width, cm->height, content_state);
+
+ thresholds[1] = threshold_base;
+ thresholds[3] = threshold_base << cpi->oxcf.speed;
+ if (cm->width >= 1280 && cm->height >= 720)
+ thresholds[3] = thresholds[3] << 1;
+ if (cm->width <= 352 && cm->height <= 288) {
+ thresholds[1] = threshold_base >> 3;
+ thresholds[2] = threshold_base >> 1;
+ thresholds[3] = threshold_base << 3;
+ } else if (cm->width < 1280 && cm->height < 720) {
+ thresholds[2] = (5 * threshold_base) >> 2;
+ } else if (cm->width < 1920 && cm->height < 1080) {
+ thresholds[2] = threshold_base << 1;
+ thresholds[3] <<= 2;
+ } else {
+ thresholds[2] = (5 * threshold_base) >> 1;
+ }
+ }
+}
+
+void av1_set_variance_partition_thresholds(AV1_COMP *cpi, int q,
+ int content_state) {
+ AV1_COMMON *const cm = &cpi->common;
+ SPEED_FEATURES *const sf = &cpi->sf;
+ const int is_key_frame = frame_is_intra_only(cm);
+ if (sf->partition_search_type != VAR_BASED_PARTITION) {
+ return;
+ } else {
+ set_vbp_thresholds(cpi, cpi->vbp_thresholds, q, content_state);
+ // The thresholds below are not changed locally.
+ if (is_key_frame) {
+ cpi->vbp_threshold_sad = 0;
+ cpi->vbp_threshold_copy = 0;
+ cpi->vbp_bsize_min = BLOCK_8X8;
+ } else {
+ if (cm->width <= 352 && cm->height <= 288)
+ cpi->vbp_threshold_sad = 10;
+ else
+ cpi->vbp_threshold_sad = (cpi->dequants.y_dequant_QTX[q][1] << 1) > 1000
+ ? (cpi->dequants.y_dequant_QTX[q][1] << 1)
+ : 1000;
+ cpi->vbp_bsize_min = BLOCK_16X16;
+ if (cm->width <= 352 && cm->height <= 288)
+ cpi->vbp_threshold_copy = 4000;
+ else if (cm->width <= 640 && cm->height <= 360)
+ cpi->vbp_threshold_copy = 8000;
+ else
+ cpi->vbp_threshold_copy =
+ (cpi->dequants.y_dequant_QTX[q][1] << 3) > 8000
+ ? (cpi->dequants.y_dequant_QTX[q][1] << 3)
+ : 8000;
+ }
+ cpi->vbp_threshold_minmax = 15 + (q >> 3);
+ }
+}
+
+// This function chooses partitioning based on the variance between source and
+// reconstructed last, where variance is computed for down-sampled inputs.
+// TODO(kyslov): lot of things. Bring back noise estimation, brush up partition
+// selection and most of all - retune the thresholds
+int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
+ MACROBLOCK *x, int mi_row, int mi_col) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+
+ int i, j, k, m;
+ v128x128 *vt;
+ v16x16 *vt2 = NULL;
+ unsigned char force_split[85];
+ int avg_32x32;
+ int max_var_32x32 = 0;
+ int min_var_32x32 = INT_MAX;
+ int var_32x32;
+ int var_64x64;
+ int min_var_64x64 = INT_MAX;
+ int max_var_64x64 = 0;
+ int avg_16x16[4];
+ int maxvar_16x16[4];
+ int minvar_16x16[4];
+ int64_t threshold_4x4avg;
+ int content_state = 0;
+ uint8_t *s;
+ const uint8_t *d;
+ int sp;
+ int dp;
+ int compute_minmax_variance = 1;
+ int is_key_frame = frame_is_intra_only(cm);
+ int pixels_wide = 128, pixels_high = 128;
+ assert(cm->seq_params.sb_size == BLOCK_64X64 ||
+ cm->seq_params.sb_size == BLOCK_128X128);
+ const int is_small_sb = (cm->seq_params.sb_size == BLOCK_64X64);
+ const int num_64x64_blocks = is_small_sb ? 1 : 4;
+
+ CHECK_MEM_ERROR(cm, vt, aom_calloc(1, sizeof(*vt)));
+
+ int64_t thresholds[5] = { cpi->vbp_thresholds[0], cpi->vbp_thresholds[1],
+ cpi->vbp_thresholds[2], cpi->vbp_thresholds[3],
+ cpi->vbp_thresholds[4] };
+
+ const int low_res = (cm->width <= 352 && cm->height <= 288);
+ int variance4x4downsample[64];
+ int segment_id;
+ const int num_planes = av1_num_planes(cm);
+
+ segment_id = xd->mi[0]->segment_id;
+
+ set_vbp_thresholds(cpi, thresholds, cm->base_qindex, content_state);
+
+ if (is_small_sb) {
+ pixels_wide = 64;
+ pixels_high = 64;
+ }
+
+ // For non keyframes, disable 4x4 average for low resolution when speed = 8
+ threshold_4x4avg = INT64_MAX;
+
+ if (xd->mb_to_right_edge < 0) pixels_wide += (xd->mb_to_right_edge >> 3);
+ if (xd->mb_to_bottom_edge < 0) pixels_high += (xd->mb_to_bottom_edge >> 3);
+
+ s = x->plane[0].src.buf;
+ sp = x->plane[0].src.stride;
+
+ // Index for force_split: 0 for 64x64, 1-4 for 32x32 blocks,
+ // 5-20 for the 16x16 blocks.
+ force_split[0] = 0;
+
+ if (!is_key_frame) {
+ // TODO(kyslov): we are assuming that the ref is LAST_FRAME! Check if it
+ // is!!
+ MB_MODE_INFO *mi = xd->mi[0];
+ const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_yv12_buf(cm, LAST_FRAME);
+
+ assert(yv12 != NULL);
+
+ av1_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
+ get_ref_scale_factors(cm, LAST_FRAME), num_planes);
+ mi->ref_frame[0] = LAST_FRAME;
+ mi->ref_frame[1] = NONE_FRAME;
+ mi->sb_type = cm->seq_params.sb_size;
+ mi->mv[0].as_int = 0;
+ mi->interp_filters = av1_make_interp_filters(BILINEAR, BILINEAR);
+ if (xd->mb_to_right_edge >= 0 && xd->mb_to_bottom_edge >= 0) {
+ const MV dummy_mv = { 0, 0 };
+ av1_int_pro_motion_estimation(cpi, x, cm->seq_params.sb_size, mi_row,
+ mi_col, &dummy_mv);
+ }
+
+// TODO(kyslov): bring the small SAD functionality back
+#if 0
+ y_sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, x->plane[0].src.stride,
+ xd->plane[0].pre[0].buf,
+ xd->plane[0].pre[0].stride);
+#endif
+ x->pred_mv[LAST_FRAME] = mi->mv[0].as_mv;
+
+ set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
+ av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL,
+ cm->seq_params.sb_size, AOM_PLANE_Y,
+ AOM_PLANE_Y);
+
+ d = xd->plane[0].dst.buf;
+ dp = xd->plane[0].dst.stride;
+
+ // If the y_sad is very small, take 64x64 as partition and exit.
+ // Don't check on boosted segment for now, as 64x64 is suppressed there.
+#if 0
+ if (segment_id == CR_SEGMENT_ID_BASE && y_sad < cpi->vbp_threshold_sad)
+ { const int block_width = num_8x8_blocks_wide_lookup[BLOCK_64X64]; const
+ int block_height = num_8x8_blocks_high_lookup[BLOCK_64X64]; if (mi_col +
+ block_width / 2 < cm->mi_cols && mi_row + block_height / 2 < cm->mi_rows)
+ { set_block_size(cpi, x, xd, mi_row, mi_col, BLOCK_128X128);
+ x->variance_low[0] = 1;
+ return 0;
+ }
+ }
+#endif
+ } else {
+ d = AV1_VAR_OFFS;
+ dp = 0;
+ }
+
+ if (low_res && threshold_4x4avg < INT64_MAX)
+ CHECK_MEM_ERROR(cm, vt2, aom_calloc(64, sizeof(*vt2)));
+ // Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances
+ // for splits.
+ for (m = 0; m < num_64x64_blocks; m++) {
+ const int x64_idx = ((m & 1) << 6);
+ const int y64_idx = ((m >> 1) << 6);
+ const int m2 = m << 2;
+ force_split[m + 1] = 0;
+ for (i = 0; i < 4; i++) {
+ const int x32_idx = x64_idx + ((i & 1) << 5);
+ const int y32_idx = y64_idx + ((i >> 1) << 5);
+ const int i2 = (m2 + i) << 2;
+ force_split[5 + m2 + i] = 0;
+ avg_16x16[i] = 0;
+ maxvar_16x16[i] = 0;
+ minvar_16x16[i] = INT_MAX;
+ for (j = 0; j < 4; j++) {
+ const int x16_idx = x32_idx + ((j & 1) << 4);
+ const int y16_idx = y32_idx + ((j >> 1) << 4);
+ const int split_index = 21 + i2 + j;
+ v16x16 *vst = &vt->split[m].split[i].split[j];
+ force_split[split_index] = 0;
+ variance4x4downsample[i2 + j] = 0;
+ if (!is_key_frame) {
+ fill_variance_8x8avg(s, sp, d, dp, x16_idx, y16_idx, vst, pixels_wide,
+ pixels_high, is_key_frame);
+ fill_variance_tree(&vt->split[m].split[i].split[j], BLOCK_16X16);
+ get_variance(&vt->split[m].split[i].split[j].part_variances.none);
+ avg_16x16[i] +=
+ vt->split[m].split[i].split[j].part_variances.none.variance;
+ if (vt->split[m].split[i].split[j].part_variances.none.variance <
+ minvar_16x16[i])
+ minvar_16x16[i] =
+ vt->split[m].split[i].split[j].part_variances.none.variance;
+ if (vt->split[m].split[i].split[j].part_variances.none.variance >
+ maxvar_16x16[i])
+ maxvar_16x16[i] =
+ vt->split[m].split[i].split[j].part_variances.none.variance;
+ if (vt->split[m].split[i].split[j].part_variances.none.variance >
+ thresholds[3]) {
+ // 16X16 variance is above threshold for split, so force split to
+ // 8x8 for this 16x16 block (this also forces splits for upper
+ // levels).
+ force_split[split_index] = 1;
+ force_split[5 + m2 + i] = 1;
+ force_split[m + 1] = 1;
+ force_split[0] = 1;
+ } else if (compute_minmax_variance &&
+ vt->split[m]
+ .split[i]
+ .split[j]
+ .part_variances.none.variance > thresholds[2] &&
+ !cyclic_refresh_segment_id_boosted(segment_id)) {
+ // We have some nominal amount of 16x16 variance (based on average),
+ // compute the minmax over the 8x8 sub-blocks, and if above
+ // threshold, force split to 8x8 block for this 16x16 block.
+ int minmax = compute_minmax_8x8(s, sp, d, dp, x16_idx, y16_idx,
+ pixels_wide, pixels_high);
+ int thresh_minmax = (int)cpi->vbp_threshold_minmax;
+ if (minmax > thresh_minmax) {
+ force_split[split_index] = 1;
+ force_split[5 + m2 + i] = 1;
+ force_split[m + 1] = 1;
+ force_split[0] = 1;
+ }
+ }
+ }
+ if (is_key_frame) {
+ force_split[split_index] = 0;
+ // Go down to 4x4 down-sampling for variance.
+ variance4x4downsample[i2 + j] = 1;
+ for (k = 0; k < 4; k++) {
+ int x8_idx = x16_idx + ((k & 1) << 3);
+ int y8_idx = y16_idx + ((k >> 1) << 3);
+ v8x8 *vst2 = is_key_frame ? &vst->split[k] : &vt2[i2 + j].split[k];
+ fill_variance_4x4avg(s, sp, d, dp, x8_idx, y8_idx, vst2,
+ pixels_wide, pixels_high, is_key_frame);
+ }
+ }
+ }
+ }
+ }
+
+ // Fill the rest of the variance tree by summing split partition values.
+ for (m = 0; m < num_64x64_blocks; ++m) {
+ avg_32x32 = 0;
+ const int m2 = m << 2;
+ for (i = 0; i < 4; i++) {
+ const int i2 = (m2 + i) << 2;
+ for (j = 0; j < 4; j++) {
+ const int split_index = 21 + i2 + j;
+ if (variance4x4downsample[i2 + j] == 1) {
+ v16x16 *vtemp =
+ (!is_key_frame) ? &vt2[i2 + j] : &vt->split[m].split[i].split[j];
+ for (k = 0; k < 4; k++)
+ fill_variance_tree(&vtemp->split[k], BLOCK_8X8);
+ fill_variance_tree(vtemp, BLOCK_16X16);
+ // If variance of this 16x16 block is above the threshold, force block
+ // to split. This also forces a split on the upper levels.
+ get_variance(&vtemp->part_variances.none);
+ if (vtemp->part_variances.none.variance > thresholds[3]) {
+ force_split[split_index] = 1;
+ force_split[5 + m2 + i] = 1;
+ force_split[m + 1] = 1;
+ force_split[0] = 1;
+ }
+ }
+ }
+ fill_variance_tree(&vt->split[m].split[i], BLOCK_32X32);
+ // If variance of this 32x32 block is above the threshold, or if its above
+ // (some threshold of) the average variance over the sub-16x16 blocks,
+ // then force this block to split. This also forces a split on the upper
+ // (64x64) level.
+ if (!force_split[5 + m2 + i]) {
+ get_variance(&vt->split[m].split[i].part_variances.none);
+ var_32x32 = vt->split[m].split[i].part_variances.none.variance;
+ max_var_32x32 = AOMMAX(var_32x32, max_var_32x32);
+ min_var_32x32 = AOMMIN(var_32x32, min_var_32x32);
+ if (vt->split[m].split[i].part_variances.none.variance >
+ thresholds[2] ||
+ (!is_key_frame &&
+ vt->split[m].split[i].part_variances.none.variance >
+ (thresholds[2] >> 1) &&
+ vt->split[m].split[i].part_variances.none.variance >
+ (avg_16x16[i] >> 1))) {
+ force_split[5 + m2 + i] = 1;
+ force_split[m + 1] = 1;
+ force_split[0] = 1;
+ } else if (!is_key_frame && cm->height <= 360 &&
+ (maxvar_16x16[i] - minvar_16x16[i]) > (thresholds[2] >> 1) &&
+ maxvar_16x16[i] > thresholds[2]) {
+ force_split[5 + m2 + i] = 1;
+ force_split[m + 1] = 1;
+ force_split[0] = 1;
+ }
+ avg_32x32 += var_32x32;
+ }
+ }
+ if (!force_split[1 + m]) {
+ fill_variance_tree(&vt->split[m], BLOCK_64X64);
+ get_variance(&vt->split[m].part_variances.none);
+ var_64x64 = vt->split[m].part_variances.none.variance;
+ max_var_64x64 = AOMMAX(var_64x64, max_var_64x64);
+ min_var_64x64 = AOMMIN(var_64x64, min_var_64x64);
+ // If variance of this 64x64 block is above (some threshold of) the
+ // average variance over the sub-32x32 blocks, then force this block to
+ // split. Only checking this for noise level >= medium for now.
+
+ if (!is_key_frame &&
+ (max_var_32x32 - min_var_32x32) > 3 * (thresholds[1] >> 3) &&
+ max_var_32x32 > thresholds[1] >> 1)
+ force_split[1 + m] = 1;
+ }
+ if (is_small_sb) force_split[0] = 1;
+ }
+
+ if (!force_split[0]) {
+ fill_variance_tree(vt, BLOCK_128X128);
+ get_variance(&vt->part_variances.none);
+ if (!is_key_frame &&
+ (max_var_64x64 - min_var_64x64) > 3 * (thresholds[0] >> 3) &&
+ max_var_64x64 > thresholds[0] >> 1)
+ force_split[0] = 1;
+ }
+
+ if (!set_vt_partitioning(cpi, x, xd, tile, vt, BLOCK_128X128, mi_row, mi_col,
+ thresholds[0], BLOCK_16X16, force_split[0])) {
+ for (m = 0; m < num_64x64_blocks; ++m) {
+ const int x64_idx = ((m & 1) << 4);
+ const int y64_idx = ((m >> 1) << 4);
+ const int m2 = m << 2;
+
+ // Now go through the entire structure, splitting every block size until
+ // we get to one that's got a variance lower than our threshold.
+ if (!set_vt_partitioning(cpi, x, xd, tile, &vt->split[m], BLOCK_64X64,
+ mi_row + y64_idx, mi_col + x64_idx,
+ thresholds[1], BLOCK_16X16,
+ force_split[1 + m])) {
+ for (i = 0; i < 4; ++i) {
+ const int x32_idx = ((i & 1) << 3);
+ const int y32_idx = ((i >> 1) << 3);
+ const int i2 = (m2 + i) << 2;
+ if (!set_vt_partitioning(cpi, x, xd, tile, &vt->split[m].split[i],
+ BLOCK_32X32, (mi_row + y64_idx + y32_idx),
+ (mi_col + x64_idx + x32_idx), thresholds[2],
+ BLOCK_16X16, force_split[5 + m2 + i])) {
+ for (j = 0; j < 4; ++j) {
+ const int x16_idx = ((j & 1) << 2);
+ const int y16_idx = ((j >> 1) << 2);
+ const int split_index = 21 + i2 + j;
+ // For inter frames: if variance4x4downsample[] == 1 for this
+ // 16x16 block, then the variance is based on 4x4 down-sampling,
+ // so use vt2 in set_vt_partioning(), otherwise use vt.
+ v16x16 *vtemp =
+ (!is_key_frame && variance4x4downsample[i2 + j] == 1)
+ ? &vt2[i2 + j]
+ : &vt->split[m].split[i].split[j];
+ if (!set_vt_partitioning(cpi, x, xd, tile, vtemp, BLOCK_16X16,
+ mi_row + y64_idx + y32_idx + y16_idx,
+ mi_col + x64_idx + x32_idx + x16_idx,
+ thresholds[3], BLOCK_8X8,
+ force_split[split_index])) {
+ for (k = 0; k < 4; ++k) {
+ const int x8_idx = (k & 1) << 1;
+ const int y8_idx = (k >> 1) << 1;
+ set_block_size(
+ cpi, x, xd,
+ (mi_row + y64_idx + y32_idx + y16_idx + y8_idx),
+ (mi_col + x64_idx + x32_idx + x16_idx + x8_idx),
+ BLOCK_8X8);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ if (vt2) aom_free(vt2);
+ if (vt) aom_free(vt);
+ return 0;
+}
diff --git a/libaom/av1/encoder/var_based_part.h b/libaom/av1/encoder/var_based_part.h
new file mode 100644
index 0000000..c355224
--- /dev/null
+++ b/libaom/av1/encoder/var_based_part.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_VAR_BASED_PART_H_
+#define AOM_AV1_ENCODER_VAR_BASED_PART_H_
+
+#include <stdio.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_set_variance_partition_thresholds(AV1_COMP *cpi, int q,
+ int content_state);
+
+int av1_choose_var_based_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
+ MACROBLOCK *x, int mi_row, int mi_col);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_VAR_BASED_PART_H_
diff --git a/libaom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c b/libaom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c
index 13982cc..9483063 100644
--- a/libaom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c
+++ b/libaom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c
@@ -1408,12 +1408,6 @@ static INLINE void fadst16x16_new_avx2(const __m256i *input, __m256i *output,
output[15] = x1[0];
}
-static INLINE __m256i scale_round_avx2(const __m256i a, const int scale) {
- const __m256i scale__r = pair_set_w16_epi16(scale, 1 << (NewSqrt2Bits - 1));
- const __m256i b = _mm256_madd_epi16(a, scale__r);
- return _mm256_srai_epi32(b, NewSqrt2Bits);
-}
-
static INLINE void fidentity16x16_new_avx2(const __m256i *input,
__m256i *output, int8_t cos_bit) {
(void)cos_bit;
@@ -1997,6 +1991,794 @@ static void lowbd_fwd_txfm2d_64x16_avx2(const int16_t *input, int32_t *output,
}
}
+static INLINE void btf_16_avx2(__m256i *w0, __m256i *w1, __m256i *in0,
+ __m256i *in1, __m128i *out0, __m128i *out1,
+ __m128i *out2, __m128i *out3,
+ const __m256i *__rounding, int8_t *cos_bit) {
+ __m256i t0 = _mm256_unpacklo_epi16(*in0, *in1);
+ __m256i t1 = _mm256_unpackhi_epi16(*in0, *in1);
+ __m256i u0 = _mm256_madd_epi16(t0, *w0);
+ __m256i u1 = _mm256_madd_epi16(t1, *w0);
+ __m256i v0 = _mm256_madd_epi16(t0, *w1);
+ __m256i v1 = _mm256_madd_epi16(t1, *w1);
+
+ __m256i a0 = _mm256_add_epi32(u0, *__rounding);
+ __m256i a1 = _mm256_add_epi32(u1, *__rounding);
+ __m256i b0 = _mm256_add_epi32(v0, *__rounding);
+ __m256i b1 = _mm256_add_epi32(v1, *__rounding);
+
+ __m256i c0 = _mm256_srai_epi32(a0, *cos_bit);
+ __m256i c1 = _mm256_srai_epi32(a1, *cos_bit);
+ __m256i d0 = _mm256_srai_epi32(b0, *cos_bit);
+ __m256i d1 = _mm256_srai_epi32(b1, *cos_bit);
+
+ __m256i temp0 = _mm256_packs_epi32(c0, c1);
+ __m256i temp1 = _mm256_packs_epi32(d0, d1);
+
+ *out0 = _mm256_castsi256_si128(temp0);
+ *out1 = _mm256_castsi256_si128(temp1);
+ *out2 = _mm256_extracti128_si256(temp0, 0x01);
+ *out3 = _mm256_extracti128_si256(temp1, 0x01);
+}
+
+static INLINE void fdct8x8_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+ __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+ __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+ __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+ __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+ __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+ __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+
+ // stage 1
+ __m256i x1[8];
+ x1[0] = _mm256_adds_epi16(input[0], input[7]);
+ x1[7] = _mm256_subs_epi16(input[0], input[7]);
+ x1[1] = _mm256_adds_epi16(input[1], input[6]);
+ x1[6] = _mm256_subs_epi16(input[1], input[6]);
+ x1[2] = _mm256_adds_epi16(input[2], input[5]);
+ x1[5] = _mm256_subs_epi16(input[2], input[5]);
+ x1[3] = _mm256_adds_epi16(input[3], input[4]);
+ x1[4] = _mm256_subs_epi16(input[3], input[4]);
+
+ // stage 2
+ __m256i x2[8];
+ x2[0] = _mm256_adds_epi16(x1[0], x1[3]);
+ x2[3] = _mm256_subs_epi16(x1[0], x1[3]);
+ x2[1] = _mm256_adds_epi16(x1[1], x1[2]);
+ x2[2] = _mm256_subs_epi16(x1[1], x1[2]);
+ x2[4] = x1[4];
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], __rounding,
+ cos_bit);
+ x2[5] = x1[5];
+ x2[6] = x1[6];
+ x2[7] = x1[7];
+
+ // stage 3
+ __m256i x3[8];
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x2[0], &x2[1], __rounding,
+ cos_bit);
+ x3[0] = x2[0];
+ x3[1] = x2[1];
+ btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x2[2], &x2[3], __rounding,
+ cos_bit);
+ x3[2] = x2[2];
+ x3[3] = x2[3];
+ x3[4] = _mm256_adds_epi16(x2[4], x2[5]);
+ x3[5] = _mm256_subs_epi16(x2[4], x2[5]);
+ x3[6] = _mm256_subs_epi16(x2[7], x2[6]);
+ x3[7] = _mm256_adds_epi16(x2[7], x2[6]);
+
+ // stage 4
+ __m256i x4[8];
+ x4[0] = x3[0];
+ x4[1] = x3[1];
+ x4[2] = x3[2];
+ x4[3] = x3[3];
+ btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x3[4], &x3[7], __rounding,
+ cos_bit);
+ x4[4] = x3[4];
+ x4[7] = x3[7];
+ btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x3[5], &x3[6], __rounding,
+ cos_bit);
+ x4[5] = x3[5];
+ x4[6] = x3[6];
+ // stage 5
+ output[0] = x4[0];
+ output[1] = x4[4];
+ output[2] = x4[2];
+ output[3] = x4[6];
+ output[4] = x4[1];
+ output[5] = x4[5];
+ output[6] = x4[3];
+ output[7] = x4[7];
+}
+
+static INLINE void fadst8x8_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m256i __zero = _mm256_setzero_si256();
+ const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+ __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+ __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
+ __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
+ __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]);
+ __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
+ __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
+ __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]);
+ __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]);
+ __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]);
+ __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]);
+ __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]);
+ __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]);
+
+ // stage 1
+ __m256i x1[8];
+ x1[0] = input[0];
+ x1[1] = _mm256_subs_epi16(__zero, input[7]);
+ x1[2] = _mm256_subs_epi16(__zero, input[3]);
+ x1[3] = input[4];
+ x1[4] = _mm256_subs_epi16(__zero, input[1]);
+ x1[5] = input[6];
+ x1[6] = input[2];
+ x1[7] = _mm256_subs_epi16(__zero, input[5]);
+
+ // stage 2
+ __m256i x2[8];
+ x2[0] = x1[0];
+ x2[1] = x1[1];
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], __rounding,
+ cos_bit);
+ x2[2] = x1[2];
+ x2[3] = x1[3];
+ x2[4] = x1[4];
+ x2[5] = x1[5];
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], __rounding,
+ cos_bit);
+ x2[6] = x1[6];
+ x2[7] = x1[7];
+
+ // stage 3
+ __m256i x3[8];
+ x3[0] = _mm256_adds_epi16(x2[0], x2[2]);
+ x3[2] = _mm256_subs_epi16(x2[0], x2[2]);
+ x3[1] = _mm256_adds_epi16(x2[1], x2[3]);
+ x3[3] = _mm256_subs_epi16(x2[1], x2[3]);
+ x3[4] = _mm256_adds_epi16(x2[4], x2[6]);
+ x3[6] = _mm256_subs_epi16(x2[4], x2[6]);
+ x3[5] = _mm256_adds_epi16(x2[5], x2[7]);
+ x3[7] = _mm256_subs_epi16(x2[5], x2[7]);
+
+ // stage 4
+ __m256i x4[8];
+ x4[0] = x3[0];
+ x4[1] = x3[1];
+ x4[2] = x3[2];
+ x4[3] = x3[3];
+ btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x3[4], &x3[5], __rounding,
+ cos_bit);
+ x4[4] = x3[4];
+ x4[5] = x3[5];
+ btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x3[6], &x3[7], __rounding,
+ cos_bit);
+ x4[6] = x3[6];
+ x4[7] = x3[7];
+
+ // stage 5
+ __m256i x5[8];
+ x5[0] = _mm256_adds_epi16(x4[0], x4[4]);
+ x5[4] = _mm256_subs_epi16(x4[0], x4[4]);
+ x5[1] = _mm256_adds_epi16(x4[1], x4[5]);
+ x5[5] = _mm256_subs_epi16(x4[1], x4[5]);
+ x5[2] = _mm256_adds_epi16(x4[2], x4[6]);
+ x5[6] = _mm256_subs_epi16(x4[2], x4[6]);
+ x5[3] = _mm256_adds_epi16(x4[3], x4[7]);
+ x5[7] = _mm256_subs_epi16(x4[3], x4[7]);
+
+ // stage 6
+ __m256i x6[8];
+ btf_16_w16_avx2(cospi_p04_p60, cospi_p60_m04, &x5[0], &x5[1], __rounding,
+ cos_bit);
+ x6[0] = x5[0];
+ x6[1] = x5[1];
+ btf_16_w16_avx2(cospi_p20_p44, cospi_p44_m20, &x5[2], &x5[3], __rounding,
+ cos_bit);
+ x6[2] = x5[2];
+ x6[3] = x5[3];
+ btf_16_w16_avx2(cospi_p36_p28, cospi_p28_m36, &x5[4], &x5[5], __rounding,
+ cos_bit);
+ x6[4] = x5[4];
+ x6[5] = x5[5];
+ btf_16_w16_avx2(cospi_p52_p12, cospi_p12_m52, &x5[6], &x5[7], __rounding,
+ cos_bit);
+ x6[6] = x5[6];
+ x6[7] = x5[7];
+
+ // stage 7
+ output[0] = x6[1];
+ output[1] = x6[6];
+ output[2] = x6[3];
+ output[3] = x6[4];
+ output[4] = x6[5];
+ output[5] = x6[2];
+ output[6] = x6[7];
+ output[7] = x6[0];
+}
+
+static INLINE void fidentity8x8_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+
+ output[0] = _mm256_adds_epi16(input[0], input[0]);
+ output[1] = _mm256_adds_epi16(input[1], input[1]);
+ output[2] = _mm256_adds_epi16(input[2], input[2]);
+ output[3] = _mm256_adds_epi16(input[3], input[3]);
+ output[4] = _mm256_adds_epi16(input[4], input[4]);
+ output[5] = _mm256_adds_epi16(input[5], input[5]);
+ output[6] = _mm256_adds_epi16(input[6], input[6]);
+ output[7] = _mm256_adds_epi16(input[7], input[7]);
+}
+
+static INLINE void fdct8x16_new_avx2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m256i __rounding_256 = _mm256_set1_epi32(1 << (cos_bit - 1));
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+ __m128i temp0, temp1, temp2, temp3;
+ __m256i in0, in1;
+ __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+ __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+ __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+ __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+ __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+ __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+ __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+ __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
+ __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+ __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+ __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+ __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
+ __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+
+ __m256i cospi_arr[12];
+
+ cospi_arr[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m32_p32),
+ cospi_m32_p32, 0x1);
+ cospi_arr[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32),
+ cospi_p32_p32, 0x1);
+ cospi_arr[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32),
+ cospi_p48_p16, 0x1);
+ cospi_arr[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32),
+ cospi_m16_p48, 0x1);
+ cospi_arr[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m16_p48),
+ cospi_m48_m16, 0x1);
+ cospi_arr[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_p16),
+ cospi_m16_p48, 0x1);
+ cospi_arr[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p56_p08),
+ cospi_p24_p40, 0x1);
+ cospi_arr[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m08_p56),
+ cospi_m40_p24, 0x1);
+ cospi_arr[8] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p60_p04),
+ cospi_p28_p36, 0x1);
+ cospi_arr[9] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m04_p60),
+ cospi_m36_p28, 0x1);
+ cospi_arr[10] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p44_p20),
+ cospi_p12_p52, 0x1);
+ cospi_arr[11] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m20_p44),
+ cospi_m52_p12, 0x1);
+
+ __m256i x[8];
+ x[0] =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(input[0]), input[1], 0x1);
+ x[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[15]), input[14],
+ 0x1);
+ x[2] =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(input[2]), input[3], 0x1);
+ x[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[13]), input[12],
+ 0x1);
+ x[4] =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(input[5]), input[4], 0x1);
+ x[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[10]), input[11],
+ 0x1);
+ x[6] =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(input[7]), input[6], 0x1);
+ x[7] =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(input[8]), input[9], 0x1);
+
+ // stage 1
+ __m256i x1[8];
+ x1[0] = _mm256_adds_epi16(x[0], x[1]);
+ x1[7] = _mm256_subs_epi16(x[0], x[1]);
+ x1[1] = _mm256_adds_epi16(x[2], x[3]);
+ x1[6] = _mm256_subs_epi16(x[2], x[3]);
+ x1[2] = _mm256_adds_epi16(x[4], x[5]);
+ x1[5] = _mm256_subs_epi16(x[4], x[5]);
+ x1[3] = _mm256_adds_epi16(x[6], x[7]);
+ x1[4] = _mm256_subs_epi16(x[6], x[7]);
+
+ // stage 2
+ __m256i x2[8];
+ x2[0] = _mm256_adds_epi16(x1[0], x1[3]);
+ x2[7] = _mm256_subs_epi16(x1[0], x1[3]);
+ x2[1] = _mm256_adds_epi16(x1[1], x1[2]);
+ x2[6] = _mm256_subs_epi16(x1[1], x1[2]);
+ x2[2] = x1[4];
+ x2[3] = x1[7];
+ btf_16_avx2(&cospi_arr[0], &cospi_arr[1], &x1[5], &x1[6], &temp0, &temp1,
+ &temp2, &temp3, &__rounding_256, &cos_bit);
+ x2[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp0, 0x1);
+ x2[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp3), temp1, 0x1);
+
+ // stage 3
+ __m256i x3[8];
+ x2[1] = _mm256_permute4x64_epi64(x2[1], 0x4e);
+ x3[0] = _mm256_adds_epi16(x2[0], x2[1]);
+ x3[1] = _mm256_subs_epi16(x2[0], x2[1]);
+ x3[2] = _mm256_blend_epi32(x2[7], x2[6], 0xf0);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, _mm256_castsi256_si128(x2[6]),
+ _mm256_extractf128_si256(x2[7], 0x01), temp0, temp1);
+ x3[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp1), temp0, 0x1);
+ x3[3] = _mm256_adds_epi16(x2[2], x2[4]);
+ x3[4] = _mm256_subs_epi16(x2[2], x2[4]);
+ x3[5] = _mm256_adds_epi16(x2[3], x2[5]);
+ x3[6] = _mm256_subs_epi16(x2[3], x2[5]);
+
+ // stage 4
+ __m256i x4[8];
+ x4[0] = _mm256_blend_epi32(x3[0], x3[1], 0xf0);
+ x4[1] = _mm256_permute2f128_si256(x3[0], x3[1], 0x21);
+ btf_16_avx2(&cospi_arr[2], &cospi_arr[3], &x4[0], &x4[1], &output[0],
+ &output[8], &output[4], &output[12], &__rounding_256, &cos_bit);
+ x4[2] = _mm256_adds_epi16(x3[2], x3[7]);
+ x4[3] = _mm256_subs_epi16(x3[2], x3[7]);
+ x4[4] = _mm256_permute2f128_si256(x3[3], x3[4], 0x20);
+ x4[5] = _mm256_permute2f128_si256(x3[6], x3[5], 0x20);
+ in0 = _mm256_permute2f128_si256(x3[3], x3[4], 0x31);
+ in1 = _mm256_permute2f128_si256(x3[5], x3[6], 0x31);
+ btf_16_avx2(&cospi_arr[4], &cospi_arr[5], &in0, &in1, &temp0, &temp1, &temp2,
+ &temp3, &__rounding_256, &cos_bit);
+
+ x4[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp2, 0x1);
+ x4[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp3), temp1, 0x1);
+
+ // stage 5
+ __m256i x5[4];
+ in0 = _mm256_permute2f128_si256(x4[2], x4[3], 0x31);
+ in1 = _mm256_permute2f128_si256(x4[2], x4[3], 0x20);
+ btf_16_avx2(&cospi_arr[6], &cospi_arr[7], &in0, &in1, &output[2], &output[14],
+ &output[10], &output[6], &__rounding_256, &cos_bit);
+ x5[0] = _mm256_adds_epi16(x4[4], x4[6]);
+ x5[1] = _mm256_subs_epi16(x4[4], x4[6]);
+ x5[2] = _mm256_adds_epi16(x4[5], x4[7]);
+ x5[3] = _mm256_subs_epi16(x4[5], x4[7]);
+
+ // stage 6
+ in0 = _mm256_permute2f128_si256(x5[0], x5[1], 0x20);
+ in1 = _mm256_permute2f128_si256(x5[2], x5[3], 0x31);
+ btf_16_avx2(&cospi_arr[8], &cospi_arr[9], &in0, &in1, &output[1], &output[15],
+ &output[9], &output[7], &__rounding_256, &cos_bit);
+ in0 = _mm256_permute2f128_si256(x5[1], x5[0], 0x31);
+ in1 = _mm256_permute2f128_si256(x5[3], x5[2], 0x20);
+ btf_16_avx2(&cospi_arr[10], &cospi_arr[11], &in0, &in1, &output[5],
+ &output[11], &output[13], &output[3], &__rounding_256, &cos_bit);
+}
+
+static INLINE void fadst8x16_new_avx2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m256i __zero = _mm256_setzero_si256();
+ const __m256i __rounding_256 = _mm256_set1_epi32(1 << (cos_bit - 1));
+ __m256i in0, in1;
+ __m128i temp0, temp1, temp2, temp3;
+
+ __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+ __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+ __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+ __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+ __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+ __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
+ __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
+ __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
+ __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
+ __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
+ __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
+ __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
+ __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
+ __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
+ __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
+ __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
+ __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
+ __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
+ __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
+ __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
+ __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
+ __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
+ __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
+
+ __m256i cospi_arr[20];
+
+ cospi_arr[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32),
+ cospi_p32_p32, 0x1);
+ cospi_arr[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32),
+ cospi_p32_m32, 0x1);
+ cospi_arr[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_p32),
+ cospi_p32_p32, 0x1);
+ cospi_arr[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p32_m32),
+ cospi_p32_m32, 0x1);
+ cospi_arr[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p16_p48),
+ cospi_m48_p16, 0x1);
+ cospi_arr[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_m16),
+ cospi_p16_p48, 0x1);
+ cospi_arr[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p16_p48),
+ cospi_m48_p16, 0x1);
+ cospi_arr[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p48_m16),
+ cospi_p16_p48, 0x1);
+ cospi_arr[8] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p08_p56),
+ cospi_p40_p24, 0x1);
+ cospi_arr[9] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p56_m08),
+ cospi_p24_m40, 0x1);
+ cospi_arr[10] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_m56_p08),
+ cospi_m24_p40, 0x1);
+ cospi_arr[11] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p08_p56),
+ cospi_p40_p24, 0x1);
+ cospi_arr[12] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p02_p62),
+ cospi_p10_p54, 0x1);
+ cospi_arr[13] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p62_m02),
+ cospi_p54_m10, 0x1);
+ cospi_arr[14] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p18_p46),
+ cospi_p26_p38, 0x1);
+ cospi_arr[15] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p46_m18),
+ cospi_p38_m26, 0x1);
+ cospi_arr[16] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p34_p30),
+ cospi_p42_p22, 0x1);
+ cospi_arr[17] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p30_m34),
+ cospi_p22_m42, 0x1);
+ cospi_arr[18] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p50_p14),
+ cospi_p58_p06, 0x1);
+ cospi_arr[19] = _mm256_insertf128_si256(_mm256_castsi128_si256(cospi_p14_m50),
+ cospi_p06_m58, 0x1);
+
+ __m256i x[8];
+ x[0] =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(input[0]), input[4], 0x1);
+ x[1] =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(input[2]), input[6], 0x1);
+ x[2] =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(input[8]), input[12], 0x1);
+ x[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(input[10]), input[14],
+ 0x1);
+ x[4] =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(input[1]), input[9], 0x1);
+ x[5] =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(input[3]), input[11], 0x1);
+ x[6] =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(input[5]), input[13], 0x1);
+ x[7] =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(input[7]), input[15], 0x1);
+
+ // stage 1
+ __m256i x1[8];
+ x1[0] = x[0];
+ x1[1] = _mm256_subs_epi16(__zero, x[7]);
+ x1[2] = x[2];
+ x1[3] = _mm256_subs_epi16(__zero, x[5]);
+ x1[4] = _mm256_subs_epi16(__zero, x[4]);
+ x1[5] = x[3];
+ x1[6] = _mm256_subs_epi16(__zero, x[6]);
+ x1[7] = x[1];
+
+ // stage 2
+ __m256i x2[8];
+ x2[0] = _mm256_blend_epi32(x1[0], x1[1], 0xf0);
+ x2[3] = _mm256_blend_epi32(x1[3], x1[2], 0xf0);
+ x2[4] = _mm256_blend_epi32(x1[4], x1[5], 0xf0);
+ x2[7] = _mm256_blend_epi32(x1[7], x1[6], 0xf0);
+ in0 = _mm256_blend_epi32(x1[1], x1[0], 0xf0);
+ in1 = _mm256_blend_epi32(x1[2], x1[3], 0xf0);
+ btf_16_avx2(&cospi_arr[0], &cospi_arr[1], &in0, &in1, &temp0, &temp1, &temp2,
+ &temp3, &__rounding_256, &cos_bit);
+ x2[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
+ x2[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
+ in0 = _mm256_permute2f128_si256(x1[7], x1[6], 0x21);
+ in1 = _mm256_permute2f128_si256(x1[4], x1[5], 0x21);
+ btf_16_avx2(&cospi_arr[2], &cospi_arr[3], &in0, &in1, &temp0, &temp1, &temp2,
+ &temp3, &__rounding_256, &cos_bit);
+ x2[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
+ x2[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
+
+ // stage 3
+ __m256i x3[8];
+ x3[0] = _mm256_adds_epi16(x2[0], x2[1]);
+ x3[1] = _mm256_subs_epi16(x2[0], x2[1]);
+ x3[2] = _mm256_adds_epi16(x2[3], x2[2]);
+ x3[3] = _mm256_subs_epi16(x2[3], x2[2]);
+ x3[4] = _mm256_adds_epi16(x2[4], x2[5]);
+ x3[5] = _mm256_subs_epi16(x2[4], x2[5]);
+ x3[6] = _mm256_adds_epi16(x2[7], x2[6]);
+ x3[7] = _mm256_subs_epi16(x2[7], x2[6]);
+
+ // stage 4
+ __m256i x4[8];
+ x4[0] = x3[0];
+ x4[1] = x3[1];
+ x4[4] = x3[4];
+ x4[5] = x3[5];
+ in0 = _mm256_permute2f128_si256(x3[2], x3[3], 0x20);
+ in1 = _mm256_permute2f128_si256(x3[2], x3[3], 0x31);
+ btf_16_avx2(&cospi_arr[4], &cospi_arr[5], &in0, &in1, &temp0, &temp1, &temp2,
+ &temp3, &__rounding_256, &cos_bit);
+ x4[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
+ x4[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
+ in0 = _mm256_permute2f128_si256(x3[6], x3[7], 0x20);
+ in1 = _mm256_permute2f128_si256(x3[6], x3[7], 0x31);
+ btf_16_avx2(&cospi_arr[6], &cospi_arr[7], &in0, &in1, &temp0, &temp1, &temp2,
+ &temp3, &__rounding_256, &cos_bit);
+ x4[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
+ x4[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
+
+ // stage 5
+ __m256i x5[8];
+ x5[0] = _mm256_adds_epi16(x4[0], x4[2]);
+ x5[1] = _mm256_subs_epi16(x4[0], x4[2]);
+ x5[2] = _mm256_adds_epi16(x4[1], x4[3]);
+ x5[3] = _mm256_subs_epi16(x4[1], x4[3]);
+ x5[4] = _mm256_adds_epi16(x4[4], x4[6]);
+ x5[5] = _mm256_subs_epi16(x4[4], x4[6]);
+ x5[6] = _mm256_adds_epi16(x4[5], x4[7]);
+ x5[7] = _mm256_subs_epi16(x4[5], x4[7]);
+
+ // stage 6
+ __m256i x6[8];
+ x6[0] = x5[0];
+ x6[1] = x5[2];
+ x6[2] = x5[1];
+ x6[3] = x5[3];
+ in0 = _mm256_permute2f128_si256(x5[4], x5[6], 0x20);
+ in1 = _mm256_permute2f128_si256(x5[4], x5[6], 0x31);
+ btf_16_avx2(&cospi_arr[8], &cospi_arr[9], &in0, &in1, &temp0, &temp1, &temp2,
+ &temp3, &__rounding_256, &cos_bit);
+ x6[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
+ x6[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
+ in0 = _mm256_permute2f128_si256(x5[5], x5[7], 0x20);
+ in1 = _mm256_permute2f128_si256(x5[5], x5[7], 0x31);
+ btf_16_avx2(&cospi_arr[10], &cospi_arr[11], &in0, &in1, &temp0, &temp1,
+ &temp2, &temp3, &__rounding_256, &cos_bit);
+ x6[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp0), temp1, 0x1);
+ x6[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(temp2), temp3, 0x1);
+
+ // stage 7
+ __m256i x7[8];
+ x7[0] = _mm256_adds_epi16(x6[0], x6[4]);
+ x7[1] = _mm256_subs_epi16(x6[0], x6[4]);
+ x7[2] = _mm256_adds_epi16(x6[1], x6[5]);
+ x7[3] = _mm256_subs_epi16(x6[1], x6[5]);
+ x7[4] = _mm256_adds_epi16(x6[2], x6[6]);
+ x7[5] = _mm256_subs_epi16(x6[2], x6[6]);
+ x7[6] = _mm256_adds_epi16(x6[3], x6[7]);
+ x7[7] = _mm256_subs_epi16(x6[3], x6[7]);
+
+ // stage 8
+ in0 = _mm256_permute2f128_si256(x7[0], x7[2], 0x20);
+ in1 = _mm256_permute2f128_si256(x7[0], x7[2], 0x31);
+ btf_16_avx2(&cospi_arr[12], &cospi_arr[13], &in0, &in1, &output[15],
+ &output[0], &output[13], &output[2], &__rounding_256, &cos_bit);
+ in0 = _mm256_permute2f128_si256(x7[4], x7[6], 0x20);
+ in1 = _mm256_permute2f128_si256(x7[4], x7[6], 0x31);
+ btf_16_avx2(&cospi_arr[14], &cospi_arr[15], &in0, &in1, &output[11],
+ &output[4], &output[9], &output[6], &__rounding_256, &cos_bit);
+ in0 = _mm256_permute2f128_si256(x7[1], x7[3], 0x20);
+ in1 = _mm256_permute2f128_si256(x7[1], x7[3], 0x31);
+ btf_16_avx2(&cospi_arr[16], &cospi_arr[17], &in0, &in1, &output[7],
+ &output[8], &output[5], &output[10], &__rounding_256, &cos_bit);
+ in0 = _mm256_permute2f128_si256(x7[5], x7[7], 0x20);
+ in1 = _mm256_permute2f128_si256(x7[5], x7[7], 0x31);
+ btf_16_avx2(&cospi_arr[18], &cospi_arr[19], &in0, &in1, &output[3],
+ &output[12], &output[1], &output[14], &__rounding_256, &cos_bit);
+}
+
+static INLINE void fidentity8x16_new_avx2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const __m256i one = _mm256_set1_epi16(1);
+ __m256i temp;
+ for (int i = 0; i < 16; i += 2) {
+ temp = _mm256_insertf128_si256(_mm256_castsi128_si256(input[i]),
+ input[i + 1], 0x1);
+ const __m256i a_lo = _mm256_unpacklo_epi16(temp, one);
+ const __m256i a_hi = _mm256_unpackhi_epi16(temp, one);
+ const __m256i b_lo = scale_round_avx2(a_lo, 2 * NewSqrt2);
+ const __m256i b_hi = scale_round_avx2(a_hi, 2 * NewSqrt2);
+ temp = _mm256_packs_epi32(b_lo, b_hi);
+ output[i] = _mm256_castsi256_si128(temp);
+ output[i + 1] = _mm256_extractf128_si256(temp, 0x1);
+ }
+}
+
+static const transform_1d_avx2 row_txfm8x16_arr[TX_TYPES] = {
+ fdct8x8_new_avx2, // DCT_DCT
+ fdct8x8_new_avx2, // ADST_DCT
+ fadst8x8_new_avx2, // DCT_ADST
+ fadst8x8_new_avx2, // ADST_ADST
+ fdct8x8_new_avx2, // FLIPADST_DCT
+ fadst8x8_new_avx2, // DCT_FLIPADST
+ fadst8x8_new_avx2, // FLIPADST_FLIPADST
+ fadst8x8_new_avx2, // ADST_FLIPADST
+ fadst8x8_new_avx2, // FLIPADST_ADST
+ fidentity8x8_new_avx2, // IDTX
+ fidentity8x8_new_avx2, // V_DCT
+ fdct8x8_new_avx2, // H_DCT
+ fidentity8x8_new_avx2, // V_ADST
+ fadst8x8_new_avx2, // H_ADST
+ fidentity8x8_new_avx2, // V_FLIPADST
+ fadst8x8_new_avx2 // H_FLIPADST
+};
+
+static const transform_1d_sse2 col_txfm8x16_arr[TX_TYPES] = {
+ fdct8x16_new_avx2, // DCT_DCT
+ fadst8x16_new_avx2, // ADST_DCT
+ fdct8x16_new_avx2, // DCT_ADST
+ fadst8x16_new_avx2, // ADST_ADST
+ fadst8x16_new_avx2, // FLIPADST_DCT
+ fdct8x16_new_avx2, // DCT_FLIPADST
+ fadst8x16_new_avx2, // FLIPADST_FLIPADST
+ fadst8x16_new_avx2, // ADST_FLIPADST
+ fadst8x16_new_avx2, // FLIPADST_ADST
+ fidentity8x16_new_avx2, // IDTX
+ fdct8x16_new_avx2, // V_DCT
+ fidentity8x16_new_avx2, // H_DCT
+ fadst8x16_new_avx2, // V_ADST
+ fidentity8x16_new_avx2, // H_ADST
+ fadst8x16_new_avx2, // V_FLIPADST
+ fidentity8x16_new_avx2 // H_FLIPADST
+};
+
+static const transform_1d_avx2 col_txfm16x8_arr[TX_TYPES] = {
+ fdct8x8_new_avx2, // DCT_DCT
+ fadst8x8_new_avx2, // ADST_DCT
+ fdct8x8_new_avx2, // DCT_ADST
+ fadst8x8_new_avx2, // ADST_ADST
+ fadst8x8_new_avx2, // FLIPADST_DCT
+ fdct8x8_new_avx2, // DCT_FLIPADST
+ fadst8x8_new_avx2, // FLIPADST_FLIPADST
+ fadst8x8_new_avx2, // ADST_FLIPADST
+ fadst8x8_new_avx2, // FLIPADST_ADST
+ fidentity8x8_new_avx2, // IDTX
+ fdct8x8_new_avx2, // V_DCT
+ fidentity8x8_new_avx2, // H_DCT
+ fadst8x8_new_avx2, // V_ADST
+ fidentity8x8_new_avx2, // H_ADST
+ fadst8x8_new_avx2, // V_FLIPADST
+ fidentity8x8_new_avx2, // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm16x8_arr[TX_TYPES] = {
+ fdct8x16_new_avx2, // DCT_DCT
+ fdct8x16_new_avx2, // ADST_DCT
+ fadst8x16_new_avx2, // DCT_ADST
+ fadst8x16_new_avx2, // ADST_ADST
+ fdct8x16_new_avx2, // FLIPADST_DCT
+ fadst8x16_new_avx2, // DCT_FLIPADST
+ fadst8x16_new_avx2, // FLIPADST_FLIPADST
+ fadst8x16_new_avx2, // ADST_FLIPADST
+ fadst8x16_new_avx2, // FLIPADST_ADST
+ fidentity8x16_new_avx2, // IDTX
+ fidentity8x16_new_avx2, // V_DCT
+ fdct8x16_new_avx2, // H_DCT
+ fidentity8x16_new_avx2, // V_ADST
+ fadst8x16_new_avx2, // H_ADST
+ fidentity8x16_new_avx2, // V_FLIPADST
+ fadst8x16_new_avx2 // H_FLIPADST
+};
+
+static void lowbd_fwd_txfm2d_8x16_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[16], buf1[16];
+ __m256i buf2[8];
+ const int8_t *shift = fwd_txfm_shift_ls[TX_8X16];
+ const int txw_idx = get_txw_idx(TX_8X16);
+ const int txh_idx = get_txh_idx(TX_8X16);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 8;
+ const int height = 16;
+ const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
+ const transform_1d_avx2 row_txfm = row_txfm8x16_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit(input, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_8x8(buf0, buf1);
+ transpose_16bit_8x8(buf0 + 8, buf1 + 8);
+
+ __m128i *bufl, *bufu;
+ if (lr_flip) {
+ bufl = buf0;
+ bufu = buf0 + 8;
+ flip_buf_sse2(buf1 + width * 0, bufl, width);
+ flip_buf_sse2(buf1 + width * 1, bufu, width);
+ } else {
+ bufl = buf1 + width * 0;
+ bufu = buf1 + width * 1;
+ }
+ pack_reg(bufl, bufu, buf2);
+ row_txfm(buf2, buf2, cos_bit_row);
+ round_shift_16bit_w16_avx2(buf2, width, shift[2]);
+ transpose_16bit_16x8_avx2(buf2, buf2);
+ store_rect_buffer_16bit_to_32bit_w8_avx2(buf2, output, width, 8);
+}
+
+static void lowbd_fwd_txfm2d_16x8_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[16], buf1[16];
+ __m256i buf2[8];
+ const int8_t *shift = fwd_txfm_shift_ls[TX_16X8];
+ const int txw_idx = get_txw_idx(TX_16X8);
+ const int txh_idx = get_txh_idx(TX_16X8);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 16;
+ const int height = 8;
+ const transform_1d_avx2 col_txfm = col_txfm16x8_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm16x8_arr[tx_type];
+ __m128i *buf;
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip(input + 8 * 0, stride, buf0, height);
+ load_buffer_16bit_to_16bit_flip(input + 8 * 1, stride, &buf0[8], height);
+ } else {
+ load_buffer_16bit_to_16bit(input + 8 * 0, stride, buf0, height);
+ load_buffer_16bit_to_16bit(input + 8 * 1, stride, &buf0[8], height);
+ }
+ pack_reg(buf0, &buf0[8], buf2);
+ round_shift_16bit_w16_avx2(buf2, height, shift[0]);
+ col_txfm(buf2, buf2, cos_bit_col);
+ round_shift_16bit_w16_avx2(buf2, height, shift[1]);
+ transpose_16bit_16x8_avx2(buf2, buf2);
+ extract_reg(buf2, buf1);
+
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1, buf, width);
+ } else {
+ buf = buf1;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ transpose_16bit_8x8(buf, buf);
+ store_rect_buffer_16bit_to_32bit_w8(buf, output, width, height);
+ transpose_16bit_8x8(buf + 8, buf + 8);
+ store_rect_buffer_16bit_to_32bit_w8(buf + 8, output + 8, width, height);
+}
+
static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = {
av1_lowbd_fwd_txfm2d_4x4_sse2, // 4x4 transform
av1_lowbd_fwd_txfm2d_8x8_sse2, // 8x8 transform
@@ -2005,8 +2787,8 @@ static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = {
lowbd_fwd_txfm2d_64x64_avx2, // 64x64 transform
av1_lowbd_fwd_txfm2d_4x8_sse2, // 4x8 transform
av1_lowbd_fwd_txfm2d_8x4_sse2, // 8x4 transform
- av1_lowbd_fwd_txfm2d_8x16_sse2, // 8x16 transform
- av1_lowbd_fwd_txfm2d_16x8_sse2, // 16x8 transform
+ lowbd_fwd_txfm2d_8x16_avx2, // 8x16 transform
+ lowbd_fwd_txfm2d_16x8_avx2, // 16x8 transform
lowbd_fwd_txfm2d_16x32_avx2, // 16x32 transform
lowbd_fwd_txfm2d_32x16_avx2, // 32x16 transform
lowbd_fwd_txfm2d_32x64_avx2, // 32x64 transform
diff --git a/libaom/av1/encoder/x86/corner_match_avx2.c b/libaom/av1/encoder/x86/corner_match_avx2.c
new file mode 100644
index 0000000..7a3b999
--- /dev/null
+++ b/libaom/av1/encoder/x86/corner_match_avx2.c
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include <immintrin.h>
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "av1/encoder/corner_match.h"
+
+DECLARE_ALIGNED(16, static const uint8_t, byte_mask[16]) = {
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0
+};
+#if MATCH_SZ != 13
+#error "Need to change byte_mask in corner_match_sse4.c if MATCH_SZ != 13"
+#endif
+
+/* Compute corr(im1, im2) * MATCH_SZ * stddev(im1), where the
+correlation/standard deviation are taken over MATCH_SZ by MATCH_SZ windows
+of each image, centered at (x1, y1) and (x2, y2) respectively.
+*/
+double compute_cross_correlation_avx2(unsigned char *im1, int stride1, int x1,
+ int y1, unsigned char *im2, int stride2,
+ int x2, int y2) {
+ int i, stride1_i = 0, stride2_i = 0;
+ __m256i temp1, sum_vec, sumsq2_vec, cross_vec, v, v1_1, v2_1;
+ const __m128i mask = _mm_load_si128((__m128i *)byte_mask);
+ const __m256i zero = _mm256_setzero_si256();
+ __m128i v1, v2;
+
+ sum_vec = zero;
+ sumsq2_vec = zero;
+ cross_vec = zero;
+
+ im1 += (y1 - MATCH_SZ_BY2) * stride1 + (x1 - MATCH_SZ_BY2);
+ im2 += (y2 - MATCH_SZ_BY2) * stride2 + (x2 - MATCH_SZ_BY2);
+
+ for (i = 0; i < MATCH_SZ; ++i) {
+ v1 = _mm_and_si128(_mm_loadu_si128((__m128i *)&im1[stride1_i]), mask);
+ v1_1 = _mm256_cvtepu8_epi16(v1);
+ v2 = _mm_and_si128(_mm_loadu_si128((__m128i *)&im2[stride2_i]), mask);
+ v2_1 = _mm256_cvtepu8_epi16(v2);
+
+ v = _mm256_insertf128_si256(_mm256_castsi128_si256(v1), v2, 1);
+ sumsq2_vec = _mm256_add_epi32(sumsq2_vec, _mm256_madd_epi16(v2_1, v2_1));
+
+ sum_vec = _mm256_add_epi16(sum_vec, _mm256_sad_epu8(v, zero));
+ cross_vec = _mm256_add_epi32(cross_vec, _mm256_madd_epi16(v1_1, v2_1));
+ stride1_i += stride1;
+ stride2_i += stride2;
+ }
+ __m256i sum_vec1 = _mm256_srli_si256(sum_vec, 8);
+ sum_vec = _mm256_add_epi32(sum_vec, sum_vec1);
+ int sum1_acc = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_vec));
+ int sum2_acc = _mm256_extract_epi32(sum_vec, 4);
+
+ __m256i unp_low = _mm256_unpacklo_epi64(sumsq2_vec, cross_vec);
+ __m256i unp_hig = _mm256_unpackhi_epi64(sumsq2_vec, cross_vec);
+ temp1 = _mm256_add_epi32(unp_low, unp_hig);
+
+ __m128i low_sumsq = _mm256_castsi256_si128(temp1);
+ low_sumsq = _mm_add_epi32(low_sumsq, _mm256_extractf128_si256(temp1, 1));
+ low_sumsq = _mm_add_epi32(low_sumsq, _mm_srli_epi64(low_sumsq, 32));
+ int sumsq2_acc = _mm_cvtsi128_si32(low_sumsq);
+ int cross_acc = _mm_extract_epi32(low_sumsq, 2);
+
+ int var2 = sumsq2_acc * MATCH_SZ_SQ - sum2_acc * sum2_acc;
+ int cov = cross_acc * MATCH_SZ_SQ - sum1_acc * sum2_acc;
+ return cov / sqrt((double)var2);
+}
diff --git a/libaom/av1/encoder/x86/encodetxb_avx2.c b/libaom/av1/encoder/x86/encodetxb_avx2.c
index 7642f57..2621301 100644
--- a/libaom/av1/encoder/x86/encodetxb_avx2.c
+++ b/libaom/av1/encoder/x86/encodetxb_avx2.c
@@ -26,14 +26,6 @@ void av1_txb_init_levels_avx2(const tran_low_t *const coeff, const int width,
const int stride = width + TX_PAD_HOR;
const __m256i y_zeros = _mm256_setzero_si256();
- const int32_t pre_len = sizeof(*levels) * TX_PAD_TOP * stride;
- uint8_t *pre_buf = levels - TX_PAD_TOP * stride;
- uint8_t *pre_buf_end = pre_buf + pre_len;
- do {
- yy_storeu_256(pre_buf, y_zeros);
- pre_buf += 32;
- } while (pre_buf < pre_buf_end);
-
const int32_t bottom_len = sizeof(*levels) * (TX_PAD_BOTTOM * stride);
uint8_t *bottom_buf_end = levels + (height + TX_PAD_BOTTOM) * stride;
uint8_t *bottom_buf = bottom_buf_end - ((bottom_len + 31) & (~31));
diff --git a/libaom/av1/encoder/x86/encodetxb_sse4.c b/libaom/av1/encoder/x86/encodetxb_sse4.c
index 5e0687c..34c9e4f 100644
--- a/libaom/av1/encoder/x86/encodetxb_sse4.c
+++ b/libaom/av1/encoder/x86/encodetxb_sse4.c
@@ -23,14 +23,6 @@ void av1_txb_init_levels_sse4_1(const tran_low_t *const coeff, const int width,
const int stride = width + TX_PAD_HOR;
const __m128i zeros = _mm_setzero_si128();
- const int32_t pre_len = sizeof(*levels) * TX_PAD_TOP * stride;
- uint8_t *pre_buf = levels - TX_PAD_TOP * stride;
- uint8_t *pre_buf_end = pre_buf + pre_len;
- do {
- _mm_storeu_si128((__m128i *)(pre_buf), zeros);
- pre_buf += 16;
- } while (pre_buf < pre_buf_end);
-
const int32_t bottom_len = sizeof(*levels) * (TX_PAD_BOTTOM * stride);
uint8_t *bottom_buf = levels + stride * height;
uint8_t *bottom_buf_end = bottom_buf + bottom_len;
diff --git a/libaom/av1/encoder/x86/highbd_block_error_intrin_avx2.c b/libaom/av1/encoder/x86/highbd_block_error_intrin_avx2.c
new file mode 100644
index 0000000..719734c
--- /dev/null
+++ b/libaom/av1/encoder/x86/highbd_block_error_intrin_avx2.c
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include <stdio.h>
+#include "aom/aom_integer.h"
+#include "av1/common/common.h"
+
+int64_t av1_highbd_block_error_avx2(tran_low_t *coeff, tran_low_t *dqcoeff,
+ intptr_t block_size, int64_t *ssz,
+ int bps) {
+ int i;
+ int64_t temp1[8];
+ int64_t error = 0, sqcoeff = 0;
+ const int shift = 2 * (bps - 8);
+ const int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+ for (i = 0; i < block_size; i += 16) {
+ __m256i mm256_coeff = _mm256_loadu_si256((__m256i *)(coeff + i));
+ __m256i mm256_coeff2 = _mm256_loadu_si256((__m256i *)(coeff + i + 8));
+ __m256i mm256_dqcoeff = _mm256_loadu_si256((__m256i *)(dqcoeff + i));
+ __m256i mm256_dqcoeff2 = _mm256_loadu_si256((__m256i *)(dqcoeff + i + 8));
+
+ __m256i diff1 = _mm256_sub_epi32(mm256_coeff, mm256_dqcoeff);
+ __m256i diff2 = _mm256_sub_epi32(mm256_coeff2, mm256_dqcoeff2);
+ __m256i diff1h = _mm256_srli_epi64(diff1, 32);
+ __m256i diff2h = _mm256_srli_epi64(diff2, 32);
+ __m256i res = _mm256_mul_epi32(diff1, diff1);
+ __m256i res1 = _mm256_mul_epi32(diff1h, diff1h);
+ __m256i res2 = _mm256_mul_epi32(diff2, diff2);
+ __m256i res3 = _mm256_mul_epi32(diff2h, diff2h);
+ __m256i res_diff = _mm256_add_epi64(_mm256_add_epi64(res, res1),
+ _mm256_add_epi64(res2, res3));
+ __m256i mm256_coeffh = _mm256_srli_epi64(mm256_coeff, 32);
+ __m256i mm256_coeffh2 = _mm256_srli_epi64(mm256_coeff2, 32);
+ res = _mm256_mul_epi32(mm256_coeff, mm256_coeff);
+ res1 = _mm256_mul_epi32(mm256_coeffh, mm256_coeffh);
+ res2 = _mm256_mul_epi32(mm256_coeff2, mm256_coeff2);
+ res3 = _mm256_mul_epi32(mm256_coeffh2, mm256_coeffh2);
+ __m256i res_sqcoeff = _mm256_add_epi64(_mm256_add_epi64(res, res1),
+ _mm256_add_epi64(res2, res3));
+ _mm256_storeu_si256((__m256i *)temp1, res_diff);
+ _mm256_storeu_si256((__m256i *)temp1 + 1, res_sqcoeff);
+
+ error += temp1[0] + temp1[1] + temp1[2] + temp1[3];
+ sqcoeff += temp1[4] + temp1[5] + temp1[6] + temp1[7];
+ }
+ assert(error >= 0 && sqcoeff >= 0);
+ error = (error + rounding) >> shift;
+ sqcoeff = (sqcoeff + rounding) >> shift;
+
+ *ssz = sqcoeff;
+ return error;
+}
diff --git a/libaom/av1/encoder/x86/highbd_fwd_txfm_avx2.c b/libaom/av1/encoder/x86/highbd_fwd_txfm_avx2.c
new file mode 100644
index 0000000..24c513f
--- /dev/null
+++ b/libaom/av1/encoder/x86/highbd_fwd_txfm_avx2.c
@@ -0,0 +1,3170 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+#include <immintrin.h> /*AVX2*/
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "aom_dsp/txfm_common.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+#include "aom_dsp/x86/txfm_common_avx2.h"
+
+static INLINE void av1_load_buffer_8x8_avx2(const int16_t *input, __m256i *out,
+ int stride, int flipud, int fliplr,
+ int shift) {
+ __m128i out1[8];
+ if (!flipud) {
+ out1[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ out1[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ out1[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ out1[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ out1[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ out1[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ out1[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ out1[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
+
+ } else {
+ out1[7] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ out1[6] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ out1[5] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ out1[4] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ out1[3] = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ out1[2] = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ out1[1] = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ out1[0] = _mm_load_si128((const __m128i *)(input + 7 * stride));
+ }
+ if (!fliplr) {
+ out[0] = _mm256_cvtepi16_epi32(out1[0]);
+ out[1] = _mm256_cvtepi16_epi32(out1[1]);
+ out[2] = _mm256_cvtepi16_epi32(out1[2]);
+ out[3] = _mm256_cvtepi16_epi32(out1[3]);
+ out[4] = _mm256_cvtepi16_epi32(out1[4]);
+ out[5] = _mm256_cvtepi16_epi32(out1[5]);
+ out[6] = _mm256_cvtepi16_epi32(out1[6]);
+ out[7] = _mm256_cvtepi16_epi32(out1[7]);
+
+ } else {
+ out[0] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[0]));
+ out[1] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[1]));
+ out[2] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[2]));
+ out[3] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[3]));
+ out[4] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[4]));
+ out[5] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[5]));
+ out[6] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[6]));
+ out[7] = _mm256_cvtepi16_epi32(mm_reverse_epi16(out1[7]));
+ }
+ out[0] = _mm256_slli_epi32(out[0], shift);
+ out[1] = _mm256_slli_epi32(out[1], shift);
+ out[2] = _mm256_slli_epi32(out[2], shift);
+ out[3] = _mm256_slli_epi32(out[3], shift);
+ out[4] = _mm256_slli_epi32(out[4], shift);
+ out[5] = _mm256_slli_epi32(out[5], shift);
+ out[6] = _mm256_slli_epi32(out[6], shift);
+ out[7] = _mm256_slli_epi32(out[7], shift);
+}
+static INLINE void col_txfm_8x8_rounding(__m256i *in, int shift) {
+ const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1));
+
+ in[0] = _mm256_add_epi32(in[0], rounding);
+ in[1] = _mm256_add_epi32(in[1], rounding);
+ in[2] = _mm256_add_epi32(in[2], rounding);
+ in[3] = _mm256_add_epi32(in[3], rounding);
+ in[4] = _mm256_add_epi32(in[4], rounding);
+ in[5] = _mm256_add_epi32(in[5], rounding);
+ in[6] = _mm256_add_epi32(in[6], rounding);
+ in[7] = _mm256_add_epi32(in[7], rounding);
+
+ in[0] = _mm256_srai_epi32(in[0], shift);
+ in[1] = _mm256_srai_epi32(in[1], shift);
+ in[2] = _mm256_srai_epi32(in[2], shift);
+ in[3] = _mm256_srai_epi32(in[3], shift);
+ in[4] = _mm256_srai_epi32(in[4], shift);
+ in[5] = _mm256_srai_epi32(in[5], shift);
+ in[6] = _mm256_srai_epi32(in[6], shift);
+ in[7] = _mm256_srai_epi32(in[7], shift);
+}
+static INLINE void av1_load_buffer_8x16_avx2(const int16_t *input, __m256i *out,
+ int stride, int flipud, int fliplr,
+ int shift) {
+ const int16_t *topL = input;
+ const int16_t *botL = input + 8 * stride;
+
+ const int16_t *tmp;
+
+ if (flipud) {
+ tmp = topL;
+ topL = botL;
+ botL = tmp;
+ }
+ av1_load_buffer_8x8_avx2(topL, out, stride, flipud, fliplr, shift);
+ av1_load_buffer_8x8_avx2(botL, out + 8, stride, flipud, fliplr, shift);
+}
+static INLINE void av1_load_buffer_16xn_avx2(const int16_t *input, __m256i *out,
+ int stride, int height,
+ int outstride, int flipud,
+ int fliplr) {
+ __m256i out1[64];
+ if (!flipud) {
+ for (int i = 0; i < height; i++) {
+ out1[i] = _mm256_loadu_si256((const __m256i *)(input + i * stride));
+ }
+ } else {
+ for (int i = 0; i < height; i++) {
+ out1[(height - 1) - i] =
+ _mm256_loadu_si256((const __m256i *)(input + i * stride));
+ }
+ }
+ if (!fliplr) {
+ for (int i = 0; i < height; i++) {
+ out[i * outstride] =
+ _mm256_cvtepi16_epi32(_mm256_castsi256_si128(out1[i]));
+ out[i * outstride + 1] =
+ _mm256_cvtepi16_epi32(_mm256_extractf128_si256(out1[i], 1));
+ }
+ } else {
+ for (int i = 0; i < height; i++) {
+ out[i * outstride + 1] = _mm256_cvtepi16_epi32(
+ mm_reverse_epi16(_mm256_castsi256_si128(out1[i])));
+ out[i * outstride + 0] = _mm256_cvtepi16_epi32(
+ mm_reverse_epi16(_mm256_extractf128_si256(out1[i], 1)));
+ }
+ }
+}
+
+static void av1_fwd_txfm_transpose_8x8_avx2(const __m256i *in, __m256i *out,
+ const int instride,
+ const int outstride) {
+ __m256i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m256i x0, x1;
+
+ u0 = _mm256_unpacklo_epi32(in[0 * instride], in[1 * instride]);
+ u1 = _mm256_unpackhi_epi32(in[0 * instride], in[1 * instride]);
+
+ u2 = _mm256_unpacklo_epi32(in[2 * instride], in[3 * instride]);
+ u3 = _mm256_unpackhi_epi32(in[2 * instride], in[3 * instride]);
+
+ u4 = _mm256_unpacklo_epi32(in[4 * instride], in[5 * instride]);
+ u5 = _mm256_unpackhi_epi32(in[4 * instride], in[5 * instride]);
+
+ u6 = _mm256_unpacklo_epi32(in[6 * instride], in[7 * instride]);
+ u7 = _mm256_unpackhi_epi32(in[6 * instride], in[7 * instride]);
+
+ x0 = _mm256_unpacklo_epi64(u0, u2);
+ x1 = _mm256_unpacklo_epi64(u4, u6);
+ out[0 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20);
+ out[4 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+ x0 = _mm256_unpackhi_epi64(u0, u2);
+ x1 = _mm256_unpackhi_epi64(u4, u6);
+ out[1 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20);
+ out[5 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+ x0 = _mm256_unpacklo_epi64(u1, u3);
+ x1 = _mm256_unpacklo_epi64(u5, u7);
+ out[2 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20);
+ out[6 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+ x0 = _mm256_unpackhi_epi64(u1, u3);
+ x1 = _mm256_unpackhi_epi64(u5, u7);
+ out[3 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x20);
+ out[7 * outstride] = _mm256_permute2f128_si256(x0, x1, 0x31);
+}
+static INLINE void av1_round_shift_32_8xn_avx2(__m256i *in, int size, int bit,
+ int stride) {
+ if (bit < 0) {
+ bit = -bit;
+ __m256i round = _mm256_set1_epi32(1 << (bit - 1));
+ for (int i = 0; i < size; ++i) {
+ in[stride * i] = _mm256_add_epi32(in[stride * i], round);
+ in[stride * i] = _mm256_srai_epi32(in[stride * i], bit);
+ }
+ } else if (bit > 0) {
+ for (int i = 0; i < size; ++i) {
+ in[stride * i] = _mm256_slli_epi32(in[stride * i], bit);
+ }
+ }
+}
+static INLINE void av1_store_buffer_avx2(const __m256i *const in, int32_t *out,
+ const int stride, const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ _mm256_store_si256((__m256i *)(out), in[i]);
+ out += stride;
+ }
+}
+static INLINE void av1_fwd_txfm_transpose_16x16_avx2(const __m256i *in,
+ __m256i *out) {
+ av1_fwd_txfm_transpose_8x8_avx2(&in[0], &out[0], 2, 2);
+ av1_fwd_txfm_transpose_8x8_avx2(&in[1], &out[16], 2, 2);
+ av1_fwd_txfm_transpose_8x8_avx2(&in[16], &out[1], 2, 2);
+ av1_fwd_txfm_transpose_8x8_avx2(&in[17], &out[17], 2, 2);
+}
+
+static INLINE __m256i av1_half_btf_avx2(const __m256i *w0, const __m256i *n0,
+ const __m256i *w1, const __m256i *n1,
+ const __m256i *rounding, int bit) {
+ __m256i x, y;
+
+ x = _mm256_mullo_epi32(*w0, *n0);
+ y = _mm256_mullo_epi32(*w1, *n1);
+ x = _mm256_add_epi32(x, y);
+ x = _mm256_add_epi32(x, *rounding);
+ x = _mm256_srai_epi32(x, bit);
+ return x;
+}
+#define btf_32_avx2_type0(w0, w1, in0, in1, out0, out1, bit) \
+ do { \
+ const __m256i ww0 = _mm256_set1_epi32(w0); \
+ const __m256i ww1 = _mm256_set1_epi32(w1); \
+ const __m256i in0_w0 = _mm256_mullo_epi32(in0, ww0); \
+ const __m256i in1_w1 = _mm256_mullo_epi32(in1, ww1); \
+ out0 = _mm256_add_epi32(in0_w0, in1_w1); \
+ av1_round_shift_32_8xn_avx2(&out0, 1, -bit, 1); \
+ const __m256i in0_w1 = _mm256_mullo_epi32(in0, ww1); \
+ const __m256i in1_w0 = _mm256_mullo_epi32(in1, ww0); \
+ out1 = _mm256_sub_epi32(in0_w1, in1_w0); \
+ av1_round_shift_32_8xn_avx2(&out1, 1, -bit, 1); \
+ } while (0)
+
+#define btf_32_type0_avx2_new(ww0, ww1, in0, in1, out0, out1, r, bit) \
+ do { \
+ const __m256i in0_w0 = _mm256_mullo_epi32(in0, ww0); \
+ const __m256i in1_w1 = _mm256_mullo_epi32(in1, ww1); \
+ out0 = _mm256_add_epi32(in0_w0, in1_w1); \
+ out0 = _mm256_add_epi32(out0, r); \
+ out0 = _mm256_srai_epi32(out0, bit); \
+ const __m256i in0_w1 = _mm256_mullo_epi32(in0, ww1); \
+ const __m256i in1_w0 = _mm256_mullo_epi32(in1, ww0); \
+ out1 = _mm256_sub_epi32(in0_w1, in1_w0); \
+ out1 = _mm256_add_epi32(out1, r); \
+ out1 = _mm256_srai_epi32(out1, bit); \
+ } while (0)
+
+typedef void (*transform_1d_avx2)(__m256i *in, __m256i *out,
+ const int8_t cos_bit, int instride,
+ int outstride);
+static void av1_fdct8_avx2(__m256i *in, __m256i *out, const int8_t bit,
+ const int col_num, const int outstride) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+ const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ __m256i u[8], v[8];
+ for (int col = 0; col < col_num; ++col) {
+ u[0] = _mm256_add_epi32(in[0 * col_num + col], in[7 * col_num + col]);
+ v[7] = _mm256_sub_epi32(in[0 * col_num + col], in[7 * col_num + col]);
+ u[1] = _mm256_add_epi32(in[1 * col_num + col], in[6 * col_num + col]);
+ u[6] = _mm256_sub_epi32(in[1 * col_num + col], in[6 * col_num + col]);
+ u[2] = _mm256_add_epi32(in[2 * col_num + col], in[5 * col_num + col]);
+ u[5] = _mm256_sub_epi32(in[2 * col_num + col], in[5 * col_num + col]);
+ u[3] = _mm256_add_epi32(in[3 * col_num + col], in[4 * col_num + col]);
+ v[4] = _mm256_sub_epi32(in[3 * col_num + col], in[4 * col_num + col]);
+ v[0] = _mm256_add_epi32(u[0], u[3]);
+ v[3] = _mm256_sub_epi32(u[0], u[3]);
+ v[1] = _mm256_add_epi32(u[1], u[2]);
+ v[2] = _mm256_sub_epi32(u[1], u[2]);
+
+ v[5] = _mm256_mullo_epi32(u[5], cospim32);
+ v[6] = _mm256_mullo_epi32(u[6], cospi32);
+ v[5] = _mm256_add_epi32(v[5], v[6]);
+ v[5] = _mm256_add_epi32(v[5], rnding);
+ v[5] = _mm256_srai_epi32(v[5], bit);
+
+ u[0] = _mm256_mullo_epi32(u[5], cospi32);
+ v[6] = _mm256_mullo_epi32(u[6], cospim32);
+ v[6] = _mm256_sub_epi32(u[0], v[6]);
+ v[6] = _mm256_add_epi32(v[6], rnding);
+ v[6] = _mm256_srai_epi32(v[6], bit);
+
+ // stage 3
+ // type 0
+ v[0] = _mm256_mullo_epi32(v[0], cospi32);
+ v[1] = _mm256_mullo_epi32(v[1], cospi32);
+ u[0] = _mm256_add_epi32(v[0], v[1]);
+ u[0] = _mm256_add_epi32(u[0], rnding);
+ u[0] = _mm256_srai_epi32(u[0], bit);
+
+ u[1] = _mm256_sub_epi32(v[0], v[1]);
+ u[1] = _mm256_add_epi32(u[1], rnding);
+ u[1] = _mm256_srai_epi32(u[1], bit);
+
+ // type 1
+ v[0] = _mm256_mullo_epi32(v[2], cospi48);
+ v[1] = _mm256_mullo_epi32(v[3], cospi16);
+ u[2] = _mm256_add_epi32(v[0], v[1]);
+ u[2] = _mm256_add_epi32(u[2], rnding);
+ u[2] = _mm256_srai_epi32(u[2], bit);
+
+ v[0] = _mm256_mullo_epi32(v[2], cospi16);
+ v[1] = _mm256_mullo_epi32(v[3], cospi48);
+ u[3] = _mm256_sub_epi32(v[1], v[0]);
+ u[3] = _mm256_add_epi32(u[3], rnding);
+ u[3] = _mm256_srai_epi32(u[3], bit);
+
+ u[4] = _mm256_add_epi32(v[4], v[5]);
+ u[5] = _mm256_sub_epi32(v[4], v[5]);
+ u[6] = _mm256_sub_epi32(v[7], v[6]);
+ u[7] = _mm256_add_epi32(v[7], v[6]);
+
+ // stage 4
+ // stage 5
+ v[0] = _mm256_mullo_epi32(u[4], cospi56);
+ v[1] = _mm256_mullo_epi32(u[7], cospi8);
+ v[0] = _mm256_add_epi32(v[0], v[1]);
+ v[0] = _mm256_add_epi32(v[0], rnding);
+ out[1 * outstride + col] = _mm256_srai_epi32(v[0], bit); // buf0[4]
+
+ v[0] = _mm256_mullo_epi32(u[4], cospi8);
+ v[1] = _mm256_mullo_epi32(u[7], cospi56);
+ v[0] = _mm256_sub_epi32(v[1], v[0]);
+ v[0] = _mm256_add_epi32(v[0], rnding);
+ out[7 * outstride + col] = _mm256_srai_epi32(v[0], bit); // buf0[7]
+
+ v[0] = _mm256_mullo_epi32(u[5], cospi24);
+ v[1] = _mm256_mullo_epi32(u[6], cospi40);
+ v[0] = _mm256_add_epi32(v[0], v[1]);
+ v[0] = _mm256_add_epi32(v[0], rnding);
+ out[5 * outstride + col] = _mm256_srai_epi32(v[0], bit); // buf0[5]
+
+ v[0] = _mm256_mullo_epi32(u[5], cospi40);
+ v[1] = _mm256_mullo_epi32(u[6], cospi24);
+ v[0] = _mm256_sub_epi32(v[1], v[0]);
+ v[0] = _mm256_add_epi32(v[0], rnding);
+ out[3 * outstride + col] = _mm256_srai_epi32(v[0], bit); // buf0[6]
+
+ out[0 * outstride + col] = u[0]; // buf0[0]
+ out[4 * outstride + col] = u[1]; // buf0[1]
+ out[2 * outstride + col] = u[2]; // buf0[2]
+ out[6 * outstride + col] = u[3]; // buf0[3]
+ }
+}
+static void av1_fadst8_avx2(__m256i *in, __m256i *out, const int8_t bit,
+ const int col_num, const int outstirde) {
+ (void)col_num;
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+ const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+ const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
+ const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+ const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+ const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
+ const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+ const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+ const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
+ const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
+ const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
+ const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+ const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m256i v0, v1, v2, v3, v4, v5, v6, v7;
+ __m256i x, y;
+ for (int col = 0; col < col_num; ++col) {
+ u0 = in[0 * col_num + col];
+ u1 = _mm256_sub_epi32(zero, in[7 * col_num + col]);
+ u2 = _mm256_sub_epi32(zero, in[3 * col_num + col]);
+ u3 = in[4 * col_num + col];
+ u4 = _mm256_sub_epi32(zero, in[1 * col_num + col]);
+ u5 = in[6 * col_num + col];
+ u6 = in[2 * col_num + col];
+ u7 = _mm256_sub_epi32(zero, in[5 * col_num + col]);
+
+ // stage 2
+ v0 = u0;
+ v1 = u1;
+
+ x = _mm256_mullo_epi32(u2, cospi32);
+ y = _mm256_mullo_epi32(u3, cospi32);
+ v2 = _mm256_add_epi32(x, y);
+ v2 = _mm256_add_epi32(v2, rnding);
+ v2 = _mm256_srai_epi32(v2, bit);
+
+ v3 = _mm256_sub_epi32(x, y);
+ v3 = _mm256_add_epi32(v3, rnding);
+ v3 = _mm256_srai_epi32(v3, bit);
+
+ v4 = u4;
+ v5 = u5;
+
+ x = _mm256_mullo_epi32(u6, cospi32);
+ y = _mm256_mullo_epi32(u7, cospi32);
+ v6 = _mm256_add_epi32(x, y);
+ v6 = _mm256_add_epi32(v6, rnding);
+ v6 = _mm256_srai_epi32(v6, bit);
+
+ v7 = _mm256_sub_epi32(x, y);
+ v7 = _mm256_add_epi32(v7, rnding);
+ v7 = _mm256_srai_epi32(v7, bit);
+
+ // stage 3
+ u0 = _mm256_add_epi32(v0, v2);
+ u1 = _mm256_add_epi32(v1, v3);
+ u2 = _mm256_sub_epi32(v0, v2);
+ u3 = _mm256_sub_epi32(v1, v3);
+ u4 = _mm256_add_epi32(v4, v6);
+ u5 = _mm256_add_epi32(v5, v7);
+ u6 = _mm256_sub_epi32(v4, v6);
+ u7 = _mm256_sub_epi32(v5, v7);
+
+ // stage 4
+ v0 = u0;
+ v1 = u1;
+ v2 = u2;
+ v3 = u3;
+
+ x = _mm256_mullo_epi32(u4, cospi16);
+ y = _mm256_mullo_epi32(u5, cospi48);
+ v4 = _mm256_add_epi32(x, y);
+ v4 = _mm256_add_epi32(v4, rnding);
+ v4 = _mm256_srai_epi32(v4, bit);
+
+ x = _mm256_mullo_epi32(u4, cospi48);
+ y = _mm256_mullo_epi32(u5, cospim16);
+ v5 = _mm256_add_epi32(x, y);
+ v5 = _mm256_add_epi32(v5, rnding);
+ v5 = _mm256_srai_epi32(v5, bit);
+
+ x = _mm256_mullo_epi32(u6, cospim48);
+ y = _mm256_mullo_epi32(u7, cospi16);
+ v6 = _mm256_add_epi32(x, y);
+ v6 = _mm256_add_epi32(v6, rnding);
+ v6 = _mm256_srai_epi32(v6, bit);
+
+ x = _mm256_mullo_epi32(u6, cospi16);
+ y = _mm256_mullo_epi32(u7, cospi48);
+ v7 = _mm256_add_epi32(x, y);
+ v7 = _mm256_add_epi32(v7, rnding);
+ v7 = _mm256_srai_epi32(v7, bit);
+
+ // stage 5
+ u0 = _mm256_add_epi32(v0, v4);
+ u1 = _mm256_add_epi32(v1, v5);
+ u2 = _mm256_add_epi32(v2, v6);
+ u3 = _mm256_add_epi32(v3, v7);
+ u4 = _mm256_sub_epi32(v0, v4);
+ u5 = _mm256_sub_epi32(v1, v5);
+ u6 = _mm256_sub_epi32(v2, v6);
+ u7 = _mm256_sub_epi32(v3, v7);
+
+ // stage 6
+ x = _mm256_mullo_epi32(u0, cospi4);
+ y = _mm256_mullo_epi32(u1, cospi60);
+ v0 = _mm256_add_epi32(x, y);
+ v0 = _mm256_add_epi32(v0, rnding);
+ v0 = _mm256_srai_epi32(v0, bit);
+
+ x = _mm256_mullo_epi32(u0, cospi60);
+ y = _mm256_mullo_epi32(u1, cospim4);
+ v1 = _mm256_add_epi32(x, y);
+ v1 = _mm256_add_epi32(v1, rnding);
+ v1 = _mm256_srai_epi32(v1, bit);
+
+ x = _mm256_mullo_epi32(u2, cospi20);
+ y = _mm256_mullo_epi32(u3, cospi44);
+ v2 = _mm256_add_epi32(x, y);
+ v2 = _mm256_add_epi32(v2, rnding);
+ v2 = _mm256_srai_epi32(v2, bit);
+
+ x = _mm256_mullo_epi32(u2, cospi44);
+ y = _mm256_mullo_epi32(u3, cospim20);
+ v3 = _mm256_add_epi32(x, y);
+ v3 = _mm256_add_epi32(v3, rnding);
+ v3 = _mm256_srai_epi32(v3, bit);
+
+ x = _mm256_mullo_epi32(u4, cospi36);
+ y = _mm256_mullo_epi32(u5, cospi28);
+ v4 = _mm256_add_epi32(x, y);
+ v4 = _mm256_add_epi32(v4, rnding);
+ v4 = _mm256_srai_epi32(v4, bit);
+
+ x = _mm256_mullo_epi32(u4, cospi28);
+ y = _mm256_mullo_epi32(u5, cospim36);
+ v5 = _mm256_add_epi32(x, y);
+ v5 = _mm256_add_epi32(v5, rnding);
+ v5 = _mm256_srai_epi32(v5, bit);
+
+ x = _mm256_mullo_epi32(u6, cospi52);
+ y = _mm256_mullo_epi32(u7, cospi12);
+ v6 = _mm256_add_epi32(x, y);
+ v6 = _mm256_add_epi32(v6, rnding);
+ v6 = _mm256_srai_epi32(v6, bit);
+
+ x = _mm256_mullo_epi32(u6, cospi12);
+ y = _mm256_mullo_epi32(u7, cospim52);
+ v7 = _mm256_add_epi32(x, y);
+ v7 = _mm256_add_epi32(v7, rnding);
+ v7 = _mm256_srai_epi32(v7, bit);
+
+ // stage 7
+ out[0 * outstirde + col] = v1;
+ out[1 * outstirde + col] = v6;
+ out[2 * outstirde + col] = v3;
+ out[3 * outstirde + col] = v4;
+ out[4 * outstirde + col] = v5;
+ out[5 * outstirde + col] = v2;
+ out[6 * outstirde + col] = v7;
+ out[7 * outstirde + col] = v0;
+ }
+}
+static void av1_idtx8_avx2(__m256i *in, __m256i *out, const int8_t bit,
+ int col_num, int outstride) {
+ (void)bit;
+ (void)outstride;
+ int num_iters = 8 * col_num;
+ for (int i = 0; i < num_iters; i += 8) {
+ out[i] = _mm256_add_epi32(in[i], in[i]);
+ out[i + 1] = _mm256_add_epi32(in[i + 1], in[i + 1]);
+ out[i + 2] = _mm256_add_epi32(in[i + 2], in[i + 2]);
+ out[i + 3] = _mm256_add_epi32(in[i + 3], in[i + 3]);
+ out[i + 4] = _mm256_add_epi32(in[i + 4], in[i + 4]);
+ out[i + 5] = _mm256_add_epi32(in[i + 5], in[i + 5]);
+ out[i + 6] = _mm256_add_epi32(in[i + 6], in[i + 6]);
+ out[i + 7] = _mm256_add_epi32(in[i + 7], in[i + 7]);
+ }
+}
+void av1_fwd_txfm2d_8x8_avx2(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ __m256i in[8], out[8];
+ const TX_SIZE tx_size = TX_8X8;
+ const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int width = tx_size_wide[tx_size];
+ const int width_div8 = (width >> 3);
+
+ switch (tx_type) {
+ case DCT_DCT:
+ av1_load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+ av1_fdct8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ av1_fdct8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ av1_store_buffer_avx2(in, coeff, 8, 8);
+ break;
+ case ADST_DCT:
+ av1_load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+ av1_fadst8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ av1_fdct8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ av1_store_buffer_avx2(in, coeff, 8, 8);
+ break;
+ case DCT_ADST:
+ av1_load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+ av1_fdct8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ av1_fadst8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ av1_store_buffer_avx2(in, coeff, 8, 8);
+ break;
+ case ADST_ADST:
+ av1_load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+ av1_fadst8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ av1_fadst8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ av1_store_buffer_avx2(in, coeff, 8, 8);
+ break;
+ case FLIPADST_DCT:
+ av1_load_buffer_8x8_avx2(input, in, stride, 1, 0, shift[0]);
+ av1_fadst8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ av1_fdct8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ av1_store_buffer_avx2(in, coeff, 8, 8);
+ break;
+ case DCT_FLIPADST:
+ av1_load_buffer_8x8_avx2(input, in, stride, 0, 1, shift[0]);
+ av1_fdct8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ av1_fadst8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ av1_store_buffer_avx2(in, coeff, 8, 8);
+ break;
+ case FLIPADST_FLIPADST:
+ av1_load_buffer_8x8_avx2(input, in, stride, 1, 1, shift[0]);
+ av1_fadst8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ av1_fadst8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ av1_store_buffer_avx2(in, coeff, 8, 8);
+ break;
+ case ADST_FLIPADST:
+ av1_load_buffer_8x8_avx2(input, in, stride, 0, 1, shift[0]);
+ av1_fadst8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ av1_fadst8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ av1_store_buffer_avx2(in, coeff, 8, 8);
+ break;
+ case FLIPADST_ADST:
+ av1_load_buffer_8x8_avx2(input, in, stride, 1, 0, shift[0]);
+ av1_fadst8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ av1_fadst8_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ av1_store_buffer_avx2(in, coeff, 8, 8);
+ break;
+ case IDTX:
+ av1_load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+ av1_idtx8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ av1_idtx8_avx2(out, in, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ av1_store_buffer_avx2(in, coeff, 8, 8);
+ break;
+ case V_DCT:
+ av1_load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+ av1_fdct8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ av1_idtx8_avx2(out, in, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ av1_store_buffer_avx2(in, coeff, 8, 8);
+ break;
+ case H_DCT:
+ av1_load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+ av1_idtx8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ av1_fdct8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ av1_store_buffer_avx2(in, coeff, 8, 8);
+ break;
+ case V_ADST:
+ av1_load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+ av1_fadst8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ av1_idtx8_avx2(out, in, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ av1_store_buffer_avx2(in, coeff, 8, 8);
+ break;
+ case H_ADST:
+ av1_load_buffer_8x8_avx2(input, in, stride, 0, 0, shift[0]);
+ av1_idtx8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ av1_fadst8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ av1_store_buffer_avx2(in, coeff, 8, 8);
+ break;
+ case V_FLIPADST:
+ av1_load_buffer_8x8_avx2(input, in, stride, 1, 0, shift[0]);
+ av1_fadst8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ av1_idtx8_avx2(out, in, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ av1_store_buffer_avx2(in, coeff, 8, 8);
+ break;
+ case H_FLIPADST:
+ av1_load_buffer_8x8_avx2(input, in, stride, 0, 1, shift[0]);
+ av1_idtx8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ av1_fadst8_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ av1_fwd_txfm_transpose_8x8_avx2(out, in, width_div8, width_div8);
+ av1_store_buffer_avx2(in, coeff, 8, 8);
+ break;
+ default: assert(0);
+ }
+ (void)bd;
+}
+
+static void av1_fdct16_avx2(__m256i *in, __m256i *out, const int8_t bit,
+ const int col_num, const int outstride) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+ const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+ const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+ const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+ const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+ const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+ const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
+ const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+ const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+ const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+ const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ __m256i u[16], v[16], x;
+ int col;
+
+ // Calculate the column 0, 1, 2, 3
+ for (col = 0; col < col_num; ++col) {
+ // stage 0
+ // stage 1
+ u[0] = _mm256_add_epi32(in[0 * col_num + col], in[15 * col_num + col]);
+ u[15] = _mm256_sub_epi32(in[0 * col_num + col], in[15 * col_num + col]);
+ u[1] = _mm256_add_epi32(in[1 * col_num + col], in[14 * col_num + col]);
+ u[14] = _mm256_sub_epi32(in[1 * col_num + col], in[14 * col_num + col]);
+ u[2] = _mm256_add_epi32(in[2 * col_num + col], in[13 * col_num + col]);
+ u[13] = _mm256_sub_epi32(in[2 * col_num + col], in[13 * col_num + col]);
+ u[3] = _mm256_add_epi32(in[3 * col_num + col], in[12 * col_num + col]);
+ u[12] = _mm256_sub_epi32(in[3 * col_num + col], in[12 * col_num + col]);
+ u[4] = _mm256_add_epi32(in[4 * col_num + col], in[11 * col_num + col]);
+ u[11] = _mm256_sub_epi32(in[4 * col_num + col], in[11 * col_num + col]);
+ u[5] = _mm256_add_epi32(in[5 * col_num + col], in[10 * col_num + col]);
+ u[10] = _mm256_sub_epi32(in[5 * col_num + col], in[10 * col_num + col]);
+ u[6] = _mm256_add_epi32(in[6 * col_num + col], in[9 * col_num + col]);
+ u[9] = _mm256_sub_epi32(in[6 * col_num + col], in[9 * col_num + col]);
+ u[7] = _mm256_add_epi32(in[7 * col_num + col], in[8 * col_num + col]);
+ u[8] = _mm256_sub_epi32(in[7 * col_num + col], in[8 * col_num + col]);
+
+ // stage 2
+ v[0] = _mm256_add_epi32(u[0], u[7]);
+ v[7] = _mm256_sub_epi32(u[0], u[7]);
+ v[1] = _mm256_add_epi32(u[1], u[6]);
+ v[6] = _mm256_sub_epi32(u[1], u[6]);
+ v[2] = _mm256_add_epi32(u[2], u[5]);
+ v[5] = _mm256_sub_epi32(u[2], u[5]);
+ v[3] = _mm256_add_epi32(u[3], u[4]);
+ v[4] = _mm256_sub_epi32(u[3], u[4]);
+ v[8] = u[8];
+ v[9] = u[9];
+
+ v[10] = _mm256_mullo_epi32(u[10], cospim32);
+ x = _mm256_mullo_epi32(u[13], cospi32);
+ v[10] = _mm256_add_epi32(v[10], x);
+ v[10] = _mm256_add_epi32(v[10], rnding);
+ v[10] = _mm256_srai_epi32(v[10], bit);
+
+ v[13] = _mm256_mullo_epi32(u[10], cospi32);
+ x = _mm256_mullo_epi32(u[13], cospim32);
+ v[13] = _mm256_sub_epi32(v[13], x);
+ v[13] = _mm256_add_epi32(v[13], rnding);
+ v[13] = _mm256_srai_epi32(v[13], bit);
+
+ v[11] = _mm256_mullo_epi32(u[11], cospim32);
+ x = _mm256_mullo_epi32(u[12], cospi32);
+ v[11] = _mm256_add_epi32(v[11], x);
+ v[11] = _mm256_add_epi32(v[11], rnding);
+ v[11] = _mm256_srai_epi32(v[11], bit);
+
+ v[12] = _mm256_mullo_epi32(u[11], cospi32);
+ x = _mm256_mullo_epi32(u[12], cospim32);
+ v[12] = _mm256_sub_epi32(v[12], x);
+ v[12] = _mm256_add_epi32(v[12], rnding);
+ v[12] = _mm256_srai_epi32(v[12], bit);
+ v[14] = u[14];
+ v[15] = u[15];
+
+ // stage 3
+ u[0] = _mm256_add_epi32(v[0], v[3]);
+ u[3] = _mm256_sub_epi32(v[0], v[3]);
+ u[1] = _mm256_add_epi32(v[1], v[2]);
+ u[2] = _mm256_sub_epi32(v[1], v[2]);
+ u[4] = v[4];
+
+ u[5] = _mm256_mullo_epi32(v[5], cospim32);
+ x = _mm256_mullo_epi32(v[6], cospi32);
+ u[5] = _mm256_add_epi32(u[5], x);
+ u[5] = _mm256_add_epi32(u[5], rnding);
+ u[5] = _mm256_srai_epi32(u[5], bit);
+
+ u[6] = _mm256_mullo_epi32(v[5], cospi32);
+ x = _mm256_mullo_epi32(v[6], cospim32);
+ u[6] = _mm256_sub_epi32(u[6], x);
+ u[6] = _mm256_add_epi32(u[6], rnding);
+ u[6] = _mm256_srai_epi32(u[6], bit);
+
+ u[7] = v[7];
+ u[8] = _mm256_add_epi32(v[8], v[11]);
+ u[11] = _mm256_sub_epi32(v[8], v[11]);
+ u[9] = _mm256_add_epi32(v[9], v[10]);
+ u[10] = _mm256_sub_epi32(v[9], v[10]);
+ u[12] = _mm256_sub_epi32(v[15], v[12]);
+ u[15] = _mm256_add_epi32(v[15], v[12]);
+ u[13] = _mm256_sub_epi32(v[14], v[13]);
+ u[14] = _mm256_add_epi32(v[14], v[13]);
+
+ // stage 4
+ u[0] = _mm256_mullo_epi32(u[0], cospi32);
+ u[1] = _mm256_mullo_epi32(u[1], cospi32);
+ v[0] = _mm256_add_epi32(u[0], u[1]);
+ v[0] = _mm256_add_epi32(v[0], rnding);
+ v[0] = _mm256_srai_epi32(v[0], bit);
+
+ v[1] = _mm256_sub_epi32(u[0], u[1]);
+ v[1] = _mm256_add_epi32(v[1], rnding);
+ v[1] = _mm256_srai_epi32(v[1], bit);
+
+ v[2] = _mm256_mullo_epi32(u[2], cospi48);
+ x = _mm256_mullo_epi32(u[3], cospi16);
+ v[2] = _mm256_add_epi32(v[2], x);
+ v[2] = _mm256_add_epi32(v[2], rnding);
+ v[2] = _mm256_srai_epi32(v[2], bit);
+
+ v[3] = _mm256_mullo_epi32(u[2], cospi16);
+ x = _mm256_mullo_epi32(u[3], cospi48);
+ v[3] = _mm256_sub_epi32(x, v[3]);
+ v[3] = _mm256_add_epi32(v[3], rnding);
+ v[3] = _mm256_srai_epi32(v[3], bit);
+
+ v[4] = _mm256_add_epi32(u[4], u[5]);
+ v[5] = _mm256_sub_epi32(u[4], u[5]);
+ v[6] = _mm256_sub_epi32(u[7], u[6]);
+ v[7] = _mm256_add_epi32(u[7], u[6]);
+ v[8] = u[8];
+
+ v[9] = _mm256_mullo_epi32(u[9], cospim16);
+ x = _mm256_mullo_epi32(u[14], cospi48);
+ v[9] = _mm256_add_epi32(v[9], x);
+ v[9] = _mm256_add_epi32(v[9], rnding);
+ v[9] = _mm256_srai_epi32(v[9], bit);
+
+ v[14] = _mm256_mullo_epi32(u[9], cospi48);
+ x = _mm256_mullo_epi32(u[14], cospim16);
+ v[14] = _mm256_sub_epi32(v[14], x);
+ v[14] = _mm256_add_epi32(v[14], rnding);
+ v[14] = _mm256_srai_epi32(v[14], bit);
+
+ v[10] = _mm256_mullo_epi32(u[10], cospim48);
+ x = _mm256_mullo_epi32(u[13], cospim16);
+ v[10] = _mm256_add_epi32(v[10], x);
+ v[10] = _mm256_add_epi32(v[10], rnding);
+ v[10] = _mm256_srai_epi32(v[10], bit);
+
+ v[13] = _mm256_mullo_epi32(u[10], cospim16);
+ x = _mm256_mullo_epi32(u[13], cospim48);
+ v[13] = _mm256_sub_epi32(v[13], x);
+ v[13] = _mm256_add_epi32(v[13], rnding);
+ v[13] = _mm256_srai_epi32(v[13], bit);
+
+ v[11] = u[11];
+ v[12] = u[12];
+ v[15] = u[15];
+
+ // stage 5
+ u[0] = v[0];
+ u[1] = v[1];
+ u[2] = v[2];
+ u[3] = v[3];
+
+ u[4] = _mm256_mullo_epi32(v[4], cospi56);
+ x = _mm256_mullo_epi32(v[7], cospi8);
+ u[4] = _mm256_add_epi32(u[4], x);
+ u[4] = _mm256_add_epi32(u[4], rnding);
+ u[4] = _mm256_srai_epi32(u[4], bit);
+
+ u[7] = _mm256_mullo_epi32(v[4], cospi8);
+ x = _mm256_mullo_epi32(v[7], cospi56);
+ u[7] = _mm256_sub_epi32(x, u[7]);
+ u[7] = _mm256_add_epi32(u[7], rnding);
+ u[7] = _mm256_srai_epi32(u[7], bit);
+
+ u[5] = _mm256_mullo_epi32(v[5], cospi24);
+ x = _mm256_mullo_epi32(v[6], cospi40);
+ u[5] = _mm256_add_epi32(u[5], x);
+ u[5] = _mm256_add_epi32(u[5], rnding);
+ u[5] = _mm256_srai_epi32(u[5], bit);
+
+ u[6] = _mm256_mullo_epi32(v[5], cospi40);
+ x = _mm256_mullo_epi32(v[6], cospi24);
+ u[6] = _mm256_sub_epi32(x, u[6]);
+ u[6] = _mm256_add_epi32(u[6], rnding);
+ u[6] = _mm256_srai_epi32(u[6], bit);
+
+ u[8] = _mm256_add_epi32(v[8], v[9]);
+ u[9] = _mm256_sub_epi32(v[8], v[9]);
+ u[10] = _mm256_sub_epi32(v[11], v[10]);
+ u[11] = _mm256_add_epi32(v[11], v[10]);
+ u[12] = _mm256_add_epi32(v[12], v[13]);
+ u[13] = _mm256_sub_epi32(v[12], v[13]);
+ u[14] = _mm256_sub_epi32(v[15], v[14]);
+ u[15] = _mm256_add_epi32(v[15], v[14]);
+
+ // stage 6
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+ v[4] = u[4];
+ v[5] = u[5];
+ v[6] = u[6];
+ v[7] = u[7];
+
+ v[8] = _mm256_mullo_epi32(u[8], cospi60);
+ x = _mm256_mullo_epi32(u[15], cospi4);
+ v[8] = _mm256_add_epi32(v[8], x);
+ v[8] = _mm256_add_epi32(v[8], rnding);
+ v[8] = _mm256_srai_epi32(v[8], bit);
+
+ v[15] = _mm256_mullo_epi32(u[8], cospi4);
+ x = _mm256_mullo_epi32(u[15], cospi60);
+ v[15] = _mm256_sub_epi32(x, v[15]);
+ v[15] = _mm256_add_epi32(v[15], rnding);
+ v[15] = _mm256_srai_epi32(v[15], bit);
+
+ v[9] = _mm256_mullo_epi32(u[9], cospi28);
+ x = _mm256_mullo_epi32(u[14], cospi36);
+ v[9] = _mm256_add_epi32(v[9], x);
+ v[9] = _mm256_add_epi32(v[9], rnding);
+ v[9] = _mm256_srai_epi32(v[9], bit);
+
+ v[14] = _mm256_mullo_epi32(u[9], cospi36);
+ x = _mm256_mullo_epi32(u[14], cospi28);
+ v[14] = _mm256_sub_epi32(x, v[14]);
+ v[14] = _mm256_add_epi32(v[14], rnding);
+ v[14] = _mm256_srai_epi32(v[14], bit);
+
+ v[10] = _mm256_mullo_epi32(u[10], cospi44);
+ x = _mm256_mullo_epi32(u[13], cospi20);
+ v[10] = _mm256_add_epi32(v[10], x);
+ v[10] = _mm256_add_epi32(v[10], rnding);
+ v[10] = _mm256_srai_epi32(v[10], bit);
+
+ v[13] = _mm256_mullo_epi32(u[10], cospi20);
+ x = _mm256_mullo_epi32(u[13], cospi44);
+ v[13] = _mm256_sub_epi32(x, v[13]);
+ v[13] = _mm256_add_epi32(v[13], rnding);
+ v[13] = _mm256_srai_epi32(v[13], bit);
+
+ v[11] = _mm256_mullo_epi32(u[11], cospi12);
+ x = _mm256_mullo_epi32(u[12], cospi52);
+ v[11] = _mm256_add_epi32(v[11], x);
+ v[11] = _mm256_add_epi32(v[11], rnding);
+ v[11] = _mm256_srai_epi32(v[11], bit);
+
+ v[12] = _mm256_mullo_epi32(u[11], cospi52);
+ x = _mm256_mullo_epi32(u[12], cospi12);
+ v[12] = _mm256_sub_epi32(x, v[12]);
+ v[12] = _mm256_add_epi32(v[12], rnding);
+ v[12] = _mm256_srai_epi32(v[12], bit);
+
+ out[0 * outstride + col] = v[0];
+ out[1 * outstride + col] = v[8];
+ out[2 * outstride + col] = v[4];
+ out[3 * outstride + col] = v[12];
+ out[4 * outstride + col] = v[2];
+ out[5 * outstride + col] = v[10];
+ out[6 * outstride + col] = v[6];
+ out[7 * outstride + col] = v[14];
+ out[8 * outstride + col] = v[1];
+ out[9 * outstride + col] = v[9];
+ out[10 * outstride + col] = v[5];
+ out[11 * outstride + col] = v[13];
+ out[12 * outstride + col] = v[3];
+ out[13 * outstride + col] = v[11];
+ out[14 * outstride + col] = v[7];
+ out[15 * outstride + col] = v[15];
+ }
+}
+static void av1_fadst16_avx2(__m256i *in, __m256i *out, const int8_t bit,
+ const int num_cols, const int outstride) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+ const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+ const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+ const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+ const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+ const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+ const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+ const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+ const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+ const __m256i cospim2 = _mm256_set1_epi32(-cospi[2]);
+ const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+ const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+ const __m256i cospim10 = _mm256_set1_epi32(-cospi[10]);
+ const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
+ const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
+ const __m256i cospim18 = _mm256_set1_epi32(-cospi[18]);
+ const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
+ const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
+ const __m256i cospim26 = _mm256_set1_epi32(-cospi[26]);
+ const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
+ const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
+ const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
+ const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
+ const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
+ const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
+ const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
+ const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+ const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
+ const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
+ const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+ const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
+ const __m256i rnding = _mm256_set1_epi32(1 << (bit - 1));
+ const __m256i zero = _mm256_setzero_si256();
+
+ __m256i u[16], v[16], x, y;
+ int col;
+
+ for (col = 0; col < num_cols; ++col) {
+ // stage 0
+ // stage 1
+ u[0] = in[0 * num_cols + col];
+ u[1] = _mm256_sub_epi32(zero, in[15 * num_cols + col]);
+ u[2] = _mm256_sub_epi32(zero, in[7 * num_cols + col]);
+ u[3] = in[8 * num_cols + col];
+ u[4] = _mm256_sub_epi32(zero, in[3 * num_cols + col]);
+ u[5] = in[12 * num_cols + col];
+ u[6] = in[4 * num_cols + col];
+ u[7] = _mm256_sub_epi32(zero, in[11 * num_cols + col]);
+ u[8] = _mm256_sub_epi32(zero, in[1 * num_cols + col]);
+ u[9] = in[14 * num_cols + col];
+ u[10] = in[6 * num_cols + col];
+ u[11] = _mm256_sub_epi32(zero, in[9 * num_cols + col]);
+ u[12] = in[2 * num_cols + col];
+ u[13] = _mm256_sub_epi32(zero, in[13 * num_cols + col]);
+ u[14] = _mm256_sub_epi32(zero, in[5 * num_cols + col]);
+ u[15] = in[10 * num_cols + col];
+
+ // stage 2
+ v[0] = u[0];
+ v[1] = u[1];
+
+ x = _mm256_mullo_epi32(u[2], cospi32);
+ y = _mm256_mullo_epi32(u[3], cospi32);
+ v[2] = _mm256_add_epi32(x, y);
+ v[2] = _mm256_add_epi32(v[2], rnding);
+ v[2] = _mm256_srai_epi32(v[2], bit);
+
+ v[3] = _mm256_sub_epi32(x, y);
+ v[3] = _mm256_add_epi32(v[3], rnding);
+ v[3] = _mm256_srai_epi32(v[3], bit);
+
+ v[4] = u[4];
+ v[5] = u[5];
+
+ x = _mm256_mullo_epi32(u[6], cospi32);
+ y = _mm256_mullo_epi32(u[7], cospi32);
+ v[6] = _mm256_add_epi32(x, y);
+ v[6] = _mm256_add_epi32(v[6], rnding);
+ v[6] = _mm256_srai_epi32(v[6], bit);
+
+ v[7] = _mm256_sub_epi32(x, y);
+ v[7] = _mm256_add_epi32(v[7], rnding);
+ v[7] = _mm256_srai_epi32(v[7], bit);
+
+ v[8] = u[8];
+ v[9] = u[9];
+
+ x = _mm256_mullo_epi32(u[10], cospi32);
+ y = _mm256_mullo_epi32(u[11], cospi32);
+ v[10] = _mm256_add_epi32(x, y);
+ v[10] = _mm256_add_epi32(v[10], rnding);
+ v[10] = _mm256_srai_epi32(v[10], bit);
+
+ v[11] = _mm256_sub_epi32(x, y);
+ v[11] = _mm256_add_epi32(v[11], rnding);
+ v[11] = _mm256_srai_epi32(v[11], bit);
+
+ v[12] = u[12];
+ v[13] = u[13];
+
+ x = _mm256_mullo_epi32(u[14], cospi32);
+ y = _mm256_mullo_epi32(u[15], cospi32);
+ v[14] = _mm256_add_epi32(x, y);
+ v[14] = _mm256_add_epi32(v[14], rnding);
+ v[14] = _mm256_srai_epi32(v[14], bit);
+
+ v[15] = _mm256_sub_epi32(x, y);
+ v[15] = _mm256_add_epi32(v[15], rnding);
+ v[15] = _mm256_srai_epi32(v[15], bit);
+
+ // stage 3
+ u[0] = _mm256_add_epi32(v[0], v[2]);
+ u[1] = _mm256_add_epi32(v[1], v[3]);
+ u[2] = _mm256_sub_epi32(v[0], v[2]);
+ u[3] = _mm256_sub_epi32(v[1], v[3]);
+ u[4] = _mm256_add_epi32(v[4], v[6]);
+ u[5] = _mm256_add_epi32(v[5], v[7]);
+ u[6] = _mm256_sub_epi32(v[4], v[6]);
+ u[7] = _mm256_sub_epi32(v[5], v[7]);
+ u[8] = _mm256_add_epi32(v[8], v[10]);
+ u[9] = _mm256_add_epi32(v[9], v[11]);
+ u[10] = _mm256_sub_epi32(v[8], v[10]);
+ u[11] = _mm256_sub_epi32(v[9], v[11]);
+ u[12] = _mm256_add_epi32(v[12], v[14]);
+ u[13] = _mm256_add_epi32(v[13], v[15]);
+ u[14] = _mm256_sub_epi32(v[12], v[14]);
+ u[15] = _mm256_sub_epi32(v[13], v[15]);
+
+ // stage 4
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+ v[4] = av1_half_btf_avx2(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit);
+ v[5] = av1_half_btf_avx2(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit);
+ v[6] = av1_half_btf_avx2(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit);
+ v[7] = av1_half_btf_avx2(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit);
+ v[8] = u[8];
+ v[9] = u[9];
+ v[10] = u[10];
+ v[11] = u[11];
+ v[12] = av1_half_btf_avx2(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit);
+ v[13] =
+ av1_half_btf_avx2(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit);
+ v[14] =
+ av1_half_btf_avx2(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit);
+ v[15] = av1_half_btf_avx2(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit);
+
+ // stage 5
+ u[0] = _mm256_add_epi32(v[0], v[4]);
+ u[1] = _mm256_add_epi32(v[1], v[5]);
+ u[2] = _mm256_add_epi32(v[2], v[6]);
+ u[3] = _mm256_add_epi32(v[3], v[7]);
+ u[4] = _mm256_sub_epi32(v[0], v[4]);
+ u[5] = _mm256_sub_epi32(v[1], v[5]);
+ u[6] = _mm256_sub_epi32(v[2], v[6]);
+ u[7] = _mm256_sub_epi32(v[3], v[7]);
+ u[8] = _mm256_add_epi32(v[8], v[12]);
+ u[9] = _mm256_add_epi32(v[9], v[13]);
+ u[10] = _mm256_add_epi32(v[10], v[14]);
+ u[11] = _mm256_add_epi32(v[11], v[15]);
+ u[12] = _mm256_sub_epi32(v[8], v[12]);
+ u[13] = _mm256_sub_epi32(v[9], v[13]);
+ u[14] = _mm256_sub_epi32(v[10], v[14]);
+ u[15] = _mm256_sub_epi32(v[11], v[15]);
+
+ // stage 6
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+ v[4] = u[4];
+ v[5] = u[5];
+ v[6] = u[6];
+ v[7] = u[7];
+ v[8] = av1_half_btf_avx2(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit);
+ v[9] = av1_half_btf_avx2(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit);
+ v[10] = av1_half_btf_avx2(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit);
+ v[11] =
+ av1_half_btf_avx2(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit);
+ v[12] = av1_half_btf_avx2(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit);
+ v[13] = av1_half_btf_avx2(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit);
+ v[14] =
+ av1_half_btf_avx2(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit);
+ v[15] = av1_half_btf_avx2(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit);
+
+ // stage 7
+ u[0] = _mm256_add_epi32(v[0], v[8]);
+ u[1] = _mm256_add_epi32(v[1], v[9]);
+ u[2] = _mm256_add_epi32(v[2], v[10]);
+ u[3] = _mm256_add_epi32(v[3], v[11]);
+ u[4] = _mm256_add_epi32(v[4], v[12]);
+ u[5] = _mm256_add_epi32(v[5], v[13]);
+ u[6] = _mm256_add_epi32(v[6], v[14]);
+ u[7] = _mm256_add_epi32(v[7], v[15]);
+ u[8] = _mm256_sub_epi32(v[0], v[8]);
+ u[9] = _mm256_sub_epi32(v[1], v[9]);
+ u[10] = _mm256_sub_epi32(v[2], v[10]);
+ u[11] = _mm256_sub_epi32(v[3], v[11]);
+ u[12] = _mm256_sub_epi32(v[4], v[12]);
+ u[13] = _mm256_sub_epi32(v[5], v[13]);
+ u[14] = _mm256_sub_epi32(v[6], v[14]);
+ u[15] = _mm256_sub_epi32(v[7], v[15]);
+
+ // stage 8
+ v[0] = av1_half_btf_avx2(&cospi2, &u[0], &cospi62, &u[1], &rnding, bit);
+ v[1] = av1_half_btf_avx2(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit);
+ v[2] = av1_half_btf_avx2(&cospi10, &u[2], &cospi54, &u[3], &rnding, bit);
+ v[3] = av1_half_btf_avx2(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit);
+ v[4] = av1_half_btf_avx2(&cospi18, &u[4], &cospi46, &u[5], &rnding, bit);
+ v[5] = av1_half_btf_avx2(&cospi46, &u[4], &cospim18, &u[5], &rnding, bit);
+ v[6] = av1_half_btf_avx2(&cospi26, &u[6], &cospi38, &u[7], &rnding, bit);
+ v[7] = av1_half_btf_avx2(&cospi38, &u[6], &cospim26, &u[7], &rnding, bit);
+ v[8] = av1_half_btf_avx2(&cospi34, &u[8], &cospi30, &u[9], &rnding, bit);
+ v[9] = av1_half_btf_avx2(&cospi30, &u[8], &cospim34, &u[9], &rnding, bit);
+ v[10] = av1_half_btf_avx2(&cospi42, &u[10], &cospi22, &u[11], &rnding, bit);
+ v[11] =
+ av1_half_btf_avx2(&cospi22, &u[10], &cospim42, &u[11], &rnding, bit);
+ v[12] = av1_half_btf_avx2(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit);
+ v[13] =
+ av1_half_btf_avx2(&cospi14, &u[12], &cospim50, &u[13], &rnding, bit);
+ v[14] = av1_half_btf_avx2(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit);
+ v[15] = av1_half_btf_avx2(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit);
+
+ // stage 9
+ out[0 * outstride + col] = v[1];
+ out[1 * outstride + col] = v[14];
+ out[2 * outstride + col] = v[3];
+ out[3 * outstride + col] = v[12];
+ out[4 * outstride + col] = v[5];
+ out[5 * outstride + col] = v[10];
+ out[6 * outstride + col] = v[7];
+ out[7 * outstride + col] = v[8];
+ out[8 * outstride + col] = v[9];
+ out[9 * outstride + col] = v[6];
+ out[10 * outstride + col] = v[11];
+ out[11 * outstride + col] = v[4];
+ out[12 * outstride + col] = v[13];
+ out[13 * outstride + col] = v[2];
+ out[14 * outstride + col] = v[15];
+ out[15 * outstride + col] = v[0];
+ }
+}
+static void av1_idtx16_avx2(__m256i *in, __m256i *out, const int8_t bit,
+ int col_num, const int outstride) {
+ (void)bit;
+ (void)outstride;
+ __m256i fact = _mm256_set1_epi32(2 * NewSqrt2);
+ __m256i offset = _mm256_set1_epi32(1 << (NewSqrt2Bits - 1));
+ __m256i a_low;
+
+ int num_iters = 16 * col_num;
+ for (int i = 0; i < num_iters; i++) {
+ a_low = _mm256_mullo_epi32(in[i], fact);
+ a_low = _mm256_add_epi32(a_low, offset);
+ out[i] = _mm256_srai_epi32(a_low, NewSqrt2Bits);
+ }
+}
+static const transform_1d_avx2 col_highbd_txfm8x16_arr[TX_TYPES] = {
+ av1_fdct16_avx2, // DCT_DCT
+ av1_fadst16_avx2, // ADST_DCT
+ av1_fdct16_avx2, // DCT_ADST
+ av1_fadst16_avx2, // ADST_ADST
+ av1_fadst16_avx2, // FLIPADST_DCT
+ av1_fdct16_avx2, // DCT_FLIPADST
+ av1_fadst16_avx2, // FLIPADST_FLIPADST
+ av1_fadst16_avx2, // ADST_FLIPADST
+ av1_fadst16_avx2, // FLIPADST_ADST
+ av1_idtx16_avx2, // IDTX
+ av1_fdct16_avx2, // V_DCT
+ av1_idtx16_avx2, // H_DCT
+ av1_fadst16_avx2, // V_ADST
+ av1_idtx16_avx2, // H_ADST
+ av1_fadst16_avx2, // V_FLIPADST
+ av1_idtx16_avx2 // H_FLIPADST
+};
+static const transform_1d_avx2 row_highbd_txfm8x8_arr[TX_TYPES] = {
+ av1_fdct8_avx2, // DCT_DCT
+ av1_fdct8_avx2, // ADST_DCT
+ av1_fadst8_avx2, // DCT_ADST
+ av1_fadst8_avx2, // ADST_ADST
+ av1_fdct8_avx2, // FLIPADST_DCT
+ av1_fadst8_avx2, // DCT_FLIPADST
+ av1_fadst8_avx2, // FLIPADST_FLIPADST
+ av1_fadst8_avx2, // ADST_FLIPADST
+ av1_fadst8_avx2, // FLIPADST_ADST
+ av1_idtx8_avx2, // IDTX
+ av1_idtx8_avx2, // V_DCT
+ av1_fdct8_avx2, // H_DCT
+ av1_idtx8_avx2, // V_ADST
+ av1_fadst8_avx2, // H_ADST
+ av1_idtx8_avx2, // V_FLIPADST
+ av1_fadst8_avx2 // H_FLIPADST
+};
+void av1_fwd_txfm2d_8x16_avx2(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ __m256i in[16], out[16];
+ const int8_t *shift = fwd_txfm_shift_ls[TX_8X16];
+ const int txw_idx = get_txw_idx(TX_8X16);
+ const int txh_idx = get_txh_idx(TX_8X16);
+ const transform_1d_avx2 col_txfm = col_highbd_txfm8x16_arr[tx_type];
+ const transform_1d_avx2 row_txfm = row_highbd_txfm8x8_arr[tx_type];
+ const int8_t bit = fwd_cos_bit_col[txw_idx][txh_idx];
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ av1_load_buffer_8x16_avx2(input, in, stride, ud_flip, lr_flip, shift[0]);
+ col_txfm(in, out, bit, 1, 1);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ col_txfm_8x8_rounding(&out[8], -shift[1]);
+ av1_fwd_txfm_transpose_8x8_avx2(out, in, 1, 2);
+ av1_fwd_txfm_transpose_8x8_avx2(&out[8], &in[1], 1, 2);
+ row_txfm(in, out, bit, 2, 2);
+ av1_fwd_txfm_transpose_8x8_avx2(out, in, 2, 1);
+ av1_fwd_txfm_transpose_8x8_avx2(&out[1], &in[8], 2, 1);
+ av1_round_shift_rect_array_32_avx2(in, in, 16, -shift[2], NewSqrt2);
+ av1_store_buffer_avx2(in, coeff, 8, 16);
+ (void)bd;
+}
+static const transform_1d_avx2 col_highbd_txfm8x8_arr[TX_TYPES] = {
+ av1_fdct8_avx2, // DCT_DCT
+ av1_fadst8_avx2, // ADST_DCT
+ av1_fdct8_avx2, // DCT_ADST
+ av1_fadst8_avx2, // ADST_ADST
+ av1_fadst8_avx2, // FLIPADST_DCT
+ av1_fdct8_avx2, // DCT_FLIPADST
+ av1_fadst8_avx2, // FLIPADST_FLIPADST
+ av1_fadst8_avx2, // ADST_FLIPADST
+ av1_fadst8_avx2, // FLIPADST_ADST
+ av1_idtx8_avx2, // IDTX
+ av1_fdct8_avx2, // V_DCT
+ av1_idtx8_avx2, // H_DCT
+ av1_fadst8_avx2, // V_ADST
+ av1_idtx8_avx2, // H_ADST
+ av1_fadst8_avx2, // V_FLIPADST
+ av1_idtx8_avx2 // H_FLIPADST
+};
+static const transform_1d_avx2 row_highbd_txfm8x16_arr[TX_TYPES] = {
+ av1_fdct16_avx2, // DCT_DCT
+ av1_fdct16_avx2, // ADST_DCT
+ av1_fadst16_avx2, // DCT_ADST
+ av1_fadst16_avx2, // ADST_ADST
+ av1_fdct16_avx2, // FLIPADST_DCT
+ av1_fadst16_avx2, // DCT_FLIPADST
+ av1_fadst16_avx2, // FLIPADST_FLIPADST
+ av1_fadst16_avx2, // ADST_FLIPADST
+ av1_fadst16_avx2, // FLIPADST_ADST
+ av1_idtx16_avx2, // IDTX
+ av1_idtx16_avx2, // V_DCT
+ av1_fdct16_avx2, // H_DCT
+ av1_idtx16_avx2, // V_ADST
+ av1_fadst16_avx2, // H_ADST
+ av1_idtx16_avx2, // V_FLIPADST
+ av1_fadst16_avx2 // H_FLIPADST
+};
+void av1_fwd_txfm2d_16x8_avx2(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ __m256i in[16], out[16];
+ const int8_t *shift = fwd_txfm_shift_ls[TX_16X8];
+ const int txw_idx = get_txw_idx(TX_16X8);
+ const int txh_idx = get_txh_idx(TX_16X8);
+ const transform_1d_avx2 col_txfm = col_highbd_txfm8x8_arr[tx_type];
+ const transform_1d_avx2 row_txfm = row_highbd_txfm8x16_arr[tx_type];
+ const int8_t bit = fwd_cos_bit_col[txw_idx][txh_idx];
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ av1_load_buffer_16xn_avx2(input, in, stride, 8, 2, ud_flip, lr_flip);
+ av1_round_shift_32_8xn_avx2(in, 16, shift[0], 1);
+ col_txfm(in, out, bit, 2, 2);
+ av1_round_shift_32_8xn_avx2(out, 16, shift[1], 1);
+ av1_fwd_txfm_transpose_8x8_avx2(out, in, 2, 1);
+ av1_fwd_txfm_transpose_8x8_avx2(&out[1], &in[8], 2, 1);
+ row_txfm(in, out, bit, 1, 1);
+ av1_fwd_txfm_transpose_8x8_avx2(out, in, 1, 2);
+ av1_fwd_txfm_transpose_8x8_avx2(&out[8], &in[1], 1, 2);
+ av1_round_shift_rect_array_32_avx2(in, in, 16, -shift[2], NewSqrt2);
+ av1_store_buffer_avx2(in, coeff, 8, 16);
+ (void)bd;
+}
+void av1_fwd_txfm2d_16x16_avx2(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ __m256i in[32], out[32];
+ const TX_SIZE tx_size = TX_16X16;
+ const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const int width_div8 = (width >> 3);
+ const int width_div16 = (width >> 4);
+ const int size = (height << 1);
+ switch (tx_type) {
+ case DCT_DCT:
+ av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+ av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+ av1_fdct16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+ av1_fwd_txfm_transpose_16x16_avx2(out, in);
+ av1_fdct16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ av1_fwd_txfm_transpose_16x16_avx2(out, in);
+ av1_store_buffer_avx2(in, coeff, 8, 32);
+ break;
+ case ADST_DCT:
+ av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+ av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+ av1_fadst16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+ av1_fwd_txfm_transpose_16x16_avx2(out, in);
+ av1_fdct16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ av1_fwd_txfm_transpose_16x16_avx2(out, in);
+ av1_store_buffer_avx2(in, coeff, 8, 32);
+ break;
+ case DCT_ADST:
+ av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+ av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+ av1_fdct16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+ av1_fwd_txfm_transpose_16x16_avx2(out, in);
+ av1_fadst16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ av1_fwd_txfm_transpose_16x16_avx2(out, in);
+ av1_store_buffer_avx2(in, coeff, 8, 32);
+ break;
+ case ADST_ADST:
+ av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+ av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+ av1_fadst16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+ av1_fwd_txfm_transpose_16x16_avx2(out, in);
+ av1_fadst16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ av1_fwd_txfm_transpose_16x16_avx2(out, in);
+ av1_store_buffer_avx2(in, coeff, 8, 32);
+ break;
+ case FLIPADST_DCT:
+ av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 0);
+ av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+ av1_fadst16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+ av1_fwd_txfm_transpose_16x16_avx2(out, in);
+ av1_fdct16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ av1_fwd_txfm_transpose_16x16_avx2(out, in);
+ av1_store_buffer_avx2(in, coeff, 8, 32);
+ break;
+ case DCT_FLIPADST:
+ av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 1);
+ av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+ av1_fdct16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+ av1_fwd_txfm_transpose_16x16_avx2(out, in);
+ av1_fadst16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ av1_fwd_txfm_transpose_16x16_avx2(out, in);
+ av1_store_buffer_avx2(in, coeff, 8, 32);
+ break;
+ case FLIPADST_FLIPADST:
+ av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 1);
+ av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+ av1_fadst16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+ av1_fwd_txfm_transpose_16x16_avx2(out, in);
+ av1_fadst16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ av1_fwd_txfm_transpose_16x16_avx2(out, in);
+ av1_store_buffer_avx2(in, coeff, 8, 32);
+ break;
+ case ADST_FLIPADST:
+ av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 1);
+ av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+ av1_fadst16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+ av1_fwd_txfm_transpose_16x16_avx2(out, in);
+ av1_fadst16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ av1_fwd_txfm_transpose_16x16_avx2(out, in);
+ av1_store_buffer_avx2(in, coeff, 8, 32);
+ break;
+ case FLIPADST_ADST:
+ av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 0);
+ av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+ av1_fadst16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+ av1_fwd_txfm_transpose_16x16_avx2(out, in);
+ av1_fadst16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ av1_fwd_txfm_transpose_16x16_avx2(out, in);
+ av1_store_buffer_avx2(in, coeff, 8, 32);
+ break;
+ case IDTX:
+ av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+ av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+ av1_idtx16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+ av1_idtx16_avx2(out, in, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ av1_store_buffer_avx2(in, coeff, 8, 32);
+ break;
+ case V_DCT:
+ av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+ av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+ av1_fdct16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+ av1_idtx16_avx2(out, in, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ av1_store_buffer_avx2(in, coeff, 8, 32);
+ break;
+ case H_DCT:
+ av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+ av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+ av1_idtx16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+ av1_fwd_txfm_transpose_16x16_avx2(out, in);
+ av1_fdct16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ av1_fwd_txfm_transpose_16x16_avx2(out, in);
+ av1_store_buffer_avx2(in, coeff, 8, 32);
+ break;
+ case V_ADST:
+ av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+ av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+ av1_fadst16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+ av1_idtx16_avx2(out, in, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ av1_store_buffer_avx2(in, coeff, 8, 32);
+ break;
+ case H_ADST:
+ av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 0);
+ av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+ av1_idtx16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+ av1_fwd_txfm_transpose_16x16_avx2(out, in);
+ av1_fadst16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ av1_fwd_txfm_transpose_16x16_avx2(out, in);
+ av1_store_buffer_avx2(in, coeff, 8, 32);
+ break;
+ case V_FLIPADST:
+ av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 1, 0);
+ av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+ av1_fadst16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+ av1_idtx16_avx2(out, in, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ av1_store_buffer_avx2(in, coeff, 8, 32);
+ break;
+ case H_FLIPADST:
+ av1_load_buffer_16xn_avx2(input, in, stride, height, width_div8, 0, 1);
+ av1_round_shift_32_8xn_avx2(in, size, shift[0], width_div16);
+ av1_idtx16_avx2(in, out, fwd_cos_bit_col[txw_idx][txh_idx], width_div8,
+ width_div8);
+ av1_round_shift_32_8xn_avx2(out, size, shift[1], width_div16);
+ av1_fwd_txfm_transpose_16x16_avx2(out, in);
+ av1_fadst16_avx2(in, out, fwd_cos_bit_row[txw_idx][txh_idx], width_div8,
+ width_div8);
+ av1_fwd_txfm_transpose_16x16_avx2(out, in);
+ av1_store_buffer_avx2(in, coeff, 8, 32);
+ break;
+ default: assert(0);
+ }
+ (void)bd;
+}
+static INLINE void av1_fdct32_avx2(__m256i *input, __m256i *output,
+ const int8_t cos_bit, const int instride,
+ const int outstride) {
+ __m256i buf0[32];
+ __m256i buf1[32];
+ const int32_t *cospi;
+ int startidx = 0 * instride;
+ int endidx = 31 * instride;
+ // stage 0
+ // stage 1
+ buf1[0] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[31] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[1] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[30] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[2] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[29] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[3] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[28] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[4] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[27] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[5] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[26] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[6] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[25] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[7] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[24] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[8] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[23] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[9] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[22] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[10] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[21] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[11] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[20] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[12] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[19] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[13] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[18] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[14] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[17] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ buf1[15] = _mm256_add_epi32(input[startidx], input[endidx]);
+ buf1[16] = _mm256_sub_epi32(input[startidx], input[endidx]);
+
+ // stage 2
+ cospi = cospi_arr(cos_bit);
+ buf0[0] = _mm256_add_epi32(buf1[0], buf1[15]);
+ buf0[15] = _mm256_sub_epi32(buf1[0], buf1[15]);
+ buf0[1] = _mm256_add_epi32(buf1[1], buf1[14]);
+ buf0[14] = _mm256_sub_epi32(buf1[1], buf1[14]);
+ buf0[2] = _mm256_add_epi32(buf1[2], buf1[13]);
+ buf0[13] = _mm256_sub_epi32(buf1[2], buf1[13]);
+ buf0[3] = _mm256_add_epi32(buf1[3], buf1[12]);
+ buf0[12] = _mm256_sub_epi32(buf1[3], buf1[12]);
+ buf0[4] = _mm256_add_epi32(buf1[4], buf1[11]);
+ buf0[11] = _mm256_sub_epi32(buf1[4], buf1[11]);
+ buf0[5] = _mm256_add_epi32(buf1[5], buf1[10]);
+ buf0[10] = _mm256_sub_epi32(buf1[5], buf1[10]);
+ buf0[6] = _mm256_add_epi32(buf1[6], buf1[9]);
+ buf0[9] = _mm256_sub_epi32(buf1[6], buf1[9]);
+ buf0[7] = _mm256_add_epi32(buf1[7], buf1[8]);
+ buf0[8] = _mm256_sub_epi32(buf1[7], buf1[8]);
+ buf0[16] = buf1[16];
+ buf0[17] = buf1[17];
+ buf0[18] = buf1[18];
+ buf0[19] = buf1[19];
+ btf_32_avx2_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20],
+ buf0[27], cos_bit);
+ btf_32_avx2_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21],
+ buf0[26], cos_bit);
+ btf_32_avx2_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22],
+ buf0[25], cos_bit);
+ btf_32_avx2_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23],
+ buf0[24], cos_bit);
+ buf0[28] = buf1[28];
+ buf0[29] = buf1[29];
+ buf0[30] = buf1[30];
+ buf0[31] = buf1[31];
+
+ // stage 3
+ cospi = cospi_arr(cos_bit);
+ buf1[0] = _mm256_add_epi32(buf0[0], buf0[7]);
+ buf1[7] = _mm256_sub_epi32(buf0[0], buf0[7]);
+ buf1[1] = _mm256_add_epi32(buf0[1], buf0[6]);
+ buf1[6] = _mm256_sub_epi32(buf0[1], buf0[6]);
+ buf1[2] = _mm256_add_epi32(buf0[2], buf0[5]);
+ buf1[5] = _mm256_sub_epi32(buf0[2], buf0[5]);
+ buf1[3] = _mm256_add_epi32(buf0[3], buf0[4]);
+ buf1[4] = _mm256_sub_epi32(buf0[3], buf0[4]);
+ buf1[8] = buf0[8];
+ buf1[9] = buf0[9];
+ btf_32_avx2_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10],
+ buf1[13], cos_bit);
+ btf_32_avx2_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11],
+ buf1[12], cos_bit);
+ buf1[14] = buf0[14];
+ buf1[15] = buf0[15];
+ buf1[16] = _mm256_add_epi32(buf0[16], buf0[23]);
+ buf1[23] = _mm256_sub_epi32(buf0[16], buf0[23]);
+ buf1[17] = _mm256_add_epi32(buf0[17], buf0[22]);
+ buf1[22] = _mm256_sub_epi32(buf0[17], buf0[22]);
+ buf1[18] = _mm256_add_epi32(buf0[18], buf0[21]);
+ buf1[21] = _mm256_sub_epi32(buf0[18], buf0[21]);
+ buf1[19] = _mm256_add_epi32(buf0[19], buf0[20]);
+ buf1[20] = _mm256_sub_epi32(buf0[19], buf0[20]);
+ buf1[24] = _mm256_sub_epi32(buf0[31], buf0[24]);
+ buf1[31] = _mm256_add_epi32(buf0[31], buf0[24]);
+ buf1[25] = _mm256_sub_epi32(buf0[30], buf0[25]);
+ buf1[30] = _mm256_add_epi32(buf0[30], buf0[25]);
+ buf1[26] = _mm256_sub_epi32(buf0[29], buf0[26]);
+ buf1[29] = _mm256_add_epi32(buf0[29], buf0[26]);
+ buf1[27] = _mm256_sub_epi32(buf0[28], buf0[27]);
+ buf1[28] = _mm256_add_epi32(buf0[28], buf0[27]);
+
+ // stage 4
+ cospi = cospi_arr(cos_bit);
+ buf0[0] = _mm256_add_epi32(buf1[0], buf1[3]);
+ buf0[3] = _mm256_sub_epi32(buf1[0], buf1[3]);
+ buf0[1] = _mm256_add_epi32(buf1[1], buf1[2]);
+ buf0[2] = _mm256_sub_epi32(buf1[1], buf1[2]);
+ buf0[4] = buf1[4];
+ btf_32_avx2_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], buf0[6],
+ cos_bit);
+ buf0[7] = buf1[7];
+ buf0[8] = _mm256_add_epi32(buf1[8], buf1[11]);
+ buf0[11] = _mm256_sub_epi32(buf1[8], buf1[11]);
+ buf0[9] = _mm256_add_epi32(buf1[9], buf1[10]);
+ buf0[10] = _mm256_sub_epi32(buf1[9], buf1[10]);
+ buf0[12] = _mm256_sub_epi32(buf1[15], buf1[12]);
+ buf0[15] = _mm256_add_epi32(buf1[15], buf1[12]);
+ buf0[13] = _mm256_sub_epi32(buf1[14], buf1[13]);
+ buf0[14] = _mm256_add_epi32(buf1[14], buf1[13]);
+ buf0[16] = buf1[16];
+ buf0[17] = buf1[17];
+ btf_32_avx2_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18],
+ buf0[29], cos_bit);
+ btf_32_avx2_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19],
+ buf0[28], cos_bit);
+ btf_32_avx2_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20],
+ buf0[27], cos_bit);
+ btf_32_avx2_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21],
+ buf0[26], cos_bit);
+ buf0[22] = buf1[22];
+ buf0[23] = buf1[23];
+ buf0[24] = buf1[24];
+ buf0[25] = buf1[25];
+ buf0[30] = buf1[30];
+ buf0[31] = buf1[31];
+
+ // stage 5
+ cospi = cospi_arr(cos_bit);
+ btf_32_avx2_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], buf1[1],
+ cos_bit);
+ btf_32_avx2_type0(cospi[16], cospi[48], buf0[3], buf0[2], buf1[2], buf1[3],
+ cos_bit);
+ buf1[4] = _mm256_add_epi32(buf0[4], buf0[5]);
+ buf1[5] = _mm256_sub_epi32(buf0[4], buf0[5]);
+ buf1[6] = _mm256_sub_epi32(buf0[7], buf0[6]);
+ buf1[7] = _mm256_add_epi32(buf0[7], buf0[6]);
+ buf1[8] = buf0[8];
+ btf_32_avx2_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9], buf1[14],
+ cos_bit);
+ btf_32_avx2_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10],
+ buf1[13], cos_bit);
+ buf1[11] = buf0[11];
+ buf1[12] = buf0[12];
+ buf1[15] = buf0[15];
+ buf1[16] = _mm256_add_epi32(buf0[16], buf0[19]);
+ buf1[19] = _mm256_sub_epi32(buf0[16], buf0[19]);
+ buf1[17] = _mm256_add_epi32(buf0[17], buf0[18]);
+ buf1[18] = _mm256_sub_epi32(buf0[17], buf0[18]);
+ buf1[20] = _mm256_sub_epi32(buf0[23], buf0[20]);
+ buf1[23] = _mm256_add_epi32(buf0[23], buf0[20]);
+ buf1[21] = _mm256_sub_epi32(buf0[22], buf0[21]);
+ buf1[22] = _mm256_add_epi32(buf0[22], buf0[21]);
+ buf1[24] = _mm256_add_epi32(buf0[24], buf0[27]);
+ buf1[27] = _mm256_sub_epi32(buf0[24], buf0[27]);
+ buf1[25] = _mm256_add_epi32(buf0[25], buf0[26]);
+ buf1[26] = _mm256_sub_epi32(buf0[25], buf0[26]);
+ buf1[28] = _mm256_sub_epi32(buf0[31], buf0[28]);
+ buf1[31] = _mm256_add_epi32(buf0[31], buf0[28]);
+ buf1[29] = _mm256_sub_epi32(buf0[30], buf0[29]);
+ buf1[30] = _mm256_add_epi32(buf0[30], buf0[29]);
+
+ // stage 6
+ cospi = cospi_arr(cos_bit);
+ buf0[0] = buf1[0];
+ buf0[1] = buf1[1];
+ buf0[2] = buf1[2];
+ buf0[3] = buf1[3];
+ btf_32_avx2_type0(cospi[8], cospi[56], buf1[7], buf1[4], buf0[4], buf0[7],
+ cos_bit);
+ btf_32_avx2_type0(cospi[40], cospi[24], buf1[6], buf1[5], buf0[5], buf0[6],
+ cos_bit);
+ buf0[8] = _mm256_add_epi32(buf1[8], buf1[9]);
+ buf0[9] = _mm256_sub_epi32(buf1[8], buf1[9]);
+ buf0[10] = _mm256_sub_epi32(buf1[11], buf1[10]);
+ buf0[11] = _mm256_add_epi32(buf1[11], buf1[10]);
+ buf0[12] = _mm256_add_epi32(buf1[12], buf1[13]);
+ buf0[13] = _mm256_sub_epi32(buf1[12], buf1[13]);
+ buf0[14] = _mm256_sub_epi32(buf1[15], buf1[14]);
+ buf0[15] = _mm256_add_epi32(buf1[15], buf1[14]);
+ buf0[16] = buf1[16];
+ btf_32_avx2_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17],
+ buf0[30], cos_bit);
+ btf_32_avx2_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18],
+ buf0[29], cos_bit);
+ buf0[19] = buf1[19];
+ buf0[20] = buf1[20];
+ btf_32_avx2_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21],
+ buf0[26], cos_bit);
+ btf_32_avx2_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22],
+ buf0[25], cos_bit);
+ buf0[23] = buf1[23];
+ buf0[24] = buf1[24];
+ buf0[27] = buf1[27];
+ buf0[28] = buf1[28];
+ buf0[31] = buf1[31];
+
+ // stage 7
+ cospi = cospi_arr(cos_bit);
+ buf1[0] = buf0[0];
+ buf1[1] = buf0[1];
+ buf1[2] = buf0[2];
+ buf1[3] = buf0[3];
+ buf1[4] = buf0[4];
+ buf1[5] = buf0[5];
+ buf1[6] = buf0[6];
+ buf1[7] = buf0[7];
+ btf_32_avx2_type0(cospi[4], cospi[60], buf0[15], buf0[8], buf1[8], buf1[15],
+ cos_bit);
+ btf_32_avx2_type0(cospi[36], cospi[28], buf0[14], buf0[9], buf1[9], buf1[14],
+ cos_bit);
+ btf_32_avx2_type0(cospi[20], cospi[44], buf0[13], buf0[10], buf1[10],
+ buf1[13], cos_bit);
+ btf_32_avx2_type0(cospi[52], cospi[12], buf0[12], buf0[11], buf1[11],
+ buf1[12], cos_bit);
+ buf1[16] = _mm256_add_epi32(buf0[16], buf0[17]);
+ buf1[17] = _mm256_sub_epi32(buf0[16], buf0[17]);
+ buf1[18] = _mm256_sub_epi32(buf0[19], buf0[18]);
+ buf1[19] = _mm256_add_epi32(buf0[19], buf0[18]);
+ buf1[20] = _mm256_add_epi32(buf0[20], buf0[21]);
+ buf1[21] = _mm256_sub_epi32(buf0[20], buf0[21]);
+ buf1[22] = _mm256_sub_epi32(buf0[23], buf0[22]);
+ buf1[23] = _mm256_add_epi32(buf0[23], buf0[22]);
+ buf1[24] = _mm256_add_epi32(buf0[24], buf0[25]);
+ buf1[25] = _mm256_sub_epi32(buf0[24], buf0[25]);
+ buf1[26] = _mm256_sub_epi32(buf0[27], buf0[26]);
+ buf1[27] = _mm256_add_epi32(buf0[27], buf0[26]);
+ buf1[28] = _mm256_add_epi32(buf0[28], buf0[29]);
+ buf1[29] = _mm256_sub_epi32(buf0[28], buf0[29]);
+ buf1[30] = _mm256_sub_epi32(buf0[31], buf0[30]);
+ buf1[31] = _mm256_add_epi32(buf0[31], buf0[30]);
+
+ // stage 8
+ cospi = cospi_arr(cos_bit);
+ buf0[0] = buf1[0];
+ buf0[1] = buf1[1];
+ buf0[2] = buf1[2];
+ buf0[3] = buf1[3];
+ buf0[4] = buf1[4];
+ buf0[5] = buf1[5];
+ buf0[6] = buf1[6];
+ buf0[7] = buf1[7];
+ buf0[8] = buf1[8];
+ buf0[9] = buf1[9];
+ buf0[10] = buf1[10];
+ buf0[11] = buf1[11];
+ buf0[12] = buf1[12];
+ buf0[13] = buf1[13];
+ buf0[14] = buf1[14];
+ buf0[15] = buf1[15];
+ btf_32_avx2_type0(cospi[2], cospi[62], buf1[31], buf1[16], buf0[16], buf0[31],
+ cos_bit);
+ btf_32_avx2_type0(cospi[34], cospi[30], buf1[30], buf1[17], buf0[17],
+ buf0[30], cos_bit);
+ btf_32_avx2_type0(cospi[18], cospi[46], buf1[29], buf1[18], buf0[18],
+ buf0[29], cos_bit);
+ btf_32_avx2_type0(cospi[50], cospi[14], buf1[28], buf1[19], buf0[19],
+ buf0[28], cos_bit);
+ btf_32_avx2_type0(cospi[10], cospi[54], buf1[27], buf1[20], buf0[20],
+ buf0[27], cos_bit);
+ btf_32_avx2_type0(cospi[42], cospi[22], buf1[26], buf1[21], buf0[21],
+ buf0[26], cos_bit);
+ btf_32_avx2_type0(cospi[26], cospi[38], buf1[25], buf1[22], buf0[22],
+ buf0[25], cos_bit);
+ btf_32_avx2_type0(cospi[58], cospi[6], buf1[24], buf1[23], buf0[23], buf0[24],
+ cos_bit);
+
+ startidx = 0 * outstride;
+ endidx = 31 * outstride;
+ // stage 9
+ output[startidx] = buf0[0];
+ output[endidx] = buf0[31];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[16];
+ output[endidx] = buf0[15];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[8];
+ output[endidx] = buf0[23];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[24];
+ output[endidx] = buf0[7];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[4];
+ output[endidx] = buf0[27];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[20];
+ output[endidx] = buf0[11];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[12];
+ output[endidx] = buf0[19];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[28];
+ output[endidx] = buf0[3];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[2];
+ output[endidx] = buf0[29];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[18];
+ output[endidx] = buf0[13];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[10];
+ output[endidx] = buf0[21];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[26];
+ output[endidx] = buf0[5];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[6];
+ output[endidx] = buf0[25];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[22];
+ output[endidx] = buf0[9];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[14];
+ output[endidx] = buf0[17];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = buf0[30];
+ output[endidx] = buf0[1];
+}
+static INLINE void idtx32x32_avx2(__m256i *input, __m256i *output,
+ const int8_t cos_bit, int instride,
+ int outstride) {
+ (void)cos_bit;
+ for (int i = 0; i < 32; i += 8) {
+ output[i * outstride] = _mm256_slli_epi32(input[i * instride], 2);
+ output[(i + 1) * outstride] =
+ _mm256_slli_epi32(input[(i + 1) * instride], 2);
+ output[(i + 2) * outstride] =
+ _mm256_slli_epi32(input[(i + 2) * instride], 2);
+ output[(i + 3) * outstride] =
+ _mm256_slli_epi32(input[(i + 3) * instride], 2);
+ output[(i + 4) * outstride] =
+ _mm256_slli_epi32(input[(i + 4) * instride], 2);
+ output[(i + 5) * outstride] =
+ _mm256_slli_epi32(input[(i + 5) * instride], 2);
+ output[(i + 6) * outstride] =
+ _mm256_slli_epi32(input[(i + 6) * instride], 2);
+ output[(i + 7) * outstride] =
+ _mm256_slli_epi32(input[(i + 7) * instride], 2);
+ }
+}
+static const transform_1d_avx2 col_txfm8x32_arr[TX_TYPES] = {
+ av1_fdct32_avx2, // DCT_DCT
+ NULL, // ADST_DCT
+ NULL, // DCT_ADST
+ NULL, // ADST_ADST
+ NULL, // FLIPADST_DCT
+ NULL, // DCT_FLIPADST
+ NULL, // FLIPADST_FLIPADST
+ NULL, // ADST_FLIPADST
+ NULL, // FLIPADST_ADST
+ idtx32x32_avx2, // IDTX
+ NULL, // V_DCT
+ NULL, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+};
+static const transform_1d_avx2 row_txfm8x32_arr[TX_TYPES] = {
+ av1_fdct32_avx2, // DCT_DCT
+ NULL, // ADST_DCT
+ NULL, // DCT_ADST
+ NULL, // ADST_ADST
+ NULL, // FLIPADST_DCT
+ NULL, // DCT_FLIPADST
+ NULL, // FLIPADST_FLIPADST
+ NULL, // ADST_FLIPADST
+ NULL, // FLIPADST_ADST
+ idtx32x32_avx2, // IDTX
+ NULL, // V_DCT
+ NULL, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+};
+void av1_fwd_txfm2d_32x32_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m256i buf0[128], buf1[128];
+ const int tx_size = TX_32X32;
+ const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_avx2 col_txfm = col_txfm8x32_arr[tx_type];
+ const transform_1d_avx2 row_txfm = row_txfm8x32_arr[tx_type];
+ int r, c;
+ const int width_div16 = (width >> 4);
+ const int width_div8 = (width >> 3);
+
+ for (int i = 0; i < width_div16; i++) {
+ av1_load_buffer_16xn_avx2(input + (i << 4), &buf0[(i << 1)], stride, height,
+ width_div8, 0, 0);
+ av1_round_shift_32_8xn_avx2(&buf0[(i << 1)], height, shift[0], width_div8);
+ av1_round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[0],
+ width_div8);
+ col_txfm(&buf0[(i << 1)], &buf0[(i << 1)], cos_bit_col, width_div8,
+ width_div8);
+ col_txfm(&buf0[(i << 1) + 1], &buf0[(i << 1) + 1], cos_bit_col, width_div8,
+ width_div8);
+ av1_round_shift_32_8xn_avx2(&buf0[(i << 1)], height, shift[1], width_div8);
+ av1_round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[1],
+ width_div8);
+ }
+
+ for (r = 0; r < height; r += 8) {
+ for (c = 0; c < width_div8; c++) {
+ av1_fwd_txfm_transpose_8x8_avx2(&buf0[r * width_div8 + c],
+ &buf1[c * 8 * width_div8 + (r >> 3)],
+ width_div8, width_div8);
+ }
+ }
+
+ for (int i = 0; i < width_div16; i++) {
+ row_txfm(&buf1[(i << 1)], &buf1[(i << 1)], cos_bit_row, width_div8,
+ width_div8);
+ row_txfm(&buf1[(i << 1) + 1], &buf1[(i << 1) + 1], cos_bit_row, width_div8,
+ width_div8);
+ av1_round_shift_32_8xn_avx2(&buf1[(i << 1)], height, shift[2], width_div8);
+ av1_round_shift_32_8xn_avx2(&buf1[(i << 1) + 1], height, shift[2],
+ width_div8);
+ }
+
+ for (r = 0; r < height; r += 8) {
+ for (c = 0; c < width_div8; c++) {
+ av1_fwd_txfm_transpose_8x8_avx2(&buf1[r * width_div8 + c],
+ &buf0[c * 8 * width_div8 + (r >> 3)],
+ width_div8, width_div8);
+ }
+ }
+
+ av1_store_buffer_avx2(buf0, output, 8, 128);
+}
+static INLINE void av1_fdct64_stage2_avx2(__m256i *x1, __m256i *x2,
+ __m256i *cospi_m32,
+ __m256i *cospi_p32,
+ const __m256i *__rounding,
+ int8_t cos_bit) {
+ x2[0] = _mm256_add_epi32(x1[0], x1[31]);
+ x2[31] = _mm256_sub_epi32(x1[0], x1[31]);
+ x2[1] = _mm256_add_epi32(x1[1], x1[30]);
+ x2[30] = _mm256_sub_epi32(x1[1], x1[30]);
+ x2[2] = _mm256_add_epi32(x1[2], x1[29]);
+ x2[29] = _mm256_sub_epi32(x1[2], x1[29]);
+ x2[3] = _mm256_add_epi32(x1[3], x1[28]);
+ x2[28] = _mm256_sub_epi32(x1[3], x1[28]);
+ x2[4] = _mm256_add_epi32(x1[4], x1[27]);
+ x2[27] = _mm256_sub_epi32(x1[4], x1[27]);
+ x2[5] = _mm256_add_epi32(x1[5], x1[26]);
+ x2[26] = _mm256_sub_epi32(x1[5], x1[26]);
+ x2[6] = _mm256_add_epi32(x1[6], x1[25]);
+ x2[25] = _mm256_sub_epi32(x1[6], x1[25]);
+ x2[7] = _mm256_add_epi32(x1[7], x1[24]);
+ x2[24] = _mm256_sub_epi32(x1[7], x1[24]);
+ x2[8] = _mm256_add_epi32(x1[8], x1[23]);
+ x2[23] = _mm256_sub_epi32(x1[8], x1[23]);
+ x2[9] = _mm256_add_epi32(x1[9], x1[22]);
+ x2[22] = _mm256_sub_epi32(x1[9], x1[22]);
+ x2[10] = _mm256_add_epi32(x1[10], x1[21]);
+ x2[21] = _mm256_sub_epi32(x1[10], x1[21]);
+ x2[11] = _mm256_add_epi32(x1[11], x1[20]);
+ x2[20] = _mm256_sub_epi32(x1[11], x1[20]);
+ x2[12] = _mm256_add_epi32(x1[12], x1[19]);
+ x2[19] = _mm256_sub_epi32(x1[12], x1[19]);
+ x2[13] = _mm256_add_epi32(x1[13], x1[18]);
+ x2[18] = _mm256_sub_epi32(x1[13], x1[18]);
+ x2[14] = _mm256_add_epi32(x1[14], x1[17]);
+ x2[17] = _mm256_sub_epi32(x1[14], x1[17]);
+ x2[15] = _mm256_add_epi32(x1[15], x1[16]);
+ x2[16] = _mm256_sub_epi32(x1[15], x1[16]);
+ x2[32] = x1[32];
+ x2[33] = x1[33];
+ x2[34] = x1[34];
+ x2[35] = x1[35];
+ x2[36] = x1[36];
+ x2[37] = x1[37];
+ x2[38] = x1[38];
+ x2[39] = x1[39];
+ btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[40], x1[55], x2[40], x2[55],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[41], x1[54], x2[41], x2[54],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[42], x1[53], x2[42], x2[53],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[43], x1[52], x2[43], x2[52],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[44], x1[51], x2[44], x2[51],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[45], x1[50], x2[45], x2[50],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[46], x1[49], x2[46], x2[49],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x1[47], x1[48], x2[47], x2[48],
+ *__rounding, cos_bit);
+ x2[56] = x1[56];
+ x2[57] = x1[57];
+ x2[58] = x1[58];
+ x2[59] = x1[59];
+ x2[60] = x1[60];
+ x2[61] = x1[61];
+ x2[62] = x1[62];
+ x2[63] = x1[63];
+}
+static INLINE void av1_fdct64_stage3_avx2(__m256i *x2, __m256i *x3,
+ __m256i *cospi_m32,
+ __m256i *cospi_p32,
+ const __m256i *__rounding,
+ int8_t cos_bit) {
+ x3[0] = _mm256_add_epi32(x2[0], x2[15]);
+ x3[15] = _mm256_sub_epi32(x2[0], x2[15]);
+ x3[1] = _mm256_add_epi32(x2[1], x2[14]);
+ x3[14] = _mm256_sub_epi32(x2[1], x2[14]);
+ x3[2] = _mm256_add_epi32(x2[2], x2[13]);
+ x3[13] = _mm256_sub_epi32(x2[2], x2[13]);
+ x3[3] = _mm256_add_epi32(x2[3], x2[12]);
+ x3[12] = _mm256_sub_epi32(x2[3], x2[12]);
+ x3[4] = _mm256_add_epi32(x2[4], x2[11]);
+ x3[11] = _mm256_sub_epi32(x2[4], x2[11]);
+ x3[5] = _mm256_add_epi32(x2[5], x2[10]);
+ x3[10] = _mm256_sub_epi32(x2[5], x2[10]);
+ x3[6] = _mm256_add_epi32(x2[6], x2[9]);
+ x3[9] = _mm256_sub_epi32(x2[6], x2[9]);
+ x3[7] = _mm256_add_epi32(x2[7], x2[8]);
+ x3[8] = _mm256_sub_epi32(x2[7], x2[8]);
+ x3[16] = x2[16];
+ x3[17] = x2[17];
+ x3[18] = x2[18];
+ x3[19] = x2[19];
+ btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[20], x2[27], x3[20], x3[27],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[21], x2[26], x3[21], x3[26],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[22], x2[25], x3[22], x3[25],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x2[23], x2[24], x3[23], x3[24],
+ *__rounding, cos_bit);
+ x3[28] = x2[28];
+ x3[29] = x2[29];
+ x3[30] = x2[30];
+ x3[31] = x2[31];
+ x3[32] = _mm256_add_epi32(x2[32], x2[47]);
+ x3[47] = _mm256_sub_epi32(x2[32], x2[47]);
+ x3[33] = _mm256_add_epi32(x2[33], x2[46]);
+ x3[46] = _mm256_sub_epi32(x2[33], x2[46]);
+ x3[34] = _mm256_add_epi32(x2[34], x2[45]);
+ x3[45] = _mm256_sub_epi32(x2[34], x2[45]);
+ x3[35] = _mm256_add_epi32(x2[35], x2[44]);
+ x3[44] = _mm256_sub_epi32(x2[35], x2[44]);
+ x3[36] = _mm256_add_epi32(x2[36], x2[43]);
+ x3[43] = _mm256_sub_epi32(x2[36], x2[43]);
+ x3[37] = _mm256_add_epi32(x2[37], x2[42]);
+ x3[42] = _mm256_sub_epi32(x2[37], x2[42]);
+ x3[38] = _mm256_add_epi32(x2[38], x2[41]);
+ x3[41] = _mm256_sub_epi32(x2[38], x2[41]);
+ x3[39] = _mm256_add_epi32(x2[39], x2[40]);
+ x3[40] = _mm256_sub_epi32(x2[39], x2[40]);
+ x3[48] = _mm256_sub_epi32(x2[63], x2[48]);
+ x3[63] = _mm256_add_epi32(x2[63], x2[48]);
+ x3[49] = _mm256_sub_epi32(x2[62], x2[49]);
+ x3[62] = _mm256_add_epi32(x2[62], x2[49]);
+ x3[50] = _mm256_sub_epi32(x2[61], x2[50]);
+ x3[61] = _mm256_add_epi32(x2[61], x2[50]);
+ x3[51] = _mm256_sub_epi32(x2[60], x2[51]);
+ x3[60] = _mm256_add_epi32(x2[60], x2[51]);
+ x3[52] = _mm256_sub_epi32(x2[59], x2[52]);
+ x3[59] = _mm256_add_epi32(x2[59], x2[52]);
+ x3[53] = _mm256_sub_epi32(x2[58], x2[53]);
+ x3[58] = _mm256_add_epi32(x2[58], x2[53]);
+ x3[54] = _mm256_sub_epi32(x2[57], x2[54]);
+ x3[57] = _mm256_add_epi32(x2[57], x2[54]);
+ x3[55] = _mm256_sub_epi32(x2[56], x2[55]);
+ x3[56] = _mm256_add_epi32(x2[56], x2[55]);
+}
+static INLINE void av1_fdct64_stage4_avx2(
+ __m256i *x3, __m256i *x4, __m256i *cospi_m32, __m256i *cospi_p32,
+ __m256i *cospi_m16, __m256i *cospi_p48, __m256i *cospi_m48,
+ const __m256i *__rounding, int8_t cos_bit) {
+ x4[0] = _mm256_add_epi32(x3[0], x3[7]);
+ x4[7] = _mm256_sub_epi32(x3[0], x3[7]);
+ x4[1] = _mm256_add_epi32(x3[1], x3[6]);
+ x4[6] = _mm256_sub_epi32(x3[1], x3[6]);
+ x4[2] = _mm256_add_epi32(x3[2], x3[5]);
+ x4[5] = _mm256_sub_epi32(x3[2], x3[5]);
+ x4[3] = _mm256_add_epi32(x3[3], x3[4]);
+ x4[4] = _mm256_sub_epi32(x3[3], x3[4]);
+ x4[8] = x3[8];
+ x4[9] = x3[9];
+ btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x3[10], x3[13], x4[10], x4[13],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x3[11], x3[12], x4[11], x4[12],
+ *__rounding, cos_bit);
+ x4[14] = x3[14];
+ x4[15] = x3[15];
+ x4[16] = _mm256_add_epi32(x3[16], x3[23]);
+ x4[23] = _mm256_sub_epi32(x3[16], x3[23]);
+ x4[17] = _mm256_add_epi32(x3[17], x3[22]);
+ x4[22] = _mm256_sub_epi32(x3[17], x3[22]);
+ x4[18] = _mm256_add_epi32(x3[18], x3[21]);
+ x4[21] = _mm256_sub_epi32(x3[18], x3[21]);
+ x4[19] = _mm256_add_epi32(x3[19], x3[20]);
+ x4[20] = _mm256_sub_epi32(x3[19], x3[20]);
+ x4[24] = _mm256_sub_epi32(x3[31], x3[24]);
+ x4[31] = _mm256_add_epi32(x3[31], x3[24]);
+ x4[25] = _mm256_sub_epi32(x3[30], x3[25]);
+ x4[30] = _mm256_add_epi32(x3[30], x3[25]);
+ x4[26] = _mm256_sub_epi32(x3[29], x3[26]);
+ x4[29] = _mm256_add_epi32(x3[29], x3[26]);
+ x4[27] = _mm256_sub_epi32(x3[28], x3[27]);
+ x4[28] = _mm256_add_epi32(x3[28], x3[27]);
+ x4[32] = x3[32];
+ x4[33] = x3[33];
+ x4[34] = x3[34];
+ x4[35] = x3[35];
+ btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[36], x3[59], x4[36], x4[59],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[37], x3[58], x4[37], x4[58],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[38], x3[57], x4[38], x4[57],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x3[39], x3[56], x4[39], x4[56],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[40], x3[55], x4[40], x4[55],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[41], x3[54], x4[41], x4[54],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[42], x3[53], x4[42], x4[53],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x3[43], x3[52], x4[43], x4[52],
+ *__rounding, cos_bit);
+ x4[44] = x3[44];
+ x4[45] = x3[45];
+ x4[46] = x3[46];
+ x4[47] = x3[47];
+ x4[48] = x3[48];
+ x4[49] = x3[49];
+ x4[50] = x3[50];
+ x4[51] = x3[51];
+ x4[60] = x3[60];
+ x4[61] = x3[61];
+ x4[62] = x3[62];
+ x4[63] = x3[63];
+}
+static INLINE void av1_fdct64_stage5_avx2(
+ __m256i *x4, __m256i *x5, __m256i *cospi_m32, __m256i *cospi_p32,
+ __m256i *cospi_m16, __m256i *cospi_p48, __m256i *cospi_m48,
+ const __m256i *__rounding, int8_t cos_bit) {
+ x5[0] = _mm256_add_epi32(x4[0], x4[3]);
+ x5[3] = _mm256_sub_epi32(x4[0], x4[3]);
+ x5[1] = _mm256_add_epi32(x4[1], x4[2]);
+ x5[2] = _mm256_sub_epi32(x4[1], x4[2]);
+ x5[4] = x4[4];
+ btf_32_type0_avx2_new(*cospi_m32, *cospi_p32, x4[5], x4[6], x5[5], x5[6],
+ *__rounding, cos_bit);
+ x5[7] = x4[7];
+ x5[8] = _mm256_add_epi32(x4[8], x4[11]);
+ x5[11] = _mm256_sub_epi32(x4[8], x4[11]);
+ x5[9] = _mm256_add_epi32(x4[9], x4[10]);
+ x5[10] = _mm256_sub_epi32(x4[9], x4[10]);
+ x5[12] = _mm256_sub_epi32(x4[15], x4[12]);
+ x5[15] = _mm256_add_epi32(x4[15], x4[12]);
+ x5[13] = _mm256_sub_epi32(x4[14], x4[13]);
+ x5[14] = _mm256_add_epi32(x4[14], x4[13]);
+ x5[16] = x4[16];
+ x5[17] = x4[17];
+ btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x4[18], x4[29], x5[18], x5[29],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x4[19], x4[28], x5[19], x5[28],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x4[20], x4[27], x5[20], x5[27],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x4[21], x4[26], x5[21], x5[26],
+ *__rounding, cos_bit);
+ x5[22] = x4[22];
+ x5[23] = x4[23];
+ x5[24] = x4[24];
+ x5[25] = x4[25];
+ x5[30] = x4[30];
+ x5[31] = x4[31];
+ x5[32] = _mm256_add_epi32(x4[32], x4[39]);
+ x5[39] = _mm256_sub_epi32(x4[32], x4[39]);
+ x5[33] = _mm256_add_epi32(x4[33], x4[38]);
+ x5[38] = _mm256_sub_epi32(x4[33], x4[38]);
+ x5[34] = _mm256_add_epi32(x4[34], x4[37]);
+ x5[37] = _mm256_sub_epi32(x4[34], x4[37]);
+ x5[35] = _mm256_add_epi32(x4[35], x4[36]);
+ x5[36] = _mm256_sub_epi32(x4[35], x4[36]);
+ x5[40] = _mm256_sub_epi32(x4[47], x4[40]);
+ x5[47] = _mm256_add_epi32(x4[47], x4[40]);
+ x5[41] = _mm256_sub_epi32(x4[46], x4[41]);
+ x5[46] = _mm256_add_epi32(x4[46], x4[41]);
+ x5[42] = _mm256_sub_epi32(x4[45], x4[42]);
+ x5[45] = _mm256_add_epi32(x4[45], x4[42]);
+ x5[43] = _mm256_sub_epi32(x4[44], x4[43]);
+ x5[44] = _mm256_add_epi32(x4[44], x4[43]);
+ x5[48] = _mm256_add_epi32(x4[48], x4[55]);
+ x5[55] = _mm256_sub_epi32(x4[48], x4[55]);
+ x5[49] = _mm256_add_epi32(x4[49], x4[54]);
+ x5[54] = _mm256_sub_epi32(x4[49], x4[54]);
+ x5[50] = _mm256_add_epi32(x4[50], x4[53]);
+ x5[53] = _mm256_sub_epi32(x4[50], x4[53]);
+ x5[51] = _mm256_add_epi32(x4[51], x4[52]);
+ x5[52] = _mm256_sub_epi32(x4[51], x4[52]);
+ x5[56] = _mm256_sub_epi32(x4[63], x4[56]);
+ x5[63] = _mm256_add_epi32(x4[63], x4[56]);
+ x5[57] = _mm256_sub_epi32(x4[62], x4[57]);
+ x5[62] = _mm256_add_epi32(x4[62], x4[57]);
+ x5[58] = _mm256_sub_epi32(x4[61], x4[58]);
+ x5[61] = _mm256_add_epi32(x4[61], x4[58]);
+ x5[59] = _mm256_sub_epi32(x4[60], x4[59]);
+ x5[60] = _mm256_add_epi32(x4[60], x4[59]);
+}
+static INLINE void av1_fdct64_stage6_avx2(
+ __m256i *x5, __m256i *x6, __m256i *cospi_p16, __m256i *cospi_p32,
+ __m256i *cospi_m16, __m256i *cospi_p48, __m256i *cospi_m48,
+ __m256i *cospi_m08, __m256i *cospi_p56, __m256i *cospi_m56,
+ __m256i *cospi_m40, __m256i *cospi_p24, __m256i *cospi_m24,
+ const __m256i *__rounding, int8_t cos_bit) {
+ btf_32_type0_avx2_new(*cospi_p32, *cospi_p32, x5[0], x5[1], x6[0], x6[1],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_p16, *cospi_p48, x5[3], x5[2], x6[2], x6[3],
+ *__rounding, cos_bit);
+ x6[4] = _mm256_add_epi32(x5[4], x5[5]);
+ x6[5] = _mm256_sub_epi32(x5[4], x5[5]);
+ x6[6] = _mm256_sub_epi32(x5[7], x5[6]);
+ x6[7] = _mm256_add_epi32(x5[7], x5[6]);
+ x6[8] = x5[8];
+ btf_32_type0_avx2_new(*cospi_m16, *cospi_p48, x5[9], x5[14], x6[9], x6[14],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m48, *cospi_m16, x5[10], x5[13], x6[10], x6[13],
+ *__rounding, cos_bit);
+ x6[11] = x5[11];
+ x6[12] = x5[12];
+ x6[15] = x5[15];
+ x6[16] = _mm256_add_epi32(x5[16], x5[19]);
+ x6[19] = _mm256_sub_epi32(x5[16], x5[19]);
+ x6[17] = _mm256_add_epi32(x5[17], x5[18]);
+ x6[18] = _mm256_sub_epi32(x5[17], x5[18]);
+ x6[20] = _mm256_sub_epi32(x5[23], x5[20]);
+ x6[23] = _mm256_add_epi32(x5[23], x5[20]);
+ x6[21] = _mm256_sub_epi32(x5[22], x5[21]);
+ x6[22] = _mm256_add_epi32(x5[22], x5[21]);
+ x6[24] = _mm256_add_epi32(x5[24], x5[27]);
+ x6[27] = _mm256_sub_epi32(x5[24], x5[27]);
+ x6[25] = _mm256_add_epi32(x5[25], x5[26]);
+ x6[26] = _mm256_sub_epi32(x5[25], x5[26]);
+ x6[28] = _mm256_sub_epi32(x5[31], x5[28]);
+ x6[31] = _mm256_add_epi32(x5[31], x5[28]);
+ x6[29] = _mm256_sub_epi32(x5[30], x5[29]);
+ x6[30] = _mm256_add_epi32(x5[30], x5[29]);
+ x6[32] = x5[32];
+ x6[33] = x5[33];
+ btf_32_type0_avx2_new(*cospi_m08, *cospi_p56, x5[34], x5[61], x6[34], x6[61],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m08, *cospi_p56, x5[35], x5[60], x6[35], x6[60],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m56, *cospi_m08, x5[36], x5[59], x6[36], x6[59],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m56, *cospi_m08, x5[37], x5[58], x6[37], x6[58],
+ *__rounding, cos_bit);
+ x6[38] = x5[38];
+ x6[39] = x5[39];
+ x6[40] = x5[40];
+ x6[41] = x5[41];
+ btf_32_type0_avx2_new(*cospi_m40, *cospi_p24, x5[42], x5[53], x6[42], x6[53],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m40, *cospi_p24, x5[43], x5[52], x6[43], x6[52],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m24, *cospi_m40, x5[44], x5[51], x6[44], x6[51],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m24, *cospi_m40, x5[45], x5[50], x6[45], x6[50],
+ *__rounding, cos_bit);
+ x6[46] = x5[46];
+ x6[47] = x5[47];
+ x6[48] = x5[48];
+ x6[49] = x5[49];
+ x6[54] = x5[54];
+ x6[55] = x5[55];
+ x6[56] = x5[56];
+ x6[57] = x5[57];
+ x6[62] = x5[62];
+ x6[63] = x5[63];
+}
+static INLINE void av1_fdct64_stage7_avx2(
+ __m256i *x6, __m256i *x7, __m256i *cospi_p08, __m256i *cospi_p56,
+ __m256i *cospi_p40, __m256i *cospi_p24, __m256i *cospi_m08,
+ __m256i *cospi_m56, __m256i *cospi_m40, __m256i *cospi_m24,
+ const __m256i *__rounding, int8_t cos_bit) {
+ x7[0] = x6[0];
+ x7[1] = x6[1];
+ x7[2] = x6[2];
+ x7[3] = x6[3];
+ btf_32_type0_avx2_new(*cospi_p08, *cospi_p56, x6[7], x6[4], x7[4], x7[7],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_p40, *cospi_p24, x6[6], x6[5], x7[5], x7[6],
+ *__rounding, cos_bit);
+ x7[8] = _mm256_add_epi32(x6[8], x6[9]);
+ x7[9] = _mm256_sub_epi32(x6[8], x6[9]);
+ x7[10] = _mm256_sub_epi32(x6[11], x6[10]);
+ x7[11] = _mm256_add_epi32(x6[11], x6[10]);
+ x7[12] = _mm256_add_epi32(x6[12], x6[13]);
+ x7[13] = _mm256_sub_epi32(x6[12], x6[13]);
+ x7[14] = _mm256_sub_epi32(x6[15], x6[14]);
+ x7[15] = _mm256_add_epi32(x6[15], x6[14]);
+ x7[16] = x6[16];
+ btf_32_type0_avx2_new(*cospi_m08, *cospi_p56, x6[17], x6[30], x7[17], x7[30],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m56, *cospi_m08, x6[18], x6[29], x7[18], x7[29],
+ *__rounding, cos_bit);
+ x7[19] = x6[19];
+ x7[20] = x6[20];
+ btf_32_type0_avx2_new(*cospi_m40, *cospi_p24, x6[21], x6[26], x7[21], x7[26],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(*cospi_m24, *cospi_m40, x6[22], x6[25], x7[22], x7[25],
+ *__rounding, cos_bit);
+ x7[23] = x6[23];
+ x7[24] = x6[24];
+ x7[27] = x6[27];
+ x7[28] = x6[28];
+ x7[31] = x6[31];
+ x7[32] = _mm256_add_epi32(x6[32], x6[35]);
+ x7[35] = _mm256_sub_epi32(x6[32], x6[35]);
+ x7[33] = _mm256_add_epi32(x6[33], x6[34]);
+ x7[34] = _mm256_sub_epi32(x6[33], x6[34]);
+ x7[36] = _mm256_sub_epi32(x6[39], x6[36]);
+ x7[39] = _mm256_add_epi32(x6[39], x6[36]);
+ x7[37] = _mm256_sub_epi32(x6[38], x6[37]);
+ x7[38] = _mm256_add_epi32(x6[38], x6[37]);
+ x7[40] = _mm256_add_epi32(x6[40], x6[43]);
+ x7[43] = _mm256_sub_epi32(x6[40], x6[43]);
+ x7[41] = _mm256_add_epi32(x6[41], x6[42]);
+ x7[42] = _mm256_sub_epi32(x6[41], x6[42]);
+ x7[44] = _mm256_sub_epi32(x6[47], x6[44]);
+ x7[47] = _mm256_add_epi32(x6[47], x6[44]);
+ x7[45] = _mm256_sub_epi32(x6[46], x6[45]);
+ x7[46] = _mm256_add_epi32(x6[46], x6[45]);
+ x7[48] = _mm256_add_epi32(x6[48], x6[51]);
+ x7[51] = _mm256_sub_epi32(x6[48], x6[51]);
+ x7[49] = _mm256_add_epi32(x6[49], x6[50]);
+ x7[50] = _mm256_sub_epi32(x6[49], x6[50]);
+ x7[52] = _mm256_sub_epi32(x6[55], x6[52]);
+ x7[55] = _mm256_add_epi32(x6[55], x6[52]);
+ x7[53] = _mm256_sub_epi32(x6[54], x6[53]);
+ x7[54] = _mm256_add_epi32(x6[54], x6[53]);
+ x7[56] = _mm256_add_epi32(x6[56], x6[59]);
+ x7[59] = _mm256_sub_epi32(x6[56], x6[59]);
+ x7[57] = _mm256_add_epi32(x6[57], x6[58]);
+ x7[58] = _mm256_sub_epi32(x6[57], x6[58]);
+ x7[60] = _mm256_sub_epi32(x6[63], x6[60]);
+ x7[63] = _mm256_add_epi32(x6[63], x6[60]);
+ x7[61] = _mm256_sub_epi32(x6[62], x6[61]);
+ x7[62] = _mm256_add_epi32(x6[62], x6[61]);
+}
+static INLINE void av1_fdct64_stage8_avx2(__m256i *x7, __m256i *x8,
+ const int32_t *cospi,
+ const __m256i *__rounding,
+ int8_t cos_bit) {
+ __m256i cospi_p60 = _mm256_set1_epi32(cospi[60]);
+ __m256i cospi_p04 = _mm256_set1_epi32(cospi[4]);
+ __m256i cospi_p28 = _mm256_set1_epi32(cospi[28]);
+ __m256i cospi_p36 = _mm256_set1_epi32(cospi[36]);
+ __m256i cospi_p44 = _mm256_set1_epi32(cospi[44]);
+ __m256i cospi_p20 = _mm256_set1_epi32(cospi[20]);
+ __m256i cospi_p12 = _mm256_set1_epi32(cospi[12]);
+ __m256i cospi_p52 = _mm256_set1_epi32(cospi[52]);
+ __m256i cospi_m04 = _mm256_set1_epi32(-cospi[4]);
+ __m256i cospi_m60 = _mm256_set1_epi32(-cospi[60]);
+ __m256i cospi_m36 = _mm256_set1_epi32(-cospi[36]);
+ __m256i cospi_m28 = _mm256_set1_epi32(-cospi[28]);
+ __m256i cospi_m20 = _mm256_set1_epi32(-cospi[20]);
+ __m256i cospi_m44 = _mm256_set1_epi32(-cospi[44]);
+ __m256i cospi_m52 = _mm256_set1_epi32(-cospi[52]);
+ __m256i cospi_m12 = _mm256_set1_epi32(-cospi[12]);
+
+ x8[0] = x7[0];
+ x8[1] = x7[1];
+ x8[2] = x7[2];
+ x8[3] = x7[3];
+ x8[4] = x7[4];
+ x8[5] = x7[5];
+ x8[6] = x7[6];
+ x8[7] = x7[7];
+
+ btf_32_type0_avx2_new(cospi_p04, cospi_p60, x7[15], x7[8], x8[8], x8[15],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p36, cospi_p28, x7[14], x7[9], x8[9], x8[14],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p20, cospi_p44, x7[13], x7[10], x8[10], x8[13],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p52, cospi_p12, x7[12], x7[11], x8[11], x8[12],
+ *__rounding, cos_bit);
+ x8[16] = _mm256_add_epi32(x7[16], x7[17]);
+ x8[17] = _mm256_sub_epi32(x7[16], x7[17]);
+ x8[18] = _mm256_sub_epi32(x7[19], x7[18]);
+ x8[19] = _mm256_add_epi32(x7[19], x7[18]);
+ x8[20] = _mm256_add_epi32(x7[20], x7[21]);
+ x8[21] = _mm256_sub_epi32(x7[20], x7[21]);
+ x8[22] = _mm256_sub_epi32(x7[23], x7[22]);
+ x8[23] = _mm256_add_epi32(x7[23], x7[22]);
+ x8[24] = _mm256_add_epi32(x7[24], x7[25]);
+ x8[25] = _mm256_sub_epi32(x7[24], x7[25]);
+ x8[26] = _mm256_sub_epi32(x7[27], x7[26]);
+ x8[27] = _mm256_add_epi32(x7[27], x7[26]);
+ x8[28] = _mm256_add_epi32(x7[28], x7[29]);
+ x8[29] = _mm256_sub_epi32(x7[28], x7[29]);
+ x8[30] = _mm256_sub_epi32(x7[31], x7[30]);
+ x8[31] = _mm256_add_epi32(x7[31], x7[30]);
+ x8[32] = x7[32];
+ btf_32_type0_avx2_new(cospi_m04, cospi_p60, x7[33], x7[62], x8[33], x8[62],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_m60, cospi_m04, x7[34], x7[61], x8[34], x8[61],
+ *__rounding, cos_bit);
+ x8[35] = x7[35];
+ x8[36] = x7[36];
+ btf_32_type0_avx2_new(cospi_m36, cospi_p28, x7[37], x7[58], x8[37], x8[58],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_m28, cospi_m36, x7[38], x7[57], x8[38], x8[57],
+ *__rounding, cos_bit);
+ x8[39] = x7[39];
+ x8[40] = x7[40];
+ btf_32_type0_avx2_new(cospi_m20, cospi_p44, x7[41], x7[54], x8[41], x8[54],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_m44, cospi_m20, x7[42], x7[53], x8[42], x8[53],
+ *__rounding, cos_bit);
+ x8[43] = x7[43];
+ x8[44] = x7[44];
+ btf_32_type0_avx2_new(cospi_m52, cospi_p12, x7[45], x7[50], x8[45], x8[50],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_m12, cospi_m52, x7[46], x7[49], x8[46], x8[49],
+ *__rounding, cos_bit);
+ x8[47] = x7[47];
+ x8[48] = x7[48];
+ x8[51] = x7[51];
+ x8[52] = x7[52];
+ x8[55] = x7[55];
+ x8[56] = x7[56];
+ x8[59] = x7[59];
+ x8[60] = x7[60];
+ x8[63] = x7[63];
+}
+static INLINE void av1_fdct64_stage9_avx2(__m256i *x8, __m256i *x9,
+ const int32_t *cospi,
+ const __m256i *__rounding,
+ int8_t cos_bit) {
+ __m256i cospi_p62 = _mm256_set1_epi32(cospi[62]);
+ __m256i cospi_p02 = _mm256_set1_epi32(cospi[2]);
+ __m256i cospi_p30 = _mm256_set1_epi32(cospi[30]);
+ __m256i cospi_p34 = _mm256_set1_epi32(cospi[34]);
+ __m256i cospi_p46 = _mm256_set1_epi32(cospi[46]);
+ __m256i cospi_p18 = _mm256_set1_epi32(cospi[18]);
+ __m256i cospi_p14 = _mm256_set1_epi32(cospi[14]);
+ __m256i cospi_p50 = _mm256_set1_epi32(cospi[50]);
+ __m256i cospi_p54 = _mm256_set1_epi32(cospi[54]);
+ __m256i cospi_p10 = _mm256_set1_epi32(cospi[10]);
+ __m256i cospi_p22 = _mm256_set1_epi32(cospi[22]);
+ __m256i cospi_p42 = _mm256_set1_epi32(cospi[42]);
+ __m256i cospi_p38 = _mm256_set1_epi32(cospi[38]);
+ __m256i cospi_p26 = _mm256_set1_epi32(cospi[26]);
+ __m256i cospi_p06 = _mm256_set1_epi32(cospi[6]);
+ __m256i cospi_p58 = _mm256_set1_epi32(cospi[58]);
+
+ x9[0] = x8[0];
+ x9[1] = x8[1];
+ x9[2] = x8[2];
+ x9[3] = x8[3];
+ x9[4] = x8[4];
+ x9[5] = x8[5];
+ x9[6] = x8[6];
+ x9[7] = x8[7];
+ x9[8] = x8[8];
+ x9[9] = x8[9];
+ x9[10] = x8[10];
+ x9[11] = x8[11];
+ x9[12] = x8[12];
+ x9[13] = x8[13];
+ x9[14] = x8[14];
+ x9[15] = x8[15];
+ btf_32_type0_avx2_new(cospi_p02, cospi_p62, x8[31], x8[16], x9[16], x9[31],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p34, cospi_p30, x8[30], x8[17], x9[17], x9[30],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p18, cospi_p46, x8[29], x8[18], x9[18], x9[29],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p50, cospi_p14, x8[28], x8[19], x9[19], x9[28],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p10, cospi_p54, x8[27], x8[20], x9[20], x9[27],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p42, cospi_p22, x8[26], x8[21], x9[21], x9[26],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p26, cospi_p38, x8[25], x8[22], x9[22], x9[25],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p58, cospi_p06, x8[24], x8[23], x9[23], x9[24],
+ *__rounding, cos_bit);
+ x9[32] = _mm256_add_epi32(x8[32], x8[33]);
+ x9[33] = _mm256_sub_epi32(x8[32], x8[33]);
+ x9[34] = _mm256_sub_epi32(x8[35], x8[34]);
+ x9[35] = _mm256_add_epi32(x8[35], x8[34]);
+ x9[36] = _mm256_add_epi32(x8[36], x8[37]);
+ x9[37] = _mm256_sub_epi32(x8[36], x8[37]);
+ x9[38] = _mm256_sub_epi32(x8[39], x8[38]);
+ x9[39] = _mm256_add_epi32(x8[39], x8[38]);
+ x9[40] = _mm256_add_epi32(x8[40], x8[41]);
+ x9[41] = _mm256_sub_epi32(x8[40], x8[41]);
+ x9[42] = _mm256_sub_epi32(x8[43], x8[42]);
+ x9[43] = _mm256_add_epi32(x8[43], x8[42]);
+ x9[44] = _mm256_add_epi32(x8[44], x8[45]);
+ x9[45] = _mm256_sub_epi32(x8[44], x8[45]);
+ x9[46] = _mm256_sub_epi32(x8[47], x8[46]);
+ x9[47] = _mm256_add_epi32(x8[47], x8[46]);
+ x9[48] = _mm256_add_epi32(x8[48], x8[49]);
+ x9[49] = _mm256_sub_epi32(x8[48], x8[49]);
+ x9[50] = _mm256_sub_epi32(x8[51], x8[50]);
+ x9[51] = _mm256_add_epi32(x8[51], x8[50]);
+ x9[52] = _mm256_add_epi32(x8[52], x8[53]);
+ x9[53] = _mm256_sub_epi32(x8[52], x8[53]);
+ x9[54] = _mm256_sub_epi32(x8[55], x8[54]);
+ x9[55] = _mm256_add_epi32(x8[55], x8[54]);
+ x9[56] = _mm256_add_epi32(x8[56], x8[57]);
+ x9[57] = _mm256_sub_epi32(x8[56], x8[57]);
+ x9[58] = _mm256_sub_epi32(x8[59], x8[58]);
+ x9[59] = _mm256_add_epi32(x8[59], x8[58]);
+ x9[60] = _mm256_add_epi32(x8[60], x8[61]);
+ x9[61] = _mm256_sub_epi32(x8[60], x8[61]);
+ x9[62] = _mm256_sub_epi32(x8[63], x8[62]);
+ x9[63] = _mm256_add_epi32(x8[63], x8[62]);
+}
+static INLINE void av1_fdct64_stage10_avx2(__m256i *x9, __m256i *x10,
+ const int32_t *cospi,
+ const __m256i *__rounding,
+ int8_t cos_bit) {
+ __m256i cospi_p63 = _mm256_set1_epi32(cospi[63]);
+ __m256i cospi_p01 = _mm256_set1_epi32(cospi[1]);
+ __m256i cospi_p31 = _mm256_set1_epi32(cospi[31]);
+ __m256i cospi_p33 = _mm256_set1_epi32(cospi[33]);
+ __m256i cospi_p47 = _mm256_set1_epi32(cospi[47]);
+ __m256i cospi_p17 = _mm256_set1_epi32(cospi[17]);
+ __m256i cospi_p15 = _mm256_set1_epi32(cospi[15]);
+ __m256i cospi_p49 = _mm256_set1_epi32(cospi[49]);
+ __m256i cospi_p55 = _mm256_set1_epi32(cospi[55]);
+ __m256i cospi_p09 = _mm256_set1_epi32(cospi[9]);
+ __m256i cospi_p23 = _mm256_set1_epi32(cospi[23]);
+ __m256i cospi_p41 = _mm256_set1_epi32(cospi[41]);
+ __m256i cospi_p39 = _mm256_set1_epi32(cospi[39]);
+ __m256i cospi_p25 = _mm256_set1_epi32(cospi[25]);
+ __m256i cospi_p07 = _mm256_set1_epi32(cospi[7]);
+ __m256i cospi_p57 = _mm256_set1_epi32(cospi[57]);
+ __m256i cospi_p59 = _mm256_set1_epi32(cospi[59]);
+ __m256i cospi_p05 = _mm256_set1_epi32(cospi[5]);
+ __m256i cospi_p27 = _mm256_set1_epi32(cospi[27]);
+ __m256i cospi_p37 = _mm256_set1_epi32(cospi[37]);
+ __m256i cospi_p43 = _mm256_set1_epi32(cospi[43]);
+ __m256i cospi_p21 = _mm256_set1_epi32(cospi[21]);
+ __m256i cospi_p11 = _mm256_set1_epi32(cospi[11]);
+ __m256i cospi_p53 = _mm256_set1_epi32(cospi[53]);
+ __m256i cospi_p51 = _mm256_set1_epi32(cospi[51]);
+ __m256i cospi_p13 = _mm256_set1_epi32(cospi[13]);
+ __m256i cospi_p19 = _mm256_set1_epi32(cospi[19]);
+ __m256i cospi_p45 = _mm256_set1_epi32(cospi[45]);
+ __m256i cospi_p35 = _mm256_set1_epi32(cospi[35]);
+ __m256i cospi_p29 = _mm256_set1_epi32(cospi[29]);
+ __m256i cospi_p03 = _mm256_set1_epi32(cospi[3]);
+ __m256i cospi_p61 = _mm256_set1_epi32(cospi[61]);
+
+ x10[0] = x9[0];
+ x10[1] = x9[1];
+ x10[2] = x9[2];
+ x10[3] = x9[3];
+ x10[4] = x9[4];
+ x10[5] = x9[5];
+ x10[6] = x9[6];
+ x10[7] = x9[7];
+ x10[8] = x9[8];
+ x10[9] = x9[9];
+ x10[10] = x9[10];
+ x10[11] = x9[11];
+ x10[12] = x9[12];
+ x10[13] = x9[13];
+ x10[14] = x9[14];
+ x10[15] = x9[15];
+ x10[16] = x9[16];
+ x10[17] = x9[17];
+ x10[18] = x9[18];
+ x10[19] = x9[19];
+ x10[20] = x9[20];
+ x10[21] = x9[21];
+ x10[22] = x9[22];
+ x10[23] = x9[23];
+ x10[24] = x9[24];
+ x10[25] = x9[25];
+ x10[26] = x9[26];
+ x10[27] = x9[27];
+ x10[28] = x9[28];
+ x10[29] = x9[29];
+ x10[30] = x9[30];
+ x10[31] = x9[31];
+ btf_32_type0_avx2_new(cospi_p01, cospi_p63, x9[63], x9[32], x10[32], x10[63],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p33, cospi_p31, x9[62], x9[33], x10[33], x10[62],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p17, cospi_p47, x9[61], x9[34], x10[34], x10[61],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p49, cospi_p15, x9[60], x9[35], x10[35], x10[60],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p09, cospi_p55, x9[59], x9[36], x10[36], x10[59],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p41, cospi_p23, x9[58], x9[37], x10[37], x10[58],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p25, cospi_p39, x9[57], x9[38], x10[38], x10[57],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p57, cospi_p07, x9[56], x9[39], x10[39], x10[56],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p05, cospi_p59, x9[55], x9[40], x10[40], x10[55],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p37, cospi_p27, x9[54], x9[41], x10[41], x10[54],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p21, cospi_p43, x9[53], x9[42], x10[42], x10[53],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p53, cospi_p11, x9[52], x9[43], x10[43], x10[52],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p13, cospi_p51, x9[51], x9[44], x10[44], x10[51],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p45, cospi_p19, x9[50], x9[45], x10[45], x10[50],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p29, cospi_p35, x9[49], x9[46], x10[46], x10[49],
+ *__rounding, cos_bit);
+ btf_32_type0_avx2_new(cospi_p61, cospi_p03, x9[48], x9[47], x10[47], x10[48],
+ *__rounding, cos_bit);
+}
+static void av1_fdct64_avx2(__m256i *input, __m256i *output, int8_t cos_bit,
+ const int instride, const int outstride) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m256i __rounding = _mm256_set1_epi32(1 << (cos_bit - 1));
+ __m256i cospi_m32 = _mm256_set1_epi32(-cospi[32]);
+ __m256i cospi_p32 = _mm256_set1_epi32(cospi[32]);
+ __m256i cospi_m16 = _mm256_set1_epi32(-cospi[16]);
+ __m256i cospi_p48 = _mm256_set1_epi32(cospi[48]);
+ __m256i cospi_m48 = _mm256_set1_epi32(-cospi[48]);
+ __m256i cospi_p16 = _mm256_set1_epi32(cospi[16]);
+ __m256i cospi_m08 = _mm256_set1_epi32(-cospi[8]);
+ __m256i cospi_p56 = _mm256_set1_epi32(cospi[56]);
+ __m256i cospi_m56 = _mm256_set1_epi32(-cospi[56]);
+ __m256i cospi_m40 = _mm256_set1_epi32(-cospi[40]);
+ __m256i cospi_p24 = _mm256_set1_epi32(cospi[24]);
+ __m256i cospi_m24 = _mm256_set1_epi32(-cospi[24]);
+ __m256i cospi_p08 = _mm256_set1_epi32(cospi[8]);
+ __m256i cospi_p40 = _mm256_set1_epi32(cospi[40]);
+
+ int startidx = 0 * instride;
+ int endidx = 63 * instride;
+ // stage 1
+ __m256i x1[64];
+ x1[0] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[63] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[1] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[62] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[2] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[61] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[3] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[60] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[4] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[59] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[5] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[58] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[6] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[57] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[7] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[56] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[8] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[55] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[9] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[54] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[10] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[53] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[11] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[52] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[12] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[51] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[13] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[50] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[14] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[49] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[15] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[48] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[16] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[47] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[17] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[46] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[18] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[45] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[19] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[44] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[20] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[43] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[21] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[42] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[22] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[41] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[23] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[40] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[24] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[39] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[25] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[38] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[26] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[37] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[27] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[36] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[28] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[35] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[29] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[34] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[30] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[33] = _mm256_sub_epi32(input[startidx], input[endidx]);
+ startidx += instride;
+ endidx -= instride;
+ x1[31] = _mm256_add_epi32(input[startidx], input[endidx]);
+ x1[32] = _mm256_sub_epi32(input[startidx], input[endidx]);
+
+ // stage 2
+ __m256i x2[64];
+ av1_fdct64_stage2_avx2(x1, x2, &cospi_m32, &cospi_p32, &__rounding, cos_bit);
+ // stage 3
+ av1_fdct64_stage3_avx2(x2, x1, &cospi_m32, &cospi_p32, &__rounding, cos_bit);
+ // stage 4
+ av1_fdct64_stage4_avx2(x1, x2, &cospi_m32, &cospi_p32, &cospi_m16, &cospi_p48,
+ &cospi_m48, &__rounding, cos_bit);
+ // stage 5
+ av1_fdct64_stage5_avx2(x2, x1, &cospi_m32, &cospi_p32, &cospi_m16, &cospi_p48,
+ &cospi_m48, &__rounding, cos_bit);
+ // stage 6
+ av1_fdct64_stage6_avx2(x1, x2, &cospi_p16, &cospi_p32, &cospi_m16, &cospi_p48,
+ &cospi_m48, &cospi_m08, &cospi_p56, &cospi_m56,
+ &cospi_m40, &cospi_p24, &cospi_m24, &__rounding,
+ cos_bit);
+ // stage 7
+ av1_fdct64_stage7_avx2(x2, x1, &cospi_p08, &cospi_p56, &cospi_p40, &cospi_p24,
+ &cospi_m08, &cospi_m56, &cospi_m40, &cospi_m24,
+ &__rounding, cos_bit);
+ // stage 8
+ av1_fdct64_stage8_avx2(x1, x2, cospi, &__rounding, cos_bit);
+ // stage 9
+ av1_fdct64_stage9_avx2(x2, x1, cospi, &__rounding, cos_bit);
+ // stage 10
+ av1_fdct64_stage10_avx2(x1, x2, cospi, &__rounding, cos_bit);
+
+ startidx = 0 * outstride;
+ endidx = 63 * outstride;
+
+ // stage 11
+ output[startidx] = x2[0];
+ output[endidx] = x2[63];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[32];
+ output[endidx] = x2[31];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[16];
+ output[endidx] = x2[47];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[48];
+ output[endidx] = x2[15];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[8];
+ output[endidx] = x2[55];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[40];
+ output[endidx] = x2[23];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[24];
+ output[endidx] = x2[39];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[56];
+ output[endidx] = x2[7];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[4];
+ output[endidx] = x2[59];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[36];
+ output[endidx] = x2[27];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[20];
+ output[endidx] = x2[43];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[52];
+ output[endidx] = x2[11];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[12];
+ output[endidx] = x2[51];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[44];
+ output[endidx] = x2[19];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[28];
+ output[endidx] = x2[35];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[60];
+ output[endidx] = x2[3];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[2];
+ output[endidx] = x2[61];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[34];
+ output[endidx] = x2[29];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[18];
+ output[endidx] = x2[45];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[50];
+ output[endidx] = x2[13];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[10];
+ output[endidx] = x2[53];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[42];
+ output[endidx] = x2[21];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[26];
+ output[endidx] = x2[37];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[58];
+ output[endidx] = x2[5];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[6];
+ output[endidx] = x2[57];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[38];
+ output[endidx] = x2[25];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[22];
+ output[endidx] = x2[41];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[54];
+ output[endidx] = x2[9];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[14];
+ output[endidx] = x2[49];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[46];
+ output[endidx] = x2[17];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[30];
+ output[endidx] = x2[33];
+ startidx += outstride;
+ endidx -= outstride;
+ output[startidx] = x2[62];
+ output[endidx] = x2[1];
+}
+void av1_fwd_txfm2d_64x64_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
+ assert(tx_type == DCT_DCT);
+ const TX_SIZE tx_size = TX_64X64;
+ __m256i buf0[512], buf1[512];
+ const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_avx2 col_txfm = av1_fdct64_avx2;
+ const transform_1d_avx2 row_txfm = av1_fdct64_avx2;
+ const int width_div16 = (width >> 4);
+ const int width_div8 = (width >> 3);
+ int r, c;
+ for (int i = 0; i < width_div16; i++) {
+ av1_load_buffer_16xn_avx2(input + (i << 4), &buf0[i << 1], stride, height,
+ width_div8, 0, 0);
+ av1_round_shift_32_8xn_avx2(&buf0[i << 1], height, shift[0], width_div8);
+ av1_round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[0],
+ width_div8);
+ col_txfm(&buf0[i << 1], &buf0[i << 1], cos_bit_col, width_div8, width_div8);
+ col_txfm(&buf0[(i << 1) + 1], &buf0[(i << 1) + 1], cos_bit_col, width_div8,
+ width_div8);
+ av1_round_shift_32_8xn_avx2(&buf0[i << 1], height, shift[1], width_div8);
+ av1_round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], height, shift[1],
+ width_div8);
+ }
+
+ for (r = 0; r < height; r += 8) {
+ for (c = 0; c < width_div8; c++) {
+ av1_fwd_txfm_transpose_8x8_avx2(&buf0[r * width_div8 + c],
+ &buf1[c * 8 * width_div8 + (r >> 3)],
+ width_div8, width_div8);
+ }
+ }
+
+ for (int i = 0; i < 2; i++) {
+ row_txfm(&buf1[i << 1], &buf0[i << 1], cos_bit_row, width_div8,
+ width_div16);
+ row_txfm(&buf1[(i << 1) + 1], &buf0[(i << 1) + 1], cos_bit_row, width_div8,
+ width_div16);
+ av1_round_shift_32_8xn_avx2(&buf0[i << 1], (height >> 1), shift[2],
+ width_div16);
+ av1_round_shift_32_8xn_avx2(&buf0[(i << 1) + 1], (height >> 1), shift[2],
+ width_div16);
+ }
+
+ for (r = 0; r < (height >> 1); r += 8) {
+ for (c = 0; c < width_div16; c++) {
+ av1_fwd_txfm_transpose_8x8_avx2(&buf0[r * width_div16 + c],
+ &buf1[c * 8 * width_div16 + (r >> 3)],
+ width_div16, width_div16);
+ }
+ }
+ av1_store_buffer_avx2(buf1, output, 8, 128);
+}
diff --git a/libaom/av1/encoder/x86/highbd_temporal_filter_sse4.c b/libaom/av1/encoder/x86/highbd_temporal_filter_sse4.c
new file mode 100644
index 0000000..f199b0f
--- /dev/null
+++ b/libaom/av1/encoder/x86/highbd_temporal_filter_sse4.c
@@ -0,0 +1,954 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "config/av1_rtcd.h"
+#include "aom/aom_integer.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/temporal_filter.h"
+#include "av1/encoder/x86/temporal_filter_constants.h"
+
+// Compute (a-b)**2 for 8 pixels with size 16-bit
+static INLINE void highbd_store_dist_8(const uint16_t *a, const uint16_t *b,
+ uint32_t *dst) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i a_reg = _mm_loadu_si128((const __m128i *)a);
+ const __m128i b_reg = _mm_loadu_si128((const __m128i *)b);
+
+ const __m128i a_first = _mm_cvtepu16_epi32(a_reg);
+ const __m128i a_second = _mm_unpackhi_epi16(a_reg, zero);
+ const __m128i b_first = _mm_cvtepu16_epi32(b_reg);
+ const __m128i b_second = _mm_unpackhi_epi16(b_reg, zero);
+
+ __m128i dist_first, dist_second;
+
+ dist_first = _mm_sub_epi32(a_first, b_first);
+ dist_second = _mm_sub_epi32(a_second, b_second);
+ dist_first = _mm_mullo_epi32(dist_first, dist_first);
+ dist_second = _mm_mullo_epi32(dist_second, dist_second);
+
+ _mm_storeu_si128((__m128i *)dst, dist_first);
+ _mm_storeu_si128((__m128i *)(dst + 4), dist_second);
+}
+
+// Sum up three neighboring distortions for the pixels
+static INLINE void highbd_get_sum_4(const uint32_t *dist, __m128i *sum) {
+ __m128i dist_reg, dist_left, dist_right;
+
+ dist_reg = _mm_loadu_si128((const __m128i *)dist);
+ dist_left = _mm_loadu_si128((const __m128i *)(dist - 1));
+ dist_right = _mm_loadu_si128((const __m128i *)(dist + 1));
+
+ *sum = _mm_add_epi32(dist_reg, dist_left);
+ *sum = _mm_add_epi32(*sum, dist_right);
+}
+
+static INLINE void highbd_get_sum_8(const uint32_t *dist, __m128i *sum_first,
+ __m128i *sum_second) {
+ highbd_get_sum_4(dist, sum_first);
+ highbd_get_sum_4(dist + 4, sum_second);
+}
+
+// Average the value based on the number of values summed (9 for pixels away
+// from the border, 4 for pixels in corners, and 6 for other edge values, plus
+// however many values from y/uv plane are).
+//
+// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply
+// by weight.
+static INLINE void highbd_average_4(__m128i *output, const __m128i *sum,
+ const __m128i *mul_constants,
+ const int strength, const int rounding,
+ const int weight) {
+ // _mm_srl_epi16 uses the lower 64 bit value for the shift.
+ const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
+ const __m128i rounding_u32 = _mm_set1_epi32(rounding);
+ const __m128i weight_u32 = _mm_set1_epi32(weight);
+ const __m128i sixteen = _mm_set1_epi32(16);
+ const __m128i zero = _mm_setzero_si128();
+
+ // modifier * 3 / index;
+ const __m128i sum_lo = _mm_unpacklo_epi32(*sum, zero);
+ const __m128i sum_hi = _mm_unpackhi_epi32(*sum, zero);
+ const __m128i const_lo = _mm_unpacklo_epi32(*mul_constants, zero);
+ const __m128i const_hi = _mm_unpackhi_epi32(*mul_constants, zero);
+
+ const __m128i mul_lo = _mm_mul_epu32(sum_lo, const_lo);
+ const __m128i mul_lo_div = _mm_srli_epi64(mul_lo, 32);
+ const __m128i mul_hi = _mm_mul_epu32(sum_hi, const_hi);
+ const __m128i mul_hi_div = _mm_srli_epi64(mul_hi, 32);
+
+ // Now we have
+ // mul_lo: 00 a1 00 a0
+ // mul_hi: 00 a3 00 a2
+ // Unpack as 64 bit words to get even and odd elements
+ // unpack_lo: 00 a2 00 a0
+ // unpack_hi: 00 a3 00 a1
+ // Then we can shift and OR the results to get everything in 32-bits
+ const __m128i mul_even = _mm_unpacklo_epi64(mul_lo_div, mul_hi_div);
+ const __m128i mul_odd = _mm_unpackhi_epi64(mul_lo_div, mul_hi_div);
+ const __m128i mul_odd_shift = _mm_slli_si128(mul_odd, 4);
+ const __m128i mul = _mm_or_si128(mul_even, mul_odd_shift);
+
+ // Round
+ *output = _mm_add_epi32(mul, rounding_u32);
+ *output = _mm_srl_epi32(*output, strength_u128);
+
+ // Multiply with the weight
+ *output = _mm_min_epu32(*output, sixteen);
+ *output = _mm_sub_epi32(sixteen, *output);
+ *output = _mm_mullo_epi32(*output, weight_u32);
+}
+
+static INLINE void highbd_average_8(__m128i *output_0, __m128i *output_1,
+ const __m128i *sum_0_u32,
+ const __m128i *sum_1_u32,
+ const __m128i *mul_constants_0,
+ const __m128i *mul_constants_1,
+ const int strength, const int rounding,
+ const int weight) {
+ highbd_average_4(output_0, sum_0_u32, mul_constants_0, strength, rounding,
+ weight);
+ highbd_average_4(output_1, sum_1_u32, mul_constants_1, strength, rounding,
+ weight);
+}
+
+// Add 'sum_u32' to 'count'. Multiply by 'pred' and add to 'accumulator.'
+static INLINE void highbd_accumulate_and_store_8(const __m128i sum_first_u32,
+ const __m128i sum_second_u32,
+ const uint16_t *pred,
+ uint16_t *count,
+ uint32_t *accumulator) {
+ // Cast down to 16-bit ints
+ const __m128i sum_u16 = _mm_packus_epi32(sum_first_u32, sum_second_u32);
+ const __m128i zero = _mm_setzero_si128();
+
+ __m128i pred_u16 = _mm_loadu_si128((const __m128i *)pred);
+ __m128i count_u16 = _mm_loadu_si128((const __m128i *)count);
+
+ __m128i pred_0_u32, pred_1_u32;
+ __m128i accum_0_u32, accum_1_u32;
+
+ count_u16 = _mm_adds_epu16(count_u16, sum_u16);
+ _mm_storeu_si128((__m128i *)count, count_u16);
+
+ pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16);
+
+ pred_0_u32 = _mm_cvtepu16_epi32(pred_u16);
+ pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero);
+
+ accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
+ accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
+
+ accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
+ accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);
+
+ _mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
+ _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
+}
+
+static INLINE void highbd_read_dist_4(const uint32_t *dist, __m128i *dist_reg) {
+ *dist_reg = _mm_loadu_si128((const __m128i *)dist);
+}
+
+static INLINE void highbd_read_dist_8(const uint32_t *dist, __m128i *reg_first,
+ __m128i *reg_second) {
+ highbd_read_dist_4(dist, reg_first);
+ highbd_read_dist_4(dist + 4, reg_second);
+}
+
+static INLINE void highbd_read_chroma_dist_row_8(
+ int ss_x, const uint32_t *u_dist, const uint32_t *v_dist, __m128i *u_first,
+ __m128i *u_second, __m128i *v_first, __m128i *v_second) {
+ if (!ss_x) {
+ // If there is no chroma subsampling in the horizontal direction, then we
+ // need to load 8 entries from chroma.
+ highbd_read_dist_8(u_dist, u_first, u_second);
+ highbd_read_dist_8(v_dist, v_first, v_second);
+ } else { // ss_x == 1
+ // Otherwise, we only need to load 8 entries
+ __m128i u_reg, v_reg;
+
+ highbd_read_dist_4(u_dist, &u_reg);
+
+ *u_first = _mm_unpacklo_epi32(u_reg, u_reg);
+ *u_second = _mm_unpackhi_epi32(u_reg, u_reg);
+
+ highbd_read_dist_4(v_dist, &v_reg);
+
+ *v_first = _mm_unpacklo_epi32(v_reg, v_reg);
+ *v_second = _mm_unpackhi_epi32(v_reg, v_reg);
+ }
+}
+
+static void av1_highbd_apply_temporal_filter_luma_8(
+ const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
+ int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
+ int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
+ int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+ int ss_x, int ss_y, int strength, int use_whole_blk, uint32_t *y_accum,
+ uint16_t *y_count, const uint32_t *y_dist, const uint32_t *u_dist,
+ const uint32_t *v_dist, const uint32_t *const *neighbors_first,
+ const uint32_t *const *neighbors_second, int top_weight,
+ int bottom_weight) {
+ const int rounding = (1 << strength) >> 1;
+ int weight = top_weight;
+
+ __m128i mul_first, mul_second;
+
+ __m128i sum_row_1_first, sum_row_1_second;
+ __m128i sum_row_2_first, sum_row_2_second;
+ __m128i sum_row_3_first, sum_row_3_second;
+
+ __m128i u_first, u_second;
+ __m128i v_first, v_second;
+
+ __m128i sum_row_first;
+ __m128i sum_row_second;
+
+ // Loop variables
+ unsigned int h;
+
+ assert(strength >= 4 && strength <= 14 &&
+ "invalid adjusted temporal filter strength");
+ assert(block_width == 8);
+
+ (void)block_width;
+
+ // First row
+ mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[0]);
+ mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[0]);
+
+ // Add luma values
+ highbd_get_sum_8(y_dist, &sum_row_2_first, &sum_row_2_second);
+ highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+ // We don't need to saturate here because the maximum value is UINT12_MAX ** 2
+ // * 9 ~= 2**24 * 9 < 2 ** 28 < INT32_MAX
+ sum_row_first = _mm_add_epi32(sum_row_2_first, sum_row_3_first);
+ sum_row_second = _mm_add_epi32(sum_row_2_second, sum_row_3_second);
+
+ // Add chroma values
+ highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
+ &v_first, &v_second);
+
+ // Max value here is 2 ** 24 * (9 + 2), so no saturation is needed
+ sum_row_first = _mm_add_epi32(sum_row_first, u_first);
+ sum_row_second = _mm_add_epi32(sum_row_second, u_second);
+
+ sum_row_first = _mm_add_epi32(sum_row_first, v_first);
+ sum_row_second = _mm_add_epi32(sum_row_second, v_second);
+
+ // Get modifier and store result
+ highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first,
+ &sum_row_second, &mul_first, &mul_second, strength, rounding,
+ weight);
+
+ highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
+ y_accum);
+
+ y_src += y_src_stride;
+ y_pre += y_pre_stride;
+ y_count += y_pre_stride;
+ y_accum += y_pre_stride;
+ y_dist += DIST_STRIDE;
+
+ u_src += uv_src_stride;
+ u_pre += uv_pre_stride;
+ u_dist += DIST_STRIDE;
+ v_src += uv_src_stride;
+ v_pre += uv_pre_stride;
+ v_dist += DIST_STRIDE;
+
+ // Then all the rows except the last one
+ mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[1]);
+ mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[1]);
+
+ for (h = 1; h < block_height - 1; ++h) {
+ // Move the weight to bottom half
+ if (!use_whole_blk && h == block_height / 2) {
+ weight = bottom_weight;
+ }
+ // Shift the rows up
+ sum_row_1_first = sum_row_2_first;
+ sum_row_1_second = sum_row_2_second;
+ sum_row_2_first = sum_row_3_first;
+ sum_row_2_second = sum_row_3_second;
+
+ // Add luma values to the modifier
+ sum_row_first = _mm_add_epi32(sum_row_1_first, sum_row_2_first);
+ sum_row_second = _mm_add_epi32(sum_row_1_second, sum_row_2_second);
+
+ highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+ sum_row_first = _mm_add_epi32(sum_row_first, sum_row_3_first);
+ sum_row_second = _mm_add_epi32(sum_row_second, sum_row_3_second);
+
+ // Add chroma values to the modifier
+ if (ss_y == 0 || h % 2 == 0) {
+ // Only calculate the new chroma distortion if we are at a pixel that
+ // corresponds to a new chroma row
+ highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
+ &v_first, &v_second);
+
+ u_src += uv_src_stride;
+ u_pre += uv_pre_stride;
+ u_dist += DIST_STRIDE;
+ v_src += uv_src_stride;
+ v_pre += uv_pre_stride;
+ v_dist += DIST_STRIDE;
+ }
+
+ sum_row_first = _mm_add_epi32(sum_row_first, u_first);
+ sum_row_second = _mm_add_epi32(sum_row_second, u_second);
+ sum_row_first = _mm_add_epi32(sum_row_first, v_first);
+ sum_row_second = _mm_add_epi32(sum_row_second, v_second);
+
+ // Get modifier and store result
+ highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first,
+ &sum_row_second, &mul_first, &mul_second, strength,
+ rounding, weight);
+ highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
+ y_accum);
+
+ y_src += y_src_stride;
+ y_pre += y_pre_stride;
+ y_count += y_pre_stride;
+ y_accum += y_pre_stride;
+ y_dist += DIST_STRIDE;
+ }
+
+ // The last row
+ mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[0]);
+ mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[0]);
+
+ // Shift the rows up
+ sum_row_1_first = sum_row_2_first;
+ sum_row_1_second = sum_row_2_second;
+ sum_row_2_first = sum_row_3_first;
+ sum_row_2_second = sum_row_3_second;
+
+ // Add luma values to the modifier
+ sum_row_first = _mm_add_epi32(sum_row_1_first, sum_row_2_first);
+ sum_row_second = _mm_add_epi32(sum_row_1_second, sum_row_2_second);
+
+ // Add chroma values to the modifier
+ if (ss_y == 0) {
+ // Only calculate the new chroma distortion if we are at a pixel that
+ // corresponds to a new chroma row
+ highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
+ &v_first, &v_second);
+ }
+
+ sum_row_first = _mm_add_epi32(sum_row_first, u_first);
+ sum_row_second = _mm_add_epi32(sum_row_second, u_second);
+ sum_row_first = _mm_add_epi32(sum_row_first, v_first);
+ sum_row_second = _mm_add_epi32(sum_row_second, v_second);
+
+ // Get modifier and store result
+ highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first,
+ &sum_row_second, &mul_first, &mul_second, strength, rounding,
+ weight);
+ highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
+ y_accum);
+}
+
+// Perform temporal filter for the luma component.
+static void av1_highbd_apply_temporal_filter_luma(
+ const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
+ int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
+ int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
+ int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+ int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
+ uint32_t *y_accum, uint16_t *y_count, const uint32_t *y_dist,
+ const uint32_t *u_dist, const uint32_t *v_dist) {
+ unsigned int blk_col = 0, uv_blk_col = 0;
+ const unsigned int blk_col_step = 8, uv_blk_col_step = 8 >> ss_x;
+ const unsigned int mid_width = block_width >> 1,
+ last_width = block_width - blk_col_step;
+ int top_weight = blk_fw[0],
+ bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+ const uint32_t *const *neighbors_first;
+ const uint32_t *const *neighbors_second;
+
+ // Left
+ neighbors_first = HIGHBD_LUMA_LEFT_COLUMN_NEIGHBORS;
+ neighbors_second = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS;
+ av1_highbd_apply_temporal_filter_luma_8(
+ y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+ u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
+ v_pre + uv_blk_col, uv_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+ strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+ y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+ neighbors_first, neighbors_second, top_weight, bottom_weight);
+
+ blk_col += blk_col_step;
+ uv_blk_col += uv_blk_col_step;
+
+ // Middle First
+ neighbors_first = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS;
+ for (; blk_col < mid_width;
+ blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+ av1_highbd_apply_temporal_filter_luma_8(
+ y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+ u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, blk_col_step,
+ block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
+ y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
+ v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
+ bottom_weight);
+ }
+
+ if (!use_whole_blk) {
+ top_weight = blk_fw[1];
+ bottom_weight = blk_fw[3];
+ }
+
+ // Middle Second
+ for (; blk_col < last_width;
+ blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+ av1_highbd_apply_temporal_filter_luma_8(
+ y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+ u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, blk_col_step,
+ block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
+ y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
+ v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
+ bottom_weight);
+ }
+
+ // Right
+ neighbors_second = HIGHBD_LUMA_RIGHT_COLUMN_NEIGHBORS;
+ av1_highbd_apply_temporal_filter_luma_8(
+ y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+ u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
+ v_pre + uv_blk_col, uv_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+ strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+ y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+ neighbors_first, neighbors_second, top_weight, bottom_weight);
+}
+
+// Add a row of luma distortion that corresponds to 8 chroma mods. If we are
+// subsampling in x direction, then we have 16 lumas, else we have 8.
+static INLINE void highbd_add_luma_dist_to_8_chroma_mod(
+ const uint32_t *y_dist, int ss_x, int ss_y, __m128i *u_mod_fst,
+ __m128i *u_mod_snd, __m128i *v_mod_fst, __m128i *v_mod_snd) {
+ __m128i y_reg_fst, y_reg_snd;
+ if (!ss_x) {
+ highbd_read_dist_8(y_dist, &y_reg_fst, &y_reg_snd);
+ if (ss_y == 1) {
+ __m128i y_tmp_fst, y_tmp_snd;
+ highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
+ y_reg_fst = _mm_add_epi32(y_reg_fst, y_tmp_fst);
+ y_reg_snd = _mm_add_epi32(y_reg_snd, y_tmp_snd);
+ }
+ } else {
+ // Temporary
+ __m128i y_fst, y_snd;
+
+ // First 8
+ highbd_read_dist_8(y_dist, &y_fst, &y_snd);
+ if (ss_y == 1) {
+ __m128i y_tmp_fst, y_tmp_snd;
+ highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
+
+ y_fst = _mm_add_epi32(y_fst, y_tmp_fst);
+ y_snd = _mm_add_epi32(y_snd, y_tmp_snd);
+ }
+
+ y_reg_fst = _mm_hadd_epi32(y_fst, y_snd);
+
+ // Second 8
+ highbd_read_dist_8(y_dist + 8, &y_fst, &y_snd);
+ if (ss_y == 1) {
+ __m128i y_tmp_fst, y_tmp_snd;
+ highbd_read_dist_8(y_dist + 8 + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
+
+ y_fst = _mm_add_epi32(y_fst, y_tmp_fst);
+ y_snd = _mm_add_epi32(y_snd, y_tmp_snd);
+ }
+
+ y_reg_snd = _mm_hadd_epi32(y_fst, y_snd);
+ }
+
+ *u_mod_fst = _mm_add_epi32(*u_mod_fst, y_reg_fst);
+ *u_mod_snd = _mm_add_epi32(*u_mod_snd, y_reg_snd);
+ *v_mod_fst = _mm_add_epi32(*v_mod_fst, y_reg_fst);
+ *v_mod_snd = _mm_add_epi32(*v_mod_snd, y_reg_snd);
+}
+
+// Apply temporal filter to the chroma components. This performs temporal
+// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use
+// blk_fw as an array of size 4 for the weights for each of the 4 subblocks,
+// else use top_weight for top half, and bottom weight for bottom half.
+static void av1_highbd_apply_temporal_filter_chroma_8(
+ const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
+ int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
+ int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
+ int uv_pre_stride, unsigned int uv_block_width,
+ unsigned int uv_block_height, int ss_x, int ss_y, int strength,
+ uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+ const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist,
+ const uint32_t *const *neighbors_fst, const uint32_t *const *neighbors_snd,
+ int top_weight, int bottom_weight, const int *blk_fw) {
+ const int rounding = (1 << strength) >> 1;
+ int weight = top_weight;
+
+ __m128i mul_fst, mul_snd;
+
+ __m128i u_sum_row_1_fst, u_sum_row_2_fst, u_sum_row_3_fst;
+ __m128i v_sum_row_1_fst, v_sum_row_2_fst, v_sum_row_3_fst;
+ __m128i u_sum_row_1_snd, u_sum_row_2_snd, u_sum_row_3_snd;
+ __m128i v_sum_row_1_snd, v_sum_row_2_snd, v_sum_row_3_snd;
+
+ __m128i u_sum_row_fst, v_sum_row_fst;
+ __m128i u_sum_row_snd, v_sum_row_snd;
+
+ // Loop variable
+ unsigned int h;
+
+ (void)uv_block_width;
+
+ // First row
+ mul_fst = _mm_loadu_si128((const __m128i *)neighbors_fst[0]);
+ mul_snd = _mm_loadu_si128((const __m128i *)neighbors_snd[0]);
+
+ // Add chroma values
+ highbd_get_sum_8(u_dist, &u_sum_row_2_fst, &u_sum_row_2_snd);
+ highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd);
+
+ u_sum_row_fst = _mm_add_epi32(u_sum_row_2_fst, u_sum_row_3_fst);
+ u_sum_row_snd = _mm_add_epi32(u_sum_row_2_snd, u_sum_row_3_snd);
+
+ highbd_get_sum_8(v_dist, &v_sum_row_2_fst, &v_sum_row_2_snd);
+ highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd);
+
+ v_sum_row_fst = _mm_add_epi32(v_sum_row_2_fst, v_sum_row_3_fst);
+ v_sum_row_snd = _mm_add_epi32(v_sum_row_2_snd, v_sum_row_3_snd);
+
+ // Add luma values
+ highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
+ &u_sum_row_snd, &v_sum_row_fst,
+ &v_sum_row_snd);
+
+ // Get modifier and store result
+ if (blk_fw) {
+ highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength,
+ rounding, blk_fw[0]);
+ highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength,
+ rounding, blk_fw[1]);
+
+ highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength,
+ rounding, blk_fw[0]);
+ highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength,
+ rounding, blk_fw[1]);
+
+ } else {
+ highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst,
+ &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+ weight);
+ highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst,
+ &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+ weight);
+ }
+ highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
+ u_accum);
+ highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
+ v_accum);
+
+ u_src += uv_src_stride;
+ u_pre += uv_pre_stride;
+ u_dist += DIST_STRIDE;
+ v_src += uv_src_stride;
+ v_pre += uv_pre_stride;
+ v_dist += DIST_STRIDE;
+ u_count += uv_pre_stride;
+ u_accum += uv_pre_stride;
+ v_count += uv_pre_stride;
+ v_accum += uv_pre_stride;
+
+ y_src += y_src_stride * (1 + ss_y);
+ y_pre += y_pre_stride * (1 + ss_y);
+ y_dist += DIST_STRIDE * (1 + ss_y);
+
+ // Then all the rows except the last one
+ mul_fst = _mm_loadu_si128((const __m128i *)neighbors_fst[1]);
+ mul_snd = _mm_loadu_si128((const __m128i *)neighbors_snd[1]);
+
+ for (h = 1; h < uv_block_height - 1; ++h) {
+ // Move the weight pointer to the bottom half of the blocks
+ if (h == uv_block_height / 2) {
+ if (blk_fw) {
+ blk_fw += 2;
+ } else {
+ weight = bottom_weight;
+ }
+ }
+
+ // Shift the rows up
+ u_sum_row_1_fst = u_sum_row_2_fst;
+ u_sum_row_2_fst = u_sum_row_3_fst;
+ u_sum_row_1_snd = u_sum_row_2_snd;
+ u_sum_row_2_snd = u_sum_row_3_snd;
+
+ v_sum_row_1_fst = v_sum_row_2_fst;
+ v_sum_row_2_fst = v_sum_row_3_fst;
+ v_sum_row_1_snd = v_sum_row_2_snd;
+ v_sum_row_2_snd = v_sum_row_3_snd;
+
+ // Add chroma values
+ u_sum_row_fst = _mm_add_epi32(u_sum_row_1_fst, u_sum_row_2_fst);
+ u_sum_row_snd = _mm_add_epi32(u_sum_row_1_snd, u_sum_row_2_snd);
+ highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd);
+ u_sum_row_fst = _mm_add_epi32(u_sum_row_fst, u_sum_row_3_fst);
+ u_sum_row_snd = _mm_add_epi32(u_sum_row_snd, u_sum_row_3_snd);
+
+ v_sum_row_fst = _mm_add_epi32(v_sum_row_1_fst, v_sum_row_2_fst);
+ v_sum_row_snd = _mm_add_epi32(v_sum_row_1_snd, v_sum_row_2_snd);
+ highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd);
+ v_sum_row_fst = _mm_add_epi32(v_sum_row_fst, v_sum_row_3_fst);
+ v_sum_row_snd = _mm_add_epi32(v_sum_row_snd, v_sum_row_3_snd);
+
+ // Add luma values
+ highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
+ &u_sum_row_snd, &v_sum_row_fst,
+ &v_sum_row_snd);
+
+ // Get modifier and store result
+ if (blk_fw) {
+ highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength,
+ rounding, blk_fw[0]);
+ highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength,
+ rounding, blk_fw[1]);
+
+ highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength,
+ rounding, blk_fw[0]);
+ highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength,
+ rounding, blk_fw[1]);
+
+ } else {
+ highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst,
+ &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+ weight);
+ highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst,
+ &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+ weight);
+ }
+
+ highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
+ u_accum);
+ highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
+ v_accum);
+
+ u_src += uv_src_stride;
+ u_pre += uv_pre_stride;
+ u_dist += DIST_STRIDE;
+ v_src += uv_src_stride;
+ v_pre += uv_pre_stride;
+ v_dist += DIST_STRIDE;
+ u_count += uv_pre_stride;
+ u_accum += uv_pre_stride;
+ v_count += uv_pre_stride;
+ v_accum += uv_pre_stride;
+
+ y_src += y_src_stride * (1 + ss_y);
+ y_pre += y_pre_stride * (1 + ss_y);
+ y_dist += DIST_STRIDE * (1 + ss_y);
+ }
+
+ // The last row
+ mul_fst = _mm_loadu_si128((const __m128i *)neighbors_fst[0]);
+ mul_snd = _mm_loadu_si128((const __m128i *)neighbors_snd[0]);
+
+ // Shift the rows up
+ u_sum_row_1_fst = u_sum_row_2_fst;
+ u_sum_row_2_fst = u_sum_row_3_fst;
+ u_sum_row_1_snd = u_sum_row_2_snd;
+ u_sum_row_2_snd = u_sum_row_3_snd;
+
+ v_sum_row_1_fst = v_sum_row_2_fst;
+ v_sum_row_2_fst = v_sum_row_3_fst;
+ v_sum_row_1_snd = v_sum_row_2_snd;
+ v_sum_row_2_snd = v_sum_row_3_snd;
+
+ // Add chroma values
+ u_sum_row_fst = _mm_add_epi32(u_sum_row_1_fst, u_sum_row_2_fst);
+ v_sum_row_fst = _mm_add_epi32(v_sum_row_1_fst, v_sum_row_2_fst);
+ u_sum_row_snd = _mm_add_epi32(u_sum_row_1_snd, u_sum_row_2_snd);
+ v_sum_row_snd = _mm_add_epi32(v_sum_row_1_snd, v_sum_row_2_snd);
+
+ // Add luma values
+ highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
+ &u_sum_row_snd, &v_sum_row_fst,
+ &v_sum_row_snd);
+
+ // Get modifier and store result
+ if (blk_fw) {
+ highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength,
+ rounding, blk_fw[0]);
+ highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength,
+ rounding, blk_fw[1]);
+
+ highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength,
+ rounding, blk_fw[0]);
+ highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength,
+ rounding, blk_fw[1]);
+
+ } else {
+ highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst,
+ &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+ weight);
+ highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst,
+ &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+ weight);
+ }
+
+ highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
+ u_accum);
+ highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
+ v_accum);
+}
+
+// Perform temporal filter for the chroma components.
+static void av1_highbd_apply_temporal_filter_chroma(
+ const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
+ int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
+ int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
+ int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+ int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
+ uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+ const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) {
+ const unsigned int uv_width = block_width >> ss_x,
+ uv_height = block_height >> ss_y;
+
+ unsigned int blk_col = 0, uv_blk_col = 0;
+ const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x;
+ const unsigned int uv_mid_width = uv_width >> 1,
+ uv_last_width = uv_width - uv_blk_col_step;
+ int top_weight = blk_fw[0],
+ bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+ const uint32_t *const *neighbors_fst;
+ const uint32_t *const *neighbors_snd;
+
+ if (uv_width == 8) {
+ // Special Case: We are subsampling in x direction on a 16x16 block. Since
+ // we are operating on a row of 8 chroma pixels, we can't use the usual
+ // left-middle-right pattern.
+ assert(ss_x);
+
+ if (ss_y) {
+ neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
+ neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
+ } else {
+ neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
+ neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
+ }
+
+ if (use_whole_blk) {
+ av1_highbd_apply_temporal_filter_chroma_8(
+ y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+ u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+ uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+ u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+ y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+ neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
+ } else {
+ av1_highbd_apply_temporal_filter_chroma_8(
+ y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+ u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+ uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+ u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+ y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+ neighbors_fst, neighbors_snd, 0, 0, blk_fw);
+ }
+
+ return;
+ }
+
+ // Left
+ if (ss_x && ss_y) {
+ neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
+ neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+ } else if (ss_x || ss_y) {
+ neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
+ neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+ } else {
+ neighbors_fst = HIGHBD_CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS;
+ neighbors_snd = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
+ }
+
+ av1_highbd_apply_temporal_filter_chroma_8(
+ y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+ u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
+ v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
+ strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+ v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+ u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, neighbors_snd,
+ top_weight, bottom_weight, NULL);
+
+ blk_col += blk_col_step;
+ uv_blk_col += uv_blk_col_step;
+
+ // Middle First
+ if (ss_x && ss_y) {
+ neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+ } else if (ss_x || ss_y) {
+ neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+ } else {
+ neighbors_fst = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
+ }
+
+ for (; uv_blk_col < uv_mid_width;
+ blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+ av1_highbd_apply_temporal_filter_chroma_8(
+ y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+ u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+ uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+ u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+ y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+ neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
+ }
+
+ if (!use_whole_blk) {
+ top_weight = blk_fw[1];
+ bottom_weight = blk_fw[3];
+ }
+
+ // Middle Second
+ for (; uv_blk_col < uv_last_width;
+ blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+ av1_highbd_apply_temporal_filter_chroma_8(
+ y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+ u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+ uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+ u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+ y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+ neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
+ }
+
+ // Right
+ if (ss_x && ss_y) {
+ neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
+ } else if (ss_x || ss_y) {
+ neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
+ } else {
+ neighbors_snd = HIGHBD_CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS;
+ }
+
+ av1_highbd_apply_temporal_filter_chroma_8(
+ y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+ u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
+ v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
+ strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+ v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+ u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, neighbors_snd,
+ top_weight, bottom_weight, NULL);
+}
+
+void av1_highbd_apply_temporal_filter_sse4_1(
+ const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
+ int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
+ int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
+ int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+ int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
+ uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, uint16_t *u_count,
+ uint32_t *v_accum, uint16_t *v_count) {
+ const unsigned int chroma_height = block_height >> ss_y,
+ chroma_width = block_width >> ss_x;
+
+ DECLARE_ALIGNED(16, uint32_t, y_dist[BH * DIST_STRIDE]) = { 0 };
+ DECLARE_ALIGNED(16, uint32_t, u_dist[BH * DIST_STRIDE]) = { 0 };
+ DECLARE_ALIGNED(16, uint32_t, v_dist[BH * DIST_STRIDE]) = { 0 };
+
+ uint32_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1,
+ *v_dist_ptr = v_dist + 1;
+ const uint16_t *y_src_ptr = CONVERT_TO_SHORTPTR(y_src),
+ *u_src_ptr = CONVERT_TO_SHORTPTR(u_src),
+ *v_src_ptr = CONVERT_TO_SHORTPTR(v_src);
+ const uint16_t *y_pre_ptr = CONVERT_TO_SHORTPTR(y_pre),
+ *u_pre_ptr = CONVERT_TO_SHORTPTR(u_pre),
+ *v_pre_ptr = CONVERT_TO_SHORTPTR(v_pre);
+
+ // Loop variables
+ unsigned int row, blk_col;
+
+ assert(block_width <= BW && "block width too large");
+ assert(block_height <= BH && "block height too large");
+ assert(block_width % 16 == 0 && "block width must be multiple of 16");
+ assert(block_height % 2 == 0 && "block height must be even");
+ assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) &&
+ "invalid chroma subsampling");
+ assert(strength >= 4 && strength <= 14 &&
+ "invalid adjusted temporal filter strength");
+ assert(blk_fw[0] >= 0 && "filter weight must be positive");
+ assert(
+ (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) &&
+ "subblock filter weight must be positive");
+ assert(blk_fw[0] <= 2 && "sublock filter weight must be less than 2");
+ assert(
+ (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) &&
+ "subblock filter weight must be less than 2");
+
+ // Precompute the difference squared
+ for (row = 0; row < block_height; row++) {
+ for (blk_col = 0; blk_col < block_width; blk_col += 8) {
+ highbd_store_dist_8(y_src_ptr + blk_col, y_pre_ptr + blk_col,
+ y_dist_ptr + blk_col);
+ }
+ y_src_ptr += y_src_stride;
+ y_pre_ptr += y_pre_stride;
+ y_dist_ptr += DIST_STRIDE;
+ }
+
+ for (row = 0; row < chroma_height; row++) {
+ for (blk_col = 0; blk_col < chroma_width; blk_col += 8) {
+ highbd_store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col,
+ u_dist_ptr + blk_col);
+ highbd_store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col,
+ v_dist_ptr + blk_col);
+ }
+
+ u_src_ptr += uv_src_stride;
+ u_pre_ptr += uv_pre_stride;
+ u_dist_ptr += DIST_STRIDE;
+ v_src_ptr += uv_src_stride;
+ v_pre_ptr += uv_pre_stride;
+ v_dist_ptr += DIST_STRIDE;
+ }
+
+ y_src_ptr = CONVERT_TO_SHORTPTR(y_src),
+ u_src_ptr = CONVERT_TO_SHORTPTR(u_src),
+ v_src_ptr = CONVERT_TO_SHORTPTR(v_src);
+ y_pre_ptr = CONVERT_TO_SHORTPTR(y_pre),
+ u_pre_ptr = CONVERT_TO_SHORTPTR(u_pre),
+ v_pre_ptr = CONVERT_TO_SHORTPTR(v_pre);
+
+ y_dist_ptr = y_dist + 1;
+ u_dist_ptr = u_dist + 1;
+ v_dist_ptr = v_dist + 1;
+
+ av1_highbd_apply_temporal_filter_luma(
+ y_src_ptr, y_src_stride, y_pre_ptr, y_pre_stride, u_src_ptr, v_src_ptr,
+ uv_src_stride, u_pre_ptr, v_pre_ptr, uv_pre_stride, block_width,
+ block_height, ss_x, ss_y, strength, blk_fw, use_whole_blk, y_accum,
+ y_count, y_dist_ptr, u_dist_ptr, v_dist_ptr);
+
+ av1_highbd_apply_temporal_filter_chroma(
+ y_src_ptr, y_src_stride, y_pre_ptr, y_pre_stride, u_src_ptr, v_src_ptr,
+ uv_src_stride, u_pre_ptr, v_pre_ptr, uv_pre_stride, block_width,
+ block_height, ss_x, ss_y, strength, blk_fw, use_whole_blk, u_accum,
+ u_count, v_accum, v_count, y_dist_ptr, u_dist_ptr, v_dist_ptr);
+}
diff --git a/libaom/av1/encoder/x86/pickrst_avx2.c b/libaom/av1/encoder/x86/pickrst_avx2.c
index 7a63c60..d00fca0 100644
--- a/libaom/av1/encoder/x86/pickrst_avx2.c
+++ b/libaom/av1/encoder/x86/pickrst_avx2.c
@@ -536,7 +536,7 @@ int64_t av1_lowbd_pixel_proj_error_avx2(
const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u);
const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
- err += e * e;
+ err += ((int64_t)e * e);
}
dat += dat_stride;
src += src_stride;
@@ -581,7 +581,7 @@ int64_t av1_lowbd_pixel_proj_error_avx2(
const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
int32_t v = xq_active * (flt[k] - u);
const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
- err += e * e;
+ err += ((int64_t)e * e);
}
dat += dat_stride;
src += src_stride;
@@ -605,7 +605,7 @@ int64_t av1_lowbd_pixel_proj_error_avx2(
}
for (k = j; k < width; ++k) {
const int32_t e = (int32_t)(dat[k]) - src[k];
- err += e * e;
+ err += ((int64_t)e * e);
}
dat += dat_stride;
src += src_stride;
@@ -711,7 +711,7 @@ int64_t av1_highbd_pixel_proj_error_avx2(
const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u);
const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
- err += e * e;
+ err += ((int64_t)e * e);
}
dat += dat_stride;
src += src_stride;
@@ -788,7 +788,7 @@ int64_t av1_highbd_pixel_proj_error_avx2(
const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
int32_t v = xq_on * (flt[k] - u);
const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
- err += e * e;
+ err += ((int64_t)e * e);
}
dat += dat_stride;
src += src_stride;
@@ -828,7 +828,7 @@ int64_t av1_highbd_pixel_proj_error_avx2(
// Process remaining pixels (modulu 16)
for (k = j; k < width; ++k) {
const int32_t e = (int32_t)(dat[k]) - src[k];
- err += e * e;
+ err += ((int64_t)e * e);
}
dat += dat_stride;
src += src_stride;
diff --git a/libaom/av1/encoder/x86/pickrst_sse4.c b/libaom/av1/encoder/x86/pickrst_sse4.c
index 2326736..a94e169 100644
--- a/libaom/av1/encoder/x86/pickrst_sse4.c
+++ b/libaom/av1/encoder/x86/pickrst_sse4.c
@@ -539,7 +539,7 @@ int64_t av1_lowbd_pixel_proj_error_sse4_1(
const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u);
const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
- err += e * e;
+ err += ((int64_t)e * e);
}
dat += dat_stride;
src += src_stride;
@@ -578,7 +578,7 @@ int64_t av1_lowbd_pixel_proj_error_sse4_1(
const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
int32_t v = xq_active * (flt[k] - u);
const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
- err += e * e;
+ err += ((int64_t)e * e);
}
dat += dat_stride;
src += src_stride;
@@ -607,7 +607,7 @@ int64_t av1_lowbd_pixel_proj_error_sse4_1(
}
for (k = j; k < width; ++k) {
const int32_t e = (int32_t)(dat[k]) - src[k];
- err += e * e;
+ err += ((int64_t)e * e);
}
dat += dat_stride;
src += src_stride;
@@ -709,7 +709,7 @@ int64_t av1_highbd_pixel_proj_error_sse4_1(
const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u);
const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
- err += e * e;
+ err += ((int64_t)e * e);
}
dat += dat_stride;
src += src_stride;
@@ -777,7 +777,7 @@ int64_t av1_highbd_pixel_proj_error_sse4_1(
const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
int32_t v = xq_on * (flt[k] - u);
const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
- err += e * e;
+ err += ((int64_t)e * e);
}
dat += dat_stride;
src += src_stride;
@@ -814,7 +814,7 @@ int64_t av1_highbd_pixel_proj_error_sse4_1(
// Process remaining pixels (modulu 8)
for (k = j; k < width; ++k) {
const int32_t e = (int32_t)(dat[k]) - src[k];
- err += e * e;
+ err += ((int64_t)e * e);
}
dat += dat_stride;
src += src_stride;
diff --git a/libaom/av1/encoder/x86/temporal_filter_apply_sse2.asm b/libaom/av1/encoder/x86/temporal_filter_apply_sse2.asm
deleted file mode 100644
index 30983d1..0000000
--- a/libaom/av1/encoder/x86/temporal_filter_apply_sse2.asm
+++ /dev/null
@@ -1,217 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-
-%include "aom_ports/x86_abi_support.asm"
-
-SECTION .text
-
-; void av1_temporal_filter_apply_sse2 | arg
-; (unsigned char *frame1, | 0
-; unsigned int stride, | 1
-; unsigned char *frame2, | 2
-; unsigned int block_width, | 3
-; unsigned int block_height, | 4
-; int strength, | 5
-; int filter_weight, | 6
-; unsigned int *accumulator, | 7
-; unsigned short *count) | 8
-global sym(av1_temporal_filter_apply_sse2) PRIVATE
-sym(av1_temporal_filter_apply_sse2):
-
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 9
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ALIGN_STACK 16, rax
- %define block_width 0
- %define block_height 16
- %define strength 32
- %define filter_weight 48
- %define rounding_bit 64
- %define rbp_backup 80
- %define stack_size 96
- sub rsp, stack_size
- mov [rsp + rbp_backup], rbp
- ; end prolog
-
- mov edx, arg(3)
- mov [rsp + block_width], rdx
- mov edx, arg(4)
- mov [rsp + block_height], rdx
- movd xmm6, arg(5)
- movdqa [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read
-
- ; calculate the rounding bit outside the loop
- ; 0x8000 >> (16 - strength)
- mov rdx, 16
- sub rdx, arg(5) ; 16 - strength
- movq xmm4, rdx ; can't use rdx w/ shift
- movdqa xmm5, [GLOBAL(_const_top_bit)]
- psrlw xmm5, xmm4
- movdqa [rsp + rounding_bit], xmm5
-
- mov rsi, arg(0) ; src/frame1
- mov rdx, arg(2) ; predictor frame
- mov rdi, arg(7) ; accumulator
- mov rax, arg(8) ; count
-
- ; dup the filter weight and store for later
- movd xmm0, arg(6) ; filter_weight
- pshuflw xmm0, xmm0, 0
- punpcklwd xmm0, xmm0
- movdqa [rsp + filter_weight], xmm0
-
- mov rbp, arg(1) ; stride
- pxor xmm7, xmm7 ; zero for extraction
-
- mov rcx, [rsp + block_width]
- imul rcx, [rsp + block_height]
- add rcx, rdx
- cmp dword ptr [rsp + block_width], 8
- jne .temporal_filter_apply_load_16
-
-.temporal_filter_apply_load_8:
- movq xmm0, [rsi] ; first row
- lea rsi, [rsi + rbp] ; += stride
- punpcklbw xmm0, xmm7 ; src[ 0- 7]
- movq xmm1, [rsi] ; second row
- lea rsi, [rsi + rbp] ; += stride
- punpcklbw xmm1, xmm7 ; src[ 8-15]
- jmp .temporal_filter_apply_load_finished
-
-.temporal_filter_apply_load_16:
- movdqa xmm0, [rsi] ; src (frame1)
- lea rsi, [rsi + rbp] ; += stride
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm7 ; src[ 0- 7]
- punpckhbw xmm1, xmm7 ; src[ 8-15]
-
-.temporal_filter_apply_load_finished:
- movdqa xmm2, [rdx] ; predictor (frame2)
- movdqa xmm3, xmm2
- punpcklbw xmm2, xmm7 ; pred[ 0- 7]
- punpckhbw xmm3, xmm7 ; pred[ 8-15]
-
- ; modifier = src_byte - pixel_value
- psubw xmm0, xmm2 ; src - pred[ 0- 7]
- psubw xmm1, xmm3 ; src - pred[ 8-15]
-
- ; modifier *= modifier
- pmullw xmm0, xmm0 ; modifer[ 0- 7]^2
- pmullw xmm1, xmm1 ; modifer[ 8-15]^2
-
- ; modifier *= 3
- pmullw xmm0, [GLOBAL(_const_3w)]
- pmullw xmm1, [GLOBAL(_const_3w)]
-
- ; modifer += 0x8000 >> (16 - strength)
- paddw xmm0, [rsp + rounding_bit]
- paddw xmm1, [rsp + rounding_bit]
-
- ; modifier >>= strength
- psrlw xmm0, [rsp + strength]
- psrlw xmm1, [rsp + strength]
-
- ; modifier = 16 - modifier
- ; saturation takes care of modifier > 16
- movdqa xmm3, [GLOBAL(_const_16w)]
- movdqa xmm2, [GLOBAL(_const_16w)]
- psubusw xmm3, xmm1
- psubusw xmm2, xmm0
-
- ; modifier *= filter_weight
- pmullw xmm2, [rsp + filter_weight]
- pmullw xmm3, [rsp + filter_weight]
-
- ; count
- movdqa xmm4, [rax]
- movdqa xmm5, [rax+16]
- ; += modifier
- paddw xmm4, xmm2
- paddw xmm5, xmm3
- ; write back
- movdqa [rax], xmm4
- movdqa [rax+16], xmm5
- lea rax, [rax + 16*2] ; count += 16*(sizeof(short))
-
- ; load and extract the predictor up to shorts
- pxor xmm7, xmm7
- movdqa xmm0, [rdx]
- lea rdx, [rdx + 16*1] ; pred += 16*(sizeof(char))
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm7 ; pred[ 0- 7]
- punpckhbw xmm1, xmm7 ; pred[ 8-15]
-
- ; modifier *= pixel_value
- pmullw xmm0, xmm2
- pmullw xmm1, xmm3
-
- ; expand to double words
- movdqa xmm2, xmm0
- punpcklwd xmm0, xmm7 ; [ 0- 3]
- punpckhwd xmm2, xmm7 ; [ 4- 7]
- movdqa xmm3, xmm1
- punpcklwd xmm1, xmm7 ; [ 8-11]
- punpckhwd xmm3, xmm7 ; [12-15]
-
- ; accumulator
- movdqa xmm4, [rdi]
- movdqa xmm5, [rdi+16]
- movdqa xmm6, [rdi+32]
- movdqa xmm7, [rdi+48]
- ; += modifier
- paddd xmm4, xmm0
- paddd xmm5, xmm2
- paddd xmm6, xmm1
- paddd xmm7, xmm3
- ; write back
- movdqa [rdi], xmm4
- movdqa [rdi+16], xmm5
- movdqa [rdi+32], xmm6
- movdqa [rdi+48], xmm7
- lea rdi, [rdi + 16*4] ; accumulator += 16*(sizeof(int))
-
- cmp rdx, rcx
- je .temporal_filter_apply_epilog
- pxor xmm7, xmm7 ; zero for extraction
- cmp dword ptr [rsp + block_width], 16
- je .temporal_filter_apply_load_16
- jmp .temporal_filter_apply_load_8
-
-.temporal_filter_apply_epilog:
- ; begin epilog
- mov rbp, [rsp + rbp_backup]
- add rsp, stack_size
- pop rsp
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-SECTION_RODATA
-align 16
-_const_3w:
- times 8 dw 3
-align 16
-_const_top_bit:
- times 8 dw 1<<15
-align 16
-_const_16w:
- times 8 dw 16
diff --git a/libaom/av1/encoder/x86/temporal_filter_constants.h b/libaom/av1/encoder/x86/temporal_filter_constants.h
new file mode 100644
index 0000000..b3a10dd
--- /dev/null
+++ b/libaom/av1/encoder/x86/temporal_filter_constants.h
@@ -0,0 +1,401 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_X86_TEMPORAL_FILTER_CONSTANTS_H_
+#define AOM_AV1_ENCODER_X86_TEMPORAL_FILTER_CONSTANTS_H_
+
+// Division using multiplication and shifting. The C implementation does:
+// modifier *= 3;
+// modifier /= index;
+// where 'modifier' is a set of summed values and 'index' is the number of
+// summed values.
+//
+// This equation works out to (m * 3) / i which reduces to:
+// m * 3/4
+// m * 1/2
+// m * 1/3
+//
+// By pairing the multiply with a down shift by 16 (_mm_mulhi_epu16):
+// m * C / 65536
+// we can create a C to replicate the division.
+//
+// m * 49152 / 65536 = m * 3/4
+// m * 32758 / 65536 = m * 1/2
+// m * 21846 / 65536 = m * 0.3333
+//
+// These are loaded using an instruction expecting int16_t values but are used
+// with _mm_mulhi_epu16(), which treats them as unsigned.
+#define NEIGHBOR_CONSTANT_4 (int16_t)49152
+#define NEIGHBOR_CONSTANT_5 (int16_t)39322
+#define NEIGHBOR_CONSTANT_6 (int16_t)32768
+#define NEIGHBOR_CONSTANT_7 (int16_t)28087
+#define NEIGHBOR_CONSTANT_8 (int16_t)24576
+#define NEIGHBOR_CONSTANT_9 (int16_t)21846
+#define NEIGHBOR_CONSTANT_10 (int16_t)19661
+#define NEIGHBOR_CONSTANT_11 (int16_t)17874
+#define NEIGHBOR_CONSTANT_13 (int16_t)15124
+
+DECLARE_ALIGNED(16, static const int16_t, LEFT_CORNER_NEIGHBORS_PLUS_1[8]) = {
+ NEIGHBOR_CONSTANT_5, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
+ NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
+ NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7
+};
+
+DECLARE_ALIGNED(16, static const int16_t, RIGHT_CORNER_NEIGHBORS_PLUS_1[8]) = {
+ NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
+ NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
+ NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_5
+};
+
+DECLARE_ALIGNED(16, static const int16_t, LEFT_EDGE_NEIGHBORS_PLUS_1[8]) = {
+ NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const int16_t, RIGHT_EDGE_NEIGHBORS_PLUS_1[8]) = {
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_7
+};
+
+DECLARE_ALIGNED(16, static const int16_t, MIDDLE_EDGE_NEIGHBORS_PLUS_1[8]) = {
+ NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
+ NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
+ NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7
+};
+
+DECLARE_ALIGNED(16, static const int16_t, MIDDLE_CENTER_NEIGHBORS_PLUS_1[8]) = {
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const int16_t, LEFT_CORNER_NEIGHBORS_PLUS_2[8]) = {
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+ NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+ NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const int16_t, RIGHT_CORNER_NEIGHBORS_PLUS_2[8]) = {
+ NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+ NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+ NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_6
+};
+
+DECLARE_ALIGNED(16, static const int16_t, LEFT_EDGE_NEIGHBORS_PLUS_2[8]) = {
+ NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+ NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+ NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11
+};
+
+DECLARE_ALIGNED(16, static const int16_t, RIGHT_EDGE_NEIGHBORS_PLUS_2[8]) = {
+ NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+ NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+ NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const int16_t, MIDDLE_EDGE_NEIGHBORS_PLUS_2[8]) = {
+ NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+ NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+ NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const int16_t, MIDDLE_CENTER_NEIGHBORS_PLUS_2[8]) = {
+ NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+ NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+ NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11
+};
+
+DECLARE_ALIGNED(16, static const int16_t, TWO_CORNER_NEIGHBORS_PLUS_2[8]) = {
+ NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+ NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+ NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_6
+};
+
+DECLARE_ALIGNED(16, static const int16_t, TWO_EDGE_NEIGHBORS_PLUS_2[8]) = {
+ NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+ NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+ NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const int16_t, LEFT_CORNER_NEIGHBORS_PLUS_4[8]) = {
+ NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const int16_t, RIGHT_CORNER_NEIGHBORS_PLUS_4[8]) = {
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const int16_t, LEFT_EDGE_NEIGHBORS_PLUS_4[8]) = {
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+ NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+ NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13
+};
+
+DECLARE_ALIGNED(16, static const int16_t, RIGHT_EDGE_NEIGHBORS_PLUS_4[8]) = {
+ NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+ NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+ NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const int16_t, MIDDLE_EDGE_NEIGHBORS_PLUS_4[8]) = {
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const int16_t, MIDDLE_CENTER_NEIGHBORS_PLUS_4[8]) = {
+ NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+ NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+ NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13
+};
+
+DECLARE_ALIGNED(16, static const int16_t, TWO_CORNER_NEIGHBORS_PLUS_4[8]) = {
+ NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const int16_t, TWO_EDGE_NEIGHBORS_PLUS_4[8]) = {
+ NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+ NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+ NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_10
+};
+
+static const int16_t *const LUMA_LEFT_COLUMN_NEIGHBORS[2] = {
+ LEFT_CORNER_NEIGHBORS_PLUS_2, LEFT_EDGE_NEIGHBORS_PLUS_2
+};
+
+static const int16_t *const LUMA_MIDDLE_COLUMN_NEIGHBORS[2] = {
+ MIDDLE_EDGE_NEIGHBORS_PLUS_2, MIDDLE_CENTER_NEIGHBORS_PLUS_2
+};
+
+static const int16_t *const LUMA_RIGHT_COLUMN_NEIGHBORS[2] = {
+ RIGHT_CORNER_NEIGHBORS_PLUS_2, RIGHT_EDGE_NEIGHBORS_PLUS_2
+};
+
+static const int16_t *const CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS[2] = {
+ LEFT_CORNER_NEIGHBORS_PLUS_1, LEFT_EDGE_NEIGHBORS_PLUS_1
+};
+
+static const int16_t *const CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
+ MIDDLE_EDGE_NEIGHBORS_PLUS_1, MIDDLE_CENTER_NEIGHBORS_PLUS_1
+};
+
+static const int16_t *const CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
+ RIGHT_CORNER_NEIGHBORS_PLUS_1, RIGHT_EDGE_NEIGHBORS_PLUS_1
+};
+
+static const int16_t *const CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS[2] = {
+ LEFT_CORNER_NEIGHBORS_PLUS_2, LEFT_EDGE_NEIGHBORS_PLUS_2
+};
+
+static const int16_t *const CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
+ MIDDLE_EDGE_NEIGHBORS_PLUS_2, MIDDLE_CENTER_NEIGHBORS_PLUS_2
+};
+
+static const int16_t *const CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
+ RIGHT_CORNER_NEIGHBORS_PLUS_2, RIGHT_EDGE_NEIGHBORS_PLUS_2
+};
+
+static const int16_t *const CHROMA_SINGLE_SS_SINGLE_COLUMN_NEIGHBORS[2] = {
+ TWO_CORNER_NEIGHBORS_PLUS_2, TWO_EDGE_NEIGHBORS_PLUS_2
+};
+
+static const int16_t *const CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS[2] = {
+ LEFT_CORNER_NEIGHBORS_PLUS_4, LEFT_EDGE_NEIGHBORS_PLUS_4
+};
+
+static const int16_t *const CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
+ MIDDLE_EDGE_NEIGHBORS_PLUS_4, MIDDLE_CENTER_NEIGHBORS_PLUS_4
+};
+
+static const int16_t *const CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
+ RIGHT_CORNER_NEIGHBORS_PLUS_4, RIGHT_EDGE_NEIGHBORS_PLUS_4
+};
+
+static const int16_t *const CHROMA_DOUBLE_SS_SINGLE_COLUMN_NEIGHBORS[2] = {
+ TWO_CORNER_NEIGHBORS_PLUS_4, TWO_EDGE_NEIGHBORS_PLUS_4
+};
+
+#define HIGHBD_NEIGHBOR_CONSTANT_4 (uint32_t)3221225472U
+#define HIGHBD_NEIGHBOR_CONSTANT_5 (uint32_t)2576980378U
+#define HIGHBD_NEIGHBOR_CONSTANT_6 (uint32_t)2147483648U
+#define HIGHBD_NEIGHBOR_CONSTANT_7 (uint32_t)1840700270U
+#define HIGHBD_NEIGHBOR_CONSTANT_8 (uint32_t)1610612736U
+#define HIGHBD_NEIGHBOR_CONSTANT_9 (uint32_t)1431655766U
+#define HIGHBD_NEIGHBOR_CONSTANT_10 (uint32_t)1288490189U
+#define HIGHBD_NEIGHBOR_CONSTANT_11 (uint32_t)1171354718U
+#define HIGHBD_NEIGHBOR_CONSTANT_13 (uint32_t)991146300U
+
+DECLARE_ALIGNED(16, static const uint32_t,
+ HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_1[4]) = {
+ HIGHBD_NEIGHBOR_CONSTANT_5, HIGHBD_NEIGHBOR_CONSTANT_7,
+ HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+ HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_1[4]) = {
+ HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7,
+ HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_5
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+ HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_1[4]) = {
+ HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_10,
+ HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+ HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_1[4]) = {
+ HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10,
+ HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_7
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+ HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_1[4]) = {
+ HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7,
+ HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+ HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_1[4]) = {
+ HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10,
+ HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+ HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_2[4]) = {
+ HIGHBD_NEIGHBOR_CONSTANT_6, HIGHBD_NEIGHBOR_CONSTANT_8,
+ HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+ HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_2[4]) = {
+ HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8,
+ HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_6
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+ HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_2[4]) = {
+ HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_11,
+ HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+ HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_2[4]) = {
+ HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11,
+ HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+ HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_2[4]) = {
+ HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8,
+ HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+ HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_2[4]) = {
+ HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11,
+ HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+ HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_4[4]) = {
+ HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_10,
+ HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+ HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_4[4]) = {
+ HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10,
+ HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+ HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_4[4]) = {
+ HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_13,
+ HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+ HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_4[4]) = {
+ HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13,
+ HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+ HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_4[4]) = {
+ HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10,
+ HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+ HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_4[4]) = {
+ HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13,
+ HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13
+};
+
+static const uint32_t *const HIGHBD_LUMA_LEFT_COLUMN_NEIGHBORS[2] = {
+ HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_2
+};
+
+static const uint32_t *const HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS[2] = {
+ HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_2, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_2
+};
+
+static const uint32_t *const HIGHBD_LUMA_RIGHT_COLUMN_NEIGHBORS[2] = {
+ HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_2
+};
+
+static const uint32_t *const HIGHBD_CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS[2] = {
+ HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_1, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_1
+};
+
+static const uint32_t *const HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
+ HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_1, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_1
+};
+
+static const uint32_t *const HIGHBD_CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
+ HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_1, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_1
+};
+
+static const uint32_t *const HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS[2] =
+ { HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_2 };
+
+static const uint32_t
+ *const HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
+ HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_2, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_2
+ };
+
+static const uint32_t *const HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS[2] =
+ { HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_2,
+ HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_2 };
+
+static const uint32_t *const HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS[2] =
+ { HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_4, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_4 };
+
+static const uint32_t
+ *const HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
+ HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_4, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_4
+ };
+
+static const uint32_t *const HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS[2] =
+ { HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_4,
+ HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_4 };
+
+#define DIST_STRIDE ((BW) + 2)
+#endif // AOM_AV1_ENCODER_X86_TEMPORAL_FILTER_CONSTANTS_H_
diff --git a/libaom/av1/encoder/x86/temporal_filter_sse4.c b/libaom/av1/encoder/x86/temporal_filter_sse4.c
new file mode 100644
index 0000000..556d00c
--- /dev/null
+++ b/libaom/av1/encoder/x86/temporal_filter_sse4.c
@@ -0,0 +1,1006 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "config/av1_rtcd.h"
+#include "aom/aom_integer.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/temporal_filter.h"
+#include "av1/encoder/x86/temporal_filter_constants.h"
+
+// Read in 8 pixels from a and b as 8-bit unsigned integers, compute the
+// difference squared, and store as unsigned 16-bit integer to dst.
+static INLINE void store_dist_8(const uint8_t *a, const uint8_t *b,
+ uint16_t *dst) {
+ const __m128i a_reg = _mm_loadl_epi64((const __m128i *)a);
+ const __m128i b_reg = _mm_loadl_epi64((const __m128i *)b);
+
+ const __m128i a_first = _mm_cvtepu8_epi16(a_reg);
+ const __m128i b_first = _mm_cvtepu8_epi16(b_reg);
+
+ __m128i dist_first;
+
+ dist_first = _mm_sub_epi16(a_first, b_first);
+ dist_first = _mm_mullo_epi16(dist_first, dist_first);
+
+ _mm_storeu_si128((__m128i *)dst, dist_first);
+}
+
+static INLINE void store_dist_16(const uint8_t *a, const uint8_t *b,
+ uint16_t *dst) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i a_reg = _mm_loadu_si128((const __m128i *)a);
+ const __m128i b_reg = _mm_loadu_si128((const __m128i *)b);
+
+ const __m128i a_first = _mm_cvtepu8_epi16(a_reg);
+ const __m128i a_second = _mm_unpackhi_epi8(a_reg, zero);
+ const __m128i b_first = _mm_cvtepu8_epi16(b_reg);
+ const __m128i b_second = _mm_unpackhi_epi8(b_reg, zero);
+
+ __m128i dist_first, dist_second;
+
+ dist_first = _mm_sub_epi16(a_first, b_first);
+ dist_second = _mm_sub_epi16(a_second, b_second);
+ dist_first = _mm_mullo_epi16(dist_first, dist_first);
+ dist_second = _mm_mullo_epi16(dist_second, dist_second);
+
+ _mm_storeu_si128((__m128i *)dst, dist_first);
+ _mm_storeu_si128((__m128i *)(dst + 8), dist_second);
+}
+
+static INLINE void read_dist_8(const uint16_t *dist, __m128i *dist_reg) {
+ *dist_reg = _mm_loadu_si128((const __m128i *)dist);
+}
+
+static INLINE void read_dist_16(const uint16_t *dist, __m128i *reg_first,
+ __m128i *reg_second) {
+ read_dist_8(dist, reg_first);
+ read_dist_8(dist + 8, reg_second);
+}
+
+// Average the value based on the number of values summed (9 for pixels away
+// from the border, 4 for pixels in corners, and 6 for other edge values).
+//
+// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply
+// by weight.
+static __m128i average_8(__m128i sum, const __m128i *mul_constants,
+ const int strength, const int rounding,
+ const int weight) {
+ // _mm_srl_epi16 uses the lower 64 bit value for the shift.
+ const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
+ const __m128i rounding_u16 = _mm_set1_epi16(rounding);
+ const __m128i weight_u16 = _mm_set1_epi16(weight);
+ const __m128i sixteen = _mm_set1_epi16(16);
+
+ // modifier * 3 / index;
+ sum = _mm_mulhi_epu16(sum, *mul_constants);
+
+ sum = _mm_adds_epu16(sum, rounding_u16);
+ sum = _mm_srl_epi16(sum, strength_u128);
+
+ // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4
+ // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385
+ // So this needs to use the epu16 version which did not come until SSE4.
+ sum = _mm_min_epu16(sum, sixteen);
+
+ sum = _mm_sub_epi16(sixteen, sum);
+
+ return _mm_mullo_epi16(sum, weight_u16);
+}
+
+static __m128i average_4_4(__m128i sum, const __m128i *mul_constants,
+ const int strength, const int rounding,
+ const int weight_0, const int weight_1) {
+ // _mm_srl_epi16 uses the lower 64 bit value for the shift.
+ const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
+ const __m128i rounding_u16 = _mm_set1_epi16(rounding);
+ const __m128i weight_u16 =
+ _mm_setr_epi16(weight_0, weight_0, weight_0, weight_0, weight_1, weight_1,
+ weight_1, weight_1);
+ const __m128i sixteen = _mm_set1_epi16(16);
+
+ // modifier * 3 / index;
+ sum = _mm_mulhi_epu16(sum, *mul_constants);
+
+ sum = _mm_adds_epu16(sum, rounding_u16);
+ sum = _mm_srl_epi16(sum, strength_u128);
+
+ // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4
+ // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385
+ // So this needs to use the epu16 version which did not come until SSE4.
+ sum = _mm_min_epu16(sum, sixteen);
+
+ sum = _mm_sub_epi16(sixteen, sum);
+
+ return _mm_mullo_epi16(sum, weight_u16);
+}
+
+static INLINE void average_16(__m128i *sum_0_u16, __m128i *sum_1_u16,
+ const __m128i *mul_constants_0,
+ const __m128i *mul_constants_1,
+ const int strength, const int rounding,
+ const int weight) {
+ const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
+ const __m128i rounding_u16 = _mm_set1_epi16(rounding);
+ const __m128i weight_u16 = _mm_set1_epi16(weight);
+ const __m128i sixteen = _mm_set1_epi16(16);
+ __m128i input_0, input_1;
+
+ input_0 = _mm_mulhi_epu16(*sum_0_u16, *mul_constants_0);
+ input_0 = _mm_adds_epu16(input_0, rounding_u16);
+
+ input_1 = _mm_mulhi_epu16(*sum_1_u16, *mul_constants_1);
+ input_1 = _mm_adds_epu16(input_1, rounding_u16);
+
+ input_0 = _mm_srl_epi16(input_0, strength_u128);
+ input_1 = _mm_srl_epi16(input_1, strength_u128);
+
+ input_0 = _mm_min_epu16(input_0, sixteen);
+ input_1 = _mm_min_epu16(input_1, sixteen);
+ input_0 = _mm_sub_epi16(sixteen, input_0);
+ input_1 = _mm_sub_epi16(sixteen, input_1);
+
+ *sum_0_u16 = _mm_mullo_epi16(input_0, weight_u16);
+ *sum_1_u16 = _mm_mullo_epi16(input_1, weight_u16);
+}
+
+// Add 'sum_u16' to 'count'. Multiply by 'pred' and add to 'accumulator.'
+static void accumulate_and_store_8(const __m128i sum_u16, const uint8_t *pred,
+ uint16_t *count, uint32_t *accumulator) {
+ const __m128i pred_u8 = _mm_loadl_epi64((const __m128i *)pred);
+ const __m128i zero = _mm_setzero_si128();
+ __m128i count_u16 = _mm_loadu_si128((const __m128i *)count);
+ __m128i pred_u16 = _mm_cvtepu8_epi16(pred_u8);
+ __m128i pred_0_u32, pred_1_u32;
+ __m128i accum_0_u32, accum_1_u32;
+
+ count_u16 = _mm_adds_epu16(count_u16, sum_u16);
+ _mm_storeu_si128((__m128i *)count, count_u16);
+
+ pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16);
+
+ pred_0_u32 = _mm_cvtepu16_epi32(pred_u16);
+ pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero);
+
+ accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
+ accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
+
+ accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
+ accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);
+
+ _mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
+ _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
+}
+
+static INLINE void accumulate_and_store_16(const __m128i sum_0_u16,
+ const __m128i sum_1_u16,
+ const uint8_t *pred, uint16_t *count,
+ uint32_t *accumulator) {
+ const __m128i pred_u8 = _mm_loadu_si128((const __m128i *)pred);
+ const __m128i zero = _mm_setzero_si128();
+ __m128i count_0_u16 = _mm_loadu_si128((const __m128i *)count),
+ count_1_u16 = _mm_loadu_si128((const __m128i *)(count + 8));
+ __m128i pred_0_u16 = _mm_cvtepu8_epi16(pred_u8),
+ pred_1_u16 = _mm_unpackhi_epi8(pred_u8, zero);
+ __m128i pred_0_u32, pred_1_u32, pred_2_u32, pred_3_u32;
+ __m128i accum_0_u32, accum_1_u32, accum_2_u32, accum_3_u32;
+
+ count_0_u16 = _mm_adds_epu16(count_0_u16, sum_0_u16);
+ _mm_storeu_si128((__m128i *)count, count_0_u16);
+
+ count_1_u16 = _mm_adds_epu16(count_1_u16, sum_1_u16);
+ _mm_storeu_si128((__m128i *)(count + 8), count_1_u16);
+
+ pred_0_u16 = _mm_mullo_epi16(sum_0_u16, pred_0_u16);
+ pred_1_u16 = _mm_mullo_epi16(sum_1_u16, pred_1_u16);
+
+ pred_0_u32 = _mm_cvtepu16_epi32(pred_0_u16);
+ pred_1_u32 = _mm_unpackhi_epi16(pred_0_u16, zero);
+ pred_2_u32 = _mm_cvtepu16_epi32(pred_1_u16);
+ pred_3_u32 = _mm_unpackhi_epi16(pred_1_u16, zero);
+
+ accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
+ accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
+ accum_2_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 8));
+ accum_3_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 12));
+
+ accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
+ accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);
+ accum_2_u32 = _mm_add_epi32(pred_2_u32, accum_2_u32);
+ accum_3_u32 = _mm_add_epi32(pred_3_u32, accum_3_u32);
+
+ _mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
+ _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
+ _mm_storeu_si128((__m128i *)(accumulator + 8), accum_2_u32);
+ _mm_storeu_si128((__m128i *)(accumulator + 12), accum_3_u32);
+}
+
+// Read in 8 pixels from y_dist. For each index i, compute y_dist[i-1] +
+// y_dist[i] + y_dist[i+1] and store in sum as 16-bit unsigned int.
+static INLINE void get_sum_8(const uint16_t *y_dist, __m128i *sum) {
+ __m128i dist_reg, dist_left, dist_right;
+
+ dist_reg = _mm_loadu_si128((const __m128i *)y_dist);
+ dist_left = _mm_loadu_si128((const __m128i *)(y_dist - 1));
+ dist_right = _mm_loadu_si128((const __m128i *)(y_dist + 1));
+
+ *sum = _mm_adds_epu16(dist_reg, dist_left);
+ *sum = _mm_adds_epu16(*sum, dist_right);
+}
+
+// Read in 16 pixels from y_dist. For each index i, compute y_dist[i-1] +
+// y_dist[i] + y_dist[i+1]. Store the result for first 8 pixels in sum_first and
+// the rest in sum_second.
+static INLINE void get_sum_16(const uint16_t *y_dist, __m128i *sum_first,
+ __m128i *sum_second) {
+ get_sum_8(y_dist, sum_first);
+ get_sum_8(y_dist + 8, sum_second);
+}
+
+// Read in a row of chroma values corresponds to a row of 16 luma values.
+static INLINE void read_chroma_dist_row_16(int ss_x, const uint16_t *u_dist,
+ const uint16_t *v_dist,
+ __m128i *u_first, __m128i *u_second,
+ __m128i *v_first,
+ __m128i *v_second) {
+ if (!ss_x) {
+ // If there is no chroma subsampling in the horizontal direction, then we
+ // need to load 16 entries from chroma.
+ read_dist_16(u_dist, u_first, u_second);
+ read_dist_16(v_dist, v_first, v_second);
+ } else { // ss_x == 1
+ // Otherwise, we only need to load 8 entries
+ __m128i u_reg, v_reg;
+
+ read_dist_8(u_dist, &u_reg);
+
+ *u_first = _mm_unpacklo_epi16(u_reg, u_reg);
+ *u_second = _mm_unpackhi_epi16(u_reg, u_reg);
+
+ read_dist_8(v_dist, &v_reg);
+
+ *v_first = _mm_unpacklo_epi16(v_reg, v_reg);
+ *v_second = _mm_unpackhi_epi16(v_reg, v_reg);
+ }
+}
+
+// Horizontal add unsigned 16-bit ints in src and store them as signed 32-bit
+// int in dst.
+static INLINE void hadd_epu16(__m128i *src, __m128i *dst) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i shift_right = _mm_srli_si128(*src, 2);
+
+ const __m128i odd = _mm_blend_epi16(shift_right, zero, 170);
+ const __m128i even = _mm_blend_epi16(*src, zero, 170);
+
+ *dst = _mm_add_epi32(even, odd);
+}
+
+// Add a row of luma distortion to 8 corresponding chroma mods.
+static INLINE void add_luma_dist_to_8_chroma_mod(const uint16_t *y_dist,
+ int ss_x, int ss_y,
+ __m128i *u_mod,
+ __m128i *v_mod) {
+ __m128i y_reg;
+ if (!ss_x) {
+ read_dist_8(y_dist, &y_reg);
+ if (ss_y == 1) {
+ __m128i y_tmp;
+ read_dist_8(y_dist + DIST_STRIDE, &y_tmp);
+
+ y_reg = _mm_adds_epu16(y_reg, y_tmp);
+ }
+ } else {
+ __m128i y_first, y_second;
+ read_dist_16(y_dist, &y_first, &y_second);
+ if (ss_y == 1) {
+ __m128i y_tmp_0, y_tmp_1;
+ read_dist_16(y_dist + DIST_STRIDE, &y_tmp_0, &y_tmp_1);
+
+ y_first = _mm_adds_epu16(y_first, y_tmp_0);
+ y_second = _mm_adds_epu16(y_second, y_tmp_1);
+ }
+
+ hadd_epu16(&y_first, &y_first);
+ hadd_epu16(&y_second, &y_second);
+
+ y_reg = _mm_packus_epi32(y_first, y_second);
+ }
+
+ *u_mod = _mm_adds_epu16(*u_mod, y_reg);
+ *v_mod = _mm_adds_epu16(*v_mod, y_reg);
+}
+
+// Apply temporal filter to the luma components. This performs temporal
+// filtering on a luma block of 16 X block_height. Use blk_fw as an array of
+// size 4 for the weights for each of the 4 subblocks if blk_fw is not NULL,
+// else use top_weight for top half, and bottom weight for bottom half.
+static void av1_apply_temporal_filter_luma_16(
+ const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
+ int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
+ int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
+ int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+ int ss_x, int ss_y, int strength, int use_whole_blk, uint32_t *y_accum,
+ uint16_t *y_count, const uint16_t *y_dist, const uint16_t *u_dist,
+ const uint16_t *v_dist, const int16_t *const *neighbors_first,
+ const int16_t *const *neighbors_second, int top_weight, int bottom_weight,
+ const int *blk_fw) {
+ const int rounding = (1 << strength) >> 1;
+ int weight = top_weight;
+
+ __m128i mul_first, mul_second;
+
+ __m128i sum_row_1_first, sum_row_1_second;
+ __m128i sum_row_2_first, sum_row_2_second;
+ __m128i sum_row_3_first, sum_row_3_second;
+
+ __m128i u_first, u_second;
+ __m128i v_first, v_second;
+
+ __m128i sum_row_first;
+ __m128i sum_row_second;
+
+ // Loop variables
+ unsigned int h;
+
+ assert(strength >= 0);
+ assert(strength <= 6);
+
+ assert(block_width == 16);
+
+ (void)block_width;
+
+ // First row
+ mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[0]);
+ mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[0]);
+
+ // Add luma values
+ get_sum_16(y_dist, &sum_row_2_first, &sum_row_2_second);
+ get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+ sum_row_first = _mm_adds_epu16(sum_row_2_first, sum_row_3_first);
+ sum_row_second = _mm_adds_epu16(sum_row_2_second, sum_row_3_second);
+
+ // Add chroma values
+ read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first,
+ &v_second);
+
+ sum_row_first = _mm_adds_epu16(sum_row_first, u_first);
+ sum_row_second = _mm_adds_epu16(sum_row_second, u_second);
+
+ sum_row_first = _mm_adds_epu16(sum_row_first, v_first);
+ sum_row_second = _mm_adds_epu16(sum_row_second, v_second);
+
+ // Get modifier and store result
+ if (blk_fw) {
+ sum_row_first =
+ average_8(sum_row_first, &mul_first, strength, rounding, blk_fw[0]);
+ sum_row_second =
+ average_8(sum_row_second, &mul_second, strength, rounding, blk_fw[1]);
+ } else {
+ average_16(&sum_row_first, &sum_row_second, &mul_first, &mul_second,
+ strength, rounding, weight);
+ }
+ accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
+ y_accum);
+
+ y_src += y_src_stride;
+ y_pre += y_pre_stride;
+ y_count += y_pre_stride;
+ y_accum += y_pre_stride;
+ y_dist += DIST_STRIDE;
+
+ u_src += uv_src_stride;
+ u_pre += uv_pre_stride;
+ u_dist += DIST_STRIDE;
+ v_src += uv_src_stride;
+ v_pre += uv_pre_stride;
+ v_dist += DIST_STRIDE;
+
+ // Then all the rows except the last one
+ mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[1]);
+ mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[1]);
+
+ for (h = 1; h < block_height - 1; ++h) {
+ // Move the weight to bottom half
+ if (!use_whole_blk && h == block_height / 2) {
+ if (blk_fw) {
+ blk_fw += 2;
+ } else {
+ weight = bottom_weight;
+ }
+ }
+ // Shift the rows up
+ sum_row_1_first = sum_row_2_first;
+ sum_row_1_second = sum_row_2_second;
+ sum_row_2_first = sum_row_3_first;
+ sum_row_2_second = sum_row_3_second;
+
+ // Add luma values to the modifier
+ sum_row_first = _mm_adds_epu16(sum_row_1_first, sum_row_2_first);
+ sum_row_second = _mm_adds_epu16(sum_row_1_second, sum_row_2_second);
+
+ get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+ sum_row_first = _mm_adds_epu16(sum_row_first, sum_row_3_first);
+ sum_row_second = _mm_adds_epu16(sum_row_second, sum_row_3_second);
+
+ // Add chroma values to the modifier
+ if (ss_y == 0 || h % 2 == 0) {
+ // Only calculate the new chroma distortion if we are at a pixel that
+ // corresponds to a new chroma row
+ read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second,
+ &v_first, &v_second);
+
+ u_src += uv_src_stride;
+ u_pre += uv_pre_stride;
+ u_dist += DIST_STRIDE;
+ v_src += uv_src_stride;
+ v_pre += uv_pre_stride;
+ v_dist += DIST_STRIDE;
+ }
+
+ sum_row_first = _mm_adds_epu16(sum_row_first, u_first);
+ sum_row_second = _mm_adds_epu16(sum_row_second, u_second);
+ sum_row_first = _mm_adds_epu16(sum_row_first, v_first);
+ sum_row_second = _mm_adds_epu16(sum_row_second, v_second);
+
+ // Get modifier and store result
+ if (blk_fw) {
+ sum_row_first =
+ average_8(sum_row_first, &mul_first, strength, rounding, blk_fw[0]);
+ sum_row_second =
+ average_8(sum_row_second, &mul_second, strength, rounding, blk_fw[1]);
+ } else {
+ average_16(&sum_row_first, &sum_row_second, &mul_first, &mul_second,
+ strength, rounding, weight);
+ }
+ accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
+ y_accum);
+
+ y_src += y_src_stride;
+ y_pre += y_pre_stride;
+ y_count += y_pre_stride;
+ y_accum += y_pre_stride;
+ y_dist += DIST_STRIDE;
+ }
+
+ // The last row
+ mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[0]);
+ mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[0]);
+
+ // Shift the rows up
+ sum_row_1_first = sum_row_2_first;
+ sum_row_1_second = sum_row_2_second;
+ sum_row_2_first = sum_row_3_first;
+ sum_row_2_second = sum_row_3_second;
+
+ // Add luma values to the modifier
+ sum_row_first = _mm_adds_epu16(sum_row_1_first, sum_row_2_first);
+ sum_row_second = _mm_adds_epu16(sum_row_1_second, sum_row_2_second);
+
+ // Add chroma values to the modifier
+ if (ss_y == 0) {
+ // Only calculate the new chroma distortion if we are at a pixel that
+ // corresponds to a new chroma row
+ read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first,
+ &v_second);
+ }
+
+ sum_row_first = _mm_adds_epu16(sum_row_first, u_first);
+ sum_row_second = _mm_adds_epu16(sum_row_second, u_second);
+ sum_row_first = _mm_adds_epu16(sum_row_first, v_first);
+ sum_row_second = _mm_adds_epu16(sum_row_second, v_second);
+
+ // Get modifier and store result
+ if (blk_fw) {
+ sum_row_first =
+ average_8(sum_row_first, &mul_first, strength, rounding, blk_fw[0]);
+ sum_row_second =
+ average_8(sum_row_second, &mul_second, strength, rounding, blk_fw[1]);
+ } else {
+ average_16(&sum_row_first, &sum_row_second, &mul_first, &mul_second,
+ strength, rounding, weight);
+ }
+ accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
+ y_accum);
+}
+
+// Perform temporal filter for the luma component.
+static void av1_apply_temporal_filter_luma(
+ const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
+ int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
+ int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
+ int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+ int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
+ uint32_t *y_accum, uint16_t *y_count, const uint16_t *y_dist,
+ const uint16_t *u_dist, const uint16_t *v_dist) {
+ unsigned int blk_col = 0, uv_blk_col = 0;
+ const unsigned int blk_col_step = 16, uv_blk_col_step = 16 >> ss_x;
+ const unsigned int mid_width = block_width >> 1,
+ last_width = block_width - blk_col_step;
+ int top_weight = blk_fw[0],
+ bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+ const int16_t *const *neighbors_first;
+ const int16_t *const *neighbors_second;
+
+ if (block_width == 16) {
+ // Special Case: The blockwidth is 16 and we are operating on a row of 16
+ // chroma pixels. In this case, we can't use the usualy left-midle-right
+ // pattern. We also don't support splitting now.
+ neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS;
+ neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS;
+ if (use_whole_blk) {
+ av1_apply_temporal_filter_luma_16(
+ y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+ u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16,
+ block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
+ y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
+ v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
+ bottom_weight, NULL);
+ } else {
+ av1_apply_temporal_filter_luma_16(
+ y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+ u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16,
+ block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
+ y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
+ v_dist + uv_blk_col, neighbors_first, neighbors_second, 0, 0, blk_fw);
+ }
+
+ return;
+ }
+
+ // Left
+ neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS;
+ neighbors_second = LUMA_MIDDLE_COLUMN_NEIGHBORS;
+ av1_apply_temporal_filter_luma_16(
+ y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+ u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
+ v_pre + uv_blk_col, uv_pre_stride, 16, block_height, ss_x, ss_y, strength,
+ use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+ u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+ neighbors_second, top_weight, bottom_weight, NULL);
+
+ blk_col += blk_col_step;
+ uv_blk_col += uv_blk_col_step;
+
+ // Middle First
+ neighbors_first = LUMA_MIDDLE_COLUMN_NEIGHBORS;
+ for (; blk_col < mid_width;
+ blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+ av1_apply_temporal_filter_luma_16(
+ y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+ u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16, block_height,
+ ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
+ y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
+ v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
+ bottom_weight, NULL);
+ }
+
+ if (!use_whole_blk) {
+ top_weight = blk_fw[1];
+ bottom_weight = blk_fw[3];
+ }
+
+ // Middle Second
+ for (; blk_col < last_width;
+ blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+ av1_apply_temporal_filter_luma_16(
+ y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+ u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16, block_height,
+ ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
+ y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
+ v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
+ bottom_weight, NULL);
+ }
+
+ // Right
+ neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS;
+ av1_apply_temporal_filter_luma_16(
+ y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+ u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
+ v_pre + uv_blk_col, uv_pre_stride, 16, block_height, ss_x, ss_y, strength,
+ use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+ u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+ neighbors_second, top_weight, bottom_weight, NULL);
+}
+
+// Apply temporal filter to the chroma components. This performs temporal
+// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use
+// blk_fw as an array of size 4 for the weights for each of the 4 subblocks,
+// else use top_weight for top half, and bottom weight for bottom half.
+static void av1_apply_temporal_filter_chroma_8(
+ const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
+ int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
+ int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
+ int uv_pre_stride, unsigned int uv_block_width,
+ unsigned int uv_block_height, int ss_x, int ss_y, int strength,
+ uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+ const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist,
+ const int16_t *const *neighbors, int top_weight, int bottom_weight,
+ const int *blk_fw) {
+ const int rounding = (1 << strength) >> 1;
+ int weight = top_weight;
+
+ __m128i mul;
+
+ __m128i u_sum_row_1, u_sum_row_2, u_sum_row_3;
+ __m128i v_sum_row_1, v_sum_row_2, v_sum_row_3;
+
+ __m128i u_sum_row, v_sum_row;
+
+ // Loop variable
+ unsigned int h;
+
+ (void)uv_block_width;
+
+ // First row
+ mul = _mm_loadu_si128((const __m128i *)neighbors[0]);
+
+ // Add chroma values
+ get_sum_8(u_dist, &u_sum_row_2);
+ get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3);
+
+ u_sum_row = _mm_adds_epu16(u_sum_row_2, u_sum_row_3);
+
+ get_sum_8(v_dist, &v_sum_row_2);
+ get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3);
+
+ v_sum_row = _mm_adds_epu16(v_sum_row_2, v_sum_row_3);
+
+ // Add luma values
+ add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
+
+ // Get modifier and store result
+ if (blk_fw) {
+ u_sum_row =
+ average_4_4(u_sum_row, &mul, strength, rounding, blk_fw[0], blk_fw[1]);
+ v_sum_row =
+ average_4_4(v_sum_row, &mul, strength, rounding, blk_fw[0], blk_fw[1]);
+ } else {
+ u_sum_row = average_8(u_sum_row, &mul, strength, rounding, weight);
+ v_sum_row = average_8(v_sum_row, &mul, strength, rounding, weight);
+ }
+ accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
+ accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
+
+ u_src += uv_src_stride;
+ u_pre += uv_pre_stride;
+ u_dist += DIST_STRIDE;
+ v_src += uv_src_stride;
+ v_pre += uv_pre_stride;
+ v_dist += DIST_STRIDE;
+ u_count += uv_pre_stride;
+ u_accum += uv_pre_stride;
+ v_count += uv_pre_stride;
+ v_accum += uv_pre_stride;
+
+ y_src += y_src_stride * (1 + ss_y);
+ y_pre += y_pre_stride * (1 + ss_y);
+ y_dist += DIST_STRIDE * (1 + ss_y);
+
+ // Then all the rows except the last one
+ mul = _mm_loadu_si128((const __m128i *)neighbors[1]);
+
+ for (h = 1; h < uv_block_height - 1; ++h) {
+ // Move the weight pointer to the bottom half of the blocks
+ if (h == uv_block_height / 2) {
+ if (blk_fw) {
+ blk_fw += 2;
+ } else {
+ weight = bottom_weight;
+ }
+ }
+
+ // Shift the rows up
+ u_sum_row_1 = u_sum_row_2;
+ u_sum_row_2 = u_sum_row_3;
+
+ v_sum_row_1 = v_sum_row_2;
+ v_sum_row_2 = v_sum_row_3;
+
+ // Add chroma values
+ u_sum_row = _mm_adds_epu16(u_sum_row_1, u_sum_row_2);
+ get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3);
+ u_sum_row = _mm_adds_epu16(u_sum_row, u_sum_row_3);
+
+ v_sum_row = _mm_adds_epu16(v_sum_row_1, v_sum_row_2);
+ get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3);
+ v_sum_row = _mm_adds_epu16(v_sum_row, v_sum_row_3);
+
+ // Add luma values
+ add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
+
+ // Get modifier and store result
+ if (blk_fw) {
+ u_sum_row = average_4_4(u_sum_row, &mul, strength, rounding, blk_fw[0],
+ blk_fw[1]);
+ v_sum_row = average_4_4(v_sum_row, &mul, strength, rounding, blk_fw[0],
+ blk_fw[1]);
+ } else {
+ u_sum_row = average_8(u_sum_row, &mul, strength, rounding, weight);
+ v_sum_row = average_8(v_sum_row, &mul, strength, rounding, weight);
+ }
+
+ accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
+ accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
+
+ u_src += uv_src_stride;
+ u_pre += uv_pre_stride;
+ u_dist += DIST_STRIDE;
+ v_src += uv_src_stride;
+ v_pre += uv_pre_stride;
+ v_dist += DIST_STRIDE;
+ u_count += uv_pre_stride;
+ u_accum += uv_pre_stride;
+ v_count += uv_pre_stride;
+ v_accum += uv_pre_stride;
+
+ y_src += y_src_stride * (1 + ss_y);
+ y_pre += y_pre_stride * (1 + ss_y);
+ y_dist += DIST_STRIDE * (1 + ss_y);
+ }
+
+ // The last row
+ mul = _mm_loadu_si128((const __m128i *)neighbors[0]);
+
+ // Shift the rows up
+ u_sum_row_1 = u_sum_row_2;
+ u_sum_row_2 = u_sum_row_3;
+
+ v_sum_row_1 = v_sum_row_2;
+ v_sum_row_2 = v_sum_row_3;
+
+ // Add chroma values
+ u_sum_row = _mm_adds_epu16(u_sum_row_1, u_sum_row_2);
+ v_sum_row = _mm_adds_epu16(v_sum_row_1, v_sum_row_2);
+
+ // Add luma values
+ add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
+
+ // Get modifier and store result
+ if (blk_fw) {
+ u_sum_row =
+ average_4_4(u_sum_row, &mul, strength, rounding, blk_fw[0], blk_fw[1]);
+ v_sum_row =
+ average_4_4(v_sum_row, &mul, strength, rounding, blk_fw[0], blk_fw[1]);
+ } else {
+ u_sum_row = average_8(u_sum_row, &mul, strength, rounding, weight);
+ v_sum_row = average_8(v_sum_row, &mul, strength, rounding, weight);
+ }
+
+ accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
+ accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
+}
+
+// Perform temporal filter for the chroma components.
+static void av1_apply_temporal_filter_chroma(
+ const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
+ int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
+ int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
+ int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+ int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
+ uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+ const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) {
+ const unsigned int uv_width = block_width >> ss_x,
+ uv_height = block_height >> ss_y;
+
+ unsigned int blk_col = 0, uv_blk_col = 0;
+ const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x;
+ const unsigned int uv_mid_width = uv_width >> 1,
+ uv_last_width = uv_width - uv_blk_col_step;
+ int top_weight = blk_fw[0],
+ bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+ const int16_t *const *neighbors;
+
+ if (uv_width == 8) {
+ // Special Case: We are subsampling in x direction on a 16x16 block. Since
+ // we are operating on a row of 8 chroma pixels, we can't use the usual
+ // left-middle-right pattern.
+ assert(ss_x);
+
+ if (ss_y) {
+ neighbors = CHROMA_DOUBLE_SS_SINGLE_COLUMN_NEIGHBORS;
+ } else {
+ neighbors = CHROMA_SINGLE_SS_SINGLE_COLUMN_NEIGHBORS;
+ }
+
+ if (use_whole_blk) {
+ av1_apply_temporal_filter_chroma_8(
+ y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+ u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+ uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+ u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+ y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors,
+ top_weight, bottom_weight, NULL);
+ } else {
+ av1_apply_temporal_filter_chroma_8(
+ y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+ u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+ uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+ u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+ y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors,
+ 0, 0, blk_fw);
+ }
+
+ return;
+ }
+
+ // Left
+ if (ss_x && ss_y) {
+ neighbors = CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
+ } else if (ss_x || ss_y) {
+ neighbors = CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
+ } else {
+ neighbors = CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS;
+ }
+
+ av1_apply_temporal_filter_chroma_8(
+ y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+ u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
+ v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
+ strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+ v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+ u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+ bottom_weight, NULL);
+
+ blk_col += blk_col_step;
+ uv_blk_col += uv_blk_col_step;
+
+ // Middle First
+ if (ss_x && ss_y) {
+ neighbors = CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+ } else if (ss_x || ss_y) {
+ neighbors = CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+ } else {
+ neighbors = CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
+ }
+
+ for (; uv_blk_col < uv_mid_width;
+ blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+ av1_apply_temporal_filter_chroma_8(
+ y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+ u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+ uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+ u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+ y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors,
+ top_weight, bottom_weight, NULL);
+ }
+
+ if (!use_whole_blk) {
+ top_weight = blk_fw[1];
+ bottom_weight = blk_fw[3];
+ }
+
+ // Middle Second
+ for (; uv_blk_col < uv_last_width;
+ blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+ av1_apply_temporal_filter_chroma_8(
+ y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+ u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+ u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+ uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+ u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+ y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors,
+ top_weight, bottom_weight, NULL);
+ }
+
+ // Right
+ if (ss_x && ss_y) {
+ neighbors = CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
+ } else if (ss_x || ss_y) {
+ neighbors = CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
+ } else {
+ neighbors = CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS;
+ }
+
+ av1_apply_temporal_filter_chroma_8(
+ y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+ u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
+ v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
+ strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+ v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+ u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+ bottom_weight, NULL);
+}
+
+void av1_apply_temporal_filter_sse4_1(
+ const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
+ int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
+ int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
+ int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+ int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
+ uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, uint16_t *u_count,
+ uint32_t *v_accum, uint16_t *v_count) {
+ const unsigned int chroma_height = block_height >> ss_y,
+ chroma_width = block_width >> ss_x;
+
+ DECLARE_ALIGNED(16, uint16_t, y_dist[BH * DIST_STRIDE]) = { 0 };
+ DECLARE_ALIGNED(16, uint16_t, u_dist[BH * DIST_STRIDE]) = { 0 };
+ DECLARE_ALIGNED(16, uint16_t, v_dist[BH * DIST_STRIDE]) = { 0 };
+ const int *blk_fw_ptr = blk_fw;
+
+ uint16_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1,
+ *v_dist_ptr = v_dist + 1;
+ const uint8_t *y_src_ptr = y_src, *u_src_ptr = u_src, *v_src_ptr = v_src;
+ const uint8_t *y_pre_ptr = y_pre, *u_pre_ptr = u_pre, *v_pre_ptr = v_pre;
+
+ // Loop variables
+ unsigned int row, blk_col;
+
+ assert(block_width <= BW && "block width too large");
+ assert(block_height <= BH && "block height too large");
+ assert(block_width % 16 == 0 && "block width must be multiple of 16");
+ assert(block_height % 2 == 0 && "block height must be even");
+ assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) &&
+ "invalid chroma subsampling");
+ assert(strength >= 0 && strength <= 6 && "invalid temporal filter strength");
+ assert(blk_fw[0] >= 0 && "filter weight must be positive");
+ assert(
+ (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) &&
+ "subblock filter weight must be positive");
+ assert(blk_fw[0] <= 2 && "sublock filter weight must be less than 2");
+ assert(
+ (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) &&
+ "subblock filter weight must be less than 2");
+
+ // Precompute the difference sqaured
+ for (row = 0; row < block_height; row++) {
+ for (blk_col = 0; blk_col < block_width; blk_col += 16) {
+ store_dist_16(y_src_ptr + blk_col, y_pre_ptr + blk_col,
+ y_dist_ptr + blk_col);
+ }
+ y_src_ptr += y_src_stride;
+ y_pre_ptr += y_pre_stride;
+ y_dist_ptr += DIST_STRIDE;
+ }
+
+ for (row = 0; row < chroma_height; row++) {
+ for (blk_col = 0; blk_col < chroma_width; blk_col += 8) {
+ store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col,
+ u_dist_ptr + blk_col);
+ store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col,
+ v_dist_ptr + blk_col);
+ }
+
+ u_src_ptr += uv_src_stride;
+ u_pre_ptr += uv_pre_stride;
+ u_dist_ptr += DIST_STRIDE;
+ v_src_ptr += uv_src_stride;
+ v_pre_ptr += uv_pre_stride;
+ v_dist_ptr += DIST_STRIDE;
+ }
+
+ y_dist_ptr = y_dist + 1;
+ u_dist_ptr = u_dist + 1;
+ v_dist_ptr = v_dist + 1;
+
+ av1_apply_temporal_filter_luma(
+ y_src, y_src_stride, y_pre, y_pre_stride, u_src, v_src, uv_src_stride,
+ u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y,
+ strength, blk_fw_ptr, use_whole_blk, y_accum, y_count, y_dist_ptr,
+ u_dist_ptr, v_dist_ptr);
+
+ av1_apply_temporal_filter_chroma(
+ y_src, y_src_stride, y_pre, y_pre_stride, u_src, v_src, uv_src_stride,
+ u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y,
+ strength, blk_fw_ptr, use_whole_blk, u_accum, u_count, v_accum, v_count,
+ y_dist_ptr, u_dist_ptr, v_dist_ptr);
+}
diff --git a/libaom/build/cmake/aom_config_defaults.cmake b/libaom/build/cmake/aom_config_defaults.cmake
index feb9b5e..f498acd 100644
--- a/libaom/build/cmake/aom_config_defaults.cmake
+++ b/libaom/build/cmake/aom_config_defaults.cmake
@@ -101,8 +101,6 @@ set_aom_config_var(CONFIG_DENOISE 1 NUMBER
"Denoise/noise modeling support in encoder.")
set_aom_config_var(CONFIG_FILEOPTIONS 1 NUMBER
"Enables encoder config file support.")
-set_aom_config_var(CONFIG_FIX_GF_LENGTH 1 NUMBER
- "Fix the GF length if possible")
set_aom_config_var(CONFIG_INSPECTION 0 NUMBER "Enables bitstream inspection.")
set_aom_config_var(CONFIG_INTERNAL_STATS 0 NUMBER
"Enables internal encoder stats.")
@@ -112,34 +110,29 @@ set_aom_config_var(CONFIG_MAX_DECODE_PROFILE 2 NUMBER
"Max profile to support decoding.")
set_aom_config_var(CONFIG_NORMAL_TILE_MODE 0 NUMBER
"Only enables normal tile mode.")
-set_aom_config_var(
- CONFIG_REDUCED_ENCODER_BORDER 0 NUMBER
- "Enable reduced border extention for encoder. \
- Disables superres and resize support."
- )
set_aom_config_var(CONFIG_SIZE_LIMIT 0 NUMBER "Limit max decode width/height.")
set_aom_config_var(CONFIG_SPATIAL_RESAMPLING 1 NUMBER "Spatial resampling.")
set_aom_config_var(DECODE_HEIGHT_LIMIT 0 NUMBER "Set limit for decode height.")
set_aom_config_var(DECODE_WIDTH_LIMIT 0 NUMBER "Set limit for decode width.")
-set_aom_config_var(CONFIG_GLOBAL_MOTION_SEARCH 1 NUMBER
- "Global motion search flag.")
# AV1 experiment flags.
-set_aom_config_var(CONFIG_COLLECT_INTER_MODE_RD_STATS 1 NUMBER
- "AV1 experiment flag.")
+set_aom_config_var(CONFIG_SPEED_STATS 0 NUMBER "AV1 experiment flag.")
set_aom_config_var(CONFIG_COLLECT_RD_STATS 0 NUMBER "AV1 experiment flag.")
set_aom_config_var(CONFIG_DIST_8X8 0 NUMBER "AV1 experiment flag.")
set_aom_config_var(CONFIG_ENTROPY_STATS 0 NUMBER "AV1 experiment flag.")
-set_aom_config_var(CONFIG_FP_MB_STATS 0 NUMBER "AV1 experiment flag.")
set_aom_config_var(CONFIG_INTER_STATS_ONLY 0 NUMBER "AV1 experiment flag.")
set_aom_config_var(CONFIG_RD_DEBUG 0 NUMBER "AV1 experiment flag.")
-set_aom_config_var(CONFIG_2PASS_PARTITION_SEARCH_LVL 1 NUMBER
+set_aom_config_var(CONFIG_2PASS_PARTITION_SEARCH_LVL_START 1 NUMBER
+ "AV1 experiment flag.")
+set_aom_config_var(CONFIG_2PASS_PARTITION_SEARCH_LVL_END 3 NUMBER
"AV1 experiment flag.")
set_aom_config_var(CONFIG_SHARP_SETTINGS 0 NUMBER "AV1 experiment flag.")
-set_aom_config_var(CONFIG_ONE_PASS_SVM 0 NUMBER "AV1 experiment flag.")
set_aom_config_var(CONFIG_DISABLE_FULL_PIXEL_SPLIT_8X8 1 NUMBER
- "Disable full_pixel_motion_search_based_split on BLOCK_8X8")
-
+ "Disable full_pixel_motion_search_based_split on BLOCK_8X8.")
+set_aom_config_var(CONFIG_COLLECT_PARTITION_STATS 0 NUMBER
+ "Collect stats on partition decisions.")
+set_aom_config_var(CONFIG_COLLECT_COMPONENT_TIMING 0 NUMBER
+ "Collect encoding component timing information.")
#
# Variables in this section control optional features of the build system.
#
diff --git a/libaom/build/cmake/aom_experiment_deps.cmake b/libaom/build/cmake/aom_experiment_deps.cmake
index 0688704..2e36157 100644
--- a/libaom/build/cmake/aom_experiment_deps.cmake
+++ b/libaom/build/cmake/aom_experiment_deps.cmake
@@ -21,10 +21,6 @@ macro(fix_experiment_configs)
change_config_and_warn(CONFIG_INSPECTION 1 CONFIG_ANALYZER)
endif()
- if(CONFIG_RD_DEBUG)
- change_config_and_warn(CONFIG_RD_DEBUG 0 CONFIG_JNT_COMP)
- endif()
-
if(CONFIG_DIST_8X8 AND CONFIG_MULTITHREAD)
change_config_and_warn(CONFIG_DIST_8X8 0 CONFIG_MULTITHREAD)
endif()
diff --git a/libaom/build/cmake/toolchains/arm64-mingw-gcc.cmake b/libaom/build/cmake/toolchains/arm64-mingw-gcc.cmake
index b5b2ff1..bfeac92 100644
--- a/libaom/build/cmake/toolchains/arm64-mingw-gcc.cmake
+++ b/libaom/build/cmake/toolchains/arm64-mingw-gcc.cmake
@@ -27,6 +27,3 @@ set(CMAKE_RANLIB ${CROSS}ranlib CACHE FILEPATH Indexer)
# No runtime cpu detect for arm64-mingw-gcc.
set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE NUMBER "")
-
-# Disable the use of the gtest's CMake support.
-set(AOM_DISABLE_GTEST_CMAKE 1)
diff --git a/libaom/build/cmake/toolchains/armv7-linux-gcc.cmake b/libaom/build/cmake/toolchains/armv7-linux-gcc.cmake
index 7d3d630..6cbc2a8 100644
--- a/libaom/build/cmake/toolchains/armv7-linux-gcc.cmake
+++ b/libaom/build/cmake/toolchains/armv7-linux-gcc.cmake
@@ -28,16 +28,13 @@ endif()
set(CMAKE_C_COMPILER ${CROSS}gcc)
set(CMAKE_CXX_COMPILER ${CROSS}g++)
set(AS_EXECUTABLE ${CROSS}as)
-set(CMAKE_C_COMPILER_ARG1
- "-march=armv7-a -mfpu=neon ${AOM_EXTRA_TOOLCHAIN_FLAGS}")
-set(CMAKE_CXX_COMPILER_ARG1
- "-march=armv7-a -mfpu=neon ${AOM_EXTRA_TOOLCHAIN_FLAGS}")
+set(CMAKE_C_COMPILER_ARG1 "-march=armv7-a ${AOM_EXTRA_TOOLCHAIN_FLAGS}")
+set(CMAKE_CXX_COMPILER_ARG1 "-march=armv7-a ${AOM_EXTRA_TOOLCHAIN_FLAGS}")
set(AOM_AS_FLAGS --defsym ARCHITECTURE=7 -march=armv7-a -mfpu=neon
${AOM_EXTRA_TOOLCHAIN_FLAGS})
set(CMAKE_SYSTEM_PROCESSOR "armv7")
-# No intrinsics flag required for armv7-linux-gcc.
-set(AOM_NEON_INTRIN_FLAG "")
+set(AOM_NEON_INTRIN_FLAG "-mfpu=neon")
# No runtime cpu detect for armv7-linux-gcc.
set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE NUMBER "")
diff --git a/libaom/build/cmake/toolchains/armv7-mingw-gcc.cmake b/libaom/build/cmake/toolchains/armv7-mingw-gcc.cmake
index cf06a11..eb488ec 100644
--- a/libaom/build/cmake/toolchains/armv7-mingw-gcc.cmake
+++ b/libaom/build/cmake/toolchains/armv7-mingw-gcc.cmake
@@ -27,6 +27,3 @@ set(CMAKE_RANLIB ${CROSS}ranlib CACHE FILEPATH Indexer)
# No runtime cpu detect for armv7-mingw-gcc.
set(CONFIG_RUNTIME_CPU_DETECT 0 CACHE NUMBER "")
-
-# Disable the use of the gtest's CMake support.
-set(AOM_DISABLE_GTEST_CMAKE 1)
diff --git a/libaom/build/cmake/toolchains/x86-mingw-gcc.cmake b/libaom/build/cmake/toolchains/x86-mingw-gcc.cmake
index c986c4e..4839c9d 100644
--- a/libaom/build/cmake/toolchains/x86-mingw-gcc.cmake
+++ b/libaom/build/cmake/toolchains/x86-mingw-gcc.cmake
@@ -26,6 +26,3 @@ set(CMAKE_C_COMPILER ${CROSS}gcc)
set(CMAKE_CXX_COMPILER ${CROSS}g++)
set(CMAKE_AR ${CROSS}ar CACHE FILEPATH Archiver)
set(CMAKE_RANLIB ${CROSS}ranlib CACHE FILEPATH Indexer)
-
-# Disable the use of the gtest's CMake support.
-set(AOM_DISABLE_GTEST_CMAKE 1)
diff --git a/libaom/build/cmake/toolchains/x86_64-mingw-gcc.cmake b/libaom/build/cmake/toolchains/x86_64-mingw-gcc.cmake
index 00d94d5..4b2d28d 100644
--- a/libaom/build/cmake/toolchains/x86_64-mingw-gcc.cmake
+++ b/libaom/build/cmake/toolchains/x86_64-mingw-gcc.cmake
@@ -24,6 +24,3 @@ set(CMAKE_C_COMPILER ${CROSS}gcc)
set(CMAKE_CXX_COMPILER ${CROSS}g++)
set(CMAKE_AR ${CROSS}ar CACHE FILEPATH Archiver)
set(CMAKE_RANLIB ${CROSS}ranlib CACHE FILEPATH Indexer)
-
-# Disable the use of the gtest's CMake support.
-set(AOM_DISABLE_GTEST_CMAKE 1)
diff --git a/libaom/common/av1_config.c b/libaom/common/av1_config.c
index e8decf7..90955fb 100644
--- a/libaom/common/av1_config.c
+++ b/libaom/common/av1_config.c
@@ -322,7 +322,7 @@ static int parse_sequence_header(const uint8_t *const buffer, size_t length,
AV1C_READ_BITS_OR_RETURN_ERROR(max_frame_height_minus_1,
frame_height_bits_minus_1 + 1);
- int frame_id_numbers_present = 0;
+ uint8_t frame_id_numbers_present = 0;
if (!reduced_still_picture_header) {
AV1C_READ_BIT_OR_RETURN_ERROR(frame_id_numbers_present_flag);
frame_id_numbers_present = frame_id_numbers_present_flag;
@@ -345,7 +345,7 @@ static int parse_sequence_header(const uint8_t *const buffer, size_t length,
AV1C_READ_BIT_OR_RETURN_ERROR(enable_order_hint);
if (enable_order_hint) {
- AV1C_READ_BIT_OR_RETURN_ERROR(enable_jnt_comp);
+ AV1C_READ_BIT_OR_RETURN_ERROR(enable_dist_wtd_comp);
AV1C_READ_BIT_OR_RETURN_ERROR(enable_ref_frame_mvs);
}
diff --git a/libaom/common/rawenc.c b/libaom/common/rawenc.c
index 5a2731d..b72132c 100644
--- a/libaom/common/rawenc.c
+++ b/libaom/common/rawenc.c
@@ -9,36 +9,88 @@
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
+#include <stdbool.h>
#include "common/rawenc.h"
-void raw_write_image_file(const aom_image_t *img, const int *planes,
- const int num_planes, FILE *file) {
- const int bytes_per_sample = ((img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
- for (int i = 0; i < num_planes; ++i) {
- const int plane = planes[i];
- const unsigned char *buf = img->planes[plane];
- const int stride = img->stride[plane];
- const int w = aom_img_plane_width(img, plane);
- const int h = aom_img_plane_height(img, plane);
- for (int y = 0; y < h; ++y) {
- fwrite(buf, bytes_per_sample, w, file);
- buf += stride;
+#define BATCH_SIZE 8
+// When writing greyscale color, batch 8 writes for low bit-depth, 4 writes
+// for high bit-depth.
+static const uint8_t batched[BATCH_SIZE] = { 128, 128, 128, 128,
+ 128, 128, 128, 128 };
+static const uint8_t batched_hbd[BATCH_SIZE] = {
+ 0, 128, 0, 128, 0, 128, 0, 128
+};
+
+// Interface to writing to either a file or MD5Context. Takes a pointer to
+// either the file or MD5Context, the buffer, the size of each element, and
+// number of elements to write. Note that size and nmemb (last two args) must
+// be unsigned int, as the interface to MD5Update requires that.
+typedef void (*WRITER)(void *, const uint8_t *, unsigned int, unsigned int);
+
+static void write_file(void *fp, const uint8_t *buffer, unsigned int size,
+ unsigned int nmemb) {
+ fwrite(buffer, size, nmemb, (FILE *)fp);
+}
+
+static void write_md5(void *md5, const uint8_t *buffer, unsigned int size,
+ unsigned int nmemb) {
+ MD5Update((MD5Context *)md5, buffer, size * nmemb);
+}
+
+// Writes out n greyscale values.
+static void write_greyscale(const bool high_bitdepth, int n, WRITER writer_func,
+ void *file_or_md5) {
+ const uint8_t *b = batched;
+ if (high_bitdepth) {
+ b = batched_hbd;
+ }
+ const int num_batched_writes =
+ high_bitdepth ? n / (BATCH_SIZE / 2) : n / BATCH_SIZE;
+ for (int i = 0; i < num_batched_writes; ++i) {
+ writer_func(file_or_md5, b, sizeof(uint8_t), BATCH_SIZE);
+ }
+ const int remaining = high_bitdepth ? n % (BATCH_SIZE / 2) : n % BATCH_SIZE;
+ for (int i = 0; i < remaining; ++i) {
+ if (high_bitdepth) {
+ writer_func(file_or_md5, batched_hbd, sizeof(uint8_t), 2);
+ } else {
+ writer_func(file_or_md5, batched, sizeof(uint8_t), 1);
}
}
}
-void raw_update_image_md5(const aom_image_t *img, const int *planes,
- const int num_planes, MD5Context *md5) {
+// Encapsulates the logic for writing raw data to either an image file or
+// to an MD5 context.
+static void raw_write_image_file_or_md5(const aom_image_t *img,
+ const int *planes, const int num_planes,
+ void *file_or_md5, WRITER writer_func) {
+ const bool high_bitdepth = img->fmt & AOM_IMG_FMT_HIGHBITDEPTH;
+ const int bytes_per_sample = high_bitdepth ? 2 : 1;
for (int i = 0; i < num_planes; ++i) {
const int plane = planes[i];
+ const int w = aom_img_plane_width(img, plane);
+ const int h = aom_img_plane_height(img, plane);
+ // If we're on a color plane and the output is monochrome, write a greyscale
+ // value. Since there are only YUV planes, compare against Y.
+ if (img->monochrome && plane != AOM_PLANE_Y) {
+ write_greyscale(high_bitdepth, w * h, writer_func, file_or_md5);
+ continue;
+ }
const unsigned char *buf = img->planes[plane];
const int stride = img->stride[plane];
- const int w = aom_img_plane_width(img, plane) *
- ((img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1);
- const int h = aom_img_plane_height(img, plane);
for (int y = 0; y < h; ++y) {
- MD5Update(md5, buf, w);
+ writer_func(file_or_md5, buf, bytes_per_sample, w);
buf += stride;
}
}
}
+
+void raw_write_image_file(const aom_image_t *img, const int *planes,
+ const int num_planes, FILE *file) {
+ raw_write_image_file_or_md5(img, planes, num_planes, file, write_file);
+}
+
+void raw_update_image_md5(const aom_image_t *img, const int *planes,
+ const int num_planes, MD5Context *md5) {
+ raw_write_image_file_or_md5(img, planes, num_planes, md5, write_md5);
+}
diff --git a/libaom/common/tools_common.c b/libaom/common/tools_common.c
index 2e32f61..51c1c52 100644
--- a/libaom/common/tools_common.c
+++ b/libaom/common/tools_common.c
@@ -149,6 +149,11 @@ const AvxInterface *get_aom_encoder_by_name(const char *name) {
return NULL;
}
+
+// large scale tile encoding
+static const AvxInterface aom_lst_encoder = { "av1", LST_FOURCC,
+ &aom_codec_av1_cx };
+const AvxInterface *get_aom_lst_encoder(void) { return &aom_lst_encoder; }
#endif // CONFIG_AV1_ENCODER
#if CONFIG_AV1_DECODER
diff --git a/libaom/common/tools_common.h b/libaom/common/tools_common.h
index df3b62b..d9a68f0 100644
--- a/libaom/common/tools_common.h
+++ b/libaom/common/tools_common.h
@@ -18,6 +18,7 @@
#include "aom/aom_codec.h"
#include "aom/aom_image.h"
#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
#include "aom_ports/msvc.h"
#if CONFIG_AV1_ENCODER
@@ -78,11 +79,14 @@ enum VideoFileType {
};
// Used in lightfield example.
-typedef enum OUTPUT_FORMAT {
+enum {
YUV1D, // 1D tile output for conformance test.
YUV, // Tile output in YUV format.
NV12, // Tile output in NV12 format.
-} OUTPUT_FORMAT;
+} UENUM1BYTE(OUTPUT_FORMAT);
+
+// The fourcc for large_scale_tile encoding is "LSTC".
+#define LST_FOURCC 0x4354534c
struct FileTypeDetectionBuffer {
char buf[4];
@@ -149,6 +153,7 @@ typedef struct AvxInterface {
int get_aom_encoder_count(void);
const AvxInterface *get_aom_encoder_by_index(int i);
const AvxInterface *get_aom_encoder_by_name(const char *name);
+const AvxInterface *get_aom_lst_encoder(void);
int get_aom_decoder_count(void);
const AvxInterface *get_aom_decoder_by_index(int i);
diff --git a/libaom/common/video_reader.c b/libaom/common/video_reader.c
index 47ad6e1..7b021bc 100644
--- a/libaom/common/video_reader.c
+++ b/libaom/common/video_reader.c
@@ -121,3 +121,7 @@ FILE *aom_video_reader_get_file(AvxVideoReader *reader) {
const AvxVideoInfo *aom_video_reader_get_info(AvxVideoReader *reader) {
return &reader->info;
}
+
+void aom_video_reader_set_fourcc(AvxVideoReader *reader, uint32_t fourcc) {
+ reader->info.codec_fourcc = fourcc;
+}
diff --git a/libaom/common/video_reader.h b/libaom/common/video_reader.h
index 903deae..9ab439e 100644
--- a/libaom/common/video_reader.h
+++ b/libaom/common/video_reader.h
@@ -50,6 +50,9 @@ FILE *aom_video_reader_get_file(AvxVideoReader *reader);
// Fills AvxVideoInfo with information from opened video file.
const AvxVideoInfo *aom_video_reader_get_info(AvxVideoReader *reader);
+// Set fourcc.
+void aom_video_reader_set_fourcc(AvxVideoReader *reader, uint32_t fourcc);
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/libaom/common/video_writer.c b/libaom/common/video_writer.c
index a7ec309..2b42e36 100644
--- a/libaom/common/video_writer.c
+++ b/libaom/common/video_writer.c
@@ -75,3 +75,7 @@ int aom_video_writer_write_frame(AvxVideoWriter *writer, const uint8_t *buffer,
return 1;
}
+
+void aom_video_writer_set_fourcc(AvxVideoWriter *writer, uint32_t fourcc) {
+ writer->info.codec_fourcc = fourcc;
+}
diff --git a/libaom/common/video_writer.h b/libaom/common/video_writer.h
index 3e2b655..8712d47 100644
--- a/libaom/common/video_writer.h
+++ b/libaom/common/video_writer.h
@@ -14,7 +14,7 @@
#include "common/video_common.h"
-typedef enum { kContainerIVF } AvxContainer;
+enum { kContainerIVF } UENUM1BYTE(AvxContainer);
struct AvxVideoWriterStruct;
typedef struct AvxVideoWriterStruct AvxVideoWriter;
@@ -37,6 +37,8 @@ void aom_video_writer_close(AvxVideoWriter *writer);
// Writes frame bytes to the file.
int aom_video_writer_write_frame(AvxVideoWriter *writer, const uint8_t *buffer,
size_t size, int64_t pts);
+// Set fourcc.
+void aom_video_writer_set_fourcc(AvxVideoWriter *writer, uint32_t fourcc);
#ifdef __cplusplus
} // extern "C"
diff --git a/libaom/common/webmenc.h b/libaom/common/webmenc.h
index 4cdfd68..a4aa992 100644
--- a/libaom/common/webmenc.h
+++ b/libaom/common/webmenc.h
@@ -30,13 +30,13 @@ struct WebmOutputContext {
};
/* Stereo 3D packed frame format */
-typedef enum stereo_format {
+enum {
STEREO_FORMAT_MONO = 0,
STEREO_FORMAT_LEFT_RIGHT = 1,
STEREO_FORMAT_BOTTOM_TOP = 2,
STEREO_FORMAT_TOP_BOTTOM = 3,
STEREO_FORMAT_RIGHT_LEFT = 11
-} stereo_format_t;
+} UENUM1BYTE(stereo_format_t);
// The following functions wrap libwebm's mkvmuxer. All functions return 0 upon
// success, or -1 upon failure.
diff --git a/libaom/examples/analyzer.cc b/libaom/examples/analyzer.cc
index 6a42eca..261d085 100644
--- a/libaom/examples/analyzer.cc
+++ b/libaom/examples/analyzer.cc
@@ -162,7 +162,7 @@ bool AV1Decoder::setInspectionCallback() {
void AV1Decoder::inspect(void *pbi, void *data) {
AV1Decoder *decoder = (AV1Decoder *)data;
- ifd_inspect(&decoder->frame_data, pbi);
+ ifd_inspect(&decoder->frame_data, pbi, 0);
}
#define MIN_ZOOM (1)
diff --git a/libaom/examples/av1_dec_fuzzer.cc b/libaom/examples/av1_dec_fuzzer.cc
new file mode 100644
index 0000000..96d16a8
--- /dev/null
+++ b/libaom/examples/av1_dec_fuzzer.cc
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*
+ * See build_av1_dec_fuzzer.sh for building instructions.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory>
+
+#include "config/aom_config.h"
+#include "aom/aom_decoder.h"
+#include "aom/aomdx.h"
+#include "aom_ports/mem_ops.h"
+#include "common/ivfdec.h"
+
+static void close_file(FILE *file) { fclose(file); }
+
+extern "C" void usage_exit(void) { exit(EXIT_FAILURE); }
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+ std::unique_ptr<FILE, decltype(&close_file)> file(
+ fmemopen((void *)data, size, "rb"), &close_file);
+ if (file == nullptr) {
+ return 0;
+ }
+
+ char header[32];
+ if (fread(header, 1, 32, file.get()) != 32) {
+ return 0;
+ }
+ const AvxInterface *decoder = get_aom_decoder_by_name("av1");
+ if (decoder == nullptr) {
+ return 0;
+ }
+
+ aom_codec_ctx_t codec;
+ // Set thread count in the range [1, 64].
+ const unsigned int threads = (header[0] & 0x3f) + 1;
+ aom_codec_dec_cfg_t cfg = { threads, 0, 0, CONFIG_LOWBITDEPTH };
+ if (aom_codec_dec_init(&codec, decoder->codec_interface(), &cfg, 0)) {
+ return 0;
+ }
+
+ uint8_t *buffer = nullptr;
+ size_t buffer_size = 0;
+ size_t frame_size = 0;
+ while (!ivf_read_frame(file.get(), &buffer, &frame_size, &buffer_size,
+ nullptr)) {
+ const aom_codec_err_t err =
+ aom_codec_decode(&codec, buffer, frame_size, nullptr);
+ static_cast<void>(err);
+ aom_codec_iter_t iter = nullptr;
+ aom_image_t *img = nullptr;
+ while ((img = aom_codec_get_frame(&codec, &iter)) != nullptr) {
+ }
+ }
+ aom_codec_destroy(&codec);
+ free(buffer);
+ return 0;
+}
diff --git a/libaom/examples/build_av1_dec_fuzzer.sh b/libaom/examples/build_av1_dec_fuzzer.sh
new file mode 100755
index 0000000..86992a0
--- /dev/null
+++ b/libaom/examples/build_av1_dec_fuzzer.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+#
+# Copyright (c) 2019, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and
+# the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+# was not distributed with this source code in the LICENSE file, you can
+# obtain it at www.aomedia.org/license/software. If the Alliance for Open
+# Media Patent License 1.0 was not distributed with this source code in the
+# PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+#
+###############################################################################
+# Fuzzer for libaom decoder.
+# ==========================
+# Requirements
+# ---------------------
+# Clang6.0 or above (must support -fsanitize=fuzzer)
+#
+# References:
+# ---------------------
+# http://llvm.org/docs/LibFuzzer.html
+# https://github.com/google/oss-fuzz
+#
+# Steps to build / run
+# ---------------------
+
+set -eu
+
+# Have a copy of AOM and a build directory ready.
+if [[ $# -ne 2 ]]; then
+ echo "Pass in the AOM source tree as first argument, and a build directory "
+ echo "as the second argument. The AOM source tree can be obtained via: "
+ echo " git clone https://aomedia.googlesource.com/aom"
+ exit 2
+fi
+if [[ -z "$CC" ]]; then
+ echo "Set the CC environment variable to point to your C compiler."
+ exit 2
+fi
+if [[ -z "$CXX" ]]; then
+ echo "Set the CXX environment variable to point to your C++ compiler."
+ exit 2
+fi
+
+AOM_DIR=$1
+BUILD_DIR=$2
+# Run CMake with address sanitizer enabled and build the codec.
+# Enable DO_RANGE_CHECK_CLAMP to suppress the noise of integer overflows
+# in the transform functions. Also set memory limits.
+EXTRA_C_FLAGS='-DDO_RANGE_CHECK_CLAMP=1 -DAOM_MAX_ALLOCABLE_MEMORY=1073741824'
+cd "${BUILD_DIR}"
+cmake "${AOM_DIR}" -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCONFIG_PIC=1 \
+ -DCONFIG_SCALABILITY=0 -DCONFIG_LOWBITDEPTH=1 -DCONFIG_AV1_ENCODER=0 \
+ -DENABLE_EXAMPLES=0 -DENABLE_DOCS=0 -DENABLE_TESTS=0 -DCONFIG_SIZE_LIMIT=1 \
+ -DDECODE_HEIGHT_LIMIT=12288 -DDECODE_WIDTH_LIMIT=12288 \
+ -DAOM_EXTRA_C_FLAGS="${EXTRA_C_FLAGS}" \
+ -DAOM_EXTRA_CXX_FLAGS="${EXTRA_C_FLAGS}" -DSANITIZE=address
+
+# Build the codec.
+make -j$(nproc)
+
+# Build some libaom utils that are not part of the core lib.
+$CC -std=c99 -c -I${AOM_DIR} -I${BUILD_DIR} \
+ ${AOM_DIR}/common/ivfdec.c -o ${BUILD_DIR}/ivfdec.o
+
+$CC -std=c99 -c -I${AOM_DIR} -I${BUILD_DIR} \
+ ${AOM_DIR}/common/tools_common.c -o ${BUILD_DIR}/tools_common.o
+
+# Build the av1 fuzzer
+$CXX -std=c++11 -DDECODER=av1 -I${AOM_DIR} -I${BUILD_DIR} \
+ -fsanitize=fuzzer -Wl,--start-group \
+ ${AOM_DIR}/examples/av1_dec_fuzzer.cc -o ${BUILD_DIR}/av1_dec_fuzzer \
+ ${BUILD_DIR}/libaom.a ${BUILD_DIR}/ivfdec.o ${BUILD_DIR}/tools_common.o \
+ -Wl,--end-group
+
+echo "Fuzzer built at ${BUILD_DIR}/av1_dec_fuzzer."
+echo "Create a corpus directory, copy IVF files in there, and run:"
+echo " av1_dec_fuzzer CORPUS_DIR"
diff --git a/libaom/examples/inspect.c b/libaom/examples/inspect.c
index 7b7b3cd..9ca2a02 100644
--- a/libaom/examples/inspect.c
+++ b/libaom/examples/inspect.c
@@ -62,7 +62,10 @@ typedef enum {
SEGMENT_ID_LAYER = 1 << 14,
MOTION_MODE_LAYER = 1 << 15,
COMPOUND_TYPE_LAYER = 1 << 16,
- ALL_LAYERS = (1 << 17) - 1
+ INTRABC_LAYER = 1 << 17,
+ PALETTE_LAYER = 1 << 18,
+ UV_PALETTE_LAYER = 1 << 19,
+ ALL_LAYERS = (1 << 20) - 1
} LayerType;
static LayerType layers = 0;
@@ -106,7 +109,20 @@ static const arg_def_t dump_delta_q_arg =
ARG_DEF("dq", "delta_q", 0, "Dump QIndex");
static const arg_def_t dump_seg_id_arg =
ARG_DEF("si", "seg_id", 0, "Dump Segment ID");
+static const arg_def_t dump_intrabc_arg =
+ ARG_DEF("ibc", "intrabc", 0, "Dump If IntraBC Is Used");
+static const arg_def_t dump_palette_arg =
+ ARG_DEF("plt", "palette", 0, "Dump Palette Size");
+static const arg_def_t dump_uv_palette_arg =
+ ARG_DEF("uvp", "uv_palette", 0, "Dump UV Palette Size");
static const arg_def_t usage_arg = ARG_DEF("h", "help", 0, "Help");
+static const arg_def_t skip_non_transform_arg = ARG_DEF(
+ "snt", "skip_non_transform", 1, "Skip is counted as a non transform.");
+static const arg_def_t combined_arg =
+ ARG_DEF("comb", "combined", 1, "combinining parameters into one output.");
+
+int combined_parm_list[15];
+int combined_parm_count = 0;
static const arg_def_t *main_args[] = { &limit_arg,
&dump_all_arg,
@@ -130,7 +146,12 @@ static const arg_def_t *main_args[] = { &limit_arg,
&dump_motion_vectors_arg,
&dump_delta_q_arg,
&dump_seg_id_arg,
+ &dump_intrabc_arg,
+ &dump_palette_arg,
+ &dump_uv_palette_arg,
&usage_arg,
+ &skip_non_transform_arg,
+ &combined_arg,
NULL };
#define ENUM(name) \
{ #name, name }
@@ -158,6 +179,8 @@ const map_entry block_size_map[] = {
ENUM(BLOCK_64X16), LAST_ENUM
};
+#define TX_SKIP -1
+
const map_entry tx_size_map[] = {
ENUM(TX_4X4), ENUM(TX_8X8), ENUM(TX_16X16), ENUM(TX_32X32),
ENUM(TX_64X64), ENUM(TX_4X8), ENUM(TX_8X4), ENUM(TX_8X16),
@@ -225,10 +248,57 @@ const map_entry uv_prediction_mode_map[] = {
const map_entry skip_map[] = { ENUM(SKIP), ENUM(NO_SKIP), LAST_ENUM };
+const map_entry intrabc_map[] = {
+ { "INTRABC", 1 }, { "NO_INTRABC", 0 }, LAST_ENUM
+};
+
+const map_entry palette_map[] = {
+ { "ZERO_COLORS", 0 }, { "TWO_COLORS", 2 }, { "THREE_COLORS", 3 },
+ { "FOUR_COLORS", 4 }, { "FIVE_COLORS", 5 }, { "SIX_COLORS", 6 },
+ { "SEVEN_COLORS", 7 }, { "EIGHT_COLORS", 8 }, LAST_ENUM
+};
+
const map_entry config_map[] = { ENUM(MI_SIZE), LAST_ENUM };
static const char *exec_name;
+struct parm_offset {
+ char parm[60];
+ char offset;
+};
+struct parm_offset parm_offsets[] = {
+ { "blockSize", offsetof(insp_mi_data, sb_type) },
+ { "transformSize", offsetof(insp_mi_data, tx_size) },
+ { "transformType", offsetof(insp_mi_data, tx_type) },
+ { "dualFilterType", offsetof(insp_mi_data, dual_filter_type) },
+ { "mode", offsetof(insp_mi_data, mode) },
+ { "uv_mode", offsetof(insp_mi_data, uv_mode) },
+ { "motion_mode", offsetof(insp_mi_data, motion_mode) },
+ { "compound_type", offsetof(insp_mi_data, compound_type) },
+ { "referenceFrame", offsetof(insp_mi_data, ref_frame) },
+ { "skip", offsetof(insp_mi_data, skip) },
+};
+int parm_count = sizeof(parm_offsets) / sizeof(parm_offsets[0]);
+
+int convert_to_indices(char *str, int *indices, int maxCount, int *count) {
+ *count = 0;
+ do {
+ char *comma = strchr(str, ',');
+ int length = (comma ? (int)(comma - str) : (int)strlen(str));
+ int i;
+ for (i = 0; i < parm_count; ++i) {
+ if (!strncmp(str, parm_offsets[i].parm, length)) {
+ break;
+ }
+ }
+ if (i == parm_count) return 0;
+ indices[(*count)++] = i;
+ if (*count > maxCount) return 0;
+ str += length + 1;
+ } while (strlen(str) > 0);
+ return 1;
+}
+
insp_frame_data frame_data;
int frame_count = 0;
int decoded_frame_count = 0;
@@ -399,6 +469,38 @@ int put_motion_vectors(char *buffer) {
return (int)(buf - buffer);
}
+int put_combined(char *buffer) {
+ const int mi_rows = frame_data.mi_rows;
+ const int mi_cols = frame_data.mi_cols;
+ char *buf = buffer;
+ int r, c, p;
+ buf += put_str(buf, " \"");
+ for (p = 0; p < combined_parm_count; ++p) {
+ if (p) buf += put_str(buf, "&");
+ buf += put_str(buf, parm_offsets[combined_parm_list[p]].parm);
+ }
+ buf += put_str(buf, "\": [");
+ for (r = 0; r < mi_rows; ++r) {
+ *(buf++) = '[';
+ for (c = 0; c < mi_cols; ++c) {
+ insp_mi_data *mi = &frame_data.mi_grid[r * mi_cols + c];
+ *(buf++) = '[';
+ for (p = 0; p < combined_parm_count; ++p) {
+ if (p) *(buf++) = ',';
+ int16_t *v = (int16_t *)(((int8_t *)mi) +
+ parm_offsets[combined_parm_list[p]].offset);
+ buf += put_num(buf, 0, v[0], 0);
+ }
+ *(buf++) = ']';
+ if (c < mi_cols - 1) *(buf++) = ',';
+ }
+ *(buf++) = ']';
+ if (r < mi_rows - 1) *(buf++) = ',';
+ }
+ buf += put_str(buf, "],\n");
+ return (int)(buf - buffer);
+}
+
int put_block_info(char *buffer, const map_entry *map, const char *name,
size_t offset, int len) {
const int mi_rows = frame_data.mi_rows;
@@ -507,9 +609,11 @@ int put_accounting(char *buffer) {
}
#endif
+int skip_non_transform = 0;
+
void inspect(void *pbi, void *data) {
/* Fetch frame data. */
- ifd_inspect(&frame_data, pbi);
+ ifd_inspect(&frame_data, pbi, skip_non_transform);
// Show existing frames just show a reference buffer we've already decoded.
// There's no information to show.
@@ -584,6 +688,19 @@ void inspect(void *pbi, void *data) {
if (layers & MOTION_VECTORS_LAYER) {
buf += put_motion_vectors(buf);
}
+ if (layers & INTRABC_LAYER) {
+ buf += put_block_info(buf, intrabc_map, "intrabc",
+ offsetof(insp_mi_data, intrabc), 0);
+ }
+ if (layers & PALETTE_LAYER) {
+ buf += put_block_info(buf, palette_map, "palette",
+ offsetof(insp_mi_data, palette), 0);
+ }
+ if (layers & UV_PALETTE_LAYER) {
+ buf += put_block_info(buf, palette_map, "uv_palette",
+ offsetof(insp_mi_data, uv_palette), 0);
+ }
+ if (combined_parm_count > 0) buf += put_combined(buf);
if (layers & REFERENCE_FRAME_LAYER) {
buf += put_block_info(buf, refs_map, "referenceFrame",
offsetof(insp_mi_data, ref_frame), 2);
@@ -775,6 +892,12 @@ static void parse_args(char **argv) {
layers |= Q_INDEX_LAYER;
else if (arg_match(&arg, &dump_seg_id_arg, argi))
layers |= SEGMENT_ID_LAYER;
+ else if (arg_match(&arg, &dump_intrabc_arg, argi))
+ layers |= INTRABC_LAYER;
+ else if (arg_match(&arg, &dump_palette_arg, argi))
+ layers |= PALETTE_LAYER;
+ else if (arg_match(&arg, &dump_uv_palette_arg, argi))
+ layers |= UV_PALETTE_LAYER;
else if (arg_match(&arg, &dump_all_arg, argi))
layers |= ALL_LAYERS;
else if (arg_match(&arg, &compress_arg, argi))
@@ -783,6 +906,13 @@ static void parse_args(char **argv) {
usage_exit();
else if (arg_match(&arg, &limit_arg, argi))
stop_after = arg_parse_uint(&arg);
+ else if (arg_match(&arg, &skip_non_transform_arg, argi))
+ skip_non_transform = arg_parse_uint(&arg);
+ else if (arg_match(&arg, &combined_arg, argi))
+ convert_to_indices(
+ (char *)arg.val, combined_parm_list,
+ sizeof(combined_parm_list) / sizeof(combined_parm_list[0]),
+ &combined_parm_count);
else
argj++;
}
diff --git a/libaom/examples/lightfield_bitstream_parsing.c b/libaom/examples/lightfield_bitstream_parsing.c
index 9c90671..afacf44 100644
--- a/libaom/examples/lightfield_bitstream_parsing.c
+++ b/libaom/examples/lightfield_bitstream_parsing.c
@@ -211,6 +211,8 @@ int main(int argc, char **argv) {
num_references = (int)strtol(argv[3], NULL, 0);
info = aom_video_reader_get_info(reader);
+ aom_video_reader_set_fourcc(reader, AV1_FOURCC);
+
// The writer to write out ivf file in tile list OBU, which can be decoded by
// AV1 decoder.
writer = aom_video_writer_open(argv[2], kContainerIVF, info);
diff --git a/libaom/examples/lightfield_decoder.c b/libaom/examples/lightfield_decoder.c
index 23dac98..7a445f0 100644
--- a/libaom/examples/lightfield_decoder.c
+++ b/libaom/examples/lightfield_decoder.c
@@ -188,8 +188,10 @@ int main(int argc, char **argv) {
info = aom_video_reader_get_info(reader);
- decoder = get_aom_decoder_by_fourcc(info->codec_fourcc);
- if (!decoder) die("Unknown input codec.");
+ if (info->codec_fourcc == LST_FOURCC)
+ decoder = get_aom_decoder_by_fourcc(AV1_FOURCC);
+ else
+ die("Unknown input codec.");
printf("Using %s\n", aom_codec_iface_name(decoder->codec_interface()));
if (aom_codec_dec_init(&codec, decoder->codec_interface(), NULL, 0))
@@ -218,7 +220,7 @@ int main(int argc, char **argv) {
// Allocate memory to store decoded references. Allocate memory with the
// border so that it can be used as a reference.
for (j = 0; j < num_references; j++) {
- unsigned int border = AOM_BORDER_IN_PIXELS;
+ unsigned int border = AOM_DEC_BORDER_IN_PIXELS;
if (!aom_img_alloc_with_border(&reference_images[j], ref_fmt,
frame_res[0], frame_res[1], 32, 8,
border)) {
diff --git a/libaom/examples/lightfield_encoder.c b/libaom/examples/lightfield_encoder.c
index e55cd5c..4dd71ca 100644
--- a/libaom/examples/lightfield_encoder.c
+++ b/libaom/examples/lightfield_encoder.c
@@ -275,9 +275,13 @@ static void pass1(aom_image_t *raw, FILE *infile, const char *outfile_name,
aom_img_fmt_t ref_fmt = AOM_IMG_FMT_I420;
if (!CONFIG_LOWBITDEPTH) ref_fmt |= AOM_IMG_FMT_HIGHBITDEPTH;
// Allocate memory with the border so that it can be used as a reference.
+ int border_in_pixels =
+ (codec.config.enc->rc_resize_mode || codec.config.enc->rc_superres_mode)
+ ? AOM_BORDER_IN_PIXELS
+ : AOM_ENC_NO_SCALE_BORDER;
for (i = 0; i < reference_image_num; i++) {
if (!aom_img_alloc_with_border(&reference_images[i], ref_fmt, cfg->g_w,
- cfg->g_h, 32, 8, AOM_BORDER_IN_PIXELS)) {
+ cfg->g_h, 32, 8, border_in_pixels)) {
die("Failed to allocate image.");
}
}
@@ -393,6 +397,10 @@ static void pass1(aom_image_t *raw, FILE *infile, const char *outfile_name,
for (i = 0; i < reference_image_num; i++) aom_img_free(&reference_images[i]);
if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec.");
+
+ // Modify large_scale_file fourcc.
+ if (cfg->large_scale_tile == 1)
+ aom_video_writer_set_fourcc(writer, LST_FOURCC);
aom_video_writer_close(writer);
printf("\nSecond pass complete. Processed %d frames.\n", frame_count);
diff --git a/libaom/examples/lightfield_tile_list_decoder.c b/libaom/examples/lightfield_tile_list_decoder.c
index 4aabde1..87a8b43 100644
--- a/libaom/examples/lightfield_tile_list_decoder.c
+++ b/libaom/examples/lightfield_tile_list_decoder.c
@@ -160,7 +160,7 @@ int main(int argc, char **argv) {
// Allocate memory to store decoded references. Allocate memory with the
// border so that it can be used as a reference.
for (j = 0; j < num_references; j++) {
- unsigned int border = AOM_BORDER_IN_PIXELS;
+ unsigned int border = AOM_DEC_BORDER_IN_PIXELS;
if (!aom_img_alloc_with_border(&reference_images[j], ref_fmt,
frame_res[0], frame_res[1], 32, 8,
border)) {
diff --git a/libaom/test/av1_convolve_2d_test.cc b/libaom/test/av1_convolve_2d_test.cc
index 825cef2..b0cef81 100644
--- a/libaom/test/av1_convolve_2d_test.cc
+++ b/libaom/test/av1_convolve_2d_test.cc
@@ -19,6 +19,7 @@ using libaom_test::AV1HighbdConvolve2D::AV1HighbdConvolve2DSrTest;
using libaom_test::AV1HighbdConvolve2D::AV1HighbdJntConvolve2DTest;
using ::testing::make_tuple;
using ::testing::tuple;
+
namespace {
TEST_P(AV1Convolve2DSrTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0)); }
@@ -89,72 +90,72 @@ INSTANTIATE_TEST_CASE_P(NEON_COPY, AV1Convolve2DSrTest,
TEST_P(AV1JntConvolve2DTest, CheckOutput) { RunCheckOutput(GET_PARAM(0)); }
TEST_P(AV1JntConvolve2DTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0)); }
-INSTANTIATE_TEST_CASE_P(
- C_COPY, AV1JntConvolve2DTest,
- libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_2d_copy_c, 0, 0));
+INSTANTIATE_TEST_CASE_P(C_COPY, AV1JntConvolve2DTest,
+ libaom_test::AV1Convolve2D::BuildParams(
+ av1_dist_wtd_convolve_2d_copy_c, 0, 0));
INSTANTIATE_TEST_CASE_P(
C_X, AV1JntConvolve2DTest,
- libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_x_c, 1, 0));
+ libaom_test::AV1Convolve2D::BuildParams(av1_dist_wtd_convolve_x_c, 1, 0));
INSTANTIATE_TEST_CASE_P(
C_Y, AV1JntConvolve2DTest,
- libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_y_c, 0, 1));
+ libaom_test::AV1Convolve2D::BuildParams(av1_dist_wtd_convolve_y_c, 0, 1));
#if HAVE_SSE2
INSTANTIATE_TEST_CASE_P(SSE2_COPY, AV1JntConvolve2DTest,
libaom_test::AV1Convolve2D::BuildParams(
- av1_jnt_convolve_2d_copy_sse2, 0, 0));
-INSTANTIATE_TEST_CASE_P(
- SSE2, AV1JntConvolve2DTest,
- libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_2d_sse2, 1, 1));
+ av1_dist_wtd_convolve_2d_copy_sse2, 0, 0));
+INSTANTIATE_TEST_CASE_P(SSE2, AV1JntConvolve2DTest,
+ libaom_test::AV1Convolve2D::BuildParams(
+ av1_dist_wtd_convolve_2d_sse2, 1, 1));
-INSTANTIATE_TEST_CASE_P(
- SSE2_X, AV1JntConvolve2DTest,
- libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_x_sse2, 1, 0));
+INSTANTIATE_TEST_CASE_P(SSE2_X, AV1JntConvolve2DTest,
+ libaom_test::AV1Convolve2D::BuildParams(
+ av1_dist_wtd_convolve_x_sse2, 1, 0));
-INSTANTIATE_TEST_CASE_P(
- SSE2_Y, AV1JntConvolve2DTest,
- libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_y_sse2, 0, 1));
+INSTANTIATE_TEST_CASE_P(SSE2_Y, AV1JntConvolve2DTest,
+ libaom_test::AV1Convolve2D::BuildParams(
+ av1_dist_wtd_convolve_y_sse2, 0, 1));
#if HAVE_SSSE3
-INSTANTIATE_TEST_CASE_P(
- SSSE3, AV1JntConvolve2DTest,
- libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_2d_ssse3, 1, 1));
+INSTANTIATE_TEST_CASE_P(SSSE3, AV1JntConvolve2DTest,
+ libaom_test::AV1Convolve2D::BuildParams(
+ av1_dist_wtd_convolve_2d_ssse3, 1, 1));
#if HAVE_AVX2
INSTANTIATE_TEST_CASE_P(AVX2_COPY, AV1JntConvolve2DTest,
libaom_test::AV1Convolve2D::BuildParams(
- av1_jnt_convolve_2d_copy_avx2, 0, 0));
-INSTANTIATE_TEST_CASE_P(
- AVX2_X, AV1JntConvolve2DTest,
- libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_x_avx2, 1, 0));
+ av1_dist_wtd_convolve_2d_copy_avx2, 0, 0));
+INSTANTIATE_TEST_CASE_P(AVX2_X, AV1JntConvolve2DTest,
+ libaom_test::AV1Convolve2D::BuildParams(
+ av1_dist_wtd_convolve_x_avx2, 1, 0));
-INSTANTIATE_TEST_CASE_P(
- AVX2_Y, AV1JntConvolve2DTest,
- libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_y_avx2, 0, 1));
+INSTANTIATE_TEST_CASE_P(AVX2_Y, AV1JntConvolve2DTest,
+ libaom_test::AV1Convolve2D::BuildParams(
+ av1_dist_wtd_convolve_y_avx2, 0, 1));
-INSTANTIATE_TEST_CASE_P(
- AVX2, AV1JntConvolve2DTest,
- libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_2d_avx2, 1, 1));
+INSTANTIATE_TEST_CASE_P(AVX2, AV1JntConvolve2DTest,
+ libaom_test::AV1Convolve2D::BuildParams(
+ av1_dist_wtd_convolve_2d_avx2, 1, 1));
#endif // HAVE_AVX2
#endif // HAVE_SSSE3
#endif // HAVE_SSE2
#if HAVE_NEON
INSTANTIATE_TEST_CASE_P(NEON_COPY, AV1JntConvolve2DTest,
libaom_test::AV1Convolve2D::BuildParams(
- av1_jnt_convolve_2d_copy_neon, 0, 0));
+ av1_dist_wtd_convolve_2d_copy_neon, 0, 0));
-INSTANTIATE_TEST_CASE_P(
- NEON, AV1JntConvolve2DTest,
- libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_2d_neon, 1, 1));
-INSTANTIATE_TEST_CASE_P(
- NEON_X, AV1JntConvolve2DTest,
- libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_x_neon, 1, 0));
+INSTANTIATE_TEST_CASE_P(NEON, AV1JntConvolve2DTest,
+ libaom_test::AV1Convolve2D::BuildParams(
+ av1_dist_wtd_convolve_2d_neon, 1, 1));
+INSTANTIATE_TEST_CASE_P(NEON_X, AV1JntConvolve2DTest,
+ libaom_test::AV1Convolve2D::BuildParams(
+ av1_dist_wtd_convolve_x_neon, 1, 0));
-INSTANTIATE_TEST_CASE_P(
- NEON_Y, AV1JntConvolve2DTest,
- libaom_test::AV1Convolve2D::BuildParams(av1_jnt_convolve_y_neon, 0, 1));
+INSTANTIATE_TEST_CASE_P(NEON_Y, AV1JntConvolve2DTest,
+ libaom_test::AV1Convolve2D::BuildParams(
+ av1_dist_wtd_convolve_y_neon, 0, 1));
#endif // HAVE_NEON
TEST_P(AV1HighbdConvolve2DSrTest, CheckOutput) { RunCheckOutput(GET_PARAM(1)); }
@@ -213,41 +214,41 @@ TEST_P(AV1HighbdJntConvolve2DTest, DISABLED_Speed) {
INSTANTIATE_TEST_CASE_P(C_X, AV1HighbdJntConvolve2DTest,
libaom_test::AV1HighbdConvolve2D::BuildParams(
- av1_highbd_jnt_convolve_x_c, 1, 0));
+ av1_highbd_dist_wtd_convolve_x_c, 1, 0));
INSTANTIATE_TEST_CASE_P(C_Y, AV1HighbdJntConvolve2DTest,
libaom_test::AV1HighbdConvolve2D::BuildParams(
- av1_highbd_jnt_convolve_y_c, 0, 1));
+ av1_highbd_dist_wtd_convolve_y_c, 0, 1));
INSTANTIATE_TEST_CASE_P(C_COPY, AV1HighbdJntConvolve2DTest,
libaom_test::AV1HighbdConvolve2D::BuildParams(
- av1_highbd_jnt_convolve_2d_copy_c, 0, 0));
+ av1_highbd_dist_wtd_convolve_2d_copy_c, 0, 0));
#if HAVE_SSE4_1
INSTANTIATE_TEST_CASE_P(SSE4_1_COPY, AV1HighbdJntConvolve2DTest,
libaom_test::AV1HighbdConvolve2D::BuildParams(
- av1_highbd_jnt_convolve_2d_copy_sse4_1, 0, 0));
+ av1_highbd_dist_wtd_convolve_2d_copy_sse4_1, 0, 0));
INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdJntConvolve2DTest,
libaom_test::AV1HighbdConvolve2D::BuildParams(
- av1_highbd_jnt_convolve_2d_sse4_1, 1, 1));
+ av1_highbd_dist_wtd_convolve_2d_sse4_1, 1, 1));
INSTANTIATE_TEST_CASE_P(SSE4_1_X, AV1HighbdJntConvolve2DTest,
libaom_test::AV1HighbdConvolve2D::BuildParams(
- av1_highbd_jnt_convolve_x_sse4_1, 1, 0));
+ av1_highbd_dist_wtd_convolve_x_sse4_1, 1, 0));
INSTANTIATE_TEST_CASE_P(SSE4_1_Y, AV1HighbdJntConvolve2DTest,
libaom_test::AV1HighbdConvolve2D::BuildParams(
- av1_highbd_jnt_convolve_y_sse4_1, 0, 1));
+ av1_highbd_dist_wtd_convolve_y_sse4_1, 0, 1));
#if HAVE_AVX2
INSTANTIATE_TEST_CASE_P(AVX2_COPY, AV1HighbdJntConvolve2DTest,
libaom_test::AV1HighbdConvolve2D::BuildParams(
- av1_highbd_jnt_convolve_2d_copy_avx2, 0, 0));
+ av1_highbd_dist_wtd_convolve_2d_copy_avx2, 0, 0));
INSTANTIATE_TEST_CASE_P(AVX2, AV1HighbdJntConvolve2DTest,
libaom_test::AV1HighbdConvolve2D::BuildParams(
- av1_highbd_jnt_convolve_2d_avx2, 1, 1));
+ av1_highbd_dist_wtd_convolve_2d_avx2, 1, 1));
INSTANTIATE_TEST_CASE_P(AVX2_X, AV1HighbdJntConvolve2DTest,
libaom_test::AV1HighbdConvolve2D::BuildParams(
- av1_highbd_jnt_convolve_x_avx2, 1, 0));
+ av1_highbd_dist_wtd_convolve_x_avx2, 1, 0));
INSTANTIATE_TEST_CASE_P(AVX2_Y, AV1HighbdJntConvolve2DTest,
libaom_test::AV1HighbdConvolve2D::BuildParams(
- av1_highbd_jnt_convolve_y_avx2, 0, 1));
+ av1_highbd_dist_wtd_convolve_y_avx2, 0, 1));
#endif // HAVE_AVX2
#endif // HAVE_SSE4_1
} // namespace
diff --git a/libaom/test/av1_convolve_2d_test_util.cc b/libaom/test/av1_convolve_2d_test_util.cc
index 409fd23..9cfe3e6 100644
--- a/libaom/test/av1_convolve_2d_test_util.cc
+++ b/libaom/test/av1_convolve_2d_test_util.cc
@@ -200,9 +200,9 @@ void AV1JntConvolve2DTest::RunCheckOutput(convolve_2d_func test_impl) {
ConvolveParams conv_params2 =
get_conv_params_no_round(do_average, 0, output2, MAX_SB_SIZE, 1, 8);
- // Test special case where jnt_comp_avg is not used
- conv_params1.use_jnt_comp_avg = 0;
- conv_params2.use_jnt_comp_avg = 0;
+ // Test special case where dist_wtd_comp_avg is not used
+ conv_params1.use_dist_wtd_comp_avg = 0;
+ conv_params2.use_dist_wtd_comp_avg = 0;
const int subx_range = has_subx ? 16 : 1;
const int suby_range = has_suby ? 16 : 1;
@@ -211,9 +211,10 @@ void AV1JntConvolve2DTest::RunCheckOutput(convolve_2d_func test_impl) {
// Choose random locations within the source block
const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
- av1_jnt_convolve_2d_c(input + offset_r * w + offset_c, w, output8_1,
- MAX_SB_SIZE, out_w, out_h, filter_params_x,
- filter_params_y, subx, suby, &conv_params1);
+ av1_dist_wtd_convolve_2d_c(input + offset_r * w + offset_c, w,
+ output8_1, MAX_SB_SIZE, out_w, out_h,
+ filter_params_x, filter_params_y, subx,
+ suby, &conv_params1);
test_impl(input + offset_r * w + offset_c, w, output8_2,
MAX_SB_SIZE, out_w, out_h, filter_params_x,
filter_params_y, subx, suby, &conv_params2);
@@ -222,7 +223,7 @@ void AV1JntConvolve2DTest::RunCheckOutput(convolve_2d_func test_impl) {
for (int j = 0; j < out_w; ++j) {
int idx = i * MAX_SB_SIZE + j;
ASSERT_EQ(output1[idx], output2[idx])
- << "Mismatch at unit tests for av1_jnt_convolve_2d\n"
+ << "Mismatch at unit tests for av1_dist_wtd_convolve_2d\n"
<< out_w << "x" << out_h << " Pixel mismatch at index "
<< idx << " = (" << i << ", " << j
<< "), sub pixel offset = (" << suby << ", " << subx << ")";
@@ -247,8 +248,8 @@ void AV1JntConvolve2DTest::RunCheckOutput(convolve_2d_func test_impl) {
// Test different combination of fwd and bck offset weights
for (int k = 0; k < 2; ++k) {
for (int l = 0; l < 4; ++l) {
- conv_params1.use_jnt_comp_avg = 1;
- conv_params2.use_jnt_comp_avg = 1;
+ conv_params1.use_dist_wtd_comp_avg = 1;
+ conv_params2.use_dist_wtd_comp_avg = 1;
conv_params1.fwd_offset = quant_dist_lookup_table[k][l][0];
conv_params1.bck_offset = quant_dist_lookup_table[k][l][1];
conv_params2.fwd_offset = quant_dist_lookup_table[k][l][0];
@@ -259,10 +260,10 @@ void AV1JntConvolve2DTest::RunCheckOutput(convolve_2d_func test_impl) {
// Choose random locations within the source block
const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
- av1_jnt_convolve_2d_c(input + offset_r * w + offset_c, w,
- output8_1, MAX_SB_SIZE, out_w, out_h,
- filter_params_x, filter_params_y, subx,
- suby, &conv_params1);
+ av1_dist_wtd_convolve_2d_c(input + offset_r * w + offset_c, w,
+ output8_1, MAX_SB_SIZE, out_w, out_h,
+ filter_params_x, filter_params_y,
+ subx, suby, &conv_params1);
test_impl(input + offset_r * w + offset_c, w, output8_2,
MAX_SB_SIZE, out_w, out_h, filter_params_x,
filter_params_y, subx, suby, &conv_params2);
@@ -272,7 +273,7 @@ void AV1JntConvolve2DTest::RunCheckOutput(convolve_2d_func test_impl) {
int idx = i * MAX_SB_SIZE + j;
ASSERT_EQ(output1[idx], output2[idx])
<< "Mismatch at unit tests for "
- "av1_jnt_convolve_2d\n"
+ "av1_dist_wtd_convolve_2d\n"
<< out_w << "x" << out_h << " Pixel mismatch at index "
<< idx << " = (" << i << ", " << j
<< "), sub pixel offset = (" << suby << ", " << subx
@@ -333,7 +334,7 @@ void AV1JntConvolve2DTest::RunSpeedTest(convolve_2d_func test_impl) {
ConvolveParams conv_params =
get_conv_params_no_round(do_average, 0, output, MAX_SB_SIZE, 1, 8);
- conv_params.use_jnt_comp_avg = 0;
+ conv_params.use_dist_wtd_comp_avg = 0;
// Choose random locations within the source block
const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
@@ -540,8 +541,8 @@ void AV1HighbdJntConvolve2DTest::RunSpeedTest(
ConvolveParams conv_params =
get_conv_params_no_round(do_average, 0, output, MAX_SB_SIZE, 1, bd);
- // Test special case where jnt_comp_avg is not used
- conv_params.use_jnt_comp_avg = 0;
+ // Test special case where dist_wtd_comp_avg is not used
+ conv_params.use_dist_wtd_comp_avg = 0;
subx = 0;
suby = 0;
@@ -601,9 +602,9 @@ void AV1HighbdJntConvolve2DTest::RunCheckOutput(
ConvolveParams conv_params2 = get_conv_params_no_round(
do_average, 0, output2, MAX_SB_SIZE, 1, bd);
- // Test special case where jnt_comp_avg is not used
- conv_params1.use_jnt_comp_avg = 0;
- conv_params2.use_jnt_comp_avg = 0;
+ // Test special case where dist_wtd_comp_avg is not used
+ conv_params1.use_dist_wtd_comp_avg = 0;
+ conv_params2.use_dist_wtd_comp_avg = 0;
const int subx_range = has_subx ? 16 : 1;
const int suby_range = has_suby ? 16 : 1;
@@ -612,10 +613,10 @@ void AV1HighbdJntConvolve2DTest::RunCheckOutput(
// Choose random locations within the source block
const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
- av1_highbd_jnt_convolve_2d_c(input + offset_r * w + offset_c, w,
- output16_1, MAX_SB_SIZE, out_w, out_h,
- filter_params_x, filter_params_y, subx,
- suby, &conv_params1, bd);
+ av1_highbd_dist_wtd_convolve_2d_c(
+ input + offset_r * w + offset_c, w, output16_1, MAX_SB_SIZE,
+ out_w, out_h, filter_params_x, filter_params_y, subx, suby,
+ &conv_params1, bd);
test_impl(input + offset_r * w + offset_c, w, output16_2,
MAX_SB_SIZE, out_w, out_h, filter_params_x,
filter_params_y, subx, suby, &conv_params2, bd);
@@ -648,8 +649,8 @@ void AV1HighbdJntConvolve2DTest::RunCheckOutput(
// Test different combination of fwd and bck offset weights
for (int k = 0; k < 2; ++k) {
for (int l = 0; l < 4; ++l) {
- conv_params1.use_jnt_comp_avg = 1;
- conv_params2.use_jnt_comp_avg = 1;
+ conv_params1.use_dist_wtd_comp_avg = 1;
+ conv_params2.use_dist_wtd_comp_avg = 1;
conv_params1.fwd_offset = quant_dist_lookup_table[k][l][0];
conv_params1.bck_offset = quant_dist_lookup_table[k][l][1];
conv_params2.fwd_offset = quant_dist_lookup_table[k][l][0];
@@ -662,7 +663,7 @@ void AV1HighbdJntConvolve2DTest::RunCheckOutput(
// Choose random locations within the source block
const int offset_r = 3 + rnd_.PseudoUniform(h - out_h - 7);
const int offset_c = 3 + rnd_.PseudoUniform(w - out_w - 7);
- av1_highbd_jnt_convolve_2d_c(
+ av1_highbd_dist_wtd_convolve_2d_c(
input + offset_r * w + offset_c, w, output16_1, MAX_SB_SIZE,
out_w, out_h, filter_params_x, filter_params_y, subx, suby,
&conv_params1, bd);
diff --git a/libaom/test/av1_convolve_scale_test.cc b/libaom/test/av1_convolve_scale_test.cc
index 1929c49..a933fc9 100644
--- a/libaom/test/av1_convolve_scale_test.cc
+++ b/libaom/test/av1_convolve_scale_test.cc
@@ -286,13 +286,13 @@ class ConvolveScaleTestBase : public ::testing::Test {
}
void SetConvParamOffset(int i, int j, int is_compound, int do_average,
- int use_jnt_comp_avg) {
+ int use_dist_wtd_comp_avg) {
if (i == -1 && j == -1) {
- convolve_params_.use_jnt_comp_avg = use_jnt_comp_avg;
+ convolve_params_.use_dist_wtd_comp_avg = use_dist_wtd_comp_avg;
convolve_params_.is_compound = is_compound;
convolve_params_.do_average = do_average;
} else {
- convolve_params_.use_jnt_comp_avg = use_jnt_comp_avg;
+ convolve_params_.use_dist_wtd_comp_avg = use_dist_wtd_comp_avg;
convolve_params_.fwd_offset = quant_dist_lookup_table[i][j][0];
convolve_params_.bck_offset = quant_dist_lookup_table[i][j][1];
convolve_params_.is_compound = is_compound;
@@ -312,12 +312,12 @@ class ConvolveScaleTestBase : public ::testing::Test {
is_compound = 1;
for (int do_average = 0; do_average < 2; do_average++) {
- for (int use_jnt_comp_avg = 0; use_jnt_comp_avg < 2;
- use_jnt_comp_avg++) {
+ for (int use_dist_wtd_comp_avg = 0; use_dist_wtd_comp_avg < 2;
+ use_dist_wtd_comp_avg++) {
for (int j = 0; j < 2; ++j) {
for (int k = 0; k < 4; ++k) {
SetConvParamOffset(j, k, is_compound, do_average,
- use_jnt_comp_avg);
+ use_dist_wtd_comp_avg);
Prep(&rnd);
RunOne(true);
RunOne(false);
diff --git a/libaom/test/av1_fwd_txfm2d_test.cc b/libaom/test/av1_fwd_txfm2d_test.cc
index c1b97f7..eb09cb1 100644
--- a/libaom/test/av1_fwd_txfm2d_test.cc
+++ b/libaom/test/av1_fwd_txfm2d_test.cc
@@ -288,6 +288,68 @@ void AV1FwdTxfm2dMatchTest(TX_SIZE tx_size, lowbd_fwd_txfm_func target_func) {
}
}
+void AV1FwdTxfm2dSpeedTest(TX_SIZE tx_size, lowbd_fwd_txfm_func target_func) {
+ TxfmParam param;
+ memset(&param, 0, sizeof(param));
+ const int rows = tx_size_high[tx_size];
+ const int cols = tx_size_wide[tx_size];
+ const int num_loops = 1000000 / (rows * cols);
+
+ for (int i = 0; i < 2; ++i) {
+ const int bd = 8;
+ for (int tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
+ if (libaom_test::IsTxSizeTypeValid(
+ tx_size, static_cast<TX_TYPE>(tx_type)) == false) {
+ continue;
+ }
+
+ FwdTxfm2dFunc ref_func = libaom_test::fwd_txfm_func_ls[tx_size];
+ if (ref_func != NULL) {
+ DECLARE_ALIGNED(32, int16_t, input[64 * 64]) = { 0 };
+ DECLARE_ALIGNED(32, int32_t, output[64 * 64]);
+ DECLARE_ALIGNED(32, int32_t, ref_output[64 * 64]);
+ int input_stride = 64;
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+ for (int r = 0; r < rows; ++r) {
+ for (int c = 0; c < cols; ++c) {
+ input[r * input_stride + c] = rnd.Rand16() % (1 << bd);
+ }
+ }
+
+ param.tx_type = (TX_TYPE)tx_type;
+ param.tx_size = (TX_SIZE)tx_size;
+ param.tx_set_type = EXT_TX_SET_ALL16;
+ param.bd = bd;
+
+ aom_usec_timer ref_timer, test_timer;
+
+ aom_usec_timer_start(&ref_timer);
+ for (int i = 0; i < num_loops; ++i) {
+ ref_func(input, ref_output, input_stride, (TX_TYPE)tx_type, bd);
+ }
+ aom_usec_timer_mark(&ref_timer);
+ const int elapsed_time_c =
+ static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
+
+ aom_usec_timer_start(&test_timer);
+ for (int i = 0; i < num_loops; ++i) {
+ target_func(input, output, input_stride, &param);
+ }
+ aom_usec_timer_mark(&test_timer);
+ const int elapsed_time_simd =
+ static_cast<int>(aom_usec_timer_elapsed(&test_timer));
+
+ printf(
+ "txfm_size[%d] \t txfm_type[%d] \t c_time=%d \t simd_time=%d \t "
+ "gain=%d \n",
+ tx_size, tx_type, elapsed_time_c, elapsed_time_simd,
+ (elapsed_time_c / elapsed_time_simd));
+ }
+ }
+ }
+}
+
typedef ::testing::tuple<TX_SIZE, lowbd_fwd_txfm_func> LbdFwdTxfm2dParam;
class AV1FwdTxfm2dTest : public ::testing::TestWithParam<LbdFwdTxfm2dParam> {};
@@ -295,7 +357,9 @@ class AV1FwdTxfm2dTest : public ::testing::TestWithParam<LbdFwdTxfm2dParam> {};
TEST_P(AV1FwdTxfm2dTest, match) {
AV1FwdTxfm2dMatchTest(GET_PARAM(0), GET_PARAM(1));
}
-
+TEST_P(AV1FwdTxfm2dTest, DISABLED_Speed) {
+ AV1FwdTxfm2dSpeedTest(GET_PARAM(0), GET_PARAM(1));
+}
using ::testing::Combine;
using ::testing::Values;
using ::testing::ValuesIn;
@@ -507,5 +571,12 @@ INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdFwdTxfm2dTest,
Combine(ValuesIn(Highbd_fwd_txfm_for_sse4_1),
Values(av1_highbd_fwd_txfm)));
#endif // HAVE_SSE4_1
+#if HAVE_AVX2
+static TX_SIZE Highbd_fwd_txfm_for_avx2[] = { TX_8X8, TX_16X16, TX_32X32,
+ TX_64X64, TX_8X16, TX_16X8 };
+INSTANTIATE_TEST_CASE_P(AVX2, AV1HighbdFwdTxfm2dTest,
+ Combine(ValuesIn(Highbd_fwd_txfm_for_avx2),
+ Values(av1_highbd_fwd_txfm)));
+#endif // HAVE_AVX2
} // namespace
diff --git a/libaom/test/av1_highbd_iht_test.cc b/libaom/test/av1_highbd_iht_test.cc
index 7f077b6..6d77cbf 100644
--- a/libaom/test/av1_highbd_iht_test.cc
+++ b/libaom/test/av1_highbd_iht_test.cc
@@ -308,7 +308,8 @@ INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdInvTxfm2d,
::testing::Values(av1_highbd_inv_txfm_add_sse4_1));
#endif
-#if HAVE_AVX2
+// TODO(http://crbug.com/aomedia/2350): these cause test vector mismatches.
+#if 0 // HAVE_AVX2
INSTANTIATE_TEST_CASE_P(AVX2, AV1HighbdInvTxfm2d,
::testing::Values(av1_highbd_inv_txfm_add_avx2));
#endif
diff --git a/libaom/test/av1_round_shift_array_test.cc b/libaom/test/av1_round_shift_array_test.cc
index 181a394..61dbed5 100644
--- a/libaom/test/av1_round_shift_array_test.cc
+++ b/libaom/test/av1_round_shift_array_test.cc
@@ -13,7 +13,7 @@
#include <stdio.h>
#include <stdlib.h>
-#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
#include "aom_mem/aom_mem.h"
#include "aom_ports/aom_timer.h"
diff --git a/libaom/test/av1_txfm_test.h b/libaom/test/av1_txfm_test.h
index a181647..5a56d28 100644
--- a/libaom/test/av1_txfm_test.h
+++ b/libaom/test/av1_txfm_test.h
@@ -29,14 +29,14 @@
#include "av1/common/enums.h"
namespace libaom_test {
-typedef enum {
+enum {
TYPE_DCT = 0,
TYPE_ADST,
TYPE_IDTX,
TYPE_IDCT,
TYPE_IADST,
TYPE_LAST
-} TYPE_TXFM;
+} UENUM1BYTE(TYPE_TXFM);
int get_txfm1d_size(TX_SIZE tx_size);
diff --git a/libaom/test/comp_avg_pred_test.cc b/libaom/test/comp_avg_pred_test.cc
index 9c6ed90..3e5632e 100644
--- a/libaom/test/comp_avg_pred_test.cc
+++ b/libaom/test/comp_avg_pred_test.cc
@@ -12,61 +12,65 @@
#include "test/comp_avg_pred_test.h"
using libaom_test::ACMRandom;
-using libaom_test::AV1JNTCOMPAVG::AV1HighBDJNTCOMPAVGTest;
-using libaom_test::AV1JNTCOMPAVG::AV1HighBDJNTCOMPAVGUPSAMPLEDTest;
-using libaom_test::AV1JNTCOMPAVG::AV1JNTCOMPAVGTest;
-using libaom_test::AV1JNTCOMPAVG::AV1JNTCOMPAVGUPSAMPLEDTest;
+using libaom_test::AV1DISTWTDCOMPAVG::AV1DISTWTDCOMPAVGTest;
+using libaom_test::AV1DISTWTDCOMPAVG::AV1DISTWTDCOMPAVGUPSAMPLEDTest;
+using libaom_test::AV1DISTWTDCOMPAVG::AV1HighBDDISTWTDCOMPAVGTest;
+using libaom_test::AV1DISTWTDCOMPAVG::AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest;
using ::testing::make_tuple;
using ::testing::tuple;
namespace {
-TEST_P(AV1JNTCOMPAVGTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0)); }
+TEST_P(AV1DISTWTDCOMPAVGTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0)); }
-TEST_P(AV1JNTCOMPAVGTest, CheckOutput) { RunCheckOutput(GET_PARAM(0)); }
+TEST_P(AV1DISTWTDCOMPAVGTest, CheckOutput) { RunCheckOutput(GET_PARAM(0)); }
#if HAVE_SSSE3
-INSTANTIATE_TEST_CASE_P(
- SSSE3, AV1JNTCOMPAVGTest,
- libaom_test::AV1JNTCOMPAVG::BuildParams(aom_jnt_comp_avg_pred_ssse3));
+INSTANTIATE_TEST_CASE_P(SSSE3, AV1DISTWTDCOMPAVGTest,
+ libaom_test::AV1DISTWTDCOMPAVG::BuildParams(
+ aom_dist_wtd_comp_avg_pred_ssse3));
#endif
-TEST_P(AV1JNTCOMPAVGUPSAMPLEDTest, DISABLED_Speed) {
+TEST_P(AV1DISTWTDCOMPAVGUPSAMPLEDTest, DISABLED_Speed) {
RunSpeedTest(GET_PARAM(0));
}
-TEST_P(AV1JNTCOMPAVGUPSAMPLEDTest, CheckOutput) {
+TEST_P(AV1DISTWTDCOMPAVGUPSAMPLEDTest, CheckOutput) {
RunCheckOutput(GET_PARAM(0));
}
#if HAVE_SSSE3
-INSTANTIATE_TEST_CASE_P(SSSE3, AV1JNTCOMPAVGUPSAMPLEDTest,
- libaom_test::AV1JNTCOMPAVG::BuildParams(
- aom_jnt_comp_avg_upsampled_pred_ssse3));
+INSTANTIATE_TEST_CASE_P(SSSE3, AV1DISTWTDCOMPAVGUPSAMPLEDTest,
+ libaom_test::AV1DISTWTDCOMPAVG::BuildParams(
+ aom_dist_wtd_comp_avg_upsampled_pred_ssse3));
#endif
-TEST_P(AV1HighBDJNTCOMPAVGTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(1)); }
+TEST_P(AV1HighBDDISTWTDCOMPAVGTest, DISABLED_Speed) {
+ RunSpeedTest(GET_PARAM(1));
+}
-TEST_P(AV1HighBDJNTCOMPAVGTest, CheckOutput) { RunCheckOutput(GET_PARAM(1)); }
+TEST_P(AV1HighBDDISTWTDCOMPAVGTest, CheckOutput) {
+ RunCheckOutput(GET_PARAM(1));
+}
#if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(SSE2, AV1HighBDJNTCOMPAVGTest,
- libaom_test::AV1JNTCOMPAVG::BuildParams(
- aom_highbd_jnt_comp_avg_pred_sse2, 1));
+INSTANTIATE_TEST_CASE_P(SSE2, AV1HighBDDISTWTDCOMPAVGTest,
+ libaom_test::AV1DISTWTDCOMPAVG::BuildParams(
+ aom_highbd_dist_wtd_comp_avg_pred_sse2, 1));
#endif
-TEST_P(AV1HighBDJNTCOMPAVGUPSAMPLEDTest, DISABLED_Speed) {
+TEST_P(AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest, DISABLED_Speed) {
RunSpeedTest(GET_PARAM(1));
}
-TEST_P(AV1HighBDJNTCOMPAVGUPSAMPLEDTest, CheckOutput) {
+TEST_P(AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest, CheckOutput) {
RunCheckOutput(GET_PARAM(1));
}
#if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(SSE2, AV1HighBDJNTCOMPAVGUPSAMPLEDTest,
- libaom_test::AV1JNTCOMPAVG::BuildParams(
- aom_highbd_jnt_comp_avg_upsampled_pred_sse2));
+INSTANTIATE_TEST_CASE_P(SSE2, AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest,
+ libaom_test::AV1DISTWTDCOMPAVG::BuildParams(
+ aom_highbd_dist_wtd_comp_avg_upsampled_pred_sse2));
#endif
} // namespace
diff --git a/libaom/test/comp_avg_pred_test.h b/libaom/test/comp_avg_pred_test.h
index 65a0153..01ea35d 100644
--- a/libaom/test/comp_avg_pred_test.h
+++ b/libaom/test/comp_avg_pred_test.h
@@ -25,72 +25,73 @@
namespace libaom_test {
const int kMaxSize = 128 + 32; // padding
-namespace AV1JNTCOMPAVG {
+namespace AV1DISTWTDCOMPAVG {
-typedef void (*jntcompavg_func)(uint8_t *comp_pred, const uint8_t *pred,
- int width, int height, const uint8_t *ref,
- int ref_stride,
- const JNT_COMP_PARAMS *jcp_param);
+typedef void (*distwtdcompavg_func)(uint8_t *comp_pred, const uint8_t *pred,
+ int width, int height, const uint8_t *ref,
+ int ref_stride,
+ const DIST_WTD_COMP_PARAMS *jcp_param);
-typedef void (*jntcompavgupsampled_func)(
+typedef void (*distwtdcompavgupsampled_func)(
MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
- int ref_stride, const JNT_COMP_PARAMS *jcp_param, int subpel_search);
+ int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param, int subpel_search);
-typedef void (*highbdjntcompavgupsampled_func)(
+typedef void (*highbddistwtdcompavgupsampled_func)(
MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
- int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param,
+ int ref_stride, int bd, const DIST_WTD_COMP_PARAMS *jcp_param,
int subpel_search);
-typedef ::testing::tuple<jntcompavg_func, BLOCK_SIZE> JNTCOMPAVGParam;
+typedef ::testing::tuple<distwtdcompavg_func, BLOCK_SIZE> DISTWTDCOMPAVGParam;
-typedef ::testing::tuple<jntcompavgupsampled_func, BLOCK_SIZE>
- JNTCOMPAVGUPSAMPLEDParam;
+typedef ::testing::tuple<distwtdcompavgupsampled_func, BLOCK_SIZE>
+ DISTWTDCOMPAVGUPSAMPLEDParam;
-typedef ::testing::tuple<int, jntcompavg_func, BLOCK_SIZE>
- HighbdJNTCOMPAVGParam;
+typedef ::testing::tuple<int, distwtdcompavg_func, BLOCK_SIZE>
+ HighbdDISTWTDCOMPAVGParam;
-typedef ::testing::tuple<int, highbdjntcompavgupsampled_func, BLOCK_SIZE>
- HighbdJNTCOMPAVGUPSAMPLEDParam;
+typedef ::testing::tuple<int, highbddistwtdcompavgupsampled_func, BLOCK_SIZE>
+ HighbdDISTWTDCOMPAVGUPSAMPLEDParam;
-::testing::internal::ParamGenerator<JNTCOMPAVGParam> BuildParams(
- jntcompavg_func filter) {
+::testing::internal::ParamGenerator<DISTWTDCOMPAVGParam> BuildParams(
+ distwtdcompavg_func filter) {
return ::testing::Combine(::testing::Values(filter),
::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
}
-::testing::internal::ParamGenerator<JNTCOMPAVGUPSAMPLEDParam> BuildParams(
- jntcompavgupsampled_func filter) {
+::testing::internal::ParamGenerator<DISTWTDCOMPAVGUPSAMPLEDParam> BuildParams(
+ distwtdcompavgupsampled_func filter) {
return ::testing::Combine(::testing::Values(filter),
::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
}
-::testing::internal::ParamGenerator<HighbdJNTCOMPAVGParam> BuildParams(
- jntcompavg_func filter, int is_hbd) {
+::testing::internal::ParamGenerator<HighbdDISTWTDCOMPAVGParam> BuildParams(
+ distwtdcompavg_func filter, int is_hbd) {
(void)is_hbd;
return ::testing::Combine(::testing::Range(8, 13, 2),
::testing::Values(filter),
::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
}
-::testing::internal::ParamGenerator<HighbdJNTCOMPAVGUPSAMPLEDParam> BuildParams(
- highbdjntcompavgupsampled_func filter) {
+::testing::internal::ParamGenerator<HighbdDISTWTDCOMPAVGUPSAMPLEDParam>
+BuildParams(highbddistwtdcompavgupsampled_func filter) {
return ::testing::Combine(::testing::Range(8, 13, 2),
::testing::Values(filter),
::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
}
-class AV1JNTCOMPAVGTest : public ::testing::TestWithParam<JNTCOMPAVGParam> {
+class AV1DISTWTDCOMPAVGTest
+ : public ::testing::TestWithParam<DISTWTDCOMPAVGParam> {
public:
- ~AV1JNTCOMPAVGTest() {}
+ ~AV1DISTWTDCOMPAVGTest() {}
void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
void TearDown() { libaom_test::ClearSystemState(); }
protected:
- void RunCheckOutput(jntcompavg_func test_impl) {
+ void RunCheckOutput(distwtdcompavg_func test_impl) {
const int w = kMaxSize, h = kMaxSize;
const int block_idx = GET_PARAM(1);
@@ -107,27 +108,27 @@ class AV1JNTCOMPAVGTest : public ::testing::TestWithParam<JNTCOMPAVGParam> {
const int in_w = block_size_wide[block_idx];
const int in_h = block_size_high[block_idx];
- JNT_COMP_PARAMS jnt_comp_params;
- jnt_comp_params.use_jnt_comp_avg = 1;
+ DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
+ dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
for (int ii = 0; ii < 2; ii++) {
for (int jj = 0; jj < 4; jj++) {
- jnt_comp_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
- jnt_comp_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+ dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
+ dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
- aom_jnt_comp_avg_pred_c(output, pred8 + offset_r * w + offset_c, in_w,
- in_h, ref8 + offset_r * w + offset_c, in_w,
- &jnt_comp_params);
+ aom_dist_wtd_comp_avg_pred_c(output, pred8 + offset_r * w + offset_c,
+ in_w, in_h, ref8 + offset_r * w + offset_c,
+ in_w, &dist_wtd_comp_params);
test_impl(output2, pred8 + offset_r * w + offset_c, in_w, in_h,
- ref8 + offset_r * w + offset_c, in_w, &jnt_comp_params);
+ ref8 + offset_r * w + offset_c, in_w, &dist_wtd_comp_params);
for (int i = 0; i < in_h; ++i) {
for (int j = 0; j < in_w; ++j) {
int idx = i * in_w + j;
ASSERT_EQ(output[idx], output2[idx])
- << "Mismatch at unit tests for AV1JNTCOMPAVGTest\n"
+ << "Mismatch at unit tests for AV1DISTWTDCOMPAVGTest\n"
<< in_w << "x" << in_h << " Pixel mismatch at index " << idx
<< " = (" << i << ", " << j << ")";
}
@@ -135,7 +136,7 @@ class AV1JNTCOMPAVGTest : public ::testing::TestWithParam<JNTCOMPAVGParam> {
}
}
}
- void RunSpeedTest(jntcompavg_func test_impl) {
+ void RunSpeedTest(distwtdcompavg_func test_impl) {
const int w = kMaxSize, h = kMaxSize;
const int block_idx = GET_PARAM(1);
@@ -152,49 +153,49 @@ class AV1JNTCOMPAVGTest : public ::testing::TestWithParam<JNTCOMPAVGParam> {
const int in_w = block_size_wide[block_idx];
const int in_h = block_size_high[block_idx];
- JNT_COMP_PARAMS jnt_comp_params;
- jnt_comp_params.use_jnt_comp_avg = 1;
+ DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
+ dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
- jnt_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0];
- jnt_comp_params.bck_offset = quant_dist_lookup_table[0][0][1];
+ dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0];
+ dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][0][1];
const int num_loops = 1000000000 / (in_w + in_h);
aom_usec_timer timer;
aom_usec_timer_start(&timer);
for (int i = 0; i < num_loops; ++i)
- aom_jnt_comp_avg_pred_c(output, pred8, in_w, in_h, ref8, in_w,
- &jnt_comp_params);
+ aom_dist_wtd_comp_avg_pred_c(output, pred8, in_w, in_h, ref8, in_w,
+ &dist_wtd_comp_params);
aom_usec_timer_mark(&timer);
const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
- printf("jntcompavg c_code %3dx%-3d: %7.2f us\n", in_w, in_h,
+ printf("distwtdcompavg c_code %3dx%-3d: %7.2f us\n", in_w, in_h,
1000.0 * elapsed_time / num_loops);
aom_usec_timer timer1;
aom_usec_timer_start(&timer1);
for (int i = 0; i < num_loops; ++i)
- test_impl(output2, pred8, in_w, in_h, ref8, in_w, &jnt_comp_params);
+ test_impl(output2, pred8, in_w, in_h, ref8, in_w, &dist_wtd_comp_params);
aom_usec_timer_mark(&timer1);
const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
- printf("jntcompavg test_code %3dx%-3d: %7.2f us\n", in_w, in_h,
+ printf("distwtdcompavg test_code %3dx%-3d: %7.2f us\n", in_w, in_h,
1000.0 * elapsed_time1 / num_loops);
}
libaom_test::ACMRandom rnd_;
-}; // class AV1JNTCOMPAVGTest
+}; // class AV1DISTWTDCOMPAVGTest
-class AV1JNTCOMPAVGUPSAMPLEDTest
- : public ::testing::TestWithParam<JNTCOMPAVGUPSAMPLEDParam> {
+class AV1DISTWTDCOMPAVGUPSAMPLEDTest
+ : public ::testing::TestWithParam<DISTWTDCOMPAVGUPSAMPLEDParam> {
public:
- ~AV1JNTCOMPAVGUPSAMPLEDTest() {}
+ ~AV1DISTWTDCOMPAVGUPSAMPLEDTest() {}
void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
void TearDown() { libaom_test::ClearSystemState(); }
protected:
- void RunCheckOutput(jntcompavgupsampled_func test_impl) {
+ void RunCheckOutput(distwtdcompavgupsampled_func test_impl) {
const int w = kMaxSize, h = kMaxSize;
const int block_idx = GET_PARAM(1);
@@ -211,8 +212,8 @@ class AV1JNTCOMPAVGUPSAMPLEDTest
const int in_w = block_size_wide[block_idx];
const int in_h = block_size_high[block_idx];
- JNT_COMP_PARAMS jnt_comp_params;
- jnt_comp_params.use_jnt_comp_avg = 1;
+ DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
+ dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
int sub_x_q3, sub_y_q3;
int subpel_search;
for (subpel_search = USE_4_TAPS; subpel_search <= USE_8_TAPS;
@@ -221,28 +222,30 @@ class AV1JNTCOMPAVGUPSAMPLEDTest
for (sub_y_q3 = 0; sub_y_q3 < 8; ++sub_y_q3) {
for (int ii = 0; ii < 2; ii++) {
for (int jj = 0; jj < 4; jj++) {
- jnt_comp_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
- jnt_comp_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+ dist_wtd_comp_params.fwd_offset =
+ quant_dist_lookup_table[ii][jj][0];
+ dist_wtd_comp_params.bck_offset =
+ quant_dist_lookup_table[ii][jj][1];
const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
- aom_jnt_comp_avg_upsampled_pred_c(
+ aom_dist_wtd_comp_avg_upsampled_pred_c(
NULL, NULL, 0, 0, NULL, output,
pred8 + offset_r * w + offset_c, in_w, in_h, sub_x_q3,
sub_y_q3, ref8 + offset_r * w + offset_c, in_w,
- &jnt_comp_params, subpel_search);
+ &dist_wtd_comp_params, subpel_search);
test_impl(NULL, NULL, 0, 0, NULL, output2,
pred8 + offset_r * w + offset_c, in_w, in_h, sub_x_q3,
sub_y_q3, ref8 + offset_r * w + offset_c, in_w,
- &jnt_comp_params, subpel_search);
+ &dist_wtd_comp_params, subpel_search);
for (int i = 0; i < in_h; ++i) {
for (int j = 0; j < in_w; ++j) {
int idx = i * in_w + j;
ASSERT_EQ(output[idx], output2[idx])
<< "Mismatch at unit tests for "
- "AV1JNTCOMPAVGUPSAMPLEDTest\n"
+ "AV1DISTWTDCOMPAVGUPSAMPLEDTest\n"
<< in_w << "x" << in_h << " Pixel mismatch at index "
<< idx << " = (" << i << ", " << j
<< "), sub pixel offset = (" << sub_y_q3 << ", "
@@ -255,7 +258,7 @@ class AV1JNTCOMPAVGUPSAMPLEDTest
}
}
}
- void RunSpeedTest(jntcompavgupsampled_func test_impl) {
+ void RunSpeedTest(distwtdcompavgupsampled_func test_impl) {
const int w = kMaxSize, h = kMaxSize;
const int block_idx = GET_PARAM(1);
@@ -272,11 +275,11 @@ class AV1JNTCOMPAVGUPSAMPLEDTest
const int in_w = block_size_wide[block_idx];
const int in_h = block_size_high[block_idx];
- JNT_COMP_PARAMS jnt_comp_params;
- jnt_comp_params.use_jnt_comp_avg = 1;
+ DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
+ dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
- jnt_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0];
- jnt_comp_params.bck_offset = quant_dist_lookup_table[0][0][1];
+ dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0];
+ dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][0][1];
int sub_x_q3 = 0;
int sub_y_q3 = 0;
@@ -287,13 +290,13 @@ class AV1JNTCOMPAVGUPSAMPLEDTest
int subpel_search = USE_8_TAPS; // set to USE_4_TAPS to test 4-tap filter.
for (int i = 0; i < num_loops; ++i)
- aom_jnt_comp_avg_upsampled_pred_c(NULL, NULL, 0, 0, NULL, output, pred8,
- in_w, in_h, sub_x_q3, sub_y_q3, ref8,
- in_w, &jnt_comp_params, subpel_search);
+ aom_dist_wtd_comp_avg_upsampled_pred_c(
+ NULL, NULL, 0, 0, NULL, output, pred8, in_w, in_h, sub_x_q3, sub_y_q3,
+ ref8, in_w, &dist_wtd_comp_params, subpel_search);
aom_usec_timer_mark(&timer);
const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
- printf("jntcompavgupsampled c_code %3dx%-3d: %7.2f us\n", in_w, in_h,
+ printf("distwtdcompavgupsampled c_code %3dx%-3d: %7.2f us\n", in_w, in_h,
1000.0 * elapsed_time / num_loops);
aom_usec_timer timer1;
@@ -301,27 +304,27 @@ class AV1JNTCOMPAVGUPSAMPLEDTest
for (int i = 0; i < num_loops; ++i)
test_impl(NULL, NULL, 0, 0, NULL, output2, pred8, in_w, in_h, sub_x_q3,
- sub_y_q3, ref8, in_w, &jnt_comp_params, subpel_search);
+ sub_y_q3, ref8, in_w, &dist_wtd_comp_params, subpel_search);
aom_usec_timer_mark(&timer1);
const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
- printf("jntcompavgupsampled test_code %3dx%-3d: %7.2f us\n", in_w, in_h,
+ printf("distwtdcompavgupsampled test_code %3dx%-3d: %7.2f us\n", in_w, in_h,
1000.0 * elapsed_time1 / num_loops);
}
libaom_test::ACMRandom rnd_;
-}; // class AV1JNTCOMPAVGUPSAMPLEDTest
+}; // class AV1DISTWTDCOMPAVGUPSAMPLEDTest
-class AV1HighBDJNTCOMPAVGTest
- : public ::testing::TestWithParam<HighbdJNTCOMPAVGParam> {
+class AV1HighBDDISTWTDCOMPAVGTest
+ : public ::testing::TestWithParam<HighbdDISTWTDCOMPAVGParam> {
public:
- ~AV1HighBDJNTCOMPAVGTest() {}
+ ~AV1HighBDDISTWTDCOMPAVGTest() {}
void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
void TearDown() { libaom_test::ClearSystemState(); }
protected:
- void RunCheckOutput(jntcompavg_func test_impl) {
+ void RunCheckOutput(distwtdcompavg_func test_impl) {
const int w = kMaxSize, h = kMaxSize;
const int block_idx = GET_PARAM(2);
const int bd = GET_PARAM(0);
@@ -338,31 +341,31 @@ class AV1HighBDJNTCOMPAVGTest
const int in_w = block_size_wide[block_idx];
const int in_h = block_size_high[block_idx];
- JNT_COMP_PARAMS jnt_comp_params;
- jnt_comp_params.use_jnt_comp_avg = 1;
+ DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
+ dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
for (int ii = 0; ii < 2; ii++) {
for (int jj = 0; jj < 4; jj++) {
- jnt_comp_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
- jnt_comp_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+ dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
+ dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
- aom_highbd_jnt_comp_avg_pred_c(
+ aom_highbd_dist_wtd_comp_avg_pred_c(
CONVERT_TO_BYTEPTR(output),
CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c, in_w, in_h,
CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c, in_w,
- &jnt_comp_params);
+ &dist_wtd_comp_params);
test_impl(CONVERT_TO_BYTEPTR(output2),
CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c, in_w,
in_h, CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c,
- in_w, &jnt_comp_params);
+ in_w, &dist_wtd_comp_params);
for (int i = 0; i < in_h; ++i) {
for (int j = 0; j < in_w; ++j) {
int idx = i * in_w + j;
ASSERT_EQ(output[idx], output2[idx])
- << "Mismatch at unit tests for AV1HighBDJNTCOMPAVGTest\n"
+ << "Mismatch at unit tests for AV1HighBDDISTWTDCOMPAVGTest\n"
<< in_w << "x" << in_h << " Pixel mismatch at index " << idx
<< " = (" << i << ", " << j << ")";
}
@@ -370,7 +373,7 @@ class AV1HighBDJNTCOMPAVGTest
}
}
}
- void RunSpeedTest(jntcompavg_func test_impl) {
+ void RunSpeedTest(distwtdcompavg_func test_impl) {
const int w = kMaxSize, h = kMaxSize;
const int block_idx = GET_PARAM(2);
const int bd = GET_PARAM(0);
@@ -387,24 +390,24 @@ class AV1HighBDJNTCOMPAVGTest
const int in_w = block_size_wide[block_idx];
const int in_h = block_size_high[block_idx];
- JNT_COMP_PARAMS jnt_comp_params;
- jnt_comp_params.use_jnt_comp_avg = 1;
+ DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
+ dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
- jnt_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0];
- jnt_comp_params.bck_offset = quant_dist_lookup_table[0][0][1];
+ dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0];
+ dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][0][1];
const int num_loops = 1000000000 / (in_w + in_h);
aom_usec_timer timer;
aom_usec_timer_start(&timer);
for (int i = 0; i < num_loops; ++i)
- aom_highbd_jnt_comp_avg_pred_c(
+ aom_highbd_dist_wtd_comp_avg_pred_c(
CONVERT_TO_BYTEPTR(output), CONVERT_TO_BYTEPTR(pred8), in_w, in_h,
- CONVERT_TO_BYTEPTR(ref8), in_w, &jnt_comp_params);
+ CONVERT_TO_BYTEPTR(ref8), in_w, &dist_wtd_comp_params);
aom_usec_timer_mark(&timer);
const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
- printf("highbdjntcompavg c_code %3dx%-3d: %7.2f us\n", in_w, in_h,
+ printf("highbddistwtdcompavg c_code %3dx%-3d: %7.2f us\n", in_w, in_h,
1000.0 * elapsed_time / num_loops);
aom_usec_timer timer1;
@@ -412,26 +415,26 @@ class AV1HighBDJNTCOMPAVGTest
for (int i = 0; i < num_loops; ++i)
test_impl(CONVERT_TO_BYTEPTR(output2), CONVERT_TO_BYTEPTR(pred8), in_w,
- in_h, CONVERT_TO_BYTEPTR(ref8), in_w, &jnt_comp_params);
+ in_h, CONVERT_TO_BYTEPTR(ref8), in_w, &dist_wtd_comp_params);
aom_usec_timer_mark(&timer1);
const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
- printf("highbdjntcompavg test_code %3dx%-3d: %7.2f us\n", in_w, in_h,
+ printf("highbddistwtdcompavg test_code %3dx%-3d: %7.2f us\n", in_w, in_h,
1000.0 * elapsed_time1 / num_loops);
}
libaom_test::ACMRandom rnd_;
-}; // class AV1HighBDJNTCOMPAVGTest
+}; // class AV1HighBDDISTWTDCOMPAVGTest
-class AV1HighBDJNTCOMPAVGUPSAMPLEDTest
- : public ::testing::TestWithParam<HighbdJNTCOMPAVGUPSAMPLEDParam> {
+class AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest
+ : public ::testing::TestWithParam<HighbdDISTWTDCOMPAVGUPSAMPLEDParam> {
public:
- ~AV1HighBDJNTCOMPAVGUPSAMPLEDTest() {}
+ ~AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest() {}
void SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
void TearDown() { libaom_test::ClearSystemState(); }
protected:
- void RunCheckOutput(highbdjntcompavgupsampled_func test_impl) {
+ void RunCheckOutput(highbddistwtdcompavgupsampled_func test_impl) {
const int w = kMaxSize, h = kMaxSize;
const int block_idx = GET_PARAM(2);
const int bd = GET_PARAM(0);
@@ -448,8 +451,8 @@ class AV1HighBDJNTCOMPAVGUPSAMPLEDTest
const int in_w = block_size_wide[block_idx];
const int in_h = block_size_high[block_idx];
- JNT_COMP_PARAMS jnt_comp_params;
- jnt_comp_params.use_jnt_comp_avg = 1;
+ DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
+ dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
int sub_x_q3, sub_y_q3;
int subpel_search;
for (subpel_search = USE_4_TAPS; subpel_search <= USE_8_TAPS;
@@ -458,30 +461,32 @@ class AV1HighBDJNTCOMPAVGUPSAMPLEDTest
for (sub_y_q3 = 0; sub_y_q3 < 8; ++sub_y_q3) {
for (int ii = 0; ii < 2; ii++) {
for (int jj = 0; jj < 4; jj++) {
- jnt_comp_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
- jnt_comp_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
+ dist_wtd_comp_params.fwd_offset =
+ quant_dist_lookup_table[ii][jj][0];
+ dist_wtd_comp_params.bck_offset =
+ quant_dist_lookup_table[ii][jj][1];
const int offset_r = 3 + rnd_.PseudoUniform(h - in_h - 7);
const int offset_c = 3 + rnd_.PseudoUniform(w - in_w - 7);
- aom_highbd_jnt_comp_avg_upsampled_pred_c(
+ aom_highbd_dist_wtd_comp_avg_upsampled_pred_c(
NULL, NULL, 0, 0, NULL, CONVERT_TO_BYTEPTR(output),
CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c, in_w,
in_h, sub_x_q3, sub_y_q3,
CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c, in_w, bd,
- &jnt_comp_params, subpel_search);
+ &dist_wtd_comp_params, subpel_search);
test_impl(NULL, NULL, 0, 0, NULL, CONVERT_TO_BYTEPTR(output2),
CONVERT_TO_BYTEPTR(pred8) + offset_r * w + offset_c,
in_w, in_h, sub_x_q3, sub_y_q3,
CONVERT_TO_BYTEPTR(ref8) + offset_r * w + offset_c,
- in_w, bd, &jnt_comp_params, subpel_search);
+ in_w, bd, &dist_wtd_comp_params, subpel_search);
for (int i = 0; i < in_h; ++i) {
for (int j = 0; j < in_w; ++j) {
int idx = i * in_w + j;
ASSERT_EQ(output[idx], output2[idx])
<< "Mismatch at unit tests for "
- "AV1HighBDJNTCOMPAVGUPSAMPLEDTest\n"
+ "AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest\n"
<< in_w << "x" << in_h << " Pixel mismatch at index "
<< idx << " = (" << i << ", " << j
<< "), sub pixel offset = (" << sub_y_q3 << ", "
@@ -494,7 +499,7 @@ class AV1HighBDJNTCOMPAVGUPSAMPLEDTest
}
}
}
- void RunSpeedTest(highbdjntcompavgupsampled_func test_impl) {
+ void RunSpeedTest(highbddistwtdcompavgupsampled_func test_impl) {
const int w = kMaxSize, h = kMaxSize;
const int block_idx = GET_PARAM(2);
const int bd = GET_PARAM(0);
@@ -511,11 +516,11 @@ class AV1HighBDJNTCOMPAVGUPSAMPLEDTest
const int in_w = block_size_wide[block_idx];
const int in_h = block_size_high[block_idx];
- JNT_COMP_PARAMS jnt_comp_params;
- jnt_comp_params.use_jnt_comp_avg = 1;
+ DIST_WTD_COMP_PARAMS dist_wtd_comp_params;
+ dist_wtd_comp_params.use_dist_wtd_comp_avg = 1;
- jnt_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0];
- jnt_comp_params.bck_offset = quant_dist_lookup_table[0][0][1];
+ dist_wtd_comp_params.fwd_offset = quant_dist_lookup_table[0][0][0];
+ dist_wtd_comp_params.bck_offset = quant_dist_lookup_table[0][0][1];
int sub_x_q3 = 0;
int sub_y_q3 = 0;
const int num_loops = 1000000000 / (in_w + in_h);
@@ -523,15 +528,16 @@ class AV1HighBDJNTCOMPAVGUPSAMPLEDTest
aom_usec_timer_start(&timer);
int subpel_search = USE_8_TAPS; // set to USE_4_TAPS to test 4-tap filter.
for (int i = 0; i < num_loops; ++i)
- aom_highbd_jnt_comp_avg_upsampled_pred_c(
+ aom_highbd_dist_wtd_comp_avg_upsampled_pred_c(
NULL, NULL, 0, 0, NULL, CONVERT_TO_BYTEPTR(output),
CONVERT_TO_BYTEPTR(pred8), in_w, in_h, sub_x_q3, sub_y_q3,
- CONVERT_TO_BYTEPTR(ref8), in_w, bd, &jnt_comp_params, subpel_search);
+ CONVERT_TO_BYTEPTR(ref8), in_w, bd, &dist_wtd_comp_params,
+ subpel_search);
aom_usec_timer_mark(&timer);
const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
- printf("highbdjntcompavgupsampled c_code %3dx%-3d: %7.2f us\n", in_w, in_h,
- 1000.0 * elapsed_time / num_loops);
+ printf("highbddistwtdcompavgupsampled c_code %3dx%-3d: %7.2f us\n", in_w,
+ in_h, 1000.0 * elapsed_time / num_loops);
aom_usec_timer timer1;
aom_usec_timer_start(&timer1);
@@ -539,19 +545,19 @@ class AV1HighBDJNTCOMPAVGUPSAMPLEDTest
for (int i = 0; i < num_loops; ++i)
test_impl(NULL, NULL, 0, 0, NULL, CONVERT_TO_BYTEPTR(output2),
CONVERT_TO_BYTEPTR(pred8), in_w, in_h, sub_x_q3, sub_y_q3,
- CONVERT_TO_BYTEPTR(ref8), in_w, bd, &jnt_comp_params,
+ CONVERT_TO_BYTEPTR(ref8), in_w, bd, &dist_wtd_comp_params,
subpel_search);
aom_usec_timer_mark(&timer1);
const int elapsed_time1 = static_cast<int>(aom_usec_timer_elapsed(&timer1));
- printf("highbdjntcompavgupsampled test_code %3dx%-3d: %7.2f us\n", in_w,
+ printf("highbddistwtdcompavgupsampled test_code %3dx%-3d: %7.2f us\n", in_w,
in_h, 1000.0 * elapsed_time1 / num_loops);
}
libaom_test::ACMRandom rnd_;
-}; // class AV1HighBDJNTCOMPAVGUPSAMPLEDTest
+}; // class AV1HighBDDISTWTDCOMPAVGUPSAMPLEDTest
-} // namespace AV1JNTCOMPAVG
+} // namespace AV1DISTWTDCOMPAVG
} // namespace libaom_test
#endif // AOM_TEST_COMP_AVG_PRED_TEST_H_
diff --git a/libaom/test/corner_match_test.cc b/libaom/test/corner_match_test.cc
index 58e3139..af2baa7 100644
--- a/libaom/test/corner_match_test.cc
+++ b/libaom/test/corner_match_test.cc
@@ -24,9 +24,13 @@ namespace AV1CornerMatch {
using libaom_test::ACMRandom;
+typedef double (*ComputeCrossCorrFunc)(unsigned char *im1, int stride1, int x1,
+ int y1, unsigned char *im2, int stride2,
+ int x2, int y2);
+
using ::testing::make_tuple;
using ::testing::tuple;
-typedef tuple<int> CornerMatchParam;
+typedef tuple<int, ComputeCrossCorrFunc> CornerMatchParam;
class AV1CornerMatchTest : public ::testing::TestWithParam<CornerMatchParam> {
public:
@@ -36,19 +40,24 @@ class AV1CornerMatchTest : public ::testing::TestWithParam<CornerMatchParam> {
virtual void TearDown();
protected:
- void RunCheckOutput();
+ void RunCheckOutput(int run_times);
+ ComputeCrossCorrFunc target_func;
libaom_test::ACMRandom rnd_;
};
AV1CornerMatchTest::~AV1CornerMatchTest() {}
-void AV1CornerMatchTest::SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); }
+void AV1CornerMatchTest::SetUp() {
+ rnd_.Reset(ACMRandom::DeterministicSeed());
+ target_func = GET_PARAM(1);
+}
void AV1CornerMatchTest::TearDown() { libaom_test::ClearSystemState(); }
-void AV1CornerMatchTest::RunCheckOutput() {
+void AV1CornerMatchTest::RunCheckOutput(int run_times) {
const int w = 128, h = 128;
const int num_iters = 10000;
int i, j;
+ aom_usec_timer ref_timer, test_timer;
uint8_t *input1 = new uint8_t[w * h];
uint8_t *input2 = new uint8_t[w * h];
@@ -80,21 +89,54 @@ void AV1CornerMatchTest::RunCheckOutput() {
double res_c =
compute_cross_correlation_c(input1, w, x1, y1, input2, w, x2, y2);
- double res_sse4 =
- compute_cross_correlation_sse4_1(input1, w, x1, y1, input2, w, x2, y2);
+ double res_simd = target_func(input1, w, x1, y1, input2, w, x2, y2);
- ASSERT_EQ(res_sse4, res_c);
- }
+ if (run_times > 1) {
+ aom_usec_timer_start(&ref_timer);
+ for (j = 0; j < run_times; j++) {
+ compute_cross_correlation_c(input1, w, x1, y1, input2, w, x2, y2);
+ }
+ aom_usec_timer_mark(&ref_timer);
+ const int elapsed_time_c =
+ static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
+ aom_usec_timer_start(&test_timer);
+ for (j = 0; j < run_times; j++) {
+ target_func(input1, w, x1, y1, input2, w, x2, y2);
+ }
+ aom_usec_timer_mark(&test_timer);
+ const int elapsed_time_simd =
+ static_cast<int>(aom_usec_timer_elapsed(&test_timer));
+
+ printf(
+ "c_time=%d \t simd_time=%d \t "
+ "gain=%d\n",
+ elapsed_time_c, elapsed_time_simd,
+ (elapsed_time_c / elapsed_time_simd));
+ } else {
+ ASSERT_EQ(res_simd, res_c);
+ }
+ }
delete[] input1;
delete[] input2;
}
-TEST_P(AV1CornerMatchTest, CheckOutput) { RunCheckOutput(); }
-
-INSTANTIATE_TEST_CASE_P(SSE4_1, AV1CornerMatchTest,
- ::testing::Values(make_tuple(0), make_tuple(1)));
-
+TEST_P(AV1CornerMatchTest, CheckOutput) { RunCheckOutput(1); }
+TEST_P(AV1CornerMatchTest, DISABLED_Speed) { RunCheckOutput(100000); }
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+ SSE4_1, AV1CornerMatchTest,
+ ::testing::Values(make_tuple(0, compute_cross_correlation_sse4_1),
+ make_tuple(1, compute_cross_correlation_sse4_1)));
+#endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+ AVX2, AV1CornerMatchTest,
+ ::testing::Values(make_tuple(0, compute_cross_correlation_avx2),
+ make_tuple(1, compute_cross_correlation_avx2)));
+#endif
} // namespace AV1CornerMatch
} // namespace test_libaom
diff --git a/libaom/test/dr_prediction_test.cc b/libaom/test/dr_prediction_test.cc
index a64d39b..4be8489 100644
--- a/libaom/test/dr_prediction_test.cc
+++ b/libaom/test/dr_prediction_test.cc
@@ -59,7 +59,9 @@ typedef void (*Z1_Lbd)(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
template <Z1_Lbd fn>
void z1_wrapper(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
const uint8_t *above, const uint8_t *left, int upsample_above,
- int /*upsample_left*/, int dx, int dy, int /*bd*/) {
+ int upsample_left, int dx, int dy, int bd) {
+ (void)bd;
+ (void)upsample_left;
fn(dst, stride, bw, bh, above, left, upsample_above, dx, dy);
}
@@ -69,7 +71,9 @@ typedef void (*Z2_Lbd)(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
template <Z2_Lbd fn>
void z2_wrapper(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
const uint8_t *above, const uint8_t *left, int upsample_above,
- int upsample_left, int dx, int dy, int /*bd*/) {
+ int upsample_left, int dx, int dy, int bd) {
+ (void)bd;
+ (void)upsample_left;
fn(dst, stride, bw, bh, above, left, upsample_above, upsample_left, dx, dy);
}
@@ -78,9 +82,10 @@ typedef void (*Z3_Lbd)(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
int upsample_left, int dx, int dy);
template <Z3_Lbd fn>
void z3_wrapper(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
- const uint8_t *above, const uint8_t *left,
- int /*upsample_above*/, int upsample_left, int dx, int dy,
- int /*bd*/) {
+ const uint8_t *above, const uint8_t *left, int upsample_above,
+ int upsample_left, int dx, int dy, int bd) {
+ (void)bd;
+ (void)upsample_above;
fn(dst, stride, bw, bh, above, left, upsample_left, dx, dy);
}
@@ -90,8 +95,10 @@ typedef void (*Z1_Hbd)(uint16_t *dst, ptrdiff_t stride, int bw, int bh,
template <Z1_Hbd fn>
void z1_wrapper_hbd(uint16_t *dst, ptrdiff_t stride, int bw, int bh,
const uint16_t *above, const uint16_t *left,
- int upsample_above, int /*upsample_left*/, int dx, int dy,
+ int upsample_above, int upsample_left, int dx, int dy,
int bd) {
+ (void)bd;
+ (void)upsample_left;
fn(dst, stride, bw, bh, above, left, upsample_above, dx, dy, bd);
}
@@ -104,6 +111,7 @@ void z2_wrapper_hbd(uint16_t *dst, ptrdiff_t stride, int bw, int bh,
const uint16_t *above, const uint16_t *left,
int upsample_above, int upsample_left, int dx, int dy,
int bd) {
+ (void)bd;
fn(dst, stride, bw, bh, above, left, upsample_above, upsample_left, dx, dy,
bd);
}
@@ -114,8 +122,10 @@ typedef void (*Z3_Hbd)(uint16_t *dst, ptrdiff_t stride, int bw, int bh,
template <Z3_Hbd fn>
void z3_wrapper_hbd(uint16_t *dst, ptrdiff_t stride, int bw, int bh,
const uint16_t *above, const uint16_t *left,
- int /*upsample_above*/, int upsample_left, int dx, int dy,
+ int upsample_above, int upsample_left, int dx, int dy,
int bd) {
+ (void)bd;
+ (void)upsample_above;
fn(dst, stride, bw, bh, above, left, upsample_left, dx, dy, bd);
}
@@ -135,7 +145,7 @@ struct DrPredFunc {
template <typename Pixel, typename FuncType>
class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > {
protected:
- static const int kMaxNumTests = 100000;
+ static const int kMaxNumTests = 10000;
static const int kIterations = 10;
static const int kDstStride = 64;
static const int kDstSize = kDstStride * kDstStride;
@@ -171,6 +181,9 @@ class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > {
void Predict(bool speedtest, int tx) {
const int kNumTests = speedtest ? kMaxNumTests : 1;
aom_usec_timer timer;
+ int tst_time = 0;
+
+ bd_ = params_.bit_depth;
aom_usec_timer_start(&timer);
for (int k = 0; k < kNumTests; ++k) {
@@ -180,25 +193,27 @@ class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > {
aom_usec_timer_mark(&timer);
const int ref_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
- aom_usec_timer_start(&timer);
if (params_.tst_fn) {
+ aom_usec_timer_start(&timer);
for (int k = 0; k < kNumTests; ++k) {
ASM_REGISTER_STATE_CHECK(params_.tst_fn(dst_tst_, dst_stride_, bw_, bh_,
above_, left_, upsample_above_,
upsample_left_, dx_, dy_, bd_));
}
+ aom_usec_timer_mark(&timer);
+ tst_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
} else {
for (int i = 0; i < kDstSize; ++i) {
dst_ref_[i] = dst_tst_[i];
}
}
- aom_usec_timer_mark(&timer);
- const int tst_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
OutputTimes(kNumTests, ref_time, tst_time, tx);
}
void RunTest(bool speedtest, bool needsaturation, int p_angle) {
+ bd_ = params_.bit_depth;
+
if (needsaturation) {
for (int i = 0; i < kBufSize; ++i) {
above_data_[i] = left_data_[i] = (1 << bd_) - 1;
@@ -290,8 +305,7 @@ class DrPredTest : public ::testing::TestWithParam<DrPredFunc<FuncType> > {
class LowbdDrPredTest : public DrPredTest<uint8_t, DrPred> {};
TEST_P(LowbdDrPredTest, SaturatedValues) {
- for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
- enable_upsample_ = iter & 1;
+ for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) {
for (int angle = start_angle_; angle < stop_angle_; ++angle) {
dx_ = av1_get_dx(angle);
dy_ = av1_get_dy(angle);
@@ -300,20 +314,6 @@ TEST_P(LowbdDrPredTest, SaturatedValues) {
}
}
-TEST_P(LowbdDrPredTest, DISABLED_Speed) {
- const int angles[] = { 3, 45, 87 };
- for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) {
- for (int i = 0; i < 3; ++i) {
- const int angle = angles[i] + start_angle_;
- dx_ = av1_get_dx(angle);
- dy_ = av1_get_dy(angle);
- printf("enable_upsample: %d angle: %d ~~~~~~~~~~~~~~~\n",
- enable_upsample_, angle);
- if (dx_ && dy_) RunTest(true, false, angle);
- }
- }
-}
-
using ::testing::make_tuple;
INSTANTIATE_TEST_CASE_P(
@@ -328,8 +328,7 @@ INSTANTIATE_TEST_CASE_P(
class HighbdDrPredTest : public DrPredTest<uint16_t, DrPred_Hbd> {};
TEST_P(HighbdDrPredTest, SaturatedValues) {
- for (int iter = 0; iter < kIterations && !HasFatalFailure(); ++iter) {
- enable_upsample_ = iter & 1;
+ for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) {
for (int angle = start_angle_; angle < stop_angle_; ++angle) {
dx_ = av1_get_dx(angle);
dy_ = av1_get_dy(angle);
@@ -362,6 +361,46 @@ INSTANTIATE_TEST_CASE_P(
#if HAVE_AVX2
INSTANTIATE_TEST_CASE_P(
+ AVX2, LowbdDrPredTest,
+ ::testing::Values(DrPredFunc<DrPred>(&z1_wrapper<av1_dr_prediction_z1_c>,
+ &z1_wrapper<av1_dr_prediction_z1_avx2>,
+ AOM_BITS_8, kZ1Start),
+ /* TODO(niva213@gmail.com): Re-enable this test after
+ fixing valgrind issue: https://crbug.com/aomedia/2316
+ DrPredFunc<DrPred>(&z2_wrapper<av1_dr_prediction_z2_c>,
+ &z2_wrapper<av1_dr_prediction_z2_avx2>,
+ AOM_BITS_8, kZ2Start), */
+ DrPredFunc<DrPred>(&z3_wrapper<av1_dr_prediction_z3_c>,
+ &z3_wrapper<av1_dr_prediction_z3_avx2>,
+ AOM_BITS_8, kZ3Start)));
+
+TEST_P(LowbdDrPredTest, DISABLED_Speed) {
+ const int angles[] = { 3, 45, 87 };
+ for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) {
+ for (int i = 0; i < 3; ++i) {
+ const int angle = angles[i] + start_angle_;
+ dx_ = av1_get_dx(angle);
+ dy_ = av1_get_dy(angle);
+ printf("enable_upsample: %d angle: %d ~~~~~~~~~~~~~~~\n",
+ enable_upsample_, angle);
+ if (dx_ && dy_) RunTest(true, false, angle);
+ }
+ }
+}
+
+TEST_P(LowbdDrPredTest, OperationCheck) {
+ if (params_.tst_fn == NULL) return;
+ // const int angles[] = { 3, 45, 81, 87, 93, 100, 145, 187, 199, 260 };
+ for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) {
+ for (int angle = start_angle_; angle < stop_angle_; ++angle) {
+ dx_ = av1_get_dx(angle);
+ dy_ = av1_get_dy(angle);
+ if (dx_ && dy_) RunTest(false, false, angle);
+ }
+ }
+}
+
+INSTANTIATE_TEST_CASE_P(
AVX2, HighbdDrPredTest,
::testing::Values(DrPredFunc<DrPred_Hbd>(
&z1_wrapper_hbd<av1_highbd_dr_prediction_z1_c>,
@@ -375,7 +414,9 @@ INSTANTIATE_TEST_CASE_P(
&z1_wrapper_hbd<av1_highbd_dr_prediction_z1_c>,
&z1_wrapper_hbd<av1_highbd_dr_prediction_z1_avx2>,
AOM_BITS_12, kZ1Start),
- /*DrPredFunc<DrPred_Hbd>(
+ /* TODO(niva213@gmail.com): Re-enable these tests after
+ fixing valgrind issue: https://crbug.com/aomedia/2316
+ DrPredFunc<DrPred_Hbd>(
&z2_wrapper_hbd<av1_highbd_dr_prediction_z2_c>,
&z2_wrapper_hbd<av1_highbd_dr_prediction_z2_avx2>,
AOM_BITS_8, kZ2Start),
diff --git a/libaom/test/edge_detect_test.cc b/libaom/test/edge_detect_test.cc
index 47466cb..77a731f 100644
--- a/libaom/test/edge_detect_test.cc
+++ b/libaom/test/edge_detect_test.cc
@@ -185,8 +185,9 @@ TEST_P(EdgeDetectBrightnessTest, DetectUniformBrightness) {
const bool high_bd = GET_PARAM(3);
const int bd = GET_PARAM(4);
- ASSERT_EQ(0, av1_edge_exists(input_, stride_8tap(width), width, height,
- high_bd, bd));
+ ASSERT_EQ(
+ 0, av1_edge_exists(input_, stride_8tap(width), width, height, high_bd, bd)
+ .magnitude);
}
INSTANTIATE_TEST_CASE_P(ImageBrightnessTests, EdgeDetectBrightnessTest,
@@ -245,9 +246,11 @@ TEST_P(EdgeDetectImageTest, BlackWhite) {
free(orig);
// Value should be between 556 and 560.
ASSERT_LE(556, av1_edge_exists(padded, stride_8tap(width), width, height,
- high_bd, bd));
+ high_bd, bd)
+ .magnitude);
ASSERT_GE(560, av1_edge_exists(padded, stride_8tap(width), width, height,
- high_bd, bd));
+ high_bd, bd)
+ .magnitude);
free_pad_8tap(padded, width, high_bd);
}
diff --git a/libaom/test/encode_api_test.cc b/libaom/test/encode_api_test.cc
index c26f572..235480a 100644
--- a/libaom/test/encode_api_test.cc
+++ b/libaom/test/encode_api_test.cc
@@ -50,7 +50,7 @@ TEST(EncodeAPI, InvalidParams) {
EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
aom_codec_enc_init(&enc, kCodecs[i], NULL, 0));
EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
- aom_codec_enc_config_default(kCodecs[i], &cfg, 1));
+ aom_codec_enc_config_default(kCodecs[i], &cfg, 2));
EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(kCodecs[i], &cfg, 0));
EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, kCodecs[i], &cfg, 0));
diff --git a/libaom/test/end_to_end_test.cc b/libaom/test/end_to_end_test.cc
index 9aa44c6..6ea09a6 100644
--- a/libaom/test/end_to_end_test.cc
+++ b/libaom/test/end_to_end_test.cc
@@ -53,6 +53,13 @@ typedef struct {
unsigned int profile;
} TestVideoParam;
+std::ostream &operator<<(std::ostream &os, const TestVideoParam &test_arg) {
+ return os << "TestVideoParam { filename:" << test_arg.filename
+ << " input_bit_depth:" << test_arg.input_bit_depth
+ << " fmt:" << test_arg.fmt << " bit_depth:" << test_arg.bit_depth
+ << " profile:" << test_arg.profile << "}";
+}
+
const TestVideoParam kTestVectors[] = {
{ "park_joy_90p_8_420.y4m", 8, AOM_IMG_FMT_I420, AOM_BITS_8, 0 },
{ "park_joy_90p_8_422.y4m", 8, AOM_IMG_FMT_I422, AOM_BITS_8, 2 },
diff --git a/libaom/test/error_block_test.cc b/libaom/test/error_block_test.cc
index 353947c..3664ccf 100644
--- a/libaom/test/error_block_test.cc
+++ b/libaom/test/error_block_test.cc
@@ -156,6 +156,70 @@ TEST_P(ErrorBlockTest, ExtremeValues) {
<< "First failed at test case " << first_failure;
}
+TEST_P(ErrorBlockTest, DISABLED_Speed) {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ DECLARE_ALIGNED(16, tran_low_t, coeff[4096]);
+ DECLARE_ALIGNED(16, tran_low_t, dqcoeff[4096]);
+ intptr_t block_size;
+ int64_t ssz;
+ int num_iters = 100000;
+ int64_t ref_ssz;
+ int k;
+ const int msb = bit_depth_ + 8 - 1;
+ for (int i = 0; i < 9; ++i) {
+ block_size = 16 << (i % 9); // All block sizes from 4x4, 8x4 ..64x64
+ for (k = 0; k < 9; k++) {
+ for (int j = 0; j < block_size; j++) {
+ if (k < 5) {
+ if (rnd(2)) {
+ // Positive number
+ coeff[j] = rnd(1 << msb);
+ dqcoeff[j] = rnd(1 << msb);
+ } else {
+ // Negative number
+ coeff[j] = -rnd(1 << msb);
+ dqcoeff[j] = -rnd(1 << msb);
+ }
+ } else {
+ if (rnd(2)) {
+ // Positive number
+ coeff[j] = rnd(1 << 14);
+ dqcoeff[j] = rnd(1 << 14);
+ } else {
+ // Negative number
+ coeff[j] = -rnd(1 << 14);
+ dqcoeff[j] = -rnd(1 << 14);
+ }
+ }
+ }
+ aom_usec_timer ref_timer, test_timer;
+
+ aom_usec_timer_start(&ref_timer);
+ for (int i = 0; i < num_iters; ++i) {
+ ref_error_block_op_(coeff, dqcoeff, block_size, &ref_ssz, bit_depth_);
+ }
+ aom_usec_timer_mark(&ref_timer);
+ const int elapsed_time_c =
+ static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
+
+ aom_usec_timer_start(&test_timer);
+ for (int i = 0; i < num_iters; ++i) {
+ error_block_op_(coeff, dqcoeff, block_size, &ssz, bit_depth_);
+ }
+ aom_usec_timer_mark(&test_timer);
+
+ const int elapsed_time_simd =
+ static_cast<int>(aom_usec_timer_elapsed(&test_timer));
+
+ printf(
+ " c_time=%d \t simd_time=%d \t "
+ "gain=%d \n",
+ elapsed_time_c, elapsed_time_simd,
+ (elapsed_time_c / elapsed_time_simd));
+ }
+ }
+}
+
#if (HAVE_SSE2 || HAVE_AVX)
using ::testing::make_tuple;
@@ -168,4 +232,17 @@ INSTANTIATE_TEST_CASE_P(
make_tuple(&av1_highbd_block_error_sse2,
&av1_highbd_block_error_c, AOM_BITS_8)));
#endif // HAVE_SSE2
+
+#if (HAVE_AVX2)
+using ::testing::make_tuple;
+
+INSTANTIATE_TEST_CASE_P(
+ AVX2, ErrorBlockTest,
+ ::testing::Values(make_tuple(&av1_highbd_block_error_avx2,
+ &av1_highbd_block_error_c, AOM_BITS_10),
+ make_tuple(&av1_highbd_block_error_avx2,
+ &av1_highbd_block_error_c, AOM_BITS_12),
+ make_tuple(&av1_highbd_block_error_avx2,
+ &av1_highbd_block_error_c, AOM_BITS_8)));
+#endif // HAVE_AVX2
} // namespace
diff --git a/libaom/test/external_frame_buffer_test.cc b/libaom/test/external_frame_buffer_test.cc
index 6fcd9e7..4938a64 100644
--- a/libaom/test/external_frame_buffer_test.cc
+++ b/libaom/test/external_frame_buffer_test.cc
@@ -58,7 +58,7 @@ class ExternalFrameBufferList {
// Searches the frame buffer list for a free frame buffer. Makes sure
// that the frame buffer is at least |min_size| in bytes. Marks that the
- // frame buffer is in use by libvpx. Finally sets |fb| to point to the
+ // frame buffer is in use by libaom. Finally sets |fb| to point to the
// external frame buffer. Returns < 0 on an error.
int GetFreeFrameBuffer(size_t min_size, aom_codec_frame_buffer_t *fb) {
EXPECT_TRUE(fb != NULL);
@@ -114,9 +114,9 @@ class ExternalFrameBufferList {
return 0;
}
- // Checks that the ximage data is contained within the external frame buffer
- // private data passed back in the ximage.
- void CheckXImageFrameBuffer(const aom_image_t *img) {
+ // Checks that the aom_image_t data is contained within the external frame
+ // buffer private data passed back in the aom_image_t.
+ void CheckImageFrameBuffer(const aom_image_t *img) {
if (img->fb_priv != NULL) {
const struct ExternalFrameBuffer *const ext_fb =
reinterpret_cast<ExternalFrameBuffer *>(img->fb_priv);
@@ -158,7 +158,7 @@ class ExternalFrameBufferList {
#if CONFIG_WEBM_IO
-// Callback used by libvpx to request the application to return a frame
+// Callback used by libaom to request the application to return a frame
// buffer of at least |min_size| in bytes.
int get_aom_frame_buffer(void *user_priv, size_t min_size,
aom_codec_frame_buffer_t *fb) {
@@ -167,7 +167,7 @@ int get_aom_frame_buffer(void *user_priv, size_t min_size,
return fb_list->GetFreeFrameBuffer(min_size, fb);
}
-// Callback used by libvpx to tell the application that |fb| is not needed
+// Callback used by libaom to tell the application that |fb| is not needed
// anymore.
int release_aom_frame_buffer(void *user_priv, aom_codec_frame_buffer_t *fb) {
ExternalFrameBufferList *const fb_list =
@@ -218,7 +218,7 @@ class ExternalFrameBufferMD5Test
const libaom_test::CompressedVideoSource &video,
libaom_test::Decoder *decoder) {
if (num_buffers_ > 0 && video.frame_number() == 0) {
- // Have libvpx use frame buffers we create.
+ // Have libaom use frame buffers we create.
ASSERT_TRUE(fb_list_.CreateBufferList(num_buffers_));
ASSERT_EQ(AOM_CODEC_OK,
decoder->SetFrameBufferFunctions(GetAV1FrameBuffer,
@@ -299,7 +299,7 @@ class ExternalFrameBufferMD5Test
const char kAV1TestFile[] = "av1-1-b8-03-sizeup.mkv";
const char kAV1NonRefTestFile[] = "av1-1-b8-01-size-226x226.ivf";
-// Class for testing passing in external frame buffers to libvpx.
+// Class for testing passing in external frame buffers to libaom.
class ExternalFrameBufferTest : public ::testing::Test {
protected:
ExternalFrameBufferTest() : video_(NULL), decoder_(NULL), num_buffers_(0) {}
@@ -322,7 +322,7 @@ class ExternalFrameBufferTest : public ::testing::Test {
video_ = NULL;
}
- // Passes the external frame buffer information to libvpx.
+ // Passes the external frame buffer information to libaom.
aom_codec_err_t SetFrameBufferFunctions(
int num_buffers, aom_get_frame_buffer_cb_fn_t cb_get,
aom_release_frame_buffer_cb_fn_t cb_release) {
@@ -359,7 +359,7 @@ class ExternalFrameBufferTest : public ::testing::Test {
// Get decompressed data
while ((img = dec_iter.Next()) != NULL) {
- fb_list_.CheckXImageFrameBuffer(img);
+ fb_list_.CheckImageFrameBuffer(img);
}
}
@@ -390,7 +390,7 @@ class ExternalFrameBufferNonRefTest : public ExternalFrameBufferTest {
#endif // CONFIG_WEBM_IO
// This test runs through the set of test vectors, and decodes them.
-// Libvpx will call into the application to allocate a frame buffer when
+// Libaom will call into the application to allocate a frame buffer when
// needed. The md5 checksums are computed for each frame in the video file.
// If md5 checksums match the correct md5 data, then the test is passed.
// Otherwise, the test failed.
diff --git a/libaom/test/fwd_kf_test.cc b/libaom/test/fwd_kf_test.cc
new file mode 100644
index 0000000..6c428d9
--- /dev/null
+++ b/libaom/test/fwd_kf_test.cc
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+typedef struct {
+ const int max_kf_dist;
+ const double psnr_thresh;
+} FwdKfTestParam;
+
+const FwdKfTestParam kTestParams[] = {
+ { 4, 37.3 }, { 6, 36.5 }, { 8, 35.8 },
+ { 12, 34.3 }, { 16, 34.3 }, { 18, 33.7 }
+};
+
+// Params: encoding mode and index into the kMaxKfDists array to control
+// kf-max-dist
+class ForwardKeyTest
+ : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ ForwardKeyTest()
+ : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+ kf_max_dist_ind_(GET_PARAM(2)) {}
+ virtual ~ForwardKeyTest() {}
+
+ virtual void SetUp() {
+ InitializeConfig();
+ SetMode(encoding_mode_);
+ const aom_rational timebase = { 1, 30 };
+ cfg_.g_timebase = timebase;
+ cpu_used_ = 2;
+ kf_max_dist_ = kTestParams[kf_max_dist_ind_].max_kf_dist;
+ psnr_threshold_ = kTestParams[kf_max_dist_ind_].psnr_thresh;
+ cfg_.rc_end_usage = AOM_VBR;
+ cfg_.rc_target_bitrate = 200;
+ cfg_.g_lag_in_frames = 10;
+ cfg_.fwd_kf_enabled = 1;
+ cfg_.kf_max_dist = kf_max_dist_;
+ cfg_.g_threads = 0;
+ init_flags_ = AOM_CODEC_USE_PSNR;
+ }
+
+ virtual void BeginPassHook(unsigned int) {
+ psnr_ = 0.0;
+ nframes_ = 0;
+ }
+
+ virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+ psnr_ += pkt->data.psnr.psnr[0];
+ nframes_++;
+ }
+
+ virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) {
+ if (video->frame() == 0) {
+ encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+ if (encoding_mode_ != ::libaom_test::kRealTime) {
+ encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+ encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+ encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+ }
+ }
+ }
+
+ double GetAveragePsnr() const {
+ if (nframes_) return psnr_ / nframes_;
+ return 0.0;
+ }
+
+ double GetPsnrThreshold() { return psnr_threshold_; }
+
+ ::libaom_test::TestMode encoding_mode_;
+ const int kf_max_dist_ind_;
+ double psnr_threshold_;
+ int kf_max_dist_;
+ int cpu_used_;
+ int nframes_;
+ double psnr_;
+};
+
+TEST_P(ForwardKeyTest, ForwardKeyEncodeTest) {
+ libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+ cfg_.g_timebase.den, cfg_.g_timebase.num,
+ 0, 20);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ // TODO(sarahparker) Add functionality to assert the minimum number of
+ // keyframes were placed.
+ EXPECT_GT(GetAveragePsnr(), GetPsnrThreshold())
+ << "kf max dist = " << kf_max_dist_;
+}
+
+AV1_INSTANTIATE_TEST_CASE(
+ ForwardKeyTest, ::testing::Values(::libaom_test::kTwoPassGood),
+ ::testing::Range(0, static_cast<int>(GTEST_ARRAY_SIZE_(kTestParams))));
+} // namespace
diff --git a/libaom/test/gf_max_pyr_height_test.cc b/libaom/test/gf_max_pyr_height_test.cc
new file mode 100644
index 0000000..2d78493
--- /dev/null
+++ b/libaom/test/gf_max_pyr_height_test.cc
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+
+namespace {
+
+static const struct GFMaxPyrHeightTestParam {
+ int gf_max_pyr_height;
+ double psnr_thresh;
+} kTestParams[] = {
+ { 0, 34.75 }, { 1, 34.75 }, { 2, 35.25 }, { 3, 35.50 }, { 4, 35.50 },
+};
+
+// Compiler may decide to add some padding to the struct above for alignment,
+// which the gtest may try to print (on error for example). This would cause
+// valgrind to complain that the padding is uninitialized. To avoid that, we
+// provide our own function to print the struct.
+// This also makes '--gtest_list_tests' output more understandable.
+std::ostream &operator<<(std::ostream &os, const GFMaxPyrHeightTestParam &p) {
+ os << "GFMaxPyrHeightTestParam { "
+ << "gf_max_pyr_height = " << p.gf_max_pyr_height << ", "
+ << "psnr_thresh = " << p.psnr_thresh << " }";
+ return os;
+}
+
+// Params: encoding mode and GFMaxPyrHeightTestParam object.
+class GFMaxPyrHeightTest
+ : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode,
+ GFMaxPyrHeightTestParam>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ GFMaxPyrHeightTest()
+ : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)) {
+ gf_max_pyr_height_ = GET_PARAM(2).gf_max_pyr_height;
+ psnr_threshold_ = GET_PARAM(2).psnr_thresh;
+ }
+ virtual ~GFMaxPyrHeightTest() {}
+
+ virtual void SetUp() {
+ InitializeConfig();
+ SetMode(encoding_mode_);
+ const aom_rational timebase = { 1, 30 };
+ cfg_.g_timebase = timebase;
+ cpu_used_ = 4;
+ cfg_.rc_end_usage = AOM_VBR;
+ cfg_.rc_target_bitrate = 200;
+ cfg_.g_lag_in_frames = 19;
+ cfg_.g_threads = 0;
+ init_flags_ = AOM_CODEC_USE_PSNR;
+ }
+
+ virtual void BeginPassHook(unsigned int) {
+ psnr_ = 0.0;
+ nframes_ = 0;
+ }
+
+ virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+ psnr_ += pkt->data.psnr.psnr[0];
+ nframes_++;
+ }
+
+ virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) {
+ if (video->frame() == 0) {
+ encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+ if (encoding_mode_ != ::libaom_test::kRealTime) {
+ encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+ encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+ encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+ }
+ encoder->Control(AV1E_SET_GF_MAX_PYRAMID_HEIGHT, gf_max_pyr_height_);
+ }
+ }
+
+ double GetAveragePsnr() const {
+ if (nframes_) return psnr_ / nframes_;
+ return 0.0;
+ }
+
+ double GetPsnrThreshold() { return psnr_threshold_; }
+
+ ::libaom_test::TestMode encoding_mode_;
+ double psnr_threshold_;
+ int gf_max_pyr_height_;
+ int cpu_used_;
+ int nframes_;
+ double psnr_;
+};
+
+TEST_P(GFMaxPyrHeightTest, EncodeAndVerifyPSNR) {
+ libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+ cfg_.g_timebase.den, cfg_.g_timebase.num,
+ 0, 32);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ EXPECT_GT(GetAveragePsnr(), GetPsnrThreshold())
+ << "GF Max Pyramid Height = " << gf_max_pyr_height_;
+}
+
+AV1_INSTANTIATE_TEST_CASE(GFMaxPyrHeightTest,
+ ::testing::Values(::libaom_test::kTwoPassGood),
+ ::testing::ValuesIn(kTestParams));
+} // namespace
diff --git a/libaom/test/hiprec_convolve_test_util.cc b/libaom/test/hiprec_convolve_test_util.cc
index f5bf56e..2672bce 100644
--- a/libaom/test/hiprec_convolve_test_util.cc
+++ b/libaom/test/hiprec_convolve_test_util.cc
@@ -31,7 +31,7 @@ static void generate_kernels(ACMRandom *rnd, InterpKernel hkernel,
hkernel[2] = hkernel[4] =
WIENER_FILT_TAP2_MINV +
rnd->PseudoUniform(WIENER_FILT_TAP2_MAXV + 1 - WIENER_FILT_TAP2_MINV);
- hkernel[3] = -2 * (hkernel[0] + hkernel[1] + hkernel[2]);
+ hkernel[3] = -(hkernel[0] + hkernel[1] + hkernel[2]);
hkernel[7] = 0;
vkernel[0] = vkernel[6] =
@@ -43,7 +43,7 @@ static void generate_kernels(ACMRandom *rnd, InterpKernel hkernel,
vkernel[2] = vkernel[4] =
WIENER_FILT_TAP2_MINV +
rnd->PseudoUniform(WIENER_FILT_TAP2_MAXV + 1 - WIENER_FILT_TAP2_MINV);
- vkernel[3] = -2 * (vkernel[0] + vkernel[1] + vkernel[2]);
+ vkernel[3] = -(vkernel[0] + vkernel[1] + vkernel[2]);
vkernel[7] = 0;
}
diff --git a/libaom/test/horz_superres_test.cc b/libaom/test/horz_superres_test.cc
index 1627684..f2c2115 100644
--- a/libaom/test/horz_superres_test.cc
+++ b/libaom/test/horz_superres_test.cc
@@ -28,13 +28,8 @@ using ::testing::tuple;
/* TESTING PARAMETERS */
-#define NUM_TEST_VIDEOS 3
-
const int kBitrate = 40;
-// PSNR thresholds found by experiment
-const double kPSNRThresholds[] = { 26.0, 28.0, 20.0 };
-
typedef struct {
const char *filename;
aom_img_fmt fmt;
@@ -42,18 +37,20 @@ typedef struct {
unsigned int profile;
unsigned int limit;
unsigned int screen_content;
+ double psnr_threshold;
} TestVideoParam;
const TestVideoParam kTestVideoVectors[] = {
- { "park_joy_90p_8_420.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 5, 0 },
- { "park_joy_90p_10_444.y4m", AOM_IMG_FMT_I44416, AOM_BITS_10, 1, 5, 0 },
- { "screendata.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 4, 1 },
+ { "park_joy_90p_8_420.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 5, 0, 26.0 },
+ { "park_joy_90p_10_444.y4m", AOM_IMG_FMT_I44416, AOM_BITS_10, 1, 5, 0, 28.0 },
+ { "screendata.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 4, 1, 20.0 },
+ // Image coding (single frame).
+ { "niklas_1280_720_30.y4m", AOM_IMG_FMT_I420, AOM_BITS_8, 0, 1, 0, 32.0 },
};
-// Superres modes tested
-// SUPERRES_QTHRESH is not included, as it has its own test
-const SUPERRES_MODE kSuperresModesNotQThresh[] = { SUPERRES_FIXED,
- SUPERRES_RANDOM };
+// Modes with extra params have their own tests.
+const SUPERRES_MODE kSuperresModesWithoutParams[] = { SUPERRES_RANDOM,
+ SUPERRES_AUTO };
// Superres denominators and superres kf denominators to be tested
typedef tuple<int, int> SuperresDenominatorPair;
@@ -74,10 +71,8 @@ const SuperresQThresholdPair kSuperresQThresholds[] = {
/* END (TESTING PARAMETERS) */
// Test parameter list:
-// <[needed for EncoderTest], test_video_idx_, superres_mode_,
-// tuple(superres_denom_, superres_kf_denom_)>
-typedef tuple<const libaom_test::CodecFactory *, int, SUPERRES_MODE,
- SuperresDenominatorPair>
+// <[needed for EncoderTest], test_video_param_, superres_mode_>
+typedef tuple<const libaom_test::CodecFactory *, TestVideoParam, SUPERRES_MODE>
HorzSuperresTestParam;
class HorzSuperresEndToEndTest
@@ -85,16 +80,113 @@ class HorzSuperresEndToEndTest
public ::libaom_test::EncoderTest {
protected:
HorzSuperresEndToEndTest()
- : EncoderTest(GET_PARAM(0)), test_video_idx_(GET_PARAM(1)),
- superres_mode_(GET_PARAM(2)), psnr_(0.0), frame_count_(0) {
- test_video_param_ = kTestVideoVectors[test_video_idx_];
+ : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)),
+ superres_mode_(GET_PARAM(2)), psnr_(0.0), frame_count_(0) {}
+
+ virtual ~HorzSuperresEndToEndTest() {}
+
+ virtual void SetUp() {
+ InitializeConfig();
+ SetMode(::libaom_test::kTwoPassGood);
+ cfg_.g_lag_in_frames = 5;
+ cfg_.rc_end_usage = AOM_Q;
+ cfg_.rc_target_bitrate = kBitrate;
+ cfg_.g_error_resilient = 0;
+ cfg_.g_profile = test_video_param_.profile;
+ cfg_.g_input_bit_depth = (unsigned int)test_video_param_.bit_depth;
+ cfg_.g_bit_depth = test_video_param_.bit_depth;
+ init_flags_ = AOM_CODEC_USE_PSNR;
+ if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH;
+
+ // Set superres parameters
+ cfg_.rc_superres_mode = superres_mode_;
+ }
+
+ virtual void BeginPassHook(unsigned int) {
+ psnr_ = 0.0;
+ frame_count_ = 0;
+ }
+
+ virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+ psnr_ += pkt->data.psnr.psnr[0];
+ frame_count_++;
+ }
+
+ virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) {
+ if (video->frame() == 0) {
+ encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+ encoder->Control(AV1E_SET_TILE_COLUMNS, 4);
+
+ // Set cpu-used = 8 for speed
+ encoder->Control(AOME_SET_CPUUSED, 8);
+
+ // Test screen coding tools
+ if (test_video_param_.screen_content)
+ encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_SCREEN);
+ else
+ encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_DEFAULT);
+
+ encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+ encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+ encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+ }
+ }
- SuperresDenominatorPair denoms = GET_PARAM(3);
+ double GetAveragePsnr() const {
+ if (frame_count_) return psnr_ / frame_count_;
+ return 0.0;
+ }
+
+ void DoTest() {
+ std::unique_ptr<libaom_test::VideoSource> video;
+ video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0,
+ test_video_param_.limit));
+ ASSERT_TRUE(video.get() != NULL);
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+ const double psnr = GetAveragePsnr();
+ EXPECT_GT(psnr, test_video_param_.psnr_threshold)
+ << "superres_mode_ = " << superres_mode_;
+
+ EXPECT_EQ(test_video_param_.limit, frame_count_)
+ << "superres_mode_ = " << superres_mode_;
+ }
+
+ TestVideoParam test_video_param_;
+ SUPERRES_MODE superres_mode_;
+
+ private:
+ double psnr_;
+ unsigned int frame_count_;
+};
+
+TEST_P(HorzSuperresEndToEndTest, HorzSuperresEndToEndPSNRTest) { DoTest(); }
+
+AV1_INSTANTIATE_TEST_CASE(HorzSuperresEndToEndTest,
+ ::testing::ValuesIn(kTestVideoVectors),
+ ::testing::ValuesIn(kSuperresModesWithoutParams));
+
+// Test parameter list:
+// <[needed for EncoderTest], test_video_param_, tuple(superres_denom_,
+// superres_kf_denom_)>
+typedef tuple<const libaom_test::CodecFactory *, TestVideoParam,
+ SuperresDenominatorPair>
+ HorzSuperresFixedTestParam;
+
+class HorzSuperresFixedEndToEndTest
+ : public ::testing::TestWithParam<HorzSuperresFixedTestParam>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ HorzSuperresFixedEndToEndTest()
+ : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)),
+ superres_mode_(SUPERRES_FIXED), psnr_(0.0), frame_count_(0) {
+ SuperresDenominatorPair denoms = GET_PARAM(2);
superres_denom_ = ::testing::get<0>(denoms);
superres_kf_denom_ = ::testing::get<1>(denoms);
}
- virtual ~HorzSuperresEndToEndTest() {}
+ virtual ~HorzSuperresFixedEndToEndTest() {}
virtual void SetUp() {
InitializeConfig();
@@ -151,8 +243,6 @@ class HorzSuperresEndToEndTest
return 0.0;
}
- double GetPsnrThreshold() { return kPSNRThresholds[test_video_idx_]; }
-
void DoTest() {
std::unique_ptr<libaom_test::VideoSource> video;
video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0,
@@ -161,7 +251,7 @@ class HorzSuperresEndToEndTest
ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
const double psnr = GetAveragePsnr();
- EXPECT_GT(psnr, GetPsnrThreshold())
+ EXPECT_GT(psnr, test_video_param_.psnr_threshold)
<< "superres_mode_ = " << superres_mode_
<< ", superres_denom_ = " << superres_denom_
<< ", superres_kf_denom_ = " << superres_kf_denom_;
@@ -172,7 +262,6 @@ class HorzSuperresEndToEndTest
<< ", superres_kf_denom_ = " << superres_kf_denom_;
}
- int test_video_idx_;
TestVideoParam test_video_param_;
SUPERRES_MODE superres_mode_;
int superres_denom_;
@@ -183,17 +272,16 @@ class HorzSuperresEndToEndTest
unsigned int frame_count_;
};
-TEST_P(HorzSuperresEndToEndTest, HorzSuperresEndToEndPSNRTest) { DoTest(); }
+TEST_P(HorzSuperresFixedEndToEndTest, HorzSuperresFixedTestParam) { DoTest(); }
-AV1_INSTANTIATE_TEST_CASE(HorzSuperresEndToEndTest,
- ::testing::Range(0, NUM_TEST_VIDEOS),
- ::testing::ValuesIn(kSuperresModesNotQThresh),
+AV1_INSTANTIATE_TEST_CASE(HorzSuperresFixedEndToEndTest,
+ ::testing::ValuesIn(kTestVideoVectors),
::testing::ValuesIn(kSuperresDenominators));
// Test parameter list:
-// <[needed for EncoderTest], test_video_idx_, tuple(superres_denom_,
-// superres_kf_denom_), tuple(superres_qthresh_,superres_kf_qthresh_)>
-typedef tuple<const libaom_test::CodecFactory *, int, SuperresDenominatorPair,
+// <[needed for EncoderTest], test_video_param_,
+// tuple(superres_qthresh_,superres_kf_qthresh_)>
+typedef tuple<const libaom_test::CodecFactory *, TestVideoParam,
SuperresQThresholdPair>
HorzSuperresQThreshTestParam;
@@ -202,15 +290,9 @@ class HorzSuperresQThreshEndToEndTest
public ::libaom_test::EncoderTest {
protected:
HorzSuperresQThreshEndToEndTest()
- : EncoderTest(GET_PARAM(0)), test_video_idx_(GET_PARAM(1)),
+ : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)),
superres_mode_(SUPERRES_QTHRESH), psnr_(0.0), frame_count_(0) {
- test_video_param_ = kTestVideoVectors[test_video_idx_];
-
- SuperresDenominatorPair denoms = GET_PARAM(2);
- superres_denom_ = ::testing::get<0>(denoms);
- superres_kf_denom_ = ::testing::get<1>(denoms);
-
- SuperresQThresholdPair qthresholds = GET_PARAM(3);
+ SuperresQThresholdPair qthresholds = GET_PARAM(2);
superres_qthresh_ = ::testing::get<0>(qthresholds);
superres_kf_qthresh_ = ::testing::get<1>(qthresholds);
}
@@ -232,8 +314,6 @@ class HorzSuperresQThreshEndToEndTest
// Set superres parameters
cfg_.rc_superres_mode = superres_mode_;
- cfg_.rc_superres_denominator = superres_denom_;
- cfg_.rc_superres_kf_denominator = superres_kf_denom_;
cfg_.rc_superres_qthresh = superres_qthresh_;
cfg_.rc_superres_kf_qthresh = superres_kf_qthresh_;
}
@@ -274,8 +354,6 @@ class HorzSuperresQThreshEndToEndTest
return 0.0;
}
- double GetPsnrThreshold() { return kPSNRThresholds[test_video_idx_]; }
-
void DoTest() {
std::unique_ptr<libaom_test::VideoSource> video;
video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0,
@@ -284,26 +362,19 @@ class HorzSuperresQThreshEndToEndTest
ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
const double psnr = GetAveragePsnr();
- EXPECT_GT(psnr, GetPsnrThreshold())
+ EXPECT_GT(psnr, test_video_param_.psnr_threshold)
<< "superres_mode_ = " << superres_mode_
- << ", superres_denom_ = " << superres_denom_
- << ", superres_kf_denom_ = " << superres_kf_denom_
<< ", superres_qthresh_ = " << superres_qthresh_
<< ", superres_kf_qthresh_ = " << superres_kf_qthresh_;
EXPECT_EQ(test_video_param_.limit, frame_count_)
<< "superres_mode_ = " << superres_mode_
- << ", superres_denom_ = " << superres_denom_
- << ", superres_kf_denom_ = " << superres_kf_denom_
<< ", superres_qthresh_ = " << superres_qthresh_
<< ", superres_kf_qthresh_ = " << superres_kf_qthresh_;
}
- int test_video_idx_;
TestVideoParam test_video_param_;
SUPERRES_MODE superres_mode_;
- int superres_denom_;
- int superres_kf_denom_;
int superres_qthresh_;
int superres_kf_qthresh_;
@@ -317,8 +388,7 @@ TEST_P(HorzSuperresQThreshEndToEndTest, HorzSuperresQThreshEndToEndPSNRTest) {
}
AV1_INSTANTIATE_TEST_CASE(HorzSuperresQThreshEndToEndTest,
- ::testing::Range(0, NUM_TEST_VIDEOS),
- ::testing::ValuesIn(kSuperresDenominators),
+ ::testing::ValuesIn(kTestVideoVectors),
::testing::ValuesIn(kSuperresQThresholds));
} // namespace
diff --git a/libaom/test/level_test.cc b/libaom/test/level_test.cc
new file mode 100644
index 0000000..e3b0ef1
--- /dev/null
+++ b/libaom/test/level_test.cc
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <memory>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "test/yuv_video_source.h"
+
+namespace {
+// Speed settings tested
+static const int kCpuUsedVectors[] = {
+ 1,
+ 2,
+ 3,
+ 4,
+};
+
+class LevelTest
+ : public ::libaom_test::CodecTestWith2Params<libaom_test::TestMode, int>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ LevelTest()
+ : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
+ cpu_used_(GET_PARAM(2)), target_level_(31) {}
+
+ virtual ~LevelTest() {}
+
+ virtual void SetUp() {
+ InitializeConfig();
+ SetMode(encoding_mode_);
+ if (encoding_mode_ != ::libaom_test::kRealTime) {
+ cfg_.g_lag_in_frames = 5;
+ cfg_.rc_end_usage = AOM_VBR;
+ } else {
+ cfg_.g_lag_in_frames = 0;
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 600;
+ }
+ }
+
+ virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) {
+ if (video->frame() == 0) {
+ encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+ encoder->Control(AV1E_SET_TARGET_SEQ_LEVEL_IDX, target_level_);
+ if (encoding_mode_ != ::libaom_test::kRealTime) {
+ encoder->Control(AOME_SET_ENABLEAUTOALTREF, 1);
+ encoder->Control(AOME_SET_ARNR_MAXFRAMES, 7);
+ encoder->Control(AOME_SET_ARNR_STRENGTH, 5);
+ }
+ }
+ }
+
+ libaom_test::TestMode encoding_mode_;
+ int cpu_used_;
+ int target_level_;
+};
+
+TEST_P(LevelTest, TestTargetLevelApi) {
+ static const aom_codec_iface_t *codec = &aom_codec_av1_cx_algo;
+ aom_codec_ctx_t enc;
+ aom_codec_enc_cfg_t cfg;
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_config_default(codec, &cfg, 0));
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_enc_init(&enc, codec, &cfg, 0));
+ for (int operating_point = 0; operating_point <= 32; ++operating_point) {
+ for (int level = 0; level <= 32; ++level) {
+ const int target_level = operating_point * 100 + level;
+ if ((level >= 0 && level <= 23) || level == 31 || operating_point > 31) {
+ EXPECT_EQ(AOM_CODEC_OK,
+ aom_codec_control(&enc, AV1E_SET_TARGET_SEQ_LEVEL_IDX,
+ target_level));
+ } else {
+ EXPECT_EQ(AOM_CODEC_INVALID_PARAM,
+ aom_codec_control(&enc, AV1E_SET_TARGET_SEQ_LEVEL_IDX,
+ target_level));
+ }
+ }
+ }
+ EXPECT_EQ(AOM_CODEC_OK, aom_codec_destroy(&enc));
+}
+
+TEST_P(LevelTest, TestTargetLevel19) {
+ std::unique_ptr<libaom_test::VideoSource> video;
+ video.reset(new libaom_test::Y4mVideoSource("park_joy_90p_8_420.y4m", 0, 10));
+ ASSERT_TRUE(video.get() != NULL);
+ // Level index 19 corresponding to level 6.3.
+ target_level_ = 19;
+ ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+}
+
+AV1_INSTANTIATE_TEST_CASE(LevelTest,
+ ::testing::Values(::libaom_test::kTwoPassGood),
+ ::testing::ValuesIn(kCpuUsedVectors));
+} // namespace
diff --git a/libaom/test/quantize_func_test.cc b/libaom/test/quantize_func_test.cc
index 8dee864..067a981 100644
--- a/libaom/test/quantize_func_test.cc
+++ b/libaom/test/quantize_func_test.cc
@@ -63,7 +63,7 @@ void highbd_quan64x64_wrapper(QUAN_PARAM_LIST) {
HBD_QUAN_FUNC;
}
-typedef enum { TYPE_B, TYPE_DC, TYPE_FP } QuantType;
+enum { TYPE_B, TYPE_DC, TYPE_FP } UENUM1BYTE(QuantType);
using ::testing::tuple;
typedef tuple<QuantizeFunc, QuantizeFunc, TX_SIZE, QuantType, aom_bit_depth_t>
@@ -191,6 +191,13 @@ class QuantizeTest : public ::testing::TestWithParam<QuantizeParam> {
}
}
+ void FillCoeffRandomRows(int num) {
+ FillCoeffZero();
+ for (int i = 0; i < num; ++i) {
+ coeff_[i] = GetRandomCoeff();
+ }
+ }
+
void FillCoeffZero() { FillCoeff(0); }
void FillCoeffConstant() {
@@ -287,28 +294,31 @@ TEST_P(QuantizeTest, DISABLED_Speed) {
const int16_t *dequant = qtab_->dequant.y_dequant_QTX[q];
const int kNumTests = 5000000;
aom_usec_timer timer, simd_timer;
+ int rows = tx_size_high[tx_size_];
+ int cols = tx_size_wide[tx_size_];
+ for (int cnt = 0; cnt <= rows; cnt++) {
+ FillCoeffRandomRows(cnt * cols);
+
+ aom_usec_timer_start(&timer);
+ for (int n = 0; n < kNumTests; ++n) {
+ quant_ref_(coeff_ptr, n_coeffs, zbin, round_fp, quant_fp, quant_shift,
+ qcoeff, dqcoeff, dequant, eob, sc->scan, sc->iscan);
+ }
+ aom_usec_timer_mark(&timer);
- FillCoeffRandom();
-
- aom_usec_timer_start(&timer);
- for (int n = 0; n < kNumTests; ++n) {
- quant_ref_(coeff_ptr, n_coeffs, zbin, round_fp, quant_fp, quant_shift,
- qcoeff, dqcoeff, dequant, eob, sc->scan, sc->iscan);
- }
- aom_usec_timer_mark(&timer);
+ aom_usec_timer_start(&simd_timer);
+ for (int n = 0; n < kNumTests; ++n) {
+ quant_(coeff_ptr, n_coeffs, zbin, round_fp, quant_fp, quant_shift, qcoeff,
+ dqcoeff, dequant, eob, sc->scan, sc->iscan);
+ }
+ aom_usec_timer_mark(&simd_timer);
- aom_usec_timer_start(&simd_timer);
- for (int n = 0; n < kNumTests; ++n) {
- quant_(coeff_ptr, n_coeffs, zbin, round_fp, quant_fp, quant_shift, qcoeff,
- dqcoeff, dequant, eob, sc->scan, sc->iscan);
+ const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+ const int simd_elapsed_time =
+ static_cast<int>(aom_usec_timer_elapsed(&simd_timer));
+ printf("c_time = %d \t simd_time = %d \t Gain = %d \n", elapsed_time,
+ simd_elapsed_time, (elapsed_time / simd_elapsed_time));
}
- aom_usec_timer_mark(&simd_timer);
-
- const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
- const int simd_elapsed_time =
- static_cast<int>(aom_usec_timer_elapsed(&simd_timer));
- printf("c_time = %d \t simd_time = %d \t Gain = %d \n", elapsed_time,
- simd_elapsed_time, (elapsed_time / simd_elapsed_time));
}
using ::testing::make_tuple;
@@ -398,6 +408,24 @@ const QuantizeParam kQParamArraySSE2[] = {
TX_32X32, TYPE_B, AOM_BITS_10),
make_tuple(&aom_highbd_quantize_b_32x32_c, &aom_highbd_quantize_b_32x32_sse2,
TX_32X32, TYPE_B, AOM_BITS_12),
+ make_tuple(&aom_highbd_quantize_b_64x64_c, &aom_highbd_quantize_b_64x64_sse2,
+ TX_64X64, TYPE_B, AOM_BITS_8),
+ make_tuple(&aom_highbd_quantize_b_64x64_c, &aom_highbd_quantize_b_64x64_sse2,
+ TX_64X64, TYPE_B, AOM_BITS_10),
+ make_tuple(&aom_highbd_quantize_b_64x64_c, &aom_highbd_quantize_b_64x64_sse2,
+ TX_64X64, TYPE_B, AOM_BITS_12),
+ make_tuple(&aom_quantize_b_adaptive_c, &aom_quantize_b_adaptive_sse2,
+ TX_16X16, TYPE_B, AOM_BITS_8),
+ make_tuple(&aom_quantize_b_adaptive_c, &aom_quantize_b_adaptive_sse2, TX_8X8,
+ TYPE_B, AOM_BITS_8),
+ make_tuple(&aom_quantize_b_adaptive_c, &aom_quantize_b_adaptive_sse2, TX_4X4,
+ TYPE_B, AOM_BITS_8),
+ make_tuple(&aom_quantize_b_32x32_adaptive_c,
+ &aom_quantize_b_32x32_adaptive_sse2, TX_32X16, TYPE_B, AOM_BITS_8),
+ make_tuple(&aom_quantize_b_32x32_adaptive_c,
+ &aom_quantize_b_32x32_adaptive_sse2, TX_16X32, TYPE_B, AOM_BITS_8),
+ make_tuple(&aom_quantize_b_32x32_adaptive_c,
+ &aom_quantize_b_32x32_adaptive_sse2, TX_32X32, TYPE_B, AOM_BITS_8)
};
INSTANTIATE_TEST_CASE_P(SSE2, QuantizeTest,
@@ -411,6 +439,9 @@ INSTANTIATE_TEST_CASE_P(
TX_16X16, TYPE_B, AOM_BITS_8),
make_tuple(&aom_quantize_b_32x32_c,
&aom_quantize_b_32x32_ssse3, TX_32X32, TYPE_B,
+ AOM_BITS_8),
+ make_tuple(&aom_quantize_b_64x64_c,
+ &aom_quantize_b_64x64_ssse3, TX_64X64, TYPE_B,
AOM_BITS_8)));
#endif // HAVE_SSSE3 && ARCH_X86_64
diff --git a/libaom/test/resize_test.cc b/libaom/test/resize_test.cc
index b270b83..39e7d1b 100644
--- a/libaom/test/resize_test.cc
+++ b/libaom/test/resize_test.cc
@@ -297,7 +297,7 @@ class ResizeInternalTestLarge : public ResizeTest {
virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
if (frame0_psnr_ == 0.) frame0_psnr_ = pkt->data.psnr.psnr[0];
- EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 2.5);
+ EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 3.0);
}
#if WRITE_COMPRESSED_STREAM
@@ -374,6 +374,7 @@ class ResizeRealtimeTest
if (video->frame() == 0) {
encoder->Control(AV1E_SET_AQ_MODE, 3);
encoder->Control(AOME_SET_CPUUSED, set_cpu_used_);
+ encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
}
if (change_bitrate_ && video->frame() == 120) {
diff --git a/libaom/test/rt_end_to_end_test.cc b/libaom/test/rt_end_to_end_test.cc
new file mode 100644
index 0000000..9c3e96b
--- /dev/null
+++ b/libaom/test/rt_end_to_end_test.cc
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <memory>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "test/yuv_video_source.h"
+
+namespace {
+
+const unsigned int kFrames = 10;
+const int kBitrate = 500;
+
+// List of psnr thresholds for speed settings 0-8
+const double kPsnrThreshold[9] = { 36.9, 36.9, 36.85, 36.8, 36.6,
+ 36.4, 36.0, 35.5, 35.0 };
+
+typedef struct {
+ const char *filename;
+ unsigned int input_bit_depth;
+ aom_img_fmt fmt;
+ aom_bit_depth_t bit_depth;
+ unsigned int profile;
+} TestVideoParam;
+
+std::ostream &operator<<(std::ostream &os, const TestVideoParam &test_arg) {
+ return os << "TestVideoParam { filename:" << test_arg.filename
+ << " input_bit_depth:" << test_arg.input_bit_depth
+ << " fmt:" << test_arg.fmt << " bit_depth:" << test_arg.bit_depth
+ << " profile:" << test_arg.profile << "}";
+}
+
+// TODO(kyslov): Add more test vectors
+const TestVideoParam kTestVectors[] = {
+ { "park_joy_90p_8_420.y4m", 8, AOM_IMG_FMT_I420, AOM_BITS_8, 0 },
+};
+
+// Speed settings tested
+const int kCpuUsedVectors[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8 };
+
+class RTEndToEndTest
+ : public ::libaom_test::CodecTestWith2Params<TestVideoParam, int>,
+ public ::libaom_test::EncoderTest {
+ protected:
+ RTEndToEndTest()
+ : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)),
+ cpu_used_(GET_PARAM(2)), psnr_(0.0), nframes_(0) {}
+
+ virtual ~RTEndToEndTest() {}
+
+ virtual void SetUp() {
+ InitializeConfig();
+ SetMode(::libaom_test::kRealTime);
+
+ cfg_.g_usage = 1; // TODO(kyslov): Move it to encode_test_driver.cc
+ cfg_.rc_end_usage = AOM_CBR;
+ cfg_.rc_buf_sz = 1000;
+ cfg_.rc_buf_initial_sz = 500;
+ cfg_.rc_buf_optimal_sz = 600;
+ }
+
+ virtual void BeginPassHook(unsigned int) {
+ psnr_ = 0.0;
+ nframes_ = 0;
+ }
+
+ virtual void PSNRPktHook(const aom_codec_cx_pkt_t *pkt) {
+ psnr_ += pkt->data.psnr.psnr[0];
+ nframes_++;
+ }
+
+ virtual void PreEncodeFrameHook(::libaom_test::VideoSource *video,
+ ::libaom_test::Encoder *encoder) {
+ if (video->frame() == 0) {
+ encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1);
+ encoder->Control(AV1E_SET_TILE_COLUMNS, 1);
+ encoder->Control(AOME_SET_CPUUSED, cpu_used_);
+ encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_DEFAULT);
+ }
+ }
+
+ double GetAveragePsnr() const {
+ if (nframes_) return psnr_ / nframes_;
+ return 0.0;
+ }
+
+ double GetPsnrThreshold() { return kPsnrThreshold[cpu_used_]; }
+
+ void DoTest() {
+ cfg_.rc_target_bitrate = kBitrate;
+ cfg_.g_error_resilient = 0;
+ cfg_.g_profile = test_video_param_.profile;
+ cfg_.g_input_bit_depth = test_video_param_.input_bit_depth;
+ cfg_.g_bit_depth = test_video_param_.bit_depth;
+ init_flags_ = AOM_CODEC_USE_PSNR;
+ if (cfg_.g_bit_depth > 8) init_flags_ |= AOM_CODEC_USE_HIGHBITDEPTH;
+
+ std::unique_ptr<libaom_test::VideoSource> video;
+ video.reset(new libaom_test::Y4mVideoSource(test_video_param_.filename, 0,
+ kFrames));
+ ASSERT_TRUE(video.get() != NULL);
+
+ ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+ const double psnr = GetAveragePsnr();
+ EXPECT_GT(psnr, GetPsnrThreshold()) << "cpu used = " << cpu_used_;
+ }
+
+ TestVideoParam test_video_param_;
+ int cpu_used_;
+
+ private:
+ double psnr_;
+ unsigned int nframes_;
+};
+
+class RTEndToEndTestLarge : public RTEndToEndTest {};
+
+TEST_P(RTEndToEndTestLarge, EndtoEndPSNRTest) { DoTest(); }
+
+TEST_P(RTEndToEndTest, EndtoEndPSNRTest) { DoTest(); }
+
+AV1_INSTANTIATE_TEST_CASE(RTEndToEndTestLarge,
+ ::testing::ValuesIn(kTestVectors),
+ ::testing::ValuesIn(kCpuUsedVectors));
+
+AV1_INSTANTIATE_TEST_CASE(RTEndToEndTest, ::testing::Values(kTestVectors[0]),
+ ::testing::Values(kCpuUsedVectors[8]));
+} // namespace
diff --git a/libaom/test/sad_test.cc b/libaom/test/sad_test.cc
index 845fe79..87dbb33 100644
--- a/libaom/test/sad_test.cc
+++ b/libaom/test/sad_test.cc
@@ -35,22 +35,25 @@ typedef uint32_t (*SadMxNAvgFunc)(const uint8_t *src_ptr, int src_stride,
const uint8_t *second_pred);
typedef ::testing::tuple<int, int, SadMxNAvgFunc, int> SadMxNAvgParam;
-typedef void (*JntCompAvgFunc)(uint8_t *comp_pred, const uint8_t *pred,
- int width, int height, const uint8_t *ref,
- int ref_stride,
- const JNT_COMP_PARAMS *jcp_param);
-typedef ::testing::tuple<int, int, JntCompAvgFunc, int> JntCompAvgParam;
-
-typedef unsigned int (*JntSadMxhFunc)(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride,
- int width, int height);
-typedef ::testing::tuple<int, int, JntSadMxhFunc, int> JntSadMxhParam;
-
-typedef uint32_t (*JntSadMxNAvgFunc)(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride,
- const uint8_t *second_pred,
- const JNT_COMP_PARAMS *jcp_param);
-typedef ::testing::tuple<int, int, JntSadMxNAvgFunc, int> JntSadMxNAvgParam;
+typedef void (*DistWtdCompAvgFunc)(uint8_t *comp_pred, const uint8_t *pred,
+ int width, int height, const uint8_t *ref,
+ int ref_stride,
+ const DIST_WTD_COMP_PARAMS *jcp_param);
+typedef ::testing::tuple<int, int, DistWtdCompAvgFunc, int> DistWtdCompAvgParam;
+
+typedef unsigned int (*DistWtdSadMxhFunc)(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *ref_ptr,
+ int ref_stride, int width,
+ int height);
+typedef ::testing::tuple<int, int, DistWtdSadMxhFunc, int> DistWtdSadMxhParam;
+
+typedef uint32_t (*DistWtdSadMxNAvgFunc)(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ const uint8_t *second_pred,
+ const DIST_WTD_COMP_PARAMS *jcp_param);
+typedef ::testing::tuple<int, int, DistWtdSadMxNAvgFunc, int>
+ DistWtdSadMxNAvgParam;
typedef void (*SadMxNx4Func)(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_ptr[], int ref_stride,
@@ -203,7 +206,7 @@ class SADTestBase : public ::testing::Test {
return sad;
}
- void ReferenceJntCompAvg(int block_idx) {
+ void ReferenceDistWtdCompAvg(int block_idx) {
const uint8_t *const reference8 = GetReference(block_idx);
const uint8_t *const second_pred8 = second_pred_;
uint8_t *const comp_pred8 = comp_pred_;
@@ -228,7 +231,7 @@ class SADTestBase : public ::testing::Test {
}
}
- unsigned int ReferenceJntSADavg(int block_idx) {
+ unsigned int ReferenceDistWtdSADavg(int block_idx) {
unsigned int sad = 0;
const uint8_t *const reference8 = GetReference(block_idx);
const uint8_t *const source8 = source_data_;
@@ -305,7 +308,7 @@ class SADTestBase : public ::testing::Test {
static uint8_t *comp_pred_test_;
static uint8_t *comp_pred8_test_;
static uint16_t *comp_pred16_test_;
- JNT_COMP_PARAMS jcp_param_;
+ DIST_WTD_COMP_PARAMS jcp_param_;
ACMRandom rnd_;
};
@@ -391,13 +394,15 @@ class SADavgTest : public ::testing::WithParamInterface<SadMxNAvgParam>,
}
};
-class JntCompAvgTest : public ::testing::WithParamInterface<JntCompAvgParam>,
- public SADTestBase {
+class DistWtdCompAvgTest
+ : public ::testing::WithParamInterface<DistWtdCompAvgParam>,
+ public SADTestBase {
public:
- JntCompAvgTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
+ DistWtdCompAvgTest()
+ : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
protected:
- void jnt_comp_avg(int block_idx) {
+ void dist_wtd_comp_avg(int block_idx) {
const uint8_t *const reference = GetReference(block_idx);
ASM_REGISTER_STATE_CHECK(GET_PARAM(2)(comp_pred_test_, second_pred_, width_,
@@ -411,8 +416,8 @@ class JntCompAvgTest : public ::testing::WithParamInterface<JntCompAvgParam>,
jcp_param_.fwd_offset = quant_dist_lookup_table[j][i][0];
jcp_param_.bck_offset = quant_dist_lookup_table[j][i][1];
- ReferenceJntCompAvg(0);
- jnt_comp_avg(0);
+ ReferenceDistWtdCompAvg(0);
+ dist_wtd_comp_avg(0);
for (int y = 0; y < height_; ++y)
for (int x = 0; x < width_; ++x)
@@ -423,10 +428,10 @@ class JntCompAvgTest : public ::testing::WithParamInterface<JntCompAvgParam>,
}
};
-class JntSADTest : public ::testing::WithParamInterface<JntSadMxhParam>,
- public SADTestBase {
+class DistWtdSADTest : public ::testing::WithParamInterface<DistWtdSadMxhParam>,
+ public SADTestBase {
public:
- JntSADTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
+ DistWtdSADTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
protected:
unsigned int SAD(int block_idx) {
@@ -455,13 +460,14 @@ class JntSADTest : public ::testing::WithParamInterface<JntSadMxhParam>,
}
};
-class JntSADavgTest : public ::testing::WithParamInterface<JntSadMxNAvgParam>,
- public SADTestBase {
+class DistWtdSADavgTest
+ : public ::testing::WithParamInterface<DistWtdSadMxNAvgParam>,
+ public SADTestBase {
public:
- JntSADavgTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
+ DistWtdSADavgTest() : SADTestBase(GET_PARAM(0), GET_PARAM(1), GET_PARAM(3)) {}
protected:
- unsigned int jnt_SAD_avg(int block_idx) {
+ unsigned int dist_wtd_SAD_avg(int block_idx) {
unsigned int ret;
const uint8_t *const reference = GetReference(block_idx);
@@ -477,8 +483,8 @@ class JntSADavgTest : public ::testing::WithParamInterface<JntSadMxNAvgParam>,
jcp_param_.fwd_offset = quant_dist_lookup_table[j][i][0];
jcp_param_.bck_offset = quant_dist_lookup_table[j][i][1];
- const unsigned int reference_sad = ReferenceJntSADavg(0);
- const unsigned int exp_sad = jnt_SAD_avg(0);
+ const unsigned int reference_sad = ReferenceDistWtdSADavg(0);
+ const unsigned int exp_sad = dist_wtd_SAD_avg(0);
ASSERT_EQ(reference_sad, exp_sad);
}
@@ -608,19 +614,19 @@ TEST_P(SADavgTest, ShortSrc) {
source_stride_ = tmp_stride;
}
-TEST_P(JntCompAvgTest, MaxRef) {
+TEST_P(DistWtdCompAvgTest, MaxRef) {
FillConstant(reference_data_, reference_stride_, mask_);
FillConstant(second_pred_, width_, 0);
CheckCompAvg();
}
-TEST_P(JntCompAvgTest, MaxSecondPred) {
+TEST_P(DistWtdCompAvgTest, MaxSecondPred) {
FillConstant(reference_data_, reference_stride_, 0);
FillConstant(second_pred_, width_, mask_);
CheckCompAvg();
}
-TEST_P(JntCompAvgTest, ShortRef) {
+TEST_P(DistWtdCompAvgTest, ShortRef) {
const int tmp_stride = reference_stride_;
reference_stride_ >>= 1;
FillRandom(reference_data_, reference_stride_);
@@ -629,7 +635,7 @@ TEST_P(JntCompAvgTest, ShortRef) {
reference_stride_ = tmp_stride;
}
-TEST_P(JntCompAvgTest, UnalignedRef) {
+TEST_P(DistWtdCompAvgTest, UnalignedRef) {
// The reference frame, but not the source frame, may be unaligned for
// certain types of searches.
const int tmp_stride = reference_stride_;
@@ -640,19 +646,19 @@ TEST_P(JntCompAvgTest, UnalignedRef) {
reference_stride_ = tmp_stride;
}
-TEST_P(JntSADTest, MaxRef) {
+TEST_P(DistWtdSADTest, MaxRef) {
FillConstant(source_data_, source_stride_, 0);
FillConstant(reference_data_, reference_stride_, mask_);
CheckSAD();
}
-TEST_P(JntSADTest, MaxSrc) {
+TEST_P(DistWtdSADTest, MaxSrc) {
FillConstant(source_data_, source_stride_, mask_);
FillConstant(reference_data_, reference_stride_, 0);
CheckSAD();
}
-TEST_P(JntSADTest, ShortRef) {
+TEST_P(DistWtdSADTest, ShortRef) {
const int tmp_stride = reference_stride_;
reference_stride_ >>= 1;
FillRandom(source_data_, source_stride_);
@@ -661,7 +667,7 @@ TEST_P(JntSADTest, ShortRef) {
reference_stride_ = tmp_stride;
}
-TEST_P(JntSADTest, UnalignedRef) {
+TEST_P(DistWtdSADTest, UnalignedRef) {
// The reference frame, but not the source frame, may be unaligned for
// certain types of searches.
const int tmp_stride = reference_stride_;
@@ -672,7 +678,7 @@ TEST_P(JntSADTest, UnalignedRef) {
reference_stride_ = tmp_stride;
}
-TEST_P(JntSADTest, ShortSrc) {
+TEST_P(DistWtdSADTest, ShortSrc) {
const int tmp_stride = source_stride_;
source_stride_ >>= 1;
int test_count = 2000;
@@ -685,20 +691,20 @@ TEST_P(JntSADTest, ShortSrc) {
source_stride_ = tmp_stride;
}
-TEST_P(JntSADavgTest, MaxRef) {
+TEST_P(DistWtdSADavgTest, MaxRef) {
FillConstant(source_data_, source_stride_, 0);
FillConstant(reference_data_, reference_stride_, mask_);
FillConstant(second_pred_, width_, 0);
CheckSAD();
}
-TEST_P(JntSADavgTest, MaxSrc) {
+TEST_P(DistWtdSADavgTest, MaxSrc) {
FillConstant(source_data_, source_stride_, mask_);
FillConstant(reference_data_, reference_stride_, 0);
FillConstant(second_pred_, width_, 0);
CheckSAD();
}
-TEST_P(JntSADavgTest, ShortRef) {
+TEST_P(DistWtdSADavgTest, ShortRef) {
const int tmp_stride = reference_stride_;
reference_stride_ >>= 1;
FillRandom(source_data_, source_stride_);
@@ -708,7 +714,7 @@ TEST_P(JntSADavgTest, ShortRef) {
reference_stride_ = tmp_stride;
}
-TEST_P(JntSADavgTest, UnalignedRef) {
+TEST_P(DistWtdSADavgTest, UnalignedRef) {
// The reference frame, but not the source frame, may be unaligned for
// certain types of searches.
const int tmp_stride = reference_stride_;
@@ -720,7 +726,7 @@ TEST_P(JntSADavgTest, UnalignedRef) {
reference_stride_ = tmp_stride;
}
-TEST_P(JntSADavgTest, ShortSrc) {
+TEST_P(DistWtdSADavgTest, ShortSrc) {
const int tmp_stride = source_stride_;
source_stride_ >>= 1;
int test_count = 2000;
@@ -947,47 +953,48 @@ const SadMxNAvgParam avg_c_tests[] = {
INSTANTIATE_TEST_CASE_P(C, SADavgTest, ::testing::ValuesIn(avg_c_tests));
// TODO(chengchen): add highbd tests
-const JntCompAvgParam jnt_comp_avg_c_tests[] = {
- make_tuple(128, 128, &aom_jnt_comp_avg_pred_c, -1),
- make_tuple(128, 64, &aom_jnt_comp_avg_pred_c, -1),
- make_tuple(64, 128, &aom_jnt_comp_avg_pred_c, -1),
- make_tuple(64, 64, &aom_jnt_comp_avg_pred_c, -1),
- make_tuple(64, 32, &aom_jnt_comp_avg_pred_c, -1),
- make_tuple(32, 64, &aom_jnt_comp_avg_pred_c, -1),
- make_tuple(32, 32, &aom_jnt_comp_avg_pred_c, -1),
- make_tuple(32, 16, &aom_jnt_comp_avg_pred_c, -1),
- make_tuple(16, 32, &aom_jnt_comp_avg_pred_c, -1),
- make_tuple(16, 16, &aom_jnt_comp_avg_pred_c, -1),
- make_tuple(16, 8, &aom_jnt_comp_avg_pred_c, -1),
- make_tuple(8, 16, &aom_jnt_comp_avg_pred_c, -1),
- make_tuple(8, 8, &aom_jnt_comp_avg_pred_c, -1),
- make_tuple(8, 4, &aom_jnt_comp_avg_pred_c, -1),
- make_tuple(4, 8, &aom_jnt_comp_avg_pred_c, -1),
- make_tuple(4, 4, &aom_jnt_comp_avg_pred_c, -1),
+const DistWtdCompAvgParam dist_wtd_comp_avg_c_tests[] = {
+ make_tuple(128, 128, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(128, 64, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(64, 128, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(64, 64, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(64, 32, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(32, 64, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(32, 32, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(32, 16, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(16, 32, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(16, 16, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(16, 8, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(8, 16, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(8, 8, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(8, 4, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(4, 8, &aom_dist_wtd_comp_avg_pred_c, -1),
+ make_tuple(4, 4, &aom_dist_wtd_comp_avg_pred_c, -1),
};
-INSTANTIATE_TEST_CASE_P(C, JntCompAvgTest,
- ::testing::ValuesIn(jnt_comp_avg_c_tests));
-
-const JntSadMxNAvgParam jnt_avg_c_tests[] = {
- make_tuple(128, 128, &aom_jnt_sad128x128_avg_c, -1),
- make_tuple(128, 64, &aom_jnt_sad128x64_avg_c, -1),
- make_tuple(64, 128, &aom_jnt_sad64x128_avg_c, -1),
- make_tuple(64, 64, &aom_jnt_sad64x64_avg_c, -1),
- make_tuple(64, 32, &aom_jnt_sad64x32_avg_c, -1),
- make_tuple(32, 64, &aom_jnt_sad32x64_avg_c, -1),
- make_tuple(32, 32, &aom_jnt_sad32x32_avg_c, -1),
- make_tuple(32, 16, &aom_jnt_sad32x16_avg_c, -1),
- make_tuple(16, 32, &aom_jnt_sad16x32_avg_c, -1),
- make_tuple(16, 16, &aom_jnt_sad16x16_avg_c, -1),
- make_tuple(16, 8, &aom_jnt_sad16x8_avg_c, -1),
- make_tuple(8, 16, &aom_jnt_sad8x16_avg_c, -1),
- make_tuple(8, 8, &aom_jnt_sad8x8_avg_c, -1),
- make_tuple(8, 4, &aom_jnt_sad8x4_avg_c, -1),
- make_tuple(4, 8, &aom_jnt_sad4x8_avg_c, -1),
- make_tuple(4, 4, &aom_jnt_sad4x4_avg_c, -1),
+INSTANTIATE_TEST_CASE_P(C, DistWtdCompAvgTest,
+ ::testing::ValuesIn(dist_wtd_comp_avg_c_tests));
+
+const DistWtdSadMxNAvgParam dist_wtd_avg_c_tests[] = {
+ make_tuple(128, 128, &aom_dist_wtd_sad128x128_avg_c, -1),
+ make_tuple(128, 64, &aom_dist_wtd_sad128x64_avg_c, -1),
+ make_tuple(64, 128, &aom_dist_wtd_sad64x128_avg_c, -1),
+ make_tuple(64, 64, &aom_dist_wtd_sad64x64_avg_c, -1),
+ make_tuple(64, 32, &aom_dist_wtd_sad64x32_avg_c, -1),
+ make_tuple(32, 64, &aom_dist_wtd_sad32x64_avg_c, -1),
+ make_tuple(32, 32, &aom_dist_wtd_sad32x32_avg_c, -1),
+ make_tuple(32, 16, &aom_dist_wtd_sad32x16_avg_c, -1),
+ make_tuple(16, 32, &aom_dist_wtd_sad16x32_avg_c, -1),
+ make_tuple(16, 16, &aom_dist_wtd_sad16x16_avg_c, -1),
+ make_tuple(16, 8, &aom_dist_wtd_sad16x8_avg_c, -1),
+ make_tuple(8, 16, &aom_dist_wtd_sad8x16_avg_c, -1),
+ make_tuple(8, 8, &aom_dist_wtd_sad8x8_avg_c, -1),
+ make_tuple(8, 4, &aom_dist_wtd_sad8x4_avg_c, -1),
+ make_tuple(4, 8, &aom_dist_wtd_sad4x8_avg_c, -1),
+ make_tuple(4, 4, &aom_dist_wtd_sad4x4_avg_c, -1),
};
-INSTANTIATE_TEST_CASE_P(C, JntSADavgTest, ::testing::ValuesIn(jnt_avg_c_tests));
+INSTANTIATE_TEST_CASE_P(C, DistWtdSADavgTest,
+ ::testing::ValuesIn(dist_wtd_avg_c_tests));
const SadMxNx4Param x4d_c_tests[] = {
make_tuple(128, 128, &aom_sad128x128x4d_c, -1),
@@ -1251,7 +1258,7 @@ INSTANTIATE_TEST_CASE_P(SSE2, SADx4Test, ::testing::ValuesIn(x4d_sse2_tests));
#if HAVE_SSSE3
// Note: These are named sse2, but part of ssse3 file and only built and linked
// when ssse3 is enabled.
-const JntSadMxhParam jnt_sad_sse2_tests[] = {
+const DistWtdSadMxhParam dist_wtd_sad_sse2_tests[] = {
make_tuple(4, 4, &aom_sad4xh_sse2, -1),
make_tuple(4, 8, &aom_sad4xh_sse2, -1),
make_tuple(8, 4, &aom_sad8xh_sse2, -1),
@@ -1275,8 +1282,8 @@ const JntSadMxhParam jnt_sad_sse2_tests[] = {
make_tuple(16, 64, &aom_sad16xh_sse2, -1),
make_tuple(64, 16, &aom_sad64xh_sse2, -1),
};
-INSTANTIATE_TEST_CASE_P(SSE2, JntSADTest,
- ::testing::ValuesIn(jnt_sad_sse2_tests));
+INSTANTIATE_TEST_CASE_P(SSE2, DistWtdSADTest,
+ ::testing::ValuesIn(dist_wtd_sad_sse2_tests));
#endif // HAVE_SSSE3
@@ -1285,49 +1292,49 @@ INSTANTIATE_TEST_CASE_P(SSE2, JntSADTest,
#endif // HAVE_SSE3
#if HAVE_SSSE3
-const JntCompAvgParam jnt_comp_avg_ssse3_tests[] = {
- make_tuple(128, 128, &aom_jnt_comp_avg_pred_ssse3, -1),
- make_tuple(128, 64, &aom_jnt_comp_avg_pred_ssse3, -1),
- make_tuple(64, 128, &aom_jnt_comp_avg_pred_ssse3, -1),
- make_tuple(64, 64, &aom_jnt_comp_avg_pred_ssse3, -1),
- make_tuple(64, 32, &aom_jnt_comp_avg_pred_ssse3, -1),
- make_tuple(32, 64, &aom_jnt_comp_avg_pred_ssse3, -1),
- make_tuple(32, 32, &aom_jnt_comp_avg_pred_ssse3, -1),
- make_tuple(32, 16, &aom_jnt_comp_avg_pred_ssse3, -1),
- make_tuple(16, 32, &aom_jnt_comp_avg_pred_ssse3, -1),
- make_tuple(16, 16, &aom_jnt_comp_avg_pred_ssse3, -1),
- make_tuple(16, 8, &aom_jnt_comp_avg_pred_ssse3, -1),
- make_tuple(8, 16, &aom_jnt_comp_avg_pred_ssse3, -1),
- make_tuple(8, 8, &aom_jnt_comp_avg_pred_ssse3, -1),
- make_tuple(8, 4, &aom_jnt_comp_avg_pred_ssse3, -1),
- make_tuple(4, 8, &aom_jnt_comp_avg_pred_ssse3, -1),
- make_tuple(4, 4, &aom_jnt_comp_avg_pred_ssse3, -1),
- make_tuple(16, 16, &aom_jnt_comp_avg_pred_ssse3, -1),
+const DistWtdCompAvgParam dist_wtd_comp_avg_ssse3_tests[] = {
+ make_tuple(128, 128, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(128, 64, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(64, 128, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(64, 64, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(64, 32, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(32, 64, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(32, 32, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(32, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(16, 32, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(16, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(16, 8, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(8, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(8, 8, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(8, 4, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(4, 8, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(4, 4, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
+ make_tuple(16, 16, &aom_dist_wtd_comp_avg_pred_ssse3, -1),
};
-INSTANTIATE_TEST_CASE_P(SSSE3, JntCompAvgTest,
- ::testing::ValuesIn(jnt_comp_avg_ssse3_tests));
-
-const JntSadMxNAvgParam jnt_avg_ssse3_tests[] = {
- make_tuple(128, 128, &aom_jnt_sad128x128_avg_ssse3, -1),
- make_tuple(128, 64, &aom_jnt_sad128x64_avg_ssse3, -1),
- make_tuple(64, 128, &aom_jnt_sad64x128_avg_ssse3, -1),
- make_tuple(64, 64, &aom_jnt_sad64x64_avg_ssse3, -1),
- make_tuple(64, 32, &aom_jnt_sad64x32_avg_ssse3, -1),
- make_tuple(32, 64, &aom_jnt_sad32x64_avg_ssse3, -1),
- make_tuple(32, 32, &aom_jnt_sad32x32_avg_ssse3, -1),
- make_tuple(32, 16, &aom_jnt_sad32x16_avg_ssse3, -1),
- make_tuple(16, 32, &aom_jnt_sad16x32_avg_ssse3, -1),
- make_tuple(16, 16, &aom_jnt_sad16x16_avg_ssse3, -1),
- make_tuple(16, 8, &aom_jnt_sad16x8_avg_ssse3, -1),
- make_tuple(8, 16, &aom_jnt_sad8x16_avg_ssse3, -1),
- make_tuple(8, 8, &aom_jnt_sad8x8_avg_ssse3, -1),
- make_tuple(8, 4, &aom_jnt_sad8x4_avg_ssse3, -1),
- make_tuple(4, 8, &aom_jnt_sad4x8_avg_ssse3, -1),
- make_tuple(4, 4, &aom_jnt_sad4x4_avg_ssse3, -1),
+INSTANTIATE_TEST_CASE_P(SSSE3, DistWtdCompAvgTest,
+ ::testing::ValuesIn(dist_wtd_comp_avg_ssse3_tests));
+
+const DistWtdSadMxNAvgParam dist_wtd_avg_ssse3_tests[] = {
+ make_tuple(128, 128, &aom_dist_wtd_sad128x128_avg_ssse3, -1),
+ make_tuple(128, 64, &aom_dist_wtd_sad128x64_avg_ssse3, -1),
+ make_tuple(64, 128, &aom_dist_wtd_sad64x128_avg_ssse3, -1),
+ make_tuple(64, 64, &aom_dist_wtd_sad64x64_avg_ssse3, -1),
+ make_tuple(64, 32, &aom_dist_wtd_sad64x32_avg_ssse3, -1),
+ make_tuple(32, 64, &aom_dist_wtd_sad32x64_avg_ssse3, -1),
+ make_tuple(32, 32, &aom_dist_wtd_sad32x32_avg_ssse3, -1),
+ make_tuple(32, 16, &aom_dist_wtd_sad32x16_avg_ssse3, -1),
+ make_tuple(16, 32, &aom_dist_wtd_sad16x32_avg_ssse3, -1),
+ make_tuple(16, 16, &aom_dist_wtd_sad16x16_avg_ssse3, -1),
+ make_tuple(16, 8, &aom_dist_wtd_sad16x8_avg_ssse3, -1),
+ make_tuple(8, 16, &aom_dist_wtd_sad8x16_avg_ssse3, -1),
+ make_tuple(8, 8, &aom_dist_wtd_sad8x8_avg_ssse3, -1),
+ make_tuple(8, 4, &aom_dist_wtd_sad8x4_avg_ssse3, -1),
+ make_tuple(4, 8, &aom_dist_wtd_sad4x8_avg_ssse3, -1),
+ make_tuple(4, 4, &aom_dist_wtd_sad4x4_avg_ssse3, -1),
};
-INSTANTIATE_TEST_CASE_P(SSSE3, JntSADavgTest,
- ::testing::ValuesIn(jnt_avg_ssse3_tests));
+INSTANTIATE_TEST_CASE_P(SSSE3, DistWtdSADavgTest,
+ ::testing::ValuesIn(dist_wtd_avg_ssse3_tests));
#endif // HAVE_SSSE3
#if HAVE_SSE4_1
diff --git a/libaom/test/sum_squares_test.cc b/libaom/test/sum_squares_test.cc
index cb518c8..f26a646 100644
--- a/libaom/test/sum_squares_test.cc
+++ b/libaom/test/sum_squares_test.cc
@@ -255,7 +255,7 @@ class SSETest : public ::testing::TestWithParam<SSETestParam> {
aom_free(src_);
aom_free(ref_);
}
- void RunTest(int isRandom, int width, int height);
+ void RunTest(int isRandom, int width, int height, int run_times);
void GenRandomData(int width, int height, int stride) {
uint16_t *pSrc = (uint16_t *)src_;
@@ -298,8 +298,9 @@ class SSETest : public ::testing::TestWithParam<SSETestParam> {
ACMRandom rnd_;
};
-void SSETest::RunTest(int isRandom, int width, int height) {
+void SSETest::RunTest(int isRandom, int width, int height, int run_times) {
int failed = 0;
+ aom_usec_timer ref_timer, test_timer;
for (int k = 0; k < 3; k++) {
int stride = 4 << rnd_(7); // Up to 256 stride
while (stride < width) { // Make sure it's valid
@@ -326,31 +327,58 @@ void SSETest::RunTest(int isRandom, int width, int height) {
pRef = CONVERT_TO_BYTEPTR(ref_);
}
res_ref = params_.ref_func(pSrc, stride, pRef, stride, width, height);
+ res_tst = params_.tst_func(pSrc, stride, pRef, stride, width, height);
+ if (run_times > 1) {
+ aom_usec_timer_start(&ref_timer);
+ for (int j = 0; j < run_times; j++) {
+ params_.ref_func(pSrc, stride, pRef, stride, width, height);
+ }
+ aom_usec_timer_mark(&ref_timer);
+ const int elapsed_time_c =
+ static_cast<int>(aom_usec_timer_elapsed(&ref_timer));
- ASM_REGISTER_STATE_CHECK(
- res_tst = params_.tst_func(pSrc, stride, pRef, stride, width, height));
-
- if (!failed) {
- failed = res_ref != res_tst;
- EXPECT_EQ(res_ref, res_tst)
- << "Error:" << (isHbd_ ? "hbd " : " ") << k << " SSE Test [" << width
- << "x" << height << "] C output does not match optimized output.";
+ aom_usec_timer_start(&test_timer);
+ for (int j = 0; j < run_times; j++) {
+ params_.tst_func(pSrc, stride, pRef, stride, width, height);
+ }
+ aom_usec_timer_mark(&test_timer);
+ const int elapsed_time_simd =
+ static_cast<int>(aom_usec_timer_elapsed(&test_timer));
+
+ printf(
+ "c_time=%d \t simd_time=%d \t "
+ "gain=%d\n",
+ elapsed_time_c, elapsed_time_simd,
+ (elapsed_time_c / elapsed_time_simd));
+ } else {
+ if (!failed) {
+ failed = res_ref != res_tst;
+ EXPECT_EQ(res_ref, res_tst)
+ << "Error:" << (isHbd_ ? "hbd " : " ") << k << " SSE Test ["
+ << width << "x" << height
+ << "] C output does not match optimized output.";
+ }
}
}
}
TEST_P(SSETest, OperationCheck) {
for (int height = 4; height <= 128; height += 4) {
- RunTest(1, width_, height); // GenRandomData
+ RunTest(1, width_, height, 1); // GenRandomData
}
}
TEST_P(SSETest, ExtremeValues) {
for (int height = 4; height <= 128; height += 4) {
- RunTest(0, width_, height);
+ RunTest(0, width_, height, 1);
}
}
+TEST_P(SSETest, DISABLED_Speed) {
+ for (int height = 4; height <= 128; height += 4) {
+ RunTest(1, width_, height, 100);
+ }
+}
#if HAVE_SSE4_1
TestSSEFuncs sse_sse4[] = { TestSSEFuncs(&aom_sse_c, &aom_sse_sse4_1),
TestSSEFuncs(&aom_highbd_sse_c,
diff --git a/libaom/test/test-data.sha1 b/libaom/test/test-data.sha1
index 95342a8..bd63206 100644
--- a/libaom/test/test-data.sha1
+++ b/libaom/test/test-data.sha1
@@ -532,3 +532,9 @@ e94687eb0e90179b3800b6d5e11eb7e9bfb34eec *av1-1-b8-22-svc-L1T2.ivf
2bc12b16385ea14323bc79607fb8dfbd7edaf8ef *av1-1-b8-22-svc-L1T2.ivf.md5
32ef2f14ee9cb11a24a22934f4c065e926e5d236 *av1-1-b8-22-svc-L2T2.ivf
f476a10ff06d750129f8229755d51e17ff141b2a *av1-1-b8-22-svc-L2T2.ivf.md5
+afca5502a489692b0a3c120370b0f43b8fc572a1 *av1-1-b8-04-cdfupdate.ivf
+13b9423155a08d5e3a2fd9ae4a973bb046718cdf *av1-1-b8-04-cdfupdate.ivf.md5
+f064290d7fcd3b3de19020e8aec6c43c88d3a505 *av1-1-b8-05-mv.ivf
+bff316e63ded5559116bdc2fa4aa97ad7b1a1761 *av1-1-b8-05-mv.ivf.md5
+b48a717c7c003b8dd23c3c2caed1ac673380fdb3 *av1-1-b8-06-mfmv.ivf
+1424e3cb53e00eb56b94f4c725826274212c42b6 *av1-1-b8-06-mfmv.ivf.md5
diff --git a/libaom/test/test.cmake b/libaom/test/test.cmake
index 12f2319..a44737a 100644
--- a/libaom/test/test.cmake
+++ b/libaom/test/test.cmake
@@ -64,10 +64,14 @@ list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES
"${AOM_ROOT}/test/encode_test_driver.cc"
"${AOM_ROOT}/test/encode_test_driver.h"
"${AOM_ROOT}/test/end_to_end_test.cc"
+ "${AOM_ROOT}/test/fwd_kf_test.cc"
+ "${AOM_ROOT}/test/gf_max_pyr_height_test.cc"
+ "${AOM_ROOT}/test/rt_end_to_end_test.cc"
"${AOM_ROOT}/test/error_resilience_test.cc"
"${AOM_ROOT}/test/frame_size_tests.cc"
"${AOM_ROOT}/test/horz_superres_test.cc"
"${AOM_ROOT}/test/i420_video_source.h"
+ "${AOM_ROOT}/test/level_test.cc"
"${AOM_ROOT}/test/lossless_test.cc"
"${AOM_ROOT}/test/monochrome_test.cc"
"${AOM_ROOT}/test/qm_test.cc"
@@ -120,7 +124,8 @@ if(NOT BUILD_SHARED_LIBS)
"${AOM_ROOT}/test/film_grain_table_test.cc"
"${AOM_ROOT}/test/segment_binarization_sync.cc"
"${AOM_ROOT}/test/superframe_test.cc"
- "${AOM_ROOT}/test/tile_independence_test.cc")
+ "${AOM_ROOT}/test/tile_independence_test.cc"
+ "${AOM_ROOT}/test/yuv_temporal_filter_test.cc")
endif()
list(APPEND AOM_UNIT_TEST_COMMON_INTRIN_NEON
@@ -233,13 +238,6 @@ if(ENABLE_TESTS)
"make sure it's in your PATH.")
endif()
- if(MSVC) # Force static run time to avoid collisions with googletest.
- include("${AOM_ROOT}/build/cmake/msvc_runtime.cmake")
- if(BUILD_SHARED_LIBS)
- set(AOM_DISABLE_GTEST_CMAKE 1)
- endif()
- endif()
-
if(BUILD_SHARED_LIBS AND APPLE) # Silence an RPATH warning.
set(CMAKE_MACOSX_RPATH 1)
endif()
@@ -247,15 +245,16 @@ if(ENABLE_TESTS)
include_directories(
"${AOM_ROOT}/third_party/googletest/src/googletest/include")
- if(AOM_DISABLE_GTEST_CMAKE)
- include_directories("${AOM_ROOT}/third_party/googletest/src/googletest")
- add_library(
- gtest
- STATIC
- "${AOM_ROOT}/third_party/googletest/src/googletest/src/gtest-all.cc")
+ include_directories("${AOM_ROOT}/third_party/googletest/src/googletest")
+ add_library(
+ aom_gtest
+ STATIC "${AOM_ROOT}/third_party/googletest/src/googletest/src/gtest-all.cc")
+ if(MSVC OR WIN32)
+ target_compile_definitions(aom_gtest PRIVATE GTEST_OS_WINDOWS=1)
+ elseif(CONFIG_MULTITHREAD AND CMAKE_USE_PTHREADS_INIT)
+ target_compile_definitions(aom_gtest PRIVATE GTEST_HAS_PTHREAD=1)
else()
- add_subdirectory("${AOM_ROOT}/third_party/googletest/src/googletest"
- EXCLUDE_FROM_ALL)
+ target_compile_definitions(aom_gtest PRIVATE GTEST_HAS_PTHREAD=0)
endif()
endif()
@@ -307,12 +306,12 @@ function(setup_aom_test_targets)
add_executable(test_intra_pred_speed ${AOM_TEST_INTRA_PRED_SPEED_SOURCES}
$<TARGET_OBJECTS:aom_common_app_util>)
target_link_libraries(test_intra_pred_speed ${AOM_LIB_LINK_TYPE} aom
- gtest)
+ aom_gtest)
list(APPEND AOM_APP_TARGETS test_intra_pred_speed)
endif()
endif()
- target_link_libraries(test_libaom ${AOM_LIB_LINK_TYPE} aom gtest)
+ target_link_libraries(test_libaom ${AOM_LIB_LINK_TYPE} aom aom_gtest)
if(CONFIG_LIBYUV)
target_sources(test_libaom PRIVATE $<TARGET_OBJECTS:yuv>)
diff --git a/libaom/test/test_data_util.cmake b/libaom/test/test_data_util.cmake
index 6d684cb..c3c86aa 100644
--- a/libaom/test/test_data_util.cmake
+++ b/libaom/test/test_data_util.cmake
@@ -500,6 +500,12 @@ if(CONFIG_AV1_DECODER)
"av1-1-b8-03-sizeup.mkv.md5"
"av1-1-b8-03-sizedown.mkv"
"av1-1-b8-03-sizedown.mkv.md5"
+ "av1-1-b8-04-cdfupdate.ivf"
+ "av1-1-b8-04-cdfupdate.ivf.md5"
+ "av1-1-b8-05-mv.ivf"
+ "av1-1-b8-05-mv.ivf.md5"
+ "av1-1-b8-06-mfmv.ivf"
+ "av1-1-b8-06-mfmv.ivf.md5"
"av1-1-b8-22-svc-L2T1.ivf"
"av1-1-b8-22-svc-L2T1.ivf.md5"
"av1-1-b8-22-svc-L1T2.ivf"
diff --git a/libaom/test/test_vectors.cc b/libaom/test/test_vectors.cc
index d2f333f..d2cd901 100644
--- a/libaom/test/test_vectors.cc
+++ b/libaom/test/test_vectors.cc
@@ -16,125 +16,243 @@ namespace libaom_test {
#define NELEMENTS(x) static_cast<int>(sizeof(x) / sizeof(x[0]))
#if CONFIG_AV1_DECODER
-const char *const kAV1TestVectors[] = {
- "av1-1-b8-00-quantizer-00.ivf", "av1-1-b8-00-quantizer-01.ivf",
- "av1-1-b8-00-quantizer-02.ivf", "av1-1-b8-00-quantizer-03.ivf",
- "av1-1-b8-00-quantizer-04.ivf", "av1-1-b8-00-quantizer-05.ivf",
- "av1-1-b8-00-quantizer-06.ivf", "av1-1-b8-00-quantizer-07.ivf",
- "av1-1-b8-00-quantizer-08.ivf", "av1-1-b8-00-quantizer-09.ivf",
- "av1-1-b8-00-quantizer-10.ivf", "av1-1-b8-00-quantizer-11.ivf",
- "av1-1-b8-00-quantizer-12.ivf", "av1-1-b8-00-quantizer-13.ivf",
- "av1-1-b8-00-quantizer-14.ivf", "av1-1-b8-00-quantizer-15.ivf",
- "av1-1-b8-00-quantizer-16.ivf", "av1-1-b8-00-quantizer-17.ivf",
- "av1-1-b8-00-quantizer-18.ivf", "av1-1-b8-00-quantizer-19.ivf",
- "av1-1-b8-00-quantizer-20.ivf", "av1-1-b8-00-quantizer-21.ivf",
- "av1-1-b8-00-quantizer-22.ivf", "av1-1-b8-00-quantizer-23.ivf",
- "av1-1-b8-00-quantizer-24.ivf", "av1-1-b8-00-quantizer-25.ivf",
- "av1-1-b8-00-quantizer-26.ivf", "av1-1-b8-00-quantizer-27.ivf",
- "av1-1-b8-00-quantizer-28.ivf", "av1-1-b8-00-quantizer-29.ivf",
- "av1-1-b8-00-quantizer-30.ivf", "av1-1-b8-00-quantizer-31.ivf",
- "av1-1-b8-00-quantizer-32.ivf", "av1-1-b8-00-quantizer-33.ivf",
- "av1-1-b8-00-quantizer-34.ivf", "av1-1-b8-00-quantizer-35.ivf",
- "av1-1-b8-00-quantizer-36.ivf", "av1-1-b8-00-quantizer-37.ivf",
- "av1-1-b8-00-quantizer-38.ivf", "av1-1-b8-00-quantizer-39.ivf",
- "av1-1-b8-00-quantizer-40.ivf", "av1-1-b8-00-quantizer-41.ivf",
- "av1-1-b8-00-quantizer-42.ivf", "av1-1-b8-00-quantizer-43.ivf",
- "av1-1-b8-00-quantizer-44.ivf", "av1-1-b8-00-quantizer-45.ivf",
- "av1-1-b8-00-quantizer-46.ivf", "av1-1-b8-00-quantizer-47.ivf",
- "av1-1-b8-00-quantizer-48.ivf", "av1-1-b8-00-quantizer-49.ivf",
- "av1-1-b8-00-quantizer-50.ivf", "av1-1-b8-00-quantizer-51.ivf",
- "av1-1-b8-00-quantizer-52.ivf", "av1-1-b8-00-quantizer-53.ivf",
- "av1-1-b8-00-quantizer-54.ivf", "av1-1-b8-00-quantizer-55.ivf",
- "av1-1-b8-00-quantizer-56.ivf", "av1-1-b8-00-quantizer-57.ivf",
- "av1-1-b8-00-quantizer-58.ivf", "av1-1-b8-00-quantizer-59.ivf",
- "av1-1-b8-00-quantizer-60.ivf", "av1-1-b8-00-quantizer-61.ivf",
- "av1-1-b8-00-quantizer-62.ivf", "av1-1-b8-00-quantizer-63.ivf",
- "av1-1-b10-00-quantizer-00.ivf", "av1-1-b10-00-quantizer-01.ivf",
- "av1-1-b10-00-quantizer-02.ivf", "av1-1-b10-00-quantizer-03.ivf",
- "av1-1-b10-00-quantizer-04.ivf", "av1-1-b10-00-quantizer-05.ivf",
- "av1-1-b10-00-quantizer-06.ivf", "av1-1-b10-00-quantizer-07.ivf",
- "av1-1-b10-00-quantizer-08.ivf", "av1-1-b10-00-quantizer-09.ivf",
- "av1-1-b10-00-quantizer-10.ivf", "av1-1-b10-00-quantizer-11.ivf",
- "av1-1-b10-00-quantizer-12.ivf", "av1-1-b10-00-quantizer-13.ivf",
- "av1-1-b10-00-quantizer-14.ivf", "av1-1-b10-00-quantizer-15.ivf",
- "av1-1-b10-00-quantizer-16.ivf", "av1-1-b10-00-quantizer-17.ivf",
- "av1-1-b10-00-quantizer-18.ivf", "av1-1-b10-00-quantizer-19.ivf",
- "av1-1-b10-00-quantizer-20.ivf", "av1-1-b10-00-quantizer-21.ivf",
- "av1-1-b10-00-quantizer-22.ivf", "av1-1-b10-00-quantizer-23.ivf",
- "av1-1-b10-00-quantizer-24.ivf", "av1-1-b10-00-quantizer-25.ivf",
- "av1-1-b10-00-quantizer-26.ivf", "av1-1-b10-00-quantizer-27.ivf",
- "av1-1-b10-00-quantizer-28.ivf", "av1-1-b10-00-quantizer-29.ivf",
- "av1-1-b10-00-quantizer-30.ivf", "av1-1-b10-00-quantizer-31.ivf",
- "av1-1-b10-00-quantizer-32.ivf", "av1-1-b10-00-quantizer-33.ivf",
- "av1-1-b10-00-quantizer-34.ivf", "av1-1-b10-00-quantizer-35.ivf",
- "av1-1-b10-00-quantizer-36.ivf", "av1-1-b10-00-quantizer-37.ivf",
- "av1-1-b10-00-quantizer-38.ivf", "av1-1-b10-00-quantizer-39.ivf",
- "av1-1-b10-00-quantizer-40.ivf", "av1-1-b10-00-quantizer-41.ivf",
- "av1-1-b10-00-quantizer-42.ivf", "av1-1-b10-00-quantizer-43.ivf",
- "av1-1-b10-00-quantizer-44.ivf", "av1-1-b10-00-quantizer-45.ivf",
- "av1-1-b10-00-quantizer-46.ivf", "av1-1-b10-00-quantizer-47.ivf",
- "av1-1-b10-00-quantizer-48.ivf", "av1-1-b10-00-quantizer-49.ivf",
- "av1-1-b10-00-quantizer-50.ivf", "av1-1-b10-00-quantizer-51.ivf",
- "av1-1-b10-00-quantizer-52.ivf", "av1-1-b10-00-quantizer-53.ivf",
- "av1-1-b10-00-quantizer-54.ivf", "av1-1-b10-00-quantizer-55.ivf",
- "av1-1-b10-00-quantizer-56.ivf", "av1-1-b10-00-quantizer-57.ivf",
- "av1-1-b10-00-quantizer-58.ivf", "av1-1-b10-00-quantizer-59.ivf",
- "av1-1-b10-00-quantizer-60.ivf", "av1-1-b10-00-quantizer-61.ivf",
- "av1-1-b10-00-quantizer-62.ivf", "av1-1-b10-00-quantizer-63.ivf",
- "av1-1-b8-01-size-16x16.ivf", "av1-1-b8-01-size-16x18.ivf",
- "av1-1-b8-01-size-16x32.ivf", "av1-1-b8-01-size-16x34.ivf",
- "av1-1-b8-01-size-16x64.ivf", "av1-1-b8-01-size-16x66.ivf",
- "av1-1-b8-01-size-18x16.ivf", "av1-1-b8-01-size-18x18.ivf",
- "av1-1-b8-01-size-18x32.ivf", "av1-1-b8-01-size-18x34.ivf",
- "av1-1-b8-01-size-18x64.ivf", "av1-1-b8-01-size-18x66.ivf",
- "av1-1-b8-01-size-196x196.ivf", "av1-1-b8-01-size-196x198.ivf",
- "av1-1-b8-01-size-196x200.ivf", "av1-1-b8-01-size-196x202.ivf",
- "av1-1-b8-01-size-196x208.ivf", "av1-1-b8-01-size-196x210.ivf",
- "av1-1-b8-01-size-196x224.ivf", "av1-1-b8-01-size-196x226.ivf",
- "av1-1-b8-01-size-198x196.ivf", "av1-1-b8-01-size-198x198.ivf",
- "av1-1-b8-01-size-198x200.ivf", "av1-1-b8-01-size-198x202.ivf",
- "av1-1-b8-01-size-198x208.ivf", "av1-1-b8-01-size-198x210.ivf",
- "av1-1-b8-01-size-198x224.ivf", "av1-1-b8-01-size-198x226.ivf",
- "av1-1-b8-01-size-200x196.ivf", "av1-1-b8-01-size-200x198.ivf",
- "av1-1-b8-01-size-200x200.ivf", "av1-1-b8-01-size-200x202.ivf",
- "av1-1-b8-01-size-200x208.ivf", "av1-1-b8-01-size-200x210.ivf",
- "av1-1-b8-01-size-200x224.ivf", "av1-1-b8-01-size-200x226.ivf",
- "av1-1-b8-01-size-202x196.ivf", "av1-1-b8-01-size-202x198.ivf",
- "av1-1-b8-01-size-202x200.ivf", "av1-1-b8-01-size-202x202.ivf",
- "av1-1-b8-01-size-202x208.ivf", "av1-1-b8-01-size-202x210.ivf",
- "av1-1-b8-01-size-202x224.ivf", "av1-1-b8-01-size-202x226.ivf",
- "av1-1-b8-01-size-208x196.ivf", "av1-1-b8-01-size-208x198.ivf",
- "av1-1-b8-01-size-208x200.ivf", "av1-1-b8-01-size-208x202.ivf",
- "av1-1-b8-01-size-208x208.ivf", "av1-1-b8-01-size-208x210.ivf",
- "av1-1-b8-01-size-208x224.ivf", "av1-1-b8-01-size-208x226.ivf",
- "av1-1-b8-01-size-210x196.ivf", "av1-1-b8-01-size-210x198.ivf",
- "av1-1-b8-01-size-210x200.ivf", "av1-1-b8-01-size-210x202.ivf",
- "av1-1-b8-01-size-210x208.ivf", "av1-1-b8-01-size-210x210.ivf",
- "av1-1-b8-01-size-210x224.ivf", "av1-1-b8-01-size-210x226.ivf",
- "av1-1-b8-01-size-224x196.ivf", "av1-1-b8-01-size-224x198.ivf",
- "av1-1-b8-01-size-224x200.ivf", "av1-1-b8-01-size-224x202.ivf",
- "av1-1-b8-01-size-224x208.ivf", "av1-1-b8-01-size-224x210.ivf",
- "av1-1-b8-01-size-224x224.ivf", "av1-1-b8-01-size-224x226.ivf",
- "av1-1-b8-01-size-226x196.ivf", "av1-1-b8-01-size-226x198.ivf",
- "av1-1-b8-01-size-226x200.ivf", "av1-1-b8-01-size-226x202.ivf",
- "av1-1-b8-01-size-226x208.ivf", "av1-1-b8-01-size-226x210.ivf",
- "av1-1-b8-01-size-226x224.ivf", "av1-1-b8-01-size-226x226.ivf",
- "av1-1-b8-01-size-32x16.ivf", "av1-1-b8-01-size-32x18.ivf",
- "av1-1-b8-01-size-32x32.ivf", "av1-1-b8-01-size-32x34.ivf",
- "av1-1-b8-01-size-32x64.ivf", "av1-1-b8-01-size-32x66.ivf",
- "av1-1-b8-01-size-34x16.ivf", "av1-1-b8-01-size-34x18.ivf",
- "av1-1-b8-01-size-34x32.ivf", "av1-1-b8-01-size-34x34.ivf",
- "av1-1-b8-01-size-34x64.ivf", "av1-1-b8-01-size-34x66.ivf",
- "av1-1-b8-01-size-64x16.ivf", "av1-1-b8-01-size-64x18.ivf",
- "av1-1-b8-01-size-64x32.ivf", "av1-1-b8-01-size-64x34.ivf",
- "av1-1-b8-01-size-64x64.ivf", "av1-1-b8-01-size-64x66.ivf",
- "av1-1-b8-01-size-66x16.ivf", "av1-1-b8-01-size-66x18.ivf",
- "av1-1-b8-01-size-66x32.ivf", "av1-1-b8-01-size-66x34.ivf",
- "av1-1-b8-01-size-66x64.ivf", "av1-1-b8-01-size-66x66.ivf",
- "av1-1-b8-02-allintra.ivf", "av1-1-b8-03-sizedown.mkv",
- "av1-1-b8-03-sizeup.mkv", "av1-1-b8-22-svc-L1T2.ivf",
- "av1-1-b8-22-svc-L2T1.ivf", "av1-1-b8-22-svc-L2T2.ivf"
-};
+const char *const kAV1TestVectors[] = { "av1-1-b8-00-quantizer-00.ivf",
+ "av1-1-b8-00-quantizer-01.ivf",
+ "av1-1-b8-00-quantizer-02.ivf",
+ "av1-1-b8-00-quantizer-03.ivf",
+ "av1-1-b8-00-quantizer-04.ivf",
+ "av1-1-b8-00-quantizer-05.ivf",
+ "av1-1-b8-00-quantizer-06.ivf",
+ "av1-1-b8-00-quantizer-07.ivf",
+ "av1-1-b8-00-quantizer-08.ivf",
+ "av1-1-b8-00-quantizer-09.ivf",
+ "av1-1-b8-00-quantizer-10.ivf",
+ "av1-1-b8-00-quantizer-11.ivf",
+ "av1-1-b8-00-quantizer-12.ivf",
+ "av1-1-b8-00-quantizer-13.ivf",
+ "av1-1-b8-00-quantizer-14.ivf",
+ "av1-1-b8-00-quantizer-15.ivf",
+ "av1-1-b8-00-quantizer-16.ivf",
+ "av1-1-b8-00-quantizer-17.ivf",
+ "av1-1-b8-00-quantizer-18.ivf",
+ "av1-1-b8-00-quantizer-19.ivf",
+ "av1-1-b8-00-quantizer-20.ivf",
+ "av1-1-b8-00-quantizer-21.ivf",
+ "av1-1-b8-00-quantizer-22.ivf",
+ "av1-1-b8-00-quantizer-23.ivf",
+ "av1-1-b8-00-quantizer-24.ivf",
+ "av1-1-b8-00-quantizer-25.ivf",
+ "av1-1-b8-00-quantizer-26.ivf",
+ "av1-1-b8-00-quantizer-27.ivf",
+ "av1-1-b8-00-quantizer-28.ivf",
+ "av1-1-b8-00-quantizer-29.ivf",
+ "av1-1-b8-00-quantizer-30.ivf",
+ "av1-1-b8-00-quantizer-31.ivf",
+ "av1-1-b8-00-quantizer-32.ivf",
+ "av1-1-b8-00-quantizer-33.ivf",
+ "av1-1-b8-00-quantizer-34.ivf",
+ "av1-1-b8-00-quantizer-35.ivf",
+ "av1-1-b8-00-quantizer-36.ivf",
+ "av1-1-b8-00-quantizer-37.ivf",
+ "av1-1-b8-00-quantizer-38.ivf",
+ "av1-1-b8-00-quantizer-39.ivf",
+ "av1-1-b8-00-quantizer-40.ivf",
+ "av1-1-b8-00-quantizer-41.ivf",
+ "av1-1-b8-00-quantizer-42.ivf",
+ "av1-1-b8-00-quantizer-43.ivf",
+ "av1-1-b8-00-quantizer-44.ivf",
+ "av1-1-b8-00-quantizer-45.ivf",
+ "av1-1-b8-00-quantizer-46.ivf",
+ "av1-1-b8-00-quantizer-47.ivf",
+ "av1-1-b8-00-quantizer-48.ivf",
+ "av1-1-b8-00-quantizer-49.ivf",
+ "av1-1-b8-00-quantizer-50.ivf",
+ "av1-1-b8-00-quantizer-51.ivf",
+ "av1-1-b8-00-quantizer-52.ivf",
+ "av1-1-b8-00-quantizer-53.ivf",
+ "av1-1-b8-00-quantizer-54.ivf",
+ "av1-1-b8-00-quantizer-55.ivf",
+ "av1-1-b8-00-quantizer-56.ivf",
+ "av1-1-b8-00-quantizer-57.ivf",
+ "av1-1-b8-00-quantizer-58.ivf",
+ "av1-1-b8-00-quantizer-59.ivf",
+ "av1-1-b8-00-quantizer-60.ivf",
+ "av1-1-b8-00-quantizer-61.ivf",
+ "av1-1-b8-00-quantizer-62.ivf",
+ "av1-1-b8-00-quantizer-63.ivf",
+ "av1-1-b10-00-quantizer-00.ivf",
+ "av1-1-b10-00-quantizer-01.ivf",
+ "av1-1-b10-00-quantizer-02.ivf",
+ "av1-1-b10-00-quantizer-03.ivf",
+ "av1-1-b10-00-quantizer-04.ivf",
+ "av1-1-b10-00-quantizer-05.ivf",
+ "av1-1-b10-00-quantizer-06.ivf",
+ "av1-1-b10-00-quantizer-07.ivf",
+ "av1-1-b10-00-quantizer-08.ivf",
+ "av1-1-b10-00-quantizer-09.ivf",
+ "av1-1-b10-00-quantizer-10.ivf",
+ "av1-1-b10-00-quantizer-11.ivf",
+ "av1-1-b10-00-quantizer-12.ivf",
+ "av1-1-b10-00-quantizer-13.ivf",
+ "av1-1-b10-00-quantizer-14.ivf",
+ "av1-1-b10-00-quantizer-15.ivf",
+ "av1-1-b10-00-quantizer-16.ivf",
+ "av1-1-b10-00-quantizer-17.ivf",
+ "av1-1-b10-00-quantizer-18.ivf",
+ "av1-1-b10-00-quantizer-19.ivf",
+ "av1-1-b10-00-quantizer-20.ivf",
+ "av1-1-b10-00-quantizer-21.ivf",
+ "av1-1-b10-00-quantizer-22.ivf",
+ "av1-1-b10-00-quantizer-23.ivf",
+ "av1-1-b10-00-quantizer-24.ivf",
+ "av1-1-b10-00-quantizer-25.ivf",
+ "av1-1-b10-00-quantizer-26.ivf",
+ "av1-1-b10-00-quantizer-27.ivf",
+ "av1-1-b10-00-quantizer-28.ivf",
+ "av1-1-b10-00-quantizer-29.ivf",
+ "av1-1-b10-00-quantizer-30.ivf",
+ "av1-1-b10-00-quantizer-31.ivf",
+ "av1-1-b10-00-quantizer-32.ivf",
+ "av1-1-b10-00-quantizer-33.ivf",
+ "av1-1-b10-00-quantizer-34.ivf",
+ "av1-1-b10-00-quantizer-35.ivf",
+ "av1-1-b10-00-quantizer-36.ivf",
+ "av1-1-b10-00-quantizer-37.ivf",
+ "av1-1-b10-00-quantizer-38.ivf",
+ "av1-1-b10-00-quantizer-39.ivf",
+ "av1-1-b10-00-quantizer-40.ivf",
+ "av1-1-b10-00-quantizer-41.ivf",
+ "av1-1-b10-00-quantizer-42.ivf",
+ "av1-1-b10-00-quantizer-43.ivf",
+ "av1-1-b10-00-quantizer-44.ivf",
+ "av1-1-b10-00-quantizer-45.ivf",
+ "av1-1-b10-00-quantizer-46.ivf",
+ "av1-1-b10-00-quantizer-47.ivf",
+ "av1-1-b10-00-quantizer-48.ivf",
+ "av1-1-b10-00-quantizer-49.ivf",
+ "av1-1-b10-00-quantizer-50.ivf",
+ "av1-1-b10-00-quantizer-51.ivf",
+ "av1-1-b10-00-quantizer-52.ivf",
+ "av1-1-b10-00-quantizer-53.ivf",
+ "av1-1-b10-00-quantizer-54.ivf",
+ "av1-1-b10-00-quantizer-55.ivf",
+ "av1-1-b10-00-quantizer-56.ivf",
+ "av1-1-b10-00-quantizer-57.ivf",
+ "av1-1-b10-00-quantizer-58.ivf",
+ "av1-1-b10-00-quantizer-59.ivf",
+ "av1-1-b10-00-quantizer-60.ivf",
+ "av1-1-b10-00-quantizer-61.ivf",
+ "av1-1-b10-00-quantizer-62.ivf",
+ "av1-1-b10-00-quantizer-63.ivf",
+ "av1-1-b8-01-size-16x16.ivf",
+ "av1-1-b8-01-size-16x18.ivf",
+ "av1-1-b8-01-size-16x32.ivf",
+ "av1-1-b8-01-size-16x34.ivf",
+ "av1-1-b8-01-size-16x64.ivf",
+ "av1-1-b8-01-size-16x66.ivf",
+ "av1-1-b8-01-size-18x16.ivf",
+ "av1-1-b8-01-size-18x18.ivf",
+ "av1-1-b8-01-size-18x32.ivf",
+ "av1-1-b8-01-size-18x34.ivf",
+ "av1-1-b8-01-size-18x64.ivf",
+ "av1-1-b8-01-size-18x66.ivf",
+ "av1-1-b8-01-size-196x196.ivf",
+ "av1-1-b8-01-size-196x198.ivf",
+ "av1-1-b8-01-size-196x200.ivf",
+ "av1-1-b8-01-size-196x202.ivf",
+ "av1-1-b8-01-size-196x208.ivf",
+ "av1-1-b8-01-size-196x210.ivf",
+ "av1-1-b8-01-size-196x224.ivf",
+ "av1-1-b8-01-size-196x226.ivf",
+ "av1-1-b8-01-size-198x196.ivf",
+ "av1-1-b8-01-size-198x198.ivf",
+ "av1-1-b8-01-size-198x200.ivf",
+ "av1-1-b8-01-size-198x202.ivf",
+ "av1-1-b8-01-size-198x208.ivf",
+ "av1-1-b8-01-size-198x210.ivf",
+ "av1-1-b8-01-size-198x224.ivf",
+ "av1-1-b8-01-size-198x226.ivf",
+ "av1-1-b8-01-size-200x196.ivf",
+ "av1-1-b8-01-size-200x198.ivf",
+ "av1-1-b8-01-size-200x200.ivf",
+ "av1-1-b8-01-size-200x202.ivf",
+ "av1-1-b8-01-size-200x208.ivf",
+ "av1-1-b8-01-size-200x210.ivf",
+ "av1-1-b8-01-size-200x224.ivf",
+ "av1-1-b8-01-size-200x226.ivf",
+ "av1-1-b8-01-size-202x196.ivf",
+ "av1-1-b8-01-size-202x198.ivf",
+ "av1-1-b8-01-size-202x200.ivf",
+ "av1-1-b8-01-size-202x202.ivf",
+ "av1-1-b8-01-size-202x208.ivf",
+ "av1-1-b8-01-size-202x210.ivf",
+ "av1-1-b8-01-size-202x224.ivf",
+ "av1-1-b8-01-size-202x226.ivf",
+ "av1-1-b8-01-size-208x196.ivf",
+ "av1-1-b8-01-size-208x198.ivf",
+ "av1-1-b8-01-size-208x200.ivf",
+ "av1-1-b8-01-size-208x202.ivf",
+ "av1-1-b8-01-size-208x208.ivf",
+ "av1-1-b8-01-size-208x210.ivf",
+ "av1-1-b8-01-size-208x224.ivf",
+ "av1-1-b8-01-size-208x226.ivf",
+ "av1-1-b8-01-size-210x196.ivf",
+ "av1-1-b8-01-size-210x198.ivf",
+ "av1-1-b8-01-size-210x200.ivf",
+ "av1-1-b8-01-size-210x202.ivf",
+ "av1-1-b8-01-size-210x208.ivf",
+ "av1-1-b8-01-size-210x210.ivf",
+ "av1-1-b8-01-size-210x224.ivf",
+ "av1-1-b8-01-size-210x226.ivf",
+ "av1-1-b8-01-size-224x196.ivf",
+ "av1-1-b8-01-size-224x198.ivf",
+ "av1-1-b8-01-size-224x200.ivf",
+ "av1-1-b8-01-size-224x202.ivf",
+ "av1-1-b8-01-size-224x208.ivf",
+ "av1-1-b8-01-size-224x210.ivf",
+ "av1-1-b8-01-size-224x224.ivf",
+ "av1-1-b8-01-size-224x226.ivf",
+ "av1-1-b8-01-size-226x196.ivf",
+ "av1-1-b8-01-size-226x198.ivf",
+ "av1-1-b8-01-size-226x200.ivf",
+ "av1-1-b8-01-size-226x202.ivf",
+ "av1-1-b8-01-size-226x208.ivf",
+ "av1-1-b8-01-size-226x210.ivf",
+ "av1-1-b8-01-size-226x224.ivf",
+ "av1-1-b8-01-size-226x226.ivf",
+ "av1-1-b8-01-size-32x16.ivf",
+ "av1-1-b8-01-size-32x18.ivf",
+ "av1-1-b8-01-size-32x32.ivf",
+ "av1-1-b8-01-size-32x34.ivf",
+ "av1-1-b8-01-size-32x64.ivf",
+ "av1-1-b8-01-size-32x66.ivf",
+ "av1-1-b8-01-size-34x16.ivf",
+ "av1-1-b8-01-size-34x18.ivf",
+ "av1-1-b8-01-size-34x32.ivf",
+ "av1-1-b8-01-size-34x34.ivf",
+ "av1-1-b8-01-size-34x64.ivf",
+ "av1-1-b8-01-size-34x66.ivf",
+ "av1-1-b8-01-size-64x16.ivf",
+ "av1-1-b8-01-size-64x18.ivf",
+ "av1-1-b8-01-size-64x32.ivf",
+ "av1-1-b8-01-size-64x34.ivf",
+ "av1-1-b8-01-size-64x64.ivf",
+ "av1-1-b8-01-size-64x66.ivf",
+ "av1-1-b8-01-size-66x16.ivf",
+ "av1-1-b8-01-size-66x18.ivf",
+ "av1-1-b8-01-size-66x32.ivf",
+ "av1-1-b8-01-size-66x34.ivf",
+ "av1-1-b8-01-size-66x64.ivf",
+ "av1-1-b8-01-size-66x66.ivf",
+ "av1-1-b8-02-allintra.ivf",
+ "av1-1-b8-03-sizedown.mkv",
+ "av1-1-b8-03-sizeup.mkv",
+ "av1-1-b8-04-cdfupdate.ivf",
+ "av1-1-b8-05-mv.ivf",
+ "av1-1-b8-06-mfmv.ivf",
+ "av1-1-b8-22-svc-L1T2.ivf",
+ "av1-1-b8-22-svc-L2T1.ivf",
+ "av1-1-b8-22-svc-L2T2.ivf" };
const int kNumAV1TestVectors = NELEMENTS(kAV1TestVectors);
#endif // CONFIG_AV1_DECODER
diff --git a/libaom/test/variance_test.cc b/libaom/test/variance_test.cc
index 0df314b..1942de0 100644
--- a/libaom/test/variance_test.cc
+++ b/libaom/test/variance_test.cc
@@ -43,10 +43,10 @@ typedef unsigned int (*SubpixAvgVarMxNFunc)(const uint8_t *a, int a_stride,
typedef unsigned int (*Get4x4SseFunc)(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride);
typedef unsigned int (*SumOfSquaresFunction)(const int16_t *src);
-typedef unsigned int (*JntSubpixAvgVarMxNFunc)(
+typedef unsigned int (*DistWtdSubpixAvgVarMxNFunc)(
const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b,
int b_stride, uint32_t *sse, const uint8_t *second_pred,
- const JNT_COMP_PARAMS *jcp_param);
+ const DIST_WTD_COMP_PARAMS *jcp_param);
typedef uint32_t (*ObmcSubpelVarFunc)(const uint8_t *pre, int pre_stride,
int xoffset, int yoffset,
const int32_t *wsrc, const int32_t *mask,
@@ -216,10 +216,10 @@ static uint32_t subpel_avg_variance_ref(const uint8_t *ref, const uint8_t *src,
return static_cast<uint32_t>(sse - ((se * se) >> (l2w + l2h)));
}
-static uint32_t jnt_subpel_avg_variance_ref(
+static uint32_t dist_wtd_subpel_avg_variance_ref(
const uint8_t *ref, const uint8_t *src, const uint8_t *second_pred, int l2w,
int l2h, int xoff, int yoff, uint32_t *sse_ptr, bool use_high_bit_depth,
- aom_bit_depth_t bit_depth, JNT_COMP_PARAMS *jcp_param) {
+ aom_bit_depth_t bit_depth, DIST_WTD_COMP_PARAMS *jcp_param) {
int64_t se = 0;
uint64_t sse = 0;
const int w = 1 << l2w;
@@ -703,13 +703,14 @@ class SubpelVarianceTest
protected:
void RefTest();
void ExtremeRefTest();
+ void SpeedTest();
ACMRandom rnd_;
uint8_t *src_;
uint8_t *ref_;
uint8_t *sec_;
TestParams<FunctionType> params_;
- JNT_COMP_PARAMS jcp_param_;
+ DIST_WTD_COMP_PARAMS jcp_param_;
// some relay helpers
bool use_high_bit_depth() const { return params_.use_high_bit_depth; }
@@ -785,6 +786,41 @@ void SubpelVarianceTest<SubpelVarianceFunctionType>::ExtremeRefTest() {
}
}
+template <typename SubpelVarianceFunctionType>
+void SubpelVarianceTest<SubpelVarianceFunctionType>::SpeedTest() {
+ if (!use_high_bit_depth()) {
+ for (int j = 0; j < block_size(); j++) {
+ src_[j] = rnd_.Rand8();
+ }
+ for (int j = 0; j < block_size() + width() + height() + 1; j++) {
+ ref_[j] = rnd_.Rand8();
+ }
+ } else {
+ for (int j = 0; j < block_size(); j++) {
+ CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
+ }
+ for (int j = 0; j < block_size() + width() + height() + 1; j++) {
+ CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
+ }
+ }
+
+ unsigned int sse1;
+ int run_time = 1000000000 / block_size();
+ aom_usec_timer timer;
+
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < run_time; ++i) {
+ int x = rnd_(8);
+ int y = rnd_(8);
+ params_.func(ref_, width() + 1, x, y, src_, width(), &sse1);
+ }
+ aom_usec_timer_mark(&timer);
+
+ const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+ printf("sub_pixel_variance_%dx%d_%d: %d us\n", width(), height(),
+ params_.bit_depth, elapsed_time);
+}
+
template <>
void SubpelVarianceTest<SubpixAvgVarMxNFunc>::RefTest() {
for (int x = 0; x < 8; ++x) {
@@ -820,7 +856,7 @@ void SubpelVarianceTest<SubpixAvgVarMxNFunc>::RefTest() {
}
template <>
-void SubpelVarianceTest<JntSubpixAvgVarMxNFunc>::RefTest() {
+void SubpelVarianceTest<DistWtdSubpixAvgVarMxNFunc>::RefTest() {
for (int x = 0; x < 8; ++x) {
for (int y = 0; y < 8; ++y) {
if (!use_high_bit_depth()) {
@@ -849,7 +885,7 @@ void SubpelVarianceTest<JntSubpixAvgVarMxNFunc>::RefTest() {
ASM_REGISTER_STATE_CHECK(var1 = params_.func(ref_, width() + 0, x, y,
src_, width(), &sse1,
sec_, &jcp_param_));
- var2 = jnt_subpel_avg_variance_ref(
+ var2 = dist_wtd_subpel_avg_variance_ref(
ref_, src_, sec_, params_.log2width, params_.log2height, x, y,
&sse2, use_high_bit_depth(), params_.bit_depth, &jcp_param_);
EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;
@@ -1022,7 +1058,8 @@ typedef MainTestClass<VarianceMxNFunc> AvxMseTest;
typedef MainTestClass<VarianceMxNFunc> AvxVarianceTest;
typedef SubpelVarianceTest<SubpixVarMxNFunc> AvxSubpelVarianceTest;
typedef SubpelVarianceTest<SubpixAvgVarMxNFunc> AvxSubpelAvgVarianceTest;
-typedef SubpelVarianceTest<JntSubpixAvgVarMxNFunc> AvxJntSubpelAvgVarianceTest;
+typedef SubpelVarianceTest<DistWtdSubpixAvgVarMxNFunc>
+ AvxDistWtdSubpelAvgVarianceTest;
typedef ObmcVarianceTest<ObmcSubpelVarFunc> AvxObmcSubpelVarianceTest;
TEST_P(AvxSseTest, RefSse) { RefTestSse(); }
@@ -1039,7 +1076,7 @@ TEST_P(SumOfSquaresTest, Ref) { RefTest(); }
TEST_P(AvxSubpelVarianceTest, Ref) { RefTest(); }
TEST_P(AvxSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
TEST_P(AvxSubpelAvgVarianceTest, Ref) { RefTest(); }
-TEST_P(AvxJntSubpelAvgVarianceTest, Ref) { RefTest(); }
+TEST_P(AvxDistWtdSubpelAvgVarianceTest, Ref) { RefTest(); }
TEST_P(AvxObmcSubpelVarianceTest, Ref) { RefTest(); }
TEST_P(AvxObmcSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
TEST_P(AvxObmcSubpelVarianceTest, DISABLED_Speed) { SpeedTest(); }
@@ -1121,36 +1158,35 @@ INSTANTIATE_TEST_CASE_P(
SubpelAvgVarianceParams(2, 3, &aom_sub_pixel_avg_variance4x8_c, 0),
SubpelAvgVarianceParams(2, 2, &aom_sub_pixel_avg_variance4x4_c, 0)));
-typedef TestParams<JntSubpixAvgVarMxNFunc> JntSubpelAvgVarianceParams;
+typedef TestParams<DistWtdSubpixAvgVarMxNFunc> DistWtdSubpelAvgVarianceParams;
INSTANTIATE_TEST_CASE_P(
- C, AvxJntSubpelAvgVarianceTest,
- ::testing::Values(
- JntSubpelAvgVarianceParams(6, 6, &aom_jnt_sub_pixel_avg_variance64x64_c,
- 0),
- JntSubpelAvgVarianceParams(6, 5, &aom_jnt_sub_pixel_avg_variance64x32_c,
- 0),
- JntSubpelAvgVarianceParams(5, 6, &aom_jnt_sub_pixel_avg_variance32x64_c,
- 0),
- JntSubpelAvgVarianceParams(5, 5, &aom_jnt_sub_pixel_avg_variance32x32_c,
- 0),
- JntSubpelAvgVarianceParams(5, 4, &aom_jnt_sub_pixel_avg_variance32x16_c,
- 0),
- JntSubpelAvgVarianceParams(4, 5, &aom_jnt_sub_pixel_avg_variance16x32_c,
- 0),
- JntSubpelAvgVarianceParams(4, 4, &aom_jnt_sub_pixel_avg_variance16x16_c,
- 0),
- JntSubpelAvgVarianceParams(4, 3, &aom_jnt_sub_pixel_avg_variance16x8_c,
- 0),
- JntSubpelAvgVarianceParams(3, 4, &aom_jnt_sub_pixel_avg_variance8x16_c,
- 0),
- JntSubpelAvgVarianceParams(3, 3, &aom_jnt_sub_pixel_avg_variance8x8_c,
- 0),
- JntSubpelAvgVarianceParams(3, 2, &aom_jnt_sub_pixel_avg_variance8x4_c,
- 0),
- JntSubpelAvgVarianceParams(2, 3, &aom_jnt_sub_pixel_avg_variance4x8_c,
- 0),
- JntSubpelAvgVarianceParams(2, 2, &aom_jnt_sub_pixel_avg_variance4x4_c,
- 0)));
+ C, AvxDistWtdSubpelAvgVarianceTest,
+ ::testing::Values(DistWtdSubpelAvgVarianceParams(
+ 6, 6, &aom_dist_wtd_sub_pixel_avg_variance64x64_c, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 5, &aom_dist_wtd_sub_pixel_avg_variance64x32_c, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 6, &aom_dist_wtd_sub_pixel_avg_variance32x64_c, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 5, &aom_dist_wtd_sub_pixel_avg_variance32x32_c, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 4, &aom_dist_wtd_sub_pixel_avg_variance32x16_c, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 5, &aom_dist_wtd_sub_pixel_avg_variance16x32_c, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 4, &aom_dist_wtd_sub_pixel_avg_variance16x16_c, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 3, &aom_dist_wtd_sub_pixel_avg_variance16x8_c, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 4, &aom_dist_wtd_sub_pixel_avg_variance8x16_c, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 3, &aom_dist_wtd_sub_pixel_avg_variance8x8_c, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 2, &aom_dist_wtd_sub_pixel_avg_variance8x4_c, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 2, 3, &aom_dist_wtd_sub_pixel_avg_variance4x8_c, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 2, 2, &aom_dist_wtd_sub_pixel_avg_variance4x4_c, 0)));
INSTANTIATE_TEST_CASE_P(
C, AvxObmcSubpelVarianceTest,
@@ -1188,6 +1224,7 @@ TEST_P(AvxHBDVarianceTest, OneQuarter) { OneQuarterTest(); }
TEST_P(AvxHBDVarianceTest, DISABLED_Speed) { SpeedTest(); }
TEST_P(AvxHBDSubpelVarianceTest, Ref) { RefTest(); }
TEST_P(AvxHBDSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
+TEST_P(AvxHBDSubpelVarianceTest, DISABLED_Speed) { SpeedTest(); }
TEST_P(AvxHBDSubpelAvgVarianceTest, Ref) { RefTest(); }
/* TODO(debargha): This test does not support the highbd version
@@ -1677,6 +1714,9 @@ INSTANTIATE_TEST_CASE_P(AVX2, AvxHBDVarianceTest,
#endif // HAVE_AVX2
const SubpelVarianceParams kArrayHBDSubpelVariance_sse2[] = {
+ SubpelVarianceParams(7, 7, &aom_highbd_12_sub_pixel_variance128x128_sse2, 12),
+ SubpelVarianceParams(7, 6, &aom_highbd_12_sub_pixel_variance128x64_sse2, 12),
+ SubpelVarianceParams(6, 7, &aom_highbd_12_sub_pixel_variance64x128_sse2, 12),
SubpelVarianceParams(6, 6, &aom_highbd_12_sub_pixel_variance64x64_sse2, 12),
SubpelVarianceParams(6, 5, &aom_highbd_12_sub_pixel_variance64x32_sse2, 12),
SubpelVarianceParams(5, 6, &aom_highbd_12_sub_pixel_variance32x64_sse2, 12),
@@ -1688,6 +1728,9 @@ const SubpelVarianceParams kArrayHBDSubpelVariance_sse2[] = {
SubpelVarianceParams(3, 4, &aom_highbd_12_sub_pixel_variance8x16_sse2, 12),
SubpelVarianceParams(3, 3, &aom_highbd_12_sub_pixel_variance8x8_sse2, 12),
SubpelVarianceParams(3, 2, &aom_highbd_12_sub_pixel_variance8x4_sse2, 12),
+ SubpelVarianceParams(7, 7, &aom_highbd_10_sub_pixel_variance128x128_sse2, 10),
+ SubpelVarianceParams(7, 6, &aom_highbd_10_sub_pixel_variance128x64_sse2, 10),
+ SubpelVarianceParams(6, 7, &aom_highbd_10_sub_pixel_variance64x128_sse2, 10),
SubpelVarianceParams(6, 6, &aom_highbd_10_sub_pixel_variance64x64_sse2, 10),
SubpelVarianceParams(6, 5, &aom_highbd_10_sub_pixel_variance64x32_sse2, 10),
SubpelVarianceParams(5, 6, &aom_highbd_10_sub_pixel_variance32x64_sse2, 10),
@@ -1699,6 +1742,9 @@ const SubpelVarianceParams kArrayHBDSubpelVariance_sse2[] = {
SubpelVarianceParams(3, 4, &aom_highbd_10_sub_pixel_variance8x16_sse2, 10),
SubpelVarianceParams(3, 3, &aom_highbd_10_sub_pixel_variance8x8_sse2, 10),
SubpelVarianceParams(3, 2, &aom_highbd_10_sub_pixel_variance8x4_sse2, 10),
+ SubpelVarianceParams(7, 7, &aom_highbd_8_sub_pixel_variance128x128_sse2, 8),
+ SubpelVarianceParams(7, 6, &aom_highbd_8_sub_pixel_variance128x64_sse2, 8),
+ SubpelVarianceParams(6, 7, &aom_highbd_8_sub_pixel_variance64x128_sse2, 8),
SubpelVarianceParams(6, 6, &aom_highbd_8_sub_pixel_variance64x64_sse2, 8),
SubpelVarianceParams(6, 5, &aom_highbd_8_sub_pixel_variance64x32_sse2, 8),
SubpelVarianceParams(5, 6, &aom_highbd_8_sub_pixel_variance32x64_sse2, 8),
@@ -1711,7 +1757,6 @@ const SubpelVarianceParams kArrayHBDSubpelVariance_sse2[] = {
SubpelVarianceParams(3, 3, &aom_highbd_8_sub_pixel_variance8x8_sse2, 8),
SubpelVarianceParams(3, 2, &aom_highbd_8_sub_pixel_variance8x4_sse2, 8)
};
-
INSTANTIATE_TEST_CASE_P(SSE2, AvxHBDSubpelVarianceTest,
::testing::ValuesIn(kArrayHBDSubpelVariance_sse2));
@@ -1840,44 +1885,34 @@ INSTANTIATE_TEST_CASE_P(
0)));
INSTANTIATE_TEST_CASE_P(
- SSSE3, AvxJntSubpelAvgVarianceTest,
+ SSSE3, AvxDistWtdSubpelAvgVarianceTest,
::testing::Values(
- JntSubpelAvgVarianceParams(6, 6,
- &aom_jnt_sub_pixel_avg_variance64x64_ssse3,
- 0),
- JntSubpelAvgVarianceParams(6, 5,
- &aom_jnt_sub_pixel_avg_variance64x32_ssse3,
- 0),
- JntSubpelAvgVarianceParams(5, 6,
- &aom_jnt_sub_pixel_avg_variance32x64_ssse3,
- 0),
- JntSubpelAvgVarianceParams(5, 5,
- &aom_jnt_sub_pixel_avg_variance32x32_ssse3,
- 0),
- JntSubpelAvgVarianceParams(5, 4,
- &aom_jnt_sub_pixel_avg_variance32x16_ssse3,
- 0),
- JntSubpelAvgVarianceParams(4, 5,
- &aom_jnt_sub_pixel_avg_variance16x32_ssse3,
- 0),
- JntSubpelAvgVarianceParams(4, 4,
- &aom_jnt_sub_pixel_avg_variance16x16_ssse3,
- 0),
- JntSubpelAvgVarianceParams(4, 3,
- &aom_jnt_sub_pixel_avg_variance16x8_ssse3,
- 0),
- JntSubpelAvgVarianceParams(3, 4,
- &aom_jnt_sub_pixel_avg_variance8x16_ssse3,
- 0),
- JntSubpelAvgVarianceParams(3, 3,
- &aom_jnt_sub_pixel_avg_variance8x8_ssse3, 0),
- JntSubpelAvgVarianceParams(3, 2,
- &aom_jnt_sub_pixel_avg_variance8x4_ssse3, 0),
- JntSubpelAvgVarianceParams(2, 3,
- &aom_jnt_sub_pixel_avg_variance4x8_ssse3, 0),
- JntSubpelAvgVarianceParams(2, 2,
- &aom_jnt_sub_pixel_avg_variance4x4_ssse3,
- 0)));
+ DistWtdSubpelAvgVarianceParams(
+ 6, 6, &aom_dist_wtd_sub_pixel_avg_variance64x64_ssse3, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 6, 5, &aom_dist_wtd_sub_pixel_avg_variance64x32_ssse3, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 6, &aom_dist_wtd_sub_pixel_avg_variance32x64_ssse3, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 5, &aom_dist_wtd_sub_pixel_avg_variance32x32_ssse3, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 5, 4, &aom_dist_wtd_sub_pixel_avg_variance32x16_ssse3, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 5, &aom_dist_wtd_sub_pixel_avg_variance16x32_ssse3, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 4, &aom_dist_wtd_sub_pixel_avg_variance16x16_ssse3, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 4, 3, &aom_dist_wtd_sub_pixel_avg_variance16x8_ssse3, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 4, &aom_dist_wtd_sub_pixel_avg_variance8x16_ssse3, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 3, &aom_dist_wtd_sub_pixel_avg_variance8x8_ssse3, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 3, 2, &aom_dist_wtd_sub_pixel_avg_variance8x4_ssse3, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 2, 3, &aom_dist_wtd_sub_pixel_avg_variance4x8_ssse3, 0),
+ DistWtdSubpelAvgVarianceParams(
+ 2, 2, &aom_dist_wtd_sub_pixel_avg_variance4x4_ssse3, 0)));
#endif // HAVE_SSSE3
#if HAVE_SSE4_1
diff --git a/libaom/test/warp_filter_test_util.cc b/libaom/test/warp_filter_test_util.cc
index 69b2ed4..9208af8 100644
--- a/libaom/test/warp_filter_test_util.cc
+++ b/libaom/test/warp_filter_test_util.cc
@@ -149,7 +149,7 @@ void AV1WarpFilterTest::RunSpeedTest(warp_affine_func test_impl) {
int do_average = 0;
conv_params = get_conv_params_no_round(do_average, 0, dsta, out_w, 1, bd);
- conv_params.use_jnt_comp_avg = 0;
+ conv_params.use_dist_wtd_comp_avg = 0;
const int num_loops = 1000000000 / (out_w + out_h);
aom_usec_timer timer;
@@ -222,9 +222,9 @@ void AV1WarpFilterTest::RunCheckOutput(warp_affine_func test_impl) {
conv_params = get_conv_params(0, 0, bd);
}
if (jj >= 4) {
- conv_params.use_jnt_comp_avg = 0;
+ conv_params.use_dist_wtd_comp_avg = 0;
} else {
- conv_params.use_jnt_comp_avg = 1;
+ conv_params.use_dist_wtd_comp_avg = 1;
conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
}
@@ -236,9 +236,9 @@ void AV1WarpFilterTest::RunCheckOutput(warp_affine_func test_impl) {
get_conv_params_no_round(do_average, 0, dstb, out_w, 1, bd);
}
if (jj >= 4) {
- conv_params.use_jnt_comp_avg = 0;
+ conv_params.use_dist_wtd_comp_avg = 0;
} else {
- conv_params.use_jnt_comp_avg = 1;
+ conv_params.use_dist_wtd_comp_avg = 1;
conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
}
@@ -342,7 +342,7 @@ void AV1HighbdWarpFilterTest::RunSpeedTest(highbd_warp_affine_func test_impl) {
sub_x = 0;
sub_y = 0;
int do_average = 0;
- conv_params.use_jnt_comp_avg = 0;
+ conv_params.use_dist_wtd_comp_avg = 0;
conv_params = get_conv_params_no_round(do_average, 0, dsta, out_w, 1, bd);
const int num_loops = 1000000000 / (out_w + out_h);
@@ -419,9 +419,9 @@ void AV1HighbdWarpFilterTest::RunCheckOutput(
conv_params = get_conv_params(0, 0, bd);
}
if (jj >= 4) {
- conv_params.use_jnt_comp_avg = 0;
+ conv_params.use_dist_wtd_comp_avg = 0;
} else {
- conv_params.use_jnt_comp_avg = 1;
+ conv_params.use_dist_wtd_comp_avg = 1;
conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
}
@@ -436,9 +436,9 @@ void AV1HighbdWarpFilterTest::RunCheckOutput(
get_conv_params_no_round(do_average, 0, dstb, out_w, 1, bd);
}
if (jj >= 4) {
- conv_params.use_jnt_comp_avg = 0;
+ conv_params.use_dist_wtd_comp_avg = 0;
} else {
- conv_params.use_jnt_comp_avg = 1;
+ conv_params.use_dist_wtd_comp_avg = 1;
conv_params.fwd_offset = quant_dist_lookup_table[ii][jj][0];
conv_params.bck_offset = quant_dist_lookup_table[ii][jj][1];
}
diff --git a/libaom/test/yuv_temporal_filter_test.cc b/libaom/test/yuv_temporal_filter_test.cc
new file mode 100644
index 0000000..fcaf0df
--- /dev/null
+++ b/libaom/test/yuv_temporal_filter_test.cc
@@ -0,0 +1,726 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "config/av1_rtcd.h"
+#include "test/acm_random.h"
+#include "test/register_state_check.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+
+namespace {
+
+using ::libaom_test::ACMRandom;
+
+const int MAX_WIDTH = 32;
+const int MAX_HEIGHT = 32;
+
+typedef void (*YUVTemporalFilterFunc)(
+ const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
+ int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
+ int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
+ int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+ int ss_x, int ss_y, int strength, const int *blk_fw, int use_32x32,
+ uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator,
+ uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count);
+
+struct TemporalFilterWithBd {
+ TemporalFilterWithBd(YUVTemporalFilterFunc func, int bitdepth)
+ : temporal_filter(func), bd(bitdepth) {}
+
+ YUVTemporalFilterFunc temporal_filter;
+ int bd;
+};
+
+std::ostream &operator<<(std::ostream &os, const TemporalFilterWithBd &tf) {
+ return os << "Bitdepth: " << tf.bd;
+}
+
+int GetFilterWeight(unsigned int row, unsigned int col,
+ unsigned int block_height, unsigned int block_width,
+ const int *const blk_fw, int use_32x32) {
+ if (use_32x32) {
+ return blk_fw[0];
+ }
+
+ return blk_fw[2 * (row >= block_height / 2) + (col >= block_width / 2)];
+}
+
+template <typename PixelType>
+int GetModIndex(int sum_dist, int index, int rounding, int strength,
+ int filter_weight) {
+ int mod = sum_dist * 3 / index;
+ mod += rounding;
+ mod >>= strength;
+
+ mod = AOMMIN(16, mod);
+
+ mod = 16 - mod;
+ mod *= filter_weight;
+
+ return mod;
+}
+
+// Lowbitdepth version
+template <>
+int GetModIndex<uint8_t>(int sum_dist, int index, int rounding, int strength,
+ int filter_weight) {
+ unsigned int index_mult[14] = {
+ 0, 0, 0, 0, 49152, 39322, 32768, 28087, 24576, 21846, 19661, 17874, 0, 15124
+ };
+
+ assert(index >= 0 && index <= 13);
+ assert(index_mult[index] != 0);
+
+ int mod = (clamp(sum_dist, 0, UINT16_MAX) * index_mult[index]) >> 16;
+ mod += rounding;
+ mod >>= strength;
+
+ mod = AOMMIN(16, mod);
+
+ mod = 16 - mod;
+ mod *= filter_weight;
+
+ return mod;
+}
+
+// Highbitdepth version
+template <>
+int GetModIndex<uint16_t>(int sum_dist, int index, int rounding, int strength,
+ int filter_weight) {
+ int64_t index_mult[14] = { 0U, 0U, 0U, 0U,
+ 3221225472U, 2576980378U, 2147483648U, 1840700270U,
+ 1610612736U, 1431655766U, 1288490189U, 1171354718U,
+ 0U, 991146300U };
+
+ assert(index >= 0 && index <= 13);
+ assert(index_mult[index] != 0);
+
+ int mod = static_cast<int>((sum_dist * index_mult[index]) >> 32);
+ mod += rounding;
+ mod >>= strength;
+
+ mod = AOMMIN(16, mod);
+
+ mod = 16 - mod;
+ mod *= filter_weight;
+
+ return mod;
+}
+
+template <typename PixelType>
+void SetArray(PixelType *pixel_array, int width, int height, int stride,
+ int val) {
+ for (int row = 0; row < height; row++) {
+ for (int col = 0; col < width; col++) {
+ pixel_array[col] = val;
+ }
+ pixel_array += stride;
+ }
+}
+
+template <typename PixelType>
+void SetArray(PixelType *pixel_array, int width, int height, int stride,
+ ACMRandom *rnd, int low_val, int high_val) {
+ EXPECT_LE(low_val, high_val);
+
+ for (int row = 0; row < height; row++) {
+ for (int col = 0; col < width; col++) {
+ const int val =
+ static_cast<int>((*rnd).PseudoUniform(high_val - low_val));
+ pixel_array[col] = low_val + val;
+ }
+ pixel_array += stride;
+ }
+}
+
+template <typename ValueType>
+bool CheckArrayEqual(const ValueType *arr_1, const ValueType *arr_2, int width,
+ int height, int stride_1, int stride_2) {
+ for (int row = 0; row < height; row++) {
+ for (int col = 0; col < width; col++) {
+ if (arr_1[col] != arr_2[col]) {
+ return false;
+ }
+ }
+ arr_1 += stride_1;
+ arr_2 += stride_2;
+ }
+ return true;
+}
+
+template <typename ValueType>
+void PrintArrayDiff(const ValueType *arr_1, const ValueType *arr_2, int width,
+ int height, int stride_1, int stride_2) {
+ const ValueType *arr_1_start = arr_1, *arr_2_start = arr_2;
+
+ printf("Array 1:\n");
+ for (int row = 0; row < height; ++row) {
+ for (int col = 0; col < width; ++col) {
+ if (arr_1[col] != arr_2[col]) {
+ printf("*%3d", arr_1[col]);
+ } else {
+ printf("%4d", arr_1[col]);
+ }
+ }
+ printf("\n");
+ arr_1 += stride_1;
+ arr_2 += stride_2;
+ }
+
+ arr_1 = arr_1_start;
+ arr_2 = arr_2_start;
+
+ printf("Array 2:\n");
+ for (int row = 0; row < height; ++row) {
+ for (int col = 0; col < width; ++col) {
+ if (arr_1[col] != arr_2[col]) {
+ printf("*%3d", arr_2[col]);
+ } else {
+ printf("%4d", arr_2[col]);
+ }
+ }
+ printf("\n");
+ arr_1 += stride_1;
+ arr_2 += stride_2;
+ }
+
+ arr_1 = arr_1_start;
+ arr_2 = arr_2_start;
+ printf("Difference:\n");
+ for (int row = 0; row < height; ++row) {
+ for (int col = 0; col < width; ++col) {
+ printf("%4d", arr_1[col] - arr_2[col]);
+ }
+ printf("\n");
+ arr_1 += stride_1;
+ arr_2 += stride_2;
+ }
+}
+
+template <typename PixelType>
+void ApplyReferenceFilter(const PixelType *y_src, const PixelType *y_pre,
+ const PixelType *u_src, const PixelType *v_src,
+ const PixelType *u_pre, const PixelType *v_pre,
+ unsigned int block_width, unsigned int block_height,
+ int ss_x, int ss_y, int strength,
+ const int *const blk_fw, int use_32x32,
+ uint32_t *y_accum, uint16_t *y_count,
+ uint32_t *u_accum, uint16_t *u_count,
+ uint32_t *v_accum, uint16_t *v_count) {
+ const int uv_block_width = block_width >> ss_x,
+ uv_block_height = block_height >> ss_y;
+ const int y_src_stride = block_width, y_pre_stride = block_width;
+ const int uv_src_stride = uv_block_width, uv_pre_stride = uv_block_width;
+ const int y_diff_stride = block_width, uv_diff_stride = uv_block_width;
+ const int y_count_stride = block_width, u_count_stride = uv_block_width,
+ v_count_stride = uv_block_width;
+ const int y_accum_stride = block_width, u_accum_stride = uv_block_width,
+ v_accum_stride = uv_block_width;
+
+ int y_dif[MAX_WIDTH * MAX_HEIGHT] = { 0 };
+ int u_dif[MAX_WIDTH * MAX_HEIGHT] = { 0 };
+ int v_dif[MAX_WIDTH * MAX_HEIGHT] = { 0 };
+
+ const int rounding = (1 << strength) >> 1;
+
+ // Get the square diffs
+ for (int row = 0; row < (int)block_height; row++) {
+ for (int col = 0; col < (int)block_width; col++) {
+ const int diff =
+ y_src[row * y_src_stride + col] - y_pre[row * y_pre_stride + col];
+ y_dif[row * y_diff_stride + col] = diff * diff;
+ }
+ }
+
+ for (int row = 0; row < (int)uv_block_height; row++) {
+ for (int col = 0; col < (int)uv_block_width; col++) {
+ const int u_diff =
+ u_src[row * uv_src_stride + col] - u_pre[row * uv_pre_stride + col];
+ const int v_diff =
+ v_src[row * uv_src_stride + col] - v_pre[row * uv_pre_stride + col];
+ u_dif[row * uv_diff_stride + col] = u_diff * u_diff;
+ v_dif[row * uv_diff_stride + col] = v_diff * v_diff;
+ }
+ }
+
+ // Apply the filter to luma
+ for (int row = 0; row < (int)block_height; row++) {
+ for (int col = 0; col < (int)block_width; col++) {
+ const int uv_row = row >> ss_y;
+ const int uv_col = col >> ss_x;
+ const int filter_weight = GetFilterWeight(row, col, block_height,
+ block_width, blk_fw, use_32x32);
+
+ // First we get the modifier for the current y pixel
+ const int y_pixel = y_pre[row * y_pre_stride + col];
+ int y_num_used = 0;
+ int y_mod = 0;
+
+ // Sum the neighboring 3x3 y pixels
+ for (int row_step = -1; row_step <= 1; row_step++) {
+ for (int col_step = -1; col_step <= 1; col_step++) {
+ const int sub_row = row + row_step;
+ const int sub_col = col + col_step;
+
+ if (sub_row >= 0 && sub_row < (int)block_height && sub_col >= 0 &&
+ sub_col < (int)block_width) {
+ y_mod += y_dif[sub_row * y_diff_stride + sub_col];
+ y_num_used++;
+ }
+ }
+ }
+
+ // Sum the corresponding uv pixels to the current y modifier
+ // Note we are rounding down instead of rounding to the nearest pixel.
+ y_mod += u_dif[uv_row * uv_diff_stride + uv_col];
+ y_mod += v_dif[uv_row * uv_diff_stride + uv_col];
+
+ y_num_used += 2;
+
+ // Set the modifier
+ y_mod = GetModIndex<PixelType>(y_mod, y_num_used, rounding, strength,
+ filter_weight);
+
+ // Accumulate the result
+ y_count[row * y_count_stride + col] += y_mod;
+ y_accum[row * y_accum_stride + col] += y_mod * y_pixel;
+ }
+ }
+
+ // Apply the filter to chroma
+ for (int uv_row = 0; uv_row < (int)uv_block_height; uv_row++) {
+ for (int uv_col = 0; uv_col < (int)uv_block_width; uv_col++) {
+ const int y_row = uv_row << ss_y;
+ const int y_col = uv_col << ss_x;
+ const int filter_weight = GetFilterWeight(
+ uv_row, uv_col, uv_block_height, uv_block_width, blk_fw, use_32x32);
+
+ const int u_pixel = u_pre[uv_row * uv_pre_stride + uv_col];
+ const int v_pixel = v_pre[uv_row * uv_pre_stride + uv_col];
+
+ int uv_num_used = 0;
+ int u_mod = 0, v_mod = 0;
+
+ // Sum the neighboring 3x3 chromal pixels to the chroma modifier
+ for (int row_step = -1; row_step <= 1; row_step++) {
+ for (int col_step = -1; col_step <= 1; col_step++) {
+ const int sub_row = uv_row + row_step;
+ const int sub_col = uv_col + col_step;
+
+ if (sub_row >= 0 && sub_row < uv_block_height && sub_col >= 0 &&
+ sub_col < uv_block_width) {
+ u_mod += u_dif[sub_row * uv_diff_stride + sub_col];
+ v_mod += v_dif[sub_row * uv_diff_stride + sub_col];
+ uv_num_used++;
+ }
+ }
+ }
+
+ // Sum all the luma pixels associated with the current luma pixel
+ for (int row_step = 0; row_step < 1 + ss_y; row_step++) {
+ for (int col_step = 0; col_step < 1 + ss_x; col_step++) {
+ const int sub_row = y_row + row_step;
+ const int sub_col = y_col + col_step;
+ const int y_diff = y_dif[sub_row * y_diff_stride + sub_col];
+
+ u_mod += y_diff;
+ v_mod += y_diff;
+ uv_num_used++;
+ }
+ }
+
+ // Set the modifier
+ u_mod = GetModIndex<PixelType>(u_mod, uv_num_used, rounding, strength,
+ filter_weight);
+ v_mod = GetModIndex<PixelType>(v_mod, uv_num_used, rounding, strength,
+ filter_weight);
+
+ // Accumulate the result
+ u_count[uv_row * u_count_stride + uv_col] += u_mod;
+ u_accum[uv_row * u_accum_stride + uv_col] += u_mod * u_pixel;
+ v_count[uv_row * v_count_stride + uv_col] += v_mod;
+ v_accum[uv_row * v_accum_stride + uv_col] += v_mod * v_pixel;
+ }
+ }
+}
+
+class YUVTemporalFilterTest
+ : public ::testing::TestWithParam<TemporalFilterWithBd> {
+ public:
+ virtual void SetUp() {
+ filter_func_ = GetParam().temporal_filter;
+ bd_ = GetParam().bd;
+ use_highbd_ = (bd_ != 8);
+
+ rnd_.Reset(ACMRandom::DeterministicSeed());
+ saturate_test_ = 0;
+ num_repeats_ = 10;
+
+ ASSERT_TRUE(bd_ == 8 || bd_ == 10 || bd_ == 12);
+ }
+
+ protected:
+ template <typename PixelType>
+ void CompareTestWithParam(int width, int height, int ss_x, int ss_y,
+ int filter_strength, int use_32x32,
+ const int *filter_weight);
+ template <typename PixelType>
+ void RunTestFilterWithParam(int width, int height, int ss_x, int ss_y,
+ int filter_strength, int use_32x32,
+ const int *filter_weight);
+ template <typename PixelType>
+ void ApplyTestFilter(const PixelType *y_src, int y_src_stride,
+ const PixelType *y_pre, int y_pre_stride,
+ const PixelType *u_src, const PixelType *v_src,
+ int uv_src_stride, const PixelType *u_pre,
+ const PixelType *v_pre, int uv_pre_stride,
+ unsigned int block_width, unsigned int block_height,
+ int ss_x, int ss_y, int strength, const int *blk_fw,
+ int use_32x32, uint32_t *y_accum, uint16_t *y_count,
+ uint32_t *u_accumu, uint16_t *u_count, uint32_t *v_accum,
+ uint16_t *v_count);
+
+ YUVTemporalFilterFunc filter_func_;
+ ACMRandom rnd_;
+ int saturate_test_;
+ int num_repeats_;
+ int use_highbd_;
+ int bd_;
+};
+
+template <>
+void YUVTemporalFilterTest::ApplyTestFilter<uint8_t>(
+ const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
+ int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
+ int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
+ int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+ int ss_x, int ss_y, int strength, const int *blk_fw, int use_32x32,
+ uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, uint16_t *u_count,
+ uint32_t *v_accum, uint16_t *v_count) {
+ ASM_REGISTER_STATE_CHECK(
+ filter_func_(y_src, y_src_stride, y_pre, y_pre_stride, u_src, v_src,
+ uv_src_stride, u_pre, v_pre, uv_pre_stride, block_width,
+ block_height, ss_x, ss_y, strength, blk_fw, use_32x32,
+ y_accum, y_count, u_accum, u_count, v_accum, v_count));
+}
+
+template <>
+void YUVTemporalFilterTest::ApplyTestFilter<uint16_t>(
+ const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
+ int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
+ int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
+ int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+ int ss_x, int ss_y, int strength, const int *blk_fw, int use_32x32,
+ uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, uint16_t *u_count,
+ uint32_t *v_accum, uint16_t *v_count) {
+ ASM_REGISTER_STATE_CHECK(filter_func_(
+ CONVERT_TO_BYTEPTR(y_src), y_src_stride, CONVERT_TO_BYTEPTR(y_pre),
+ y_pre_stride, CONVERT_TO_BYTEPTR(u_src), CONVERT_TO_BYTEPTR(v_src),
+ uv_src_stride, CONVERT_TO_BYTEPTR(u_pre), CONVERT_TO_BYTEPTR(v_pre),
+ uv_pre_stride, block_width, block_height, ss_x, ss_y, strength, blk_fw,
+ use_32x32, y_accum, y_count, u_accum, u_count, v_accum, v_count));
+}
+
+template <typename PixelType>
+void YUVTemporalFilterTest::CompareTestWithParam(int width, int height,
+ int ss_x, int ss_y,
+ int filter_strength,
+ int use_32x32,
+ const int *filter_weight) {
+ const int uv_width = width >> ss_x, uv_height = height >> ss_y;
+ const int y_stride = width, uv_stride = uv_width;
+
+ DECLARE_ALIGNED(16, PixelType, y_src[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+ DECLARE_ALIGNED(16, PixelType, y_pre[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+ DECLARE_ALIGNED(16, uint16_t, y_count_ref[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+ DECLARE_ALIGNED(16, uint32_t, y_accum_ref[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+ DECLARE_ALIGNED(16, uint16_t, y_count_tst[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+ DECLARE_ALIGNED(16, uint32_t, y_accum_tst[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+
+ DECLARE_ALIGNED(16, PixelType, u_src[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+ DECLARE_ALIGNED(16, PixelType, u_pre[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+ DECLARE_ALIGNED(16, uint16_t, u_count_ref[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+ DECLARE_ALIGNED(16, uint32_t, u_accum_ref[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+ DECLARE_ALIGNED(16, uint16_t, u_count_tst[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+ DECLARE_ALIGNED(16, uint32_t, u_accum_tst[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+
+ DECLARE_ALIGNED(16, PixelType, v_src[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+ DECLARE_ALIGNED(16, PixelType, v_pre[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+ DECLARE_ALIGNED(16, uint16_t, v_count_ref[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+ DECLARE_ALIGNED(16, uint32_t, v_accum_ref[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+ DECLARE_ALIGNED(16, uint16_t, v_count_tst[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+ DECLARE_ALIGNED(16, uint32_t, v_accum_tst[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
+
+ for (int repeats = 0; repeats < num_repeats_; repeats++) {
+ if (saturate_test_) {
+ const int max_val = (1 << bd_) - 1;
+ SetArray(y_src, width, height, y_stride, max_val);
+ SetArray(y_pre, width, height, y_stride, 0);
+ SetArray(u_src, uv_width, uv_height, uv_stride, max_val);
+ SetArray(u_pre, uv_width, uv_height, uv_stride, 0);
+ SetArray(v_src, uv_width, uv_height, uv_stride, max_val);
+ SetArray(v_pre, uv_width, uv_height, uv_stride, 0);
+ } else {
+ const int max_val = 7 << (bd_ - 8);
+ SetArray(y_src, width, height, y_stride, &rnd_, 0, max_val);
+ SetArray(y_pre, width, height, y_stride, &rnd_, 0, max_val);
+ SetArray(u_src, uv_width, uv_height, uv_stride, &rnd_, 0, max_val);
+ SetArray(u_pre, uv_width, uv_height, uv_stride, &rnd_, 0, max_val);
+ SetArray(v_src, uv_width, uv_height, uv_stride, &rnd_, 0, max_val);
+ SetArray(v_pre, uv_width, uv_height, uv_stride, &rnd_, 0, max_val);
+ }
+
+ ApplyReferenceFilter<PixelType>(
+ y_src, y_pre, u_src, v_src, u_pre, v_pre, width, height, ss_x, ss_y,
+ filter_strength, filter_weight, use_32x32, y_accum_ref, y_count_ref,
+ u_accum_ref, u_count_ref, v_accum_ref, v_count_ref);
+
+ ApplyTestFilter(y_src, y_stride, y_pre, y_stride, u_src, v_src, uv_stride,
+ u_pre, v_pre, uv_stride, width, height, ss_x, ss_y,
+ filter_strength, filter_weight, use_32x32, y_accum_tst,
+ y_count_tst, u_accum_tst, u_count_tst, v_accum_tst,
+ v_count_tst);
+
+ EXPECT_TRUE(CheckArrayEqual(y_accum_tst, y_accum_ref, width, height,
+ y_stride, y_stride));
+ EXPECT_TRUE(CheckArrayEqual(y_count_tst, y_count_ref, width, height,
+ y_stride, y_stride));
+ EXPECT_TRUE(CheckArrayEqual(u_accum_tst, u_accum_ref, uv_width, uv_height,
+ uv_stride, uv_stride));
+ EXPECT_TRUE(CheckArrayEqual(u_count_tst, u_count_ref, uv_width, uv_height,
+ uv_stride, uv_stride));
+ EXPECT_TRUE(CheckArrayEqual(v_accum_tst, v_accum_ref, uv_width, uv_height,
+ uv_stride, uv_stride));
+ EXPECT_TRUE(CheckArrayEqual(v_count_tst, v_count_ref, uv_width, uv_height,
+ uv_stride, uv_stride));
+
+ if (HasFailure()) {
+ if (use_32x32) {
+ printf("SS_X: %d, SS_Y: %d, Strength: %d, Weight: %d\n", ss_x, ss_y,
+ filter_strength, *filter_weight);
+ } else {
+ printf("SS_X: %d, SS_Y: %d, Strength: %d, Weights: %d,%d,%d,%d\n", ss_x,
+ ss_y, filter_strength, filter_weight[0], filter_weight[1],
+ filter_weight[2], filter_weight[3]);
+ }
+
+ PrintArrayDiff(y_accum_ref, y_accum_tst, width, height, y_stride,
+ y_stride);
+ PrintArrayDiff(y_count_ref, y_count_tst, width, height, y_stride,
+ y_stride);
+ PrintArrayDiff(u_accum_ref, v_accum_tst, uv_width, uv_height, uv_stride,
+ uv_stride);
+ PrintArrayDiff(u_count_ref, v_count_tst, uv_width, uv_height, uv_stride,
+ uv_stride);
+ PrintArrayDiff(u_accum_ref, v_accum_tst, uv_width, uv_height, uv_stride,
+ uv_stride);
+ PrintArrayDiff(u_count_ref, v_count_tst, uv_width, uv_height, uv_stride,
+ uv_stride);
+
+ return;
+ }
+ }
+}
+
+template <typename PixelType>
+void YUVTemporalFilterTest::RunTestFilterWithParam(int width, int height,
+ int ss_x, int ss_y,
+ int filter_strength,
+ int use_32x32,
+ const int *filter_weight) {
+ PixelType y_src[MAX_WIDTH * MAX_HEIGHT] = { 0 };
+ PixelType y_pre[MAX_WIDTH * MAX_HEIGHT] = { 0 };
+ uint16_t y_count[MAX_WIDTH * MAX_HEIGHT] = { 0 };
+ uint32_t y_accum[MAX_WIDTH * MAX_HEIGHT] = { 0 };
+
+ PixelType u_src[MAX_WIDTH * MAX_HEIGHT] = { 0 };
+ PixelType u_pre[MAX_WIDTH * MAX_HEIGHT] = { 0 };
+ uint16_t u_count[MAX_WIDTH * MAX_HEIGHT] = { 0 };
+ uint32_t u_accum[MAX_WIDTH * MAX_HEIGHT] = { 0 };
+
+ PixelType v_src[MAX_WIDTH * MAX_HEIGHT] = { 0 };
+ PixelType v_pre[MAX_WIDTH * MAX_HEIGHT] = { 0 };
+ uint16_t v_count[MAX_WIDTH * MAX_HEIGHT] = { 0 };
+ uint32_t v_accum[MAX_WIDTH * MAX_HEIGHT] = { 0 };
+
+ SetArray(y_src, width, height, MAX_WIDTH, &rnd_, 0, 7 << (bd_ = 8));
+ SetArray(y_pre, width, height, MAX_WIDTH, &rnd_, 0, 7 << (bd_ = 8));
+ SetArray(u_src, width, height, MAX_WIDTH, &rnd_, 0, 7 << (bd_ = 8));
+ SetArray(u_pre, width, height, MAX_WIDTH, &rnd_, 0, 7 << (bd_ = 8));
+ SetArray(v_src, width, height, MAX_WIDTH, &rnd_, 0, 7 << (bd_ = 8));
+ SetArray(v_pre, width, height, MAX_WIDTH, &rnd_, 0, 7 << (bd_ = 8));
+
+ for (int repeats = 0; repeats < num_repeats_; repeats++) {
+ ApplyTestFilter(y_src, MAX_WIDTH, y_pre, MAX_WIDTH, u_src, v_src, MAX_WIDTH,
+ u_pre, v_pre, MAX_WIDTH, width, height, ss_x, ss_y,
+ filter_strength, filter_weight, use_32x32, y_accum, y_count,
+ u_accum, u_count, v_accum, v_count);
+ }
+}
+
+TEST_P(YUVTemporalFilterTest, Use32x32) {
+ const int width = 32, height = 32;
+ const int use_32x32 = 1;
+
+ for (int ss_x = 0; ss_x <= 1; ss_x++) {
+ for (int ss_y = 0; ss_y <= 1; ss_y++) {
+ for (int filter_strength = 0; filter_strength <= 6;
+ filter_strength += 2) {
+ for (int filter_weight = 0; filter_weight <= 2; filter_weight++) {
+ if (use_highbd_) {
+ const int adjusted_strength = filter_strength + 2 * (bd_ - 8);
+ CompareTestWithParam<uint16_t>(width, height, ss_x, ss_y,
+ adjusted_strength, use_32x32,
+ &filter_weight);
+ } else {
+ CompareTestWithParam<uint8_t>(width, height, ss_x, ss_y,
+ filter_strength, use_32x32,
+ &filter_weight);
+ }
+ ASSERT_FALSE(HasFailure());
+ }
+ }
+ }
+ }
+}
+
+TEST_P(YUVTemporalFilterTest, Use16x16) {
+ const int width = 32, height = 32;
+ const int use_32x32 = 0;
+
+ for (int ss_x = 0; ss_x <= 1; ss_x++) {
+ for (int ss_y = 0; ss_y <= 1; ss_y++) {
+ for (int filter_idx = 0; filter_idx < 3 * 3 * 3 * 3; filter_idx++) {
+ // Set up the filter
+ int filter_weight[4];
+ int filter_idx_cp = filter_idx;
+ for (int idx = 0; idx < 4; idx++) {
+ filter_weight[idx] = filter_idx_cp % 3;
+ filter_idx_cp /= 3;
+ }
+
+ // Test each parameter
+ for (int filter_strength = 0; filter_strength <= 6;
+ filter_strength += 2) {
+ if (use_highbd_) {
+ const int adjusted_strength = filter_strength + 2 * (bd_ - 8);
+ CompareTestWithParam<uint16_t>(width, height, ss_x, ss_y,
+ adjusted_strength, use_32x32,
+ filter_weight);
+ } else {
+ CompareTestWithParam<uint8_t>(width, height, ss_x, ss_y,
+ filter_strength, use_32x32,
+ filter_weight);
+ }
+
+ ASSERT_FALSE(HasFailure());
+ }
+ }
+ }
+ }
+}
+
+TEST_P(YUVTemporalFilterTest, SaturationTest) {
+ const int width = 32, height = 32;
+ const int use_32x32 = 1;
+ const int filter_weight = 1;
+ saturate_test_ = 1;
+
+ for (int ss_x = 0; ss_x <= 1; ss_x++) {
+ for (int ss_y = 0; ss_y <= 1; ss_y++) {
+ for (int filter_strength = 0; filter_strength <= 6;
+ filter_strength += 2) {
+ if (use_highbd_) {
+ const int adjusted_strength = filter_strength + 2 * (bd_ - 8);
+ CompareTestWithParam<uint16_t>(width, height, ss_x, ss_y,
+ adjusted_strength, use_32x32,
+ &filter_weight);
+ } else {
+ CompareTestWithParam<uint8_t>(width, height, ss_x, ss_y,
+ filter_strength, use_32x32,
+ &filter_weight);
+ }
+
+ ASSERT_FALSE(HasFailure());
+ }
+ }
+ }
+}
+
+TEST_P(YUVTemporalFilterTest, DISABLED_Speed) {
+ const int width = 32, height = 32;
+ num_repeats_ = 1000;
+
+ for (int use_32x32 = 0; use_32x32 <= 1; use_32x32++) {
+ const int num_filter_weights = use_32x32 ? 3 : 3 * 3 * 3 * 3;
+ for (int ss_x = 0; ss_x <= 1; ss_x++) {
+ for (int ss_y = 0; ss_y <= 1; ss_y++) {
+ for (int filter_idx = 0; filter_idx < num_filter_weights;
+ filter_idx++) {
+ // Set up the filter
+ int filter_weight[4];
+ int filter_idx_cp = filter_idx;
+ for (int idx = 0; idx < 4; idx++) {
+ filter_weight[idx] = filter_idx_cp % 3;
+ filter_idx_cp /= 3;
+ }
+
+ // Test each parameter
+ for (int filter_strength = 0; filter_strength <= 6;
+ filter_strength += 2) {
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+
+ if (use_highbd_) {
+ RunTestFilterWithParam<uint16_t>(width, height, ss_x, ss_y,
+ filter_strength, use_32x32,
+ filter_weight);
+ } else {
+ RunTestFilterWithParam<uint8_t>(width, height, ss_x, ss_y,
+ filter_strength, use_32x32,
+ filter_weight);
+ }
+
+ aom_usec_timer_mark(&timer);
+ const int elapsed_time =
+ static_cast<int>(aom_usec_timer_elapsed(&timer));
+
+ printf(
+ "Bitdepth: %d, Use 32X32: %d, SS_X: %d, SS_Y: %d, Weight Idx: "
+ "%d, Strength: %d, Time: %5d\n",
+ bd_, use_32x32, ss_x, ss_y, filter_idx, filter_strength,
+ elapsed_time);
+ }
+ }
+ }
+ }
+ }
+}
+
+INSTANTIATE_TEST_CASE_P(
+ C, YUVTemporalFilterTest,
+ ::testing::Values(
+ TemporalFilterWithBd(&av1_apply_temporal_filter_c, 8),
+ TemporalFilterWithBd(&av1_highbd_apply_temporal_filter_c, 10),
+ TemporalFilterWithBd(&av1_highbd_apply_temporal_filter_c, 12)));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+ SSE4_1, YUVTemporalFilterTest,
+ ::testing::Values(
+ TemporalFilterWithBd(&av1_apply_temporal_filter_sse4_1, 8),
+ TemporalFilterWithBd(&av1_highbd_apply_temporal_filter_sse4_1, 10),
+ TemporalFilterWithBd(&av1_highbd_apply_temporal_filter_sse4_1, 12)));
+#endif // HAVE_SSE4_1
+
+} // namespace
diff --git a/libaom/third_party/libwebm/AUTHORS.TXT b/libaom/third_party/libwebm/AUTHORS.TXT
index 8ab6f79..9686ac1 100644
--- a/libaom/third_party/libwebm/AUTHORS.TXT
+++ b/libaom/third_party/libwebm/AUTHORS.TXT
@@ -1,4 +1,4 @@
-# Names should be added to this file like so:
-# Name or Organization <email address>
-
-Google Inc.
+# Names should be added to this file like so:
+# Name or Organization <email address>
+
+Google Inc.
diff --git a/libaom/third_party/libwebm/README.libaom b/libaom/third_party/libwebm/README.libaom
index bd288d2..17b2f47 100644
--- a/libaom/third_party/libwebm/README.libaom
+++ b/libaom/third_party/libwebm/README.libaom
@@ -1,5 +1,5 @@
URL: https://chromium.googlesource.com/webm/libwebm
-Version: af81f26025b7435fa9a14ad07c58b44cf9280430
+Version: 9f23fbc50e7a76c815b1d3f0309abe1066301331
License: BSD
License File: LICENSE.txt
@@ -7,8 +7,6 @@ Description:
libwebm is used to handle WebM container I/O.
Local Changes:
-Add av1 codec as an eligible codec for webm:
- https://aomedia-review.googlesource.com/c/aom/+/15103
Only keep:
- Android.mk
- AUTHORS.TXT
diff --git a/libaom/third_party/libwebm/common/file_util.cc b/libaom/third_party/libwebm/common/file_util.cc
index 618ffc0..e6109d5 100644
--- a/libaom/third_party/libwebm/common/file_util.cc
+++ b/libaom/third_party/libwebm/common/file_util.cc
@@ -46,7 +46,7 @@ std::string GetTempFileName() {
errno_t err = tmpnam_s(tmp_file_name);
#else
char* fname_pointer = tmpnam(tmp_file_name);
- errno_t err = (fname_pointer == &tmp_file_name[0]) ? 0 : -1;
+ int err = (fname_pointer == &tmp_file_name[0]) ? 0 : -1;
#endif
if (err == 0) {
return std::string(tmp_file_name);
diff --git a/libaom/third_party/libwebm/common/webmids.h b/libaom/third_party/libwebm/common/webmids.h
index 89d722a..fc0c208 100644
--- a/libaom/third_party/libwebm/common/webmids.h
+++ b/libaom/third_party/libwebm/common/webmids.h
@@ -93,6 +93,7 @@ enum MkvId {
kMkvDisplayHeight = 0x54BA,
kMkvDisplayUnit = 0x54B2,
kMkvAspectRatioType = 0x54B3,
+ kMkvColourSpace = 0x2EB524,
kMkvFrameRate = 0x2383E3,
// end video
// colour
diff --git a/libaom/third_party/libwebm/mkvmuxer/mkvmuxer.cc b/libaom/third_party/libwebm/mkvmuxer/mkvmuxer.cc
index bae2c99..5120312 100644
--- a/libaom/third_party/libwebm/mkvmuxer/mkvmuxer.cc
+++ b/libaom/third_party/libwebm/mkvmuxer/mkvmuxer.cc
@@ -773,6 +773,14 @@ bool Track::Write(IMkvWriter* writer) const {
if (!type_ || !codec_id_)
return false;
+ // AV1 tracks require a CodecPrivate. See
+ // https://github.com/Matroska-Org/matroska-specification/blob/av1-mappin/codec/av1.md
+ // TODO(tomfinegan): Update the above link to the AV1 Matroska mappings to
+ // point to a stable version once it is finalized, or our own WebM mappings
+ // page on webmproject.org should we decide to release them.
+ if (!strcmp(codec_id_, Tracks::kAv1CodecId) && !codec_private_)
+ return false;
+
// |size| may be bigger than what is written out in this function because
// derived classes may write out more data in the Track element.
const uint64_t payload_size = PayloadSize();
@@ -1027,19 +1035,16 @@ bool MasteringMetadata::Write(IMkvWriter* writer) const {
!WriteEbmlElement(writer, libwebm::kMkvLuminanceMin, luminance_min_)) {
return false;
}
- if (r_ &&
- !r_->Write(writer, libwebm::kMkvPrimaryRChromaticityX,
- libwebm::kMkvPrimaryRChromaticityY)) {
+ if (r_ && !r_->Write(writer, libwebm::kMkvPrimaryRChromaticityX,
+ libwebm::kMkvPrimaryRChromaticityY)) {
return false;
}
- if (g_ &&
- !g_->Write(writer, libwebm::kMkvPrimaryGChromaticityX,
- libwebm::kMkvPrimaryGChromaticityY)) {
+ if (g_ && !g_->Write(writer, libwebm::kMkvPrimaryGChromaticityX,
+ libwebm::kMkvPrimaryGChromaticityY)) {
return false;
}
- if (b_ &&
- !b_->Write(writer, libwebm::kMkvPrimaryBChromaticityX,
- libwebm::kMkvPrimaryBChromaticityY)) {
+ if (b_ && !b_->Write(writer, libwebm::kMkvPrimaryBChromaticityX,
+ libwebm::kMkvPrimaryBChromaticityY)) {
return false;
}
if (white_point_ &&
@@ -1421,6 +1426,7 @@ VideoTrack::VideoTrack(unsigned int* seed)
stereo_mode_(0),
alpha_mode_(0),
width_(0),
+ colour_space_(NULL),
colour_(NULL),
projection_(NULL) {}
@@ -1518,6 +1524,10 @@ bool VideoTrack::Write(IMkvWriter* writer) const {
static_cast<uint64>(alpha_mode_)))
return false;
}
+ if (colour_space_) {
+ if (!WriteEbmlElement(writer, libwebm::kMkvColourSpace, colour_space_))
+ return false;
+ }
if (frame_rate_ > 0.0) {
if (!WriteEbmlElement(writer, libwebm::kMkvFrameRate,
static_cast<float>(frame_rate_))) {
@@ -1542,6 +1552,22 @@ bool VideoTrack::Write(IMkvWriter* writer) const {
return true;
}
+void VideoTrack::set_colour_space(const char* colour_space) {
+ if (colour_space) {
+ delete[] colour_space_;
+
+ const size_t length = strlen(colour_space) + 1;
+ colour_space_ = new (std::nothrow) char[length]; // NOLINT
+ if (colour_space_) {
+#ifdef _MSC_VER
+ strcpy_s(colour_space_, length, colour_space);
+#else
+ strcpy(colour_space_, colour_space);
+#endif
+ }
+ }
+}
+
bool VideoTrack::SetColour(const Colour& colour) {
std::unique_ptr<Colour> colour_ptr(new Colour());
if (!colour_ptr.get())
@@ -1625,6 +1651,8 @@ uint64_t VideoTrack::VideoPayloadSize() const {
if (frame_rate_ > 0.0)
size += EbmlElementSize(libwebm::kMkvFrameRate,
static_cast<float>(frame_rate_));
+ if (colour_space_)
+ size += EbmlElementSize(libwebm::kMkvColourSpace, colour_space_);
if (colour_)
size += colour_->ColourSize();
if (projection_)
@@ -1702,10 +1730,9 @@ bool AudioTrack::Write(IMkvWriter* writer) const {
const char Tracks::kOpusCodecId[] = "A_OPUS";
const char Tracks::kVorbisCodecId[] = "A_VORBIS";
+const char Tracks::kAv1CodecId[] = "V_AV1";
const char Tracks::kVp8CodecId[] = "V_VP8";
const char Tracks::kVp9CodecId[] = "V_VP9";
-const char Tracks::kVp10CodecId[] = "V_VP10";
-const char Tracks::kAV1CodecId[] = "V_AV1";
const char Tracks::kWebVttCaptionsId[] = "D_WEBVTT/CAPTIONS";
const char Tracks::kWebVttDescriptionsId[] = "D_WEBVTT/DESCRIPTIONS";
const char Tracks::kWebVttMetadataId[] = "D_WEBVTT/METADATA";
@@ -4161,15 +4188,15 @@ bool Segment::WriteFramesLessThan(uint64_t timestamp) {
}
bool Segment::DocTypeIsWebm() const {
- const int kNumCodecIds = 10;
+ const int kNumCodecIds = 9;
// TODO(vigneshv): Tweak .clang-format.
const char* kWebmCodecIds[kNumCodecIds] = {
Tracks::kOpusCodecId, Tracks::kVorbisCodecId,
- Tracks::kVp8CodecId, Tracks::kVp9CodecId,
- Tracks::kVp10CodecId, Tracks::kAV1CodecId,
- Tracks::kWebVttCaptionsId, Tracks::kWebVttDescriptionsId,
- Tracks::kWebVttMetadataId, Tracks::kWebVttSubtitlesId};
+ Tracks::kAv1CodecId, Tracks::kVp8CodecId,
+ Tracks::kVp9CodecId, Tracks::kWebVttCaptionsId,
+ Tracks::kWebVttDescriptionsId, Tracks::kWebVttMetadataId,
+ Tracks::kWebVttSubtitlesId};
const int num_tracks = static_cast<int>(tracks_.track_entries_size());
for (int track_index = 0; track_index < num_tracks; ++track_index) {
diff --git a/libaom/third_party/libwebm/mkvmuxer/mkvmuxer.h b/libaom/third_party/libwebm/mkvmuxer/mkvmuxer.h
index 9e817bc..f2db377 100644
--- a/libaom/third_party/libwebm/mkvmuxer/mkvmuxer.h
+++ b/libaom/third_party/libwebm/mkvmuxer/mkvmuxer.h
@@ -795,6 +795,8 @@ class VideoTrack : public Track {
uint64_t alpha_mode() { return alpha_mode_; }
void set_width(uint64_t width) { width_ = width; }
uint64_t width() const { return width_; }
+ void set_colour_space(const char* colour_space);
+ const char* colour_space() const { return colour_space_; }
Colour* colour() { return colour_; }
@@ -824,6 +826,7 @@ class VideoTrack : public Track {
uint64_t stereo_mode_;
uint64_t alpha_mode_;
uint64_t width_;
+ char* colour_space_;
Colour* colour_;
Projection* projection_;
@@ -871,10 +874,9 @@ class Tracks {
static const char kOpusCodecId[];
static const char kVorbisCodecId[];
+ static const char kAv1CodecId[];
static const char kVp8CodecId[];
static const char kVp9CodecId[];
- static const char kVp10CodecId[];
- static const char kAV1CodecId[];
static const char kWebVttCaptionsId[];
static const char kWebVttDescriptionsId[];
static const char kWebVttMetadataId[];
diff --git a/libaom/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc b/libaom/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
index 355d4e2..3bff7cd 100644
--- a/libaom/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
+++ b/libaom/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
@@ -136,9 +136,8 @@ uint64 WriteBlock(IMkvWriter* writer, const Frame* const frame, int64 timecode,
return false;
}
- if (!frame->is_key() &&
- !WriteEbmlElement(writer, libwebm::kMkvReferenceBlock,
- reference_block_timestamp)) {
+ if (!frame->is_key() && !WriteEbmlElement(writer, libwebm::kMkvReferenceBlock,
+ reference_block_timestamp)) {
return false;
}
diff --git a/libaom/third_party/libwebm/mkvmuxer/mkvwriter.cc b/libaom/third_party/libwebm/mkvmuxer/mkvwriter.cc
index 84655d8..d668384 100644
--- a/libaom/third_party/libwebm/mkvmuxer/mkvwriter.cc
+++ b/libaom/third_party/libwebm/mkvmuxer/mkvwriter.cc
@@ -78,6 +78,8 @@ int32 MkvWriter::Position(int64 position) {
#ifdef _MSC_VER
return _fseeki64(file_, position, SEEK_SET);
+#elif defined(_WIN32)
+ return fseeko64(file_, static_cast<off_t>(position), SEEK_SET);
#else
return fseeko(file_, static_cast<off_t>(position), SEEK_SET);
#endif
diff --git a/libaom/third_party/libwebm/mkvparser/mkvparser.cc b/libaom/third_party/libwebm/mkvparser/mkvparser.cc
index e7b76f7..9c78ead 100644
--- a/libaom/third_party/libwebm/mkvparser/mkvparser.cc
+++ b/libaom/third_party/libwebm/mkvparser/mkvparser.cc
@@ -36,8 +36,6 @@ inline bool isnan(double val) { return std::isnan(val); }
inline bool isinf(double val) { return std::isinf(val); }
#endif // MSC_COMPAT
-IMkvReader::~IMkvReader() {}
-
template <typename Type>
Type* SafeArrayAlloc(unsigned long long num_elements,
unsigned long long element_size) {
@@ -5274,6 +5272,7 @@ bool Projection::Parse(IMkvReader* reader, long long start, long long size,
VideoTrack::VideoTrack(Segment* pSegment, long long element_start,
long long element_size)
: Track(pSegment, element_start, element_size),
+ m_colour_space(NULL),
m_colour(NULL),
m_projection(NULL) {}
@@ -5299,6 +5298,7 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info,
long long stereo_mode = 0;
double rate = 0.0;
+ char* colour_space = NULL;
IMkvReader* const pReader = pSegment->m_pReader;
@@ -5312,7 +5312,7 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info,
const long long stop = pos + s.size;
Colour* colour = NULL;
- Projection* projection = NULL;
+ std::unique_ptr<Projection> projection_ptr;
while (pos < stop) {
long long id, size;
@@ -5364,8 +5364,16 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info,
if (!Colour::Parse(pReader, pos, size, &colour))
return E_FILE_FORMAT_INVALID;
} else if (id == libwebm::kMkvProjection) {
- if (!Projection::Parse(pReader, pos, size, &projection))
+ Projection* projection = NULL;
+ if (!Projection::Parse(pReader, pos, size, &projection)) {
return E_FILE_FORMAT_INVALID;
+ } else {
+ projection_ptr.reset(projection);
+ }
+ } else if (id == libwebm::kMkvColourSpace) {
+ const long status = UnserializeString(pReader, pos, size, colour_space);
+ if (status < 0)
+ return status;
}
pos += size; // consume payload
@@ -5397,7 +5405,8 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info,
pTrack->m_stereo_mode = stereo_mode;
pTrack->m_rate = rate;
pTrack->m_colour = colour;
- pTrack->m_projection = projection;
+ pTrack->m_colour_space = colour_space;
+ pTrack->m_projection = projection_ptr.release();
pResult = pTrack;
return 0; // success
diff --git a/libaom/third_party/libwebm/mkvparser/mkvparser.h b/libaom/third_party/libwebm/mkvparser/mkvparser.h
index 26c2b7e..848d01f 100644
--- a/libaom/third_party/libwebm/mkvparser/mkvparser.h
+++ b/libaom/third_party/libwebm/mkvparser/mkvparser.h
@@ -22,7 +22,7 @@ class IMkvReader {
virtual int Length(long long* total, long long* available) = 0;
protected:
- virtual ~IMkvReader();
+ virtual ~IMkvReader() {}
};
template <typename Type>
@@ -527,6 +527,8 @@ class VideoTrack : public Track {
Projection* GetProjection() const;
+ const char* GetColourSpace() const { return m_colour_space; }
+
private:
long long m_width;
long long m_height;
@@ -534,7 +536,7 @@ class VideoTrack : public Track {
long long m_display_height;
long long m_display_unit;
long long m_stereo_mode;
-
+ char* m_colour_space;
double m_rate;
Colour* m_colour;
diff --git a/libaom/third_party/libwebm/mkvparser/mkvreader.cc b/libaom/third_party/libwebm/mkvparser/mkvreader.cc
index 23d68f5..9d19c1b 100644
--- a/libaom/third_party/libwebm/mkvparser/mkvreader.cc
+++ b/libaom/third_party/libwebm/mkvparser/mkvreader.cc
@@ -118,6 +118,8 @@ int MkvReader::Read(long long offset, long len, unsigned char* buffer) {
if (status)
return -1; // error
+#elif defined(_WIN32)
+ fseeko64(m_file, static_cast<off_t>(offset), SEEK_SET);
#else
fseeko(m_file, static_cast<off_t>(offset), SEEK_SET);
#endif
diff --git a/libaom/tools/txfm_analyzer/txfm_graph.h b/libaom/tools/txfm_analyzer/txfm_graph.h
index 2e3c955..8dc3614 100644
--- a/libaom/tools/txfm_analyzer/txfm_graph.h
+++ b/libaom/tools/txfm_analyzer/txfm_graph.h
@@ -23,7 +23,6 @@ struct Node {
int visited;
};
-#define PI (3.141592653589793238462643383279502884)
#define STAGENUM (10)
#define NODENUM (32)
#define COS_MOD (128)